Compare commits
343 Commits
v0.5.0.pos
...
v0.5.2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4cf256ae7f | ||
|
|
64fdc08c72 | ||
|
|
4ef95b0f06 | ||
|
|
eaec4b9153 | ||
|
|
a63a4c6341 | ||
|
|
c8fd97f26d | ||
|
|
94b82e8c18 | ||
|
|
6ae1597ddf | ||
|
|
22e79ee8f3 | ||
|
|
de19916314 | ||
|
|
69672f116c | ||
|
|
44874a0bf9 | ||
|
|
b47008b4d2 | ||
|
|
9bfece89fd | ||
|
|
32c9d7f765 | ||
|
|
ccb20db8bd | ||
|
|
a754dc2cb9 | ||
|
|
61e85dbad8 | ||
|
|
dbfe254eda | ||
|
|
73030b7dae | ||
|
|
ccd3c04571 | ||
|
|
9dad5cc859 | ||
|
|
6ef3bf912c | ||
|
|
540c0368b1 | ||
|
|
fb6af8bc08 | ||
|
|
eeceadaecc | ||
|
|
babf52dade | ||
|
|
9da4aad44b | ||
|
|
41708e5034 | ||
|
|
d80aef3776 | ||
|
|
e1684a766a | ||
|
|
a27f87da34 | ||
|
|
16ff6bd58c | ||
|
|
f8f9ff57ee | ||
|
|
6bc9710f6e | ||
|
|
111fc6e7ec | ||
|
|
75f64d8b94 | ||
|
|
21b2dcedab | ||
|
|
07b35af86d | ||
|
|
bb1a784b05 | ||
|
|
d719ba24c5 | ||
|
|
aa48e502fb | ||
|
|
4dbebd03cc | ||
|
|
b75bce1008 | ||
|
|
b039cbbce3 | ||
|
|
f9d25c2519 | ||
|
|
024ad87cdc | ||
|
|
aea19f0989 | ||
|
|
f7160d946a | ||
|
|
6047187cd8 | ||
|
|
b6c16cf8ff | ||
|
|
d26a8b3f1f | ||
|
|
d59eb98489 | ||
|
|
adf32e0a0f | ||
|
|
2b0fb53481 | ||
|
|
d6ab528997 | ||
|
|
7ed6a4f0e1 | ||
|
|
a4feba929b | ||
|
|
2d23b42d92 | ||
|
|
1df43de9bb | ||
|
|
52b7fcb35a | ||
|
|
b675069d74 | ||
|
|
55f692b46e | ||
|
|
8a1415cf77 | ||
|
|
546b101fa0 | ||
|
|
3963a5335b | ||
|
|
c4774eb841 | ||
|
|
fc17110bbe | ||
|
|
439c84581a | ||
|
|
99ded1e1c4 | ||
|
|
997df46a32 | ||
|
|
ae151d73be | ||
|
|
44cc76610d | ||
|
|
b422d4961a | ||
|
|
c38eba3046 | ||
|
|
e72ae80b06 | ||
|
|
8a924d2248 | ||
|
|
5ed3505d82 | ||
|
|
da78caecfa | ||
|
|
2416b26e11 | ||
|
|
d3a245138a | ||
|
|
673dd4cae9 | ||
|
|
4d6ada947c | ||
|
|
a0550cbc80 | ||
|
|
08c5bdecae | ||
|
|
5d5b4c5fe5 | ||
|
|
70c232f85a | ||
|
|
a3c9435d93 | ||
|
|
4f0e0ea131 | ||
|
|
ddc369fba1 | ||
|
|
185ad31f37 | ||
|
|
543aa48573 | ||
|
|
f7a8fa39d8 | ||
|
|
717f4bcea0 | ||
|
|
16620f439d | ||
|
|
3b08fe2b13 | ||
|
|
abfe705a02 | ||
|
|
333306a252 | ||
|
|
6206dcb29e | ||
|
|
9389380015 | ||
|
|
175c43eca4 | ||
|
|
bc96d5c330 | ||
|
|
f0250620dd | ||
|
|
2de490d60f | ||
|
|
79d406e918 | ||
|
|
abad5746a7 | ||
|
|
e58294ddf2 | ||
|
|
f1e15da6fe | ||
|
|
0097bb1829 | ||
|
|
ea4b570483 | ||
|
|
a41357e941 | ||
|
|
ae96ef8fbd | ||
|
|
69ec3ca14c | ||
|
|
81d7a50f24 | ||
|
|
27902d42be | ||
|
|
56b325e977 | ||
|
|
3dd507083f | ||
|
|
0ed646b7aa | ||
|
|
1dab9bc8a9 | ||
|
|
3de6e6a30e | ||
|
|
966fe72141 | ||
|
|
62963d129e | ||
|
|
d9e98f42e4 | ||
|
|
3c6325f0fc | ||
|
|
47f0954af0 | ||
|
|
7cd2ebb025 | ||
|
|
f1c78138aa | ||
|
|
3a86b54fb0 | ||
|
|
f666207161 | ||
|
|
d830656a97 | ||
|
|
d18bab3587 | ||
|
|
9831aec49f | ||
|
|
482045ee77 | ||
|
|
9d6a8daa87 | ||
|
|
ee93f4f92a | ||
|
|
7c008c51a9 | ||
|
|
4d26d806e1 | ||
|
|
c5832d2ae9 | ||
|
|
15aba081f3 | ||
|
|
31354e563f | ||
|
|
98d6682cd1 | ||
|
|
2c37540aa6 | ||
|
|
3476ed0809 | ||
|
|
54600709b6 | ||
|
|
e373853e12 | ||
|
|
c87ebc3ef9 | ||
|
|
c4059ea54f | ||
|
|
8e0817c262 | ||
|
|
83bdcb6ac3 | ||
|
|
12a59959ed | ||
|
|
dec6fc6f3b | ||
|
|
8893130b63 | ||
|
|
bb60326836 | ||
|
|
4050d646e5 | ||
|
|
d76084c12f | ||
|
|
80ca1e6a3a | ||
|
|
614aa51203 | ||
|
|
af9ad46fca | ||
|
|
7836fdcc11 | ||
|
|
deacb7ec44 | ||
|
|
f5e73c9f1b | ||
|
|
c6c240aa0a | ||
|
|
2be6955a3f | ||
|
|
9d47f64eb6 | ||
|
|
cff6a1fec1 | ||
|
|
bcc6a09b63 | ||
|
|
9def10664e | ||
|
|
75aa1442db | ||
|
|
99397da534 | ||
|
|
8dbfcd35bf | ||
|
|
f7dac83d95 | ||
|
|
7c01f70641 | ||
|
|
51e971d39e | ||
|
|
329df38f1a | ||
|
|
580353da93 | ||
|
|
ba4994443a | ||
|
|
906a19cdb0 | ||
|
|
c4bca740e8 | ||
|
|
7f83f40dee | ||
|
|
54814fd85b | ||
|
|
7041de4384 | ||
|
|
6a62cb82cc | ||
|
|
5d2a1a9cf0 | ||
|
|
4bf35ed9ae | ||
|
|
be0b3af9e0 | ||
|
|
2cd402e169 | ||
|
|
b185230744 | ||
|
|
6a2d659d28 | ||
|
|
b2c620230a | ||
|
|
b90d8cd832 | ||
|
|
3b752a6555 | ||
|
|
ec1ad0046c | ||
|
|
57f09a419c | ||
|
|
5932634409 | ||
|
|
5cbe8d155c | ||
|
|
0d0e3a42ac | ||
|
|
74d55c065b | ||
|
|
f136da15e1 | ||
|
|
c3dde367f1 | ||
|
|
64e8d2a783 | ||
|
|
79c92c7c8a | ||
|
|
736ed38849 | ||
|
|
365791ff81 | ||
|
|
691e29ecf3 | ||
|
|
3fd02bda51 | ||
|
|
98cf2ed678 | ||
|
|
e9d32d077d | ||
|
|
2061f0b8a7 | ||
|
|
96354d6a29 | ||
|
|
d12af207d2 | ||
|
|
6eabc6cb0e | ||
|
|
2110557dab | ||
|
|
b9e84259e9 | ||
|
|
294104c3f9 | ||
|
|
38a1674abb | ||
|
|
f5c8628fdc | ||
|
|
cbc53b6b8d | ||
|
|
c54269d967 | ||
|
|
5bfd1bbc98 | ||
|
|
6984c02a27 | ||
|
|
3439c5a8e3 | ||
|
|
6806998bf9 | ||
|
|
515080ad2f | ||
|
|
3aa7b6cf66 | ||
|
|
dda4811591 | ||
|
|
82079729cc | ||
|
|
c2a8ac75e0 | ||
|
|
f178e56c68 | ||
|
|
dd793d1de5 | ||
|
|
bc34937d68 | ||
|
|
dd248f7675 | ||
|
|
d9b34baedd | ||
|
|
c18ebfdd71 | ||
|
|
67882dbb44 | ||
|
|
7b99314301 | ||
|
|
2ce5d6688b | ||
|
|
f23871e9ee | ||
|
|
e9de9dd551 | ||
|
|
ba991d5c84 | ||
|
|
1744cc99ba | ||
|
|
e72dc6cb35 | ||
|
|
c246212952 | ||
|
|
edd5fe5fa2 | ||
|
|
5d4d90536f | ||
|
|
6c916ac8a8 | ||
|
|
832ea88fcb | ||
|
|
8c00f9c15d | ||
|
|
0cbc1d2b4f | ||
|
|
ff9ddbceee | ||
|
|
9c62db07ed | ||
|
|
cf90ae0123 | ||
|
|
f5dda63eb5 | ||
|
|
7187507301 | ||
|
|
f1e72cc19a | ||
|
|
5b15bde539 | ||
|
|
bd620b01fb | ||
|
|
d9a252bc8e | ||
|
|
67005a07bc | ||
|
|
c35e4a3dd7 | ||
|
|
1f5674218f | ||
|
|
b12518d3cf | ||
|
|
6c5b7af152 | ||
|
|
8065a7e220 | ||
|
|
3f3b6b2150 | ||
|
|
a7dcc62086 | ||
|
|
ad137cd111 | ||
|
|
111af1fa2c | ||
|
|
1b2eaac316 | ||
|
|
3730a1c832 | ||
|
|
949e49a685 | ||
|
|
4a30d7e3cc | ||
|
|
e83db9e7e3 | ||
|
|
78687504f7 | ||
|
|
d571ca0108 | ||
|
|
afed90a034 | ||
|
|
3ee5c4bca5 | ||
|
|
e9c2732b97 | ||
|
|
d8714530d1 | ||
|
|
7d46c8d378 | ||
|
|
da971ec7a5 | ||
|
|
3eea74889f | ||
|
|
f758aed0e8 | ||
|
|
e5150f2c28 | ||
|
|
59a1eb59c9 | ||
|
|
6820724e51 | ||
|
|
b23ce92032 | ||
|
|
2bd231a7b7 | ||
|
|
8a173382c8 | ||
|
|
07feecde1a | ||
|
|
19091efc44 | ||
|
|
95db455e7f | ||
|
|
7879f24dcc | ||
|
|
13db4369d9 | ||
|
|
4ad7b53e59 | ||
|
|
f0cc0e68e3 | ||
|
|
db5ec52ad7 | ||
|
|
114d7270ff | ||
|
|
32c86e494a | ||
|
|
8eadcf0b90 | ||
|
|
5002175e80 | ||
|
|
daef218b55 | ||
|
|
fa9e385229 | ||
|
|
26e1188e51 | ||
|
|
a3e8a05d4c | ||
|
|
e441bad674 | ||
|
|
1b44aaf4e3 | ||
|
|
9e4e6fe207 | ||
|
|
ab66536dbf | ||
|
|
728c4c8a06 | ||
|
|
1f12122b17 | ||
|
|
890d8d960b | ||
|
|
9e74d9d003 | ||
|
|
9333fb8eb9 | ||
|
|
e2b85cf86a | ||
|
|
845a3f26f9 | ||
|
|
f07d513320 | ||
|
|
4a6769053a | ||
|
|
f31c1f90e3 | ||
|
|
3ce2c050dd | ||
|
|
1c0afa13c5 | ||
|
|
d919ecc771 | ||
|
|
e691918e3b | ||
|
|
81fbb3655f | ||
|
|
0e9164b40a | ||
|
|
1b8a0d71cf | ||
|
|
bd7efe95d0 | ||
|
|
f5bb85b435 | ||
|
|
28c145eb57 | ||
|
|
e2afb03c92 | ||
|
|
6e2527a7cb | ||
|
|
cdab68dcdb | ||
|
|
d1c3d7d139 | ||
|
|
77490c6f2f | ||
|
|
48f589e18b | ||
|
|
348616ac4b | ||
|
|
15985680e2 | ||
|
|
d74674bbd9 | ||
|
|
703475f6c2 | ||
|
|
d47af2bc02 | ||
|
|
319ad7f1d3 | ||
|
|
0f0d8bc065 | ||
|
|
55d6361b13 | ||
|
|
cd9c0d65d9 |
@@ -8,10 +8,6 @@ set -o pipefail
|
|||||||
# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
|
# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
|
||||||
mkdir -p images
|
mkdir -p images
|
||||||
cd images
|
cd images
|
||||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
|
|
||||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
|
|
||||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
|
|
||||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
|
|
||||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
|
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
|
||||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
|
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
||||||
|
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.671
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.664
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
||||||
|
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.892
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.892
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
||||||
|
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.755
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.755
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
||||||
|
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.753
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.753
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||||
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.728
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.728
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
|
||||||
|
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.756
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.752
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
||||||
|
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.86
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.86
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
||||||
|
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.624
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.624
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
|
||||||
|
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.616
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.632
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||||
|
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.593
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.588
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
||||||
|
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.595
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.582
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
||||||
|
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.792
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.824
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
4
.buildkite/lm-eval-harness/configs/models-large.txt
Normal file
4
.buildkite/lm-eval-harness/configs/models-large.txt
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
Meta-Llama-3-70B-Instruct.yaml
|
||||||
|
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||||
|
Qwen2-57B-A14-Instruct.yaml
|
||||||
|
DeepSeek-V2-Lite-Chat.yaml
|
||||||
5
.buildkite/lm-eval-harness/configs/models-small.txt
Normal file
5
.buildkite/lm-eval-harness/configs/models-small.txt
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
Meta-Llama-3-8B-Instruct.yaml
|
||||||
|
Meta-Llama-3-8B-Instruct-FP8.yaml
|
||||||
|
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
|
||||||
|
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
|
||||||
|
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
|
||||||
46
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file
46
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# We can use this script to compute baseline accuracy on GSM for transformers.
|
||||||
|
#
|
||||||
|
# Make sure you have lm-eval-harness installed:
|
||||||
|
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on GSM8k using huggingface transformers."
|
||||||
|
echo "This pathway is intended to be used to create baselines for "
|
||||||
|
echo "our automated nm-test-accuracy workflow"
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -m - huggingface stub or local directory of the model"
|
||||||
|
echo " -b - batch size to run the evaluation at"
|
||||||
|
echo " -l - limit number of samples to run"
|
||||||
|
echo " -f - number of fewshot samples to use"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
while getopts "m:b:l:f:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
m )
|
||||||
|
MODEL="$OPTARG"
|
||||||
|
;;
|
||||||
|
b )
|
||||||
|
BATCH_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
l )
|
||||||
|
LIMIT="$OPTARG"
|
||||||
|
;;
|
||||||
|
f )
|
||||||
|
FEWSHOT="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
lm_eval --model hf \
|
||||||
|
--model_args pretrained=$MODEL,parallelize=True \
|
||||||
|
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
|
||||||
|
--batch_size $BATCH_SIZE
|
||||||
51
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
Normal file
51
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# We can use this script to compute baseline accuracy on GSM for vllm.
|
||||||
|
# We use this for fp8, which HF does not support.
|
||||||
|
#
|
||||||
|
# Make sure you have lm-eval-harness installed:
|
||||||
|
# pip install lm-eval==0.4.3
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on GSM8k using huggingface transformers."
|
||||||
|
echo "This pathway is intended to be used to create baselines for "
|
||||||
|
echo "our automated nm-test-accuracy workflow"
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -m - huggingface stub or local directory of the model"
|
||||||
|
echo " -b - batch size to run the evaluation at"
|
||||||
|
echo " -l - limit number of samples to run"
|
||||||
|
echo " -f - number of fewshot samples to use"
|
||||||
|
echo " -t - tensor parallel size to run at"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
while getopts "m:b:l:f:t:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
m )
|
||||||
|
MODEL="$OPTARG"
|
||||||
|
;;
|
||||||
|
b )
|
||||||
|
BATCH_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
l )
|
||||||
|
LIMIT="$OPTARG"
|
||||||
|
;;
|
||||||
|
f )
|
||||||
|
FEWSHOT="$OPTARG"
|
||||||
|
;;
|
||||||
|
t )
|
||||||
|
TP_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
lm_eval --model vllm \
|
||||||
|
--model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
|
||||||
|
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
|
||||||
|
--batch_size $BATCH_SIZE
|
||||||
59
.buildkite/lm-eval-harness/run-tests.sh
Normal file
59
.buildkite/lm-eval-harness/run-tests.sh
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on GSM8k using vllm and compares to "
|
||||||
|
echo "precomputed baseline (measured by HF transformers.)"
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
|
||||||
|
echo " -t - tensor parallel size"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
SUCCESS=0
|
||||||
|
|
||||||
|
while getopts "c:t:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
c )
|
||||||
|
CONFIG="$OPTARG"
|
||||||
|
;;
|
||||||
|
t )
|
||||||
|
TP_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# Parse list of configs.
|
||||||
|
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
|
||||||
|
|
||||||
|
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
|
||||||
|
do
|
||||||
|
LOCAL_SUCCESS=0
|
||||||
|
|
||||||
|
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
|
||||||
|
|
||||||
|
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
|
||||||
|
export LM_EVAL_TP_SIZE=$TP_SIZE
|
||||||
|
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
|
||||||
|
|
||||||
|
if [[ $LOCAL_SUCCESS == 0 ]]; then
|
||||||
|
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
|
||||||
|
else
|
||||||
|
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
|
||||||
|
fi
|
||||||
|
|
||||||
|
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "${SUCCESS}" -eq "0" ]; then
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
55
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
Normal file
55
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
"""
|
||||||
|
LM eval harness on model to compare vs HF baseline computed offline.
|
||||||
|
Configs are found in configs/$MODEL.yaml
|
||||||
|
|
||||||
|
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
|
||||||
|
* export LM_EVAL_TP_SIZE=4
|
||||||
|
* pytest -s test_lm_eval_correctness.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import lm_eval
|
||||||
|
import numpy
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
RTOL = 0.02
|
||||||
|
TEST_DATA_FILE = os.environ.get(
|
||||||
|
"LM_EVAL_TEST_DATA_FILE",
|
||||||
|
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
|
||||||
|
|
||||||
|
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
|
||||||
|
|
||||||
|
|
||||||
|
def launch_lm_eval(eval_config):
|
||||||
|
model_args = f"pretrained={eval_config['model_name']}," \
|
||||||
|
f"tensor_parallel_size={TP_SIZE}," \
|
||||||
|
f"add_bos_token=true"
|
||||||
|
|
||||||
|
results = lm_eval.simple_evaluate(
|
||||||
|
model="vllm",
|
||||||
|
model_args=model_args,
|
||||||
|
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||||
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
|
limit=eval_config["limit"],
|
||||||
|
batch_size="auto")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def test_lm_eval_correctness():
|
||||||
|
eval_config = yaml.safe_load(
|
||||||
|
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
# Launch eval requests.
|
||||||
|
results = launch_lm_eval(eval_config)
|
||||||
|
|
||||||
|
# Confirm scores match ground truth.
|
||||||
|
for task in eval_config["tasks"]:
|
||||||
|
for metric in task["metrics"]:
|
||||||
|
ground_truth = metric["value"]
|
||||||
|
measured_value = results["results"][task["name"]][metric["name"]]
|
||||||
|
print(f'{task["name"]} | {metric["name"]}: '
|
||||||
|
f'ground_truth={ground_truth} | measured={measured_value}')
|
||||||
|
assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
|
||||||
104
.buildkite/nightly-benchmarks/README.md
Normal file
104
.buildkite/nightly-benchmarks/README.md
Normal file
@@ -0,0 +1,104 @@
|
|||||||
|
# vLLM benchmark suite
|
||||||
|
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
This directory contains the performance benchmarking CI for vllm.
|
||||||
|
The goal is to help developers know the impact of their PRs on the performance of vllm.
|
||||||
|
|
||||||
|
This benchmark will be *triggered* upon:
|
||||||
|
- A PR being merged into vllm.
|
||||||
|
- Every commit for those PRs with `perf-benchmarks` label.
|
||||||
|
|
||||||
|
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for more GPUs is comming later), with different models.
|
||||||
|
|
||||||
|
**Benchmarking Duration**: about 1hr.
|
||||||
|
|
||||||
|
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to less than 1.5 hr so that it won't take forever to run.
|
||||||
|
|
||||||
|
|
||||||
|
## Configuring the workload
|
||||||
|
|
||||||
|
The benchmarking workload contains three parts:
|
||||||
|
- Latency tests in `latency-tests.json`.
|
||||||
|
- Throughput tests in `throughput-tests.json`.
|
||||||
|
- Serving tests in `serving-tests.json`.
|
||||||
|
|
||||||
|
See [descriptions.md](tests/descriptions.md) for detailed descriptions.
|
||||||
|
|
||||||
|
### Latency test
|
||||||
|
|
||||||
|
Here is an example of one test inside `latency-tests.json`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama8B_tp1",
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num_iters_warmup": 5,
|
||||||
|
"num_iters": 15
|
||||||
|
}
|
||||||
|
},
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
In this example:
|
||||||
|
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
||||||
|
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
||||||
|
|
||||||
|
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
||||||
|
|
||||||
|
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
|
||||||
|
|
||||||
|
|
||||||
|
### Throughput test
|
||||||
|
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
|
||||||
|
|
||||||
|
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
||||||
|
|
||||||
|
### Serving test
|
||||||
|
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
||||||
|
|
||||||
|
```
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
Inside this example:
|
||||||
|
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
||||||
|
- The `server-parameters` includes the command line arguments for vLLM server.
|
||||||
|
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
|
||||||
|
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
|
||||||
|
|
||||||
|
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
|
||||||
|
|
||||||
|
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
||||||
|
|
||||||
|
## Visualizing the results
|
||||||
|
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
||||||
|
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||||
|
If you do not see the table, please wait till the benchmark finish running.
|
||||||
|
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||||
|
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
||||||
61
.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
Normal file
61
.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
steps:
|
||||||
|
- label: "Wait for container to be ready"
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
containers:
|
||||||
|
- image: badouralix/curl-jq
|
||||||
|
command:
|
||||||
|
- sh
|
||||||
|
- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
||||||
|
- wait
|
||||||
|
- label: "A100"
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
priorityClassName: perf-benchmark
|
||||||
|
containers:
|
||||||
|
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 8
|
||||||
|
volumeMounts:
|
||||||
|
- name: devshm
|
||||||
|
mountPath: /dev/shm
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
nodeSelector:
|
||||||
|
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||||
|
volumes:
|
||||||
|
- name: devshm
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
- label: "H100"
|
||||||
|
agents:
|
||||||
|
queue: H100
|
||||||
|
plugins:
|
||||||
|
- docker#v5.11.0:
|
||||||
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash
|
||||||
|
- .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
|
||||||
|
mount-buildkite-agent: true
|
||||||
|
propagate-environment: true
|
||||||
|
ipc: host
|
||||||
|
gpus: all
|
||||||
|
environment:
|
||||||
|
- VLLM_USAGE_SOURCE
|
||||||
|
- HF_TOKEN
|
||||||
|
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Install system packages
|
|
||||||
apt update
|
|
||||||
apt install -y curl jq
|
|
||||||
|
|
||||||
# Install minijinja for templating
|
|
||||||
curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
|
|
||||||
source $HOME/.cargo/env
|
|
||||||
|
|
||||||
# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
|
|
||||||
if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
|
|
||||||
PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
|
|
||||||
|
|
||||||
if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
|
|
||||||
echo "This PR has the 'perf-benchmarks' label. Proceeding with the nightly benchmarks."
|
|
||||||
else
|
|
||||||
echo "This PR does not have the 'perf-benchmarks' label. Skipping the nightly benchmarks."
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Upload sample.yaml
|
|
||||||
buildkite-agent pipeline upload .buildkite/nightly-benchmarks/sample.yaml
|
|
||||||
45
.buildkite/nightly-benchmarks/nightly-descriptions.md
Normal file
45
.buildkite/nightly-benchmarks/nightly-descriptions.md
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
|
||||||
|
# Nightly benchmark
|
||||||
|
|
||||||
|
The main goal of this benchmarking is two-fold:
|
||||||
|
- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
|
||||||
|
- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
|
||||||
|
|
||||||
|
|
||||||
|
## Docker images
|
||||||
|
|
||||||
|
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
|
||||||
|
- vllm/vllm-openai:v0.5.0.post1
|
||||||
|
- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
|
||||||
|
- openmmlab/lmdeploy:v0.5.0
|
||||||
|
- ghcr.io/huggingface/text-generation-inference:2.1
|
||||||
|
|
||||||
|
<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->
|
||||||
|
|
||||||
|
|
||||||
|
## Hardware
|
||||||
|
|
||||||
|
One AWS node with 8x NVIDIA A100 GPUs.
|
||||||
|
|
||||||
|
|
||||||
|
## Workload description
|
||||||
|
|
||||||
|
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
|
||||||
|
|
||||||
|
- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
|
- Output length: the corresponding output length of these 500 prompts.
|
||||||
|
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
|
||||||
|
- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
||||||
|
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
||||||
|
|
||||||
|
<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
|
||||||
|
|
||||||
|
## Plots
|
||||||
|
|
||||||
|
In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
|
||||||
|
|
||||||
|
<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
|
||||||
|
|
||||||
|
## Results
|
||||||
|
|
||||||
|
{nightly_results_benchmarking_table}
|
||||||
120
.buildkite/nightly-benchmarks/nightly-pipeline.yaml
Normal file
120
.buildkite/nightly-benchmarks/nightly-pipeline.yaml
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
common_pod_spec: &common_pod_spec
|
||||||
|
priorityClassName: perf-benchmark
|
||||||
|
nodeSelector:
|
||||||
|
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||||
|
volumes:
|
||||||
|
- name: devshm
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
- name: hf-cache
|
||||||
|
hostPath:
|
||||||
|
path: /root/.cache/huggingface
|
||||||
|
type: Directory
|
||||||
|
|
||||||
|
common_container_settings: &common_container_settings
|
||||||
|
command:
|
||||||
|
- bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 8
|
||||||
|
volumeMounts:
|
||||||
|
- name: devshm
|
||||||
|
mountPath: /dev/shm
|
||||||
|
- name: hf-cache
|
||||||
|
mountPath: /root/.cache/huggingface
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_HOME
|
||||||
|
value: /root/.cache/huggingface
|
||||||
|
- name: VLLM_SOURCE_CODE_LOC
|
||||||
|
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
|
||||||
|
- label: "A100 trt benchmark"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
|
||||||
|
<<: *common_container_settings
|
||||||
|
|
||||||
|
- label: "A100 lmdeploy benchmark"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: openmmlab/lmdeploy:v0.5.0
|
||||||
|
<<: *common_container_settings
|
||||||
|
|
||||||
|
|
||||||
|
- label: "A100 vllm benchmark"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: vllm/vllm-openai:latest
|
||||||
|
<<: *common_container_settings
|
||||||
|
|
||||||
|
- label: "A100 tgi benchmark"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: ghcr.io/huggingface/text-generation-inference:2.1
|
||||||
|
<<: *common_container_settings
|
||||||
|
|
||||||
|
- wait
|
||||||
|
|
||||||
|
- label: "Plot"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: vllm/vllm-openai:v0.5.0.post1
|
||||||
|
command:
|
||||||
|
- bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 8
|
||||||
|
volumeMounts:
|
||||||
|
- name: devshm
|
||||||
|
mountPath: /dev/shm
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: VLLM_SOURCE_CODE_LOC
|
||||||
|
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
|
||||||
|
- wait
|
||||||
376
.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
Normal file
376
.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
Normal file
@@ -0,0 +1,376 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script should be run inside the CI process
|
||||||
|
# This script assumes that we are already inside the vllm/ directory
|
||||||
|
# Benchmarking results will be available inside vllm/benchmarks/results/
|
||||||
|
|
||||||
|
# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
|
||||||
|
# and we still want to see other benchmarking results even when mixtral crashes.
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
check_gpus() {
|
||||||
|
# check the number of GPUs and GPU type.
|
||||||
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
|
echo "GPU found."
|
||||||
|
else
|
||||||
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||||
|
echo "GPU type is $gpu_type"
|
||||||
|
}
|
||||||
|
|
||||||
|
check_hf_token() {
|
||||||
|
# check if HF_TOKEN is available and valid
|
||||||
|
if [[ -z "$HF_TOKEN" ]]; then
|
||||||
|
echo "Error: HF_TOKEN is not set."
|
||||||
|
exit 1
|
||||||
|
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
||||||
|
echo "Error: HF_TOKEN does not start with 'hf_'."
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
echo "HF_TOKEN is set and valid."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
json2args() {
|
||||||
|
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||||
|
# example:
|
||||||
|
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||||
|
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||||
|
local json_string=$1
|
||||||
|
local args=$(
|
||||||
|
echo "$json_string" | jq -r '
|
||||||
|
to_entries |
|
||||||
|
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||||
|
join(" ")
|
||||||
|
'
|
||||||
|
)
|
||||||
|
echo "$args"
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
# wait for vllm server to start
|
||||||
|
# return 1 if vllm server crashes
|
||||||
|
timeout 1200 bash -c '
|
||||||
|
until curl -X POST localhost:8000/v1/completions; do
|
||||||
|
sleep 1
|
||||||
|
done' && return 0 || return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_gpu_processes() {
|
||||||
|
# kill all processes on GPU.
|
||||||
|
pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
|
||||||
|
if [ -z "$pids" ]; then
|
||||||
|
echo "No GPU processes found."
|
||||||
|
else
|
||||||
|
for pid in $pids; do
|
||||||
|
kill -9 "$pid"
|
||||||
|
echo "Killed process with PID: $pid"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "All GPU processes have been killed."
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Sometimes kill with pid doesn't work properly, we can also kill all process running python or python3
|
||||||
|
# since we are in container anyway
|
||||||
|
pkill -9 -f python
|
||||||
|
pkill -9 -f python3
|
||||||
|
|
||||||
|
# waiting for GPU processes to be fully killed
|
||||||
|
# loop while nvidia-smi returns any processes
|
||||||
|
while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do
|
||||||
|
sleep 1
|
||||||
|
echo "Waiting for GPU processes to be killed"
|
||||||
|
done
|
||||||
|
|
||||||
|
# remove vllm config file
|
||||||
|
rm -rf ~/.config/vllm
|
||||||
|
|
||||||
|
# Print the GPU memory usage
|
||||||
|
# so that we know if all GPU processes are killed.
|
||||||
|
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
||||||
|
# The memory usage should be 0 MB.
|
||||||
|
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
||||||
|
}
|
||||||
|
|
||||||
|
upload_to_buildkite() {
|
||||||
|
# upload the benchmarking results to buildkite
|
||||||
|
|
||||||
|
# if the agent binary is not found, skip uploading the results, exit 0
|
||||||
|
# Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
|
||||||
|
if command -v buildkite-agent >/dev/null 2>&1; then
|
||||||
|
BUILDKITE_AGENT_COMMAND="buildkite-agent"
|
||||||
|
elif [ -f /workspace/buildkite-agent ]; then
|
||||||
|
BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
|
||||||
|
else
|
||||||
|
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Use the determined command to annotate and upload artifacts
|
||||||
|
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
|
||||||
|
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
|
||||||
|
}
|
||||||
|
|
||||||
|
run_latency_tests() {
|
||||||
|
# run latency tests using `benchmark_latency.py`
|
||||||
|
# $1: a json file specifying latency test cases
|
||||||
|
|
||||||
|
local latency_test_file
|
||||||
|
latency_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over latency tests
|
||||||
|
jq -c '.[]' "$latency_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
if [[ ! "$test_name" =~ ^latency_ ]]; then
|
||||||
|
echo "In latency-test.json, test_name must start with \"latency_\"."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# get arguments
|
||||||
|
latency_params=$(echo "$params" | jq -r '.parameters')
|
||||||
|
latency_args=$(json2args "$latency_params")
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
latency_command="python3 benchmark_latency.py \
|
||||||
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
|
$latency_args"
|
||||||
|
|
||||||
|
echo "Running test case $test_name"
|
||||||
|
echo "Latency command: $latency_command"
|
||||||
|
|
||||||
|
# recoding benchmarking command ang GPU command
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg latency "$latency_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
'{
|
||||||
|
latency_command: $latency,
|
||||||
|
gpu_type: $gpu
|
||||||
|
}')
|
||||||
|
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
|
||||||
|
|
||||||
|
# run the benchmark
|
||||||
|
eval "$latency_command"
|
||||||
|
|
||||||
|
kill_gpu_processes
|
||||||
|
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
run_throughput_tests() {
|
||||||
|
# run throughput tests using `benchmark_throughput.py`
|
||||||
|
# $1: a json file specifying throughput test cases
|
||||||
|
|
||||||
|
local throughput_test_file
|
||||||
|
throughput_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over throughput tests
|
||||||
|
jq -c '.[]' "$throughput_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
if [[ ! "$test_name" =~ ^throughput_ ]]; then
|
||||||
|
echo "In throughput-test.json, test_name must start with \"throughput_\"."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# get arguments
|
||||||
|
throughput_params=$(echo "$params" | jq -r '.parameters')
|
||||||
|
throughput_args=$(json2args "$throughput_params")
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
tp=$(echo $throughput_params | jq -r '.tensor_parallel_size')
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
throughput_command="python3 benchmark_throughput.py \
|
||||||
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
|
$throughput_args"
|
||||||
|
|
||||||
|
echo "Running test case $test_name"
|
||||||
|
echo "Throughput command: $throughput_command"
|
||||||
|
# recoding benchmarking command ang GPU command
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg command "$throughput_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
'{
|
||||||
|
throughput_command: $command,
|
||||||
|
gpu_type: $gpu
|
||||||
|
}')
|
||||||
|
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
|
||||||
|
|
||||||
|
# run the benchmark
|
||||||
|
eval "$throughput_command"
|
||||||
|
|
||||||
|
kill_gpu_processes
|
||||||
|
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
run_serving_tests() {
|
||||||
|
# run serving tests using `benchmark_serving.py`
|
||||||
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
|
local serving_test_file
|
||||||
|
serving_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over serving tests
|
||||||
|
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
if [[ ! "$test_name" =~ ^serving_ ]]; then
|
||||||
|
echo "In serving-test.json, test_name must start with \"serving_\"."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
# get client and server arguments
|
||||||
|
server_params=$(echo "$params" | jq -r '.server_parameters')
|
||||||
|
client_params=$(echo "$params" | jq -r '.client_parameters')
|
||||||
|
server_args=$(json2args "$server_params")
|
||||||
|
client_args=$(json2args "$client_params")
|
||||||
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# check if server model and client model is aligned
|
||||||
|
server_model=$(echo "$server_params" | jq -r '.model')
|
||||||
|
client_model=$(echo "$client_params" | jq -r '.model')
|
||||||
|
if [[ $server_model != "$client_model" ]]; then
|
||||||
|
echo "Server model and client model must be the same. Skip testcase $testname."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
$server_args"
|
||||||
|
|
||||||
|
# run the server
|
||||||
|
echo "Running test case $test_name"
|
||||||
|
echo "Server command: $server_command"
|
||||||
|
eval "$server_command" &
|
||||||
|
server_pid=$!
|
||||||
|
|
||||||
|
# wait until the server is alive
|
||||||
|
wait_for_server
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo ""
|
||||||
|
echo "vllm server is up and running."
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "vllm failed to start within the timeout period."
|
||||||
|
fi
|
||||||
|
|
||||||
|
# iterate over different QPS
|
||||||
|
for qps in $qps_list; do
|
||||||
|
# remove the surrounding single quote from qps
|
||||||
|
if [[ "$qps" == *"inf"* ]]; then
|
||||||
|
echo "qps was $qps"
|
||||||
|
qps="inf"
|
||||||
|
echo "now qps is $qps"
|
||||||
|
fi
|
||||||
|
|
||||||
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
|
||||||
|
client_command="python3 benchmark_serving.py \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
$client_args"
|
||||||
|
|
||||||
|
echo "Running test case $test_name with qps $qps"
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
|
eval "$client_command"
|
||||||
|
|
||||||
|
# record the benchmarking commands
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg server "$server_command" \
|
||||||
|
--arg client "$client_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
'{
|
||||||
|
server_command: $server,
|
||||||
|
client_command: $client,
|
||||||
|
gpu_type: $gpu
|
||||||
|
}')
|
||||||
|
echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
# clean up
|
||||||
|
kill -9 $server_pid
|
||||||
|
kill_gpu_processes
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
check_gpus
|
||||||
|
check_hf_token
|
||||||
|
|
||||||
|
# dependencies
|
||||||
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
|
|
||||||
|
# get the current IP address, required by benchmark_serving.py
|
||||||
|
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||||
|
# turn of the reporting of the status of each request, to clean up the terminal output
|
||||||
|
export VLLM_LOG_LEVEL="WARNING"
|
||||||
|
|
||||||
|
# prepare for benchmarking
|
||||||
|
cd benchmarks || exit 1
|
||||||
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
declare -g RESULTS_FOLDER=results/
|
||||||
|
mkdir -p $RESULTS_FOLDER
|
||||||
|
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||||
|
|
||||||
|
# benchmarking
|
||||||
|
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
|
||||||
|
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
|
||||||
|
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
|
||||||
|
|
||||||
|
|
||||||
|
# postprocess benchmarking results
|
||||||
|
pip install tabulate pandas
|
||||||
|
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
|
||||||
|
|
||||||
|
upload_to_buildkite
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
76
.buildkite/nightly-benchmarks/run-nightly-suite.sh
Normal file
76
.buildkite/nightly-benchmarks/run-nightly-suite.sh
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -o pipefail
|
||||||
|
set -x
|
||||||
|
|
||||||
|
check_gpus() {
|
||||||
|
# check the number of GPUs and GPU type.
|
||||||
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
|
echo "GPU found."
|
||||||
|
else
|
||||||
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||||
|
echo "GPU type is $gpu_type"
|
||||||
|
}
|
||||||
|
|
||||||
|
check_hf_token() {
|
||||||
|
# check if HF_TOKEN is available and valid
|
||||||
|
if [[ -z "$HF_TOKEN" ]]; then
|
||||||
|
echo "Error: HF_TOKEN is not set."
|
||||||
|
exit 1
|
||||||
|
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
||||||
|
echo "Error: HF_TOKEN does not start with 'hf_'."
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
echo "HF_TOKEN is set and valid."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
|
||||||
|
check_gpus
|
||||||
|
check_hf_token
|
||||||
|
|
||||||
|
df -h
|
||||||
|
|
||||||
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
|
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
|
|
||||||
|
# run lmdeploy
|
||||||
|
if which lmdeploy >/dev/null; then
|
||||||
|
echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
|
||||||
|
bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# run tgi
|
||||||
|
if [ -e /tgi-entrypoint.sh ]; then
|
||||||
|
echo "tgi is available, redirect to run-tgi-nightly.sh"
|
||||||
|
bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# run trt
|
||||||
|
if which trtllm-build >/dev/null; then
|
||||||
|
echo "trtllm is available, redirect to run-trt-nightly.sh"
|
||||||
|
bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# run vllm
|
||||||
|
if [ -e /vllm-workspace ]; then
|
||||||
|
echo "vllm is available, redirect to run-vllm-nightly.sh"
|
||||||
|
bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
@@ -1,39 +0,0 @@
|
|||||||
steps:
|
|
||||||
# NOTE(simon): You can create separate blocks for different jobs
|
|
||||||
- label: "A100: NVIDIA SMI"
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
containers:
|
|
||||||
# - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT
|
|
||||||
# TODO(simon): check latest main branch or use the PR image.
|
|
||||||
- image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6
|
|
||||||
command:
|
|
||||||
- bash -c 'nvidia-smi && nvidia-smi topo -m && pwd && ls'
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
nvidia.com/gpu: 8
|
|
||||||
volumeMounts:
|
|
||||||
- name: devshm
|
|
||||||
mountPath: /dev/shm
|
|
||||||
nodeSelector:
|
|
||||||
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
|
||||||
volumes:
|
|
||||||
- name: devshm
|
|
||||||
emptyDir:
|
|
||||||
medium: Memory
|
|
||||||
# TODO(simon): bring H100 online
|
|
||||||
# - label: "H100: NVIDIA SMI"
|
|
||||||
# agents:
|
|
||||||
# queue: H100
|
|
||||||
# plugins:
|
|
||||||
# - docker#v5.11.0:
|
|
||||||
# image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6
|
|
||||||
# command:
|
|
||||||
# - bash -c 'nvidia-smi && nvidia-smi topo -m'
|
|
||||||
# propagate-environment: true
|
|
||||||
# ipc: host
|
|
||||||
# gpus: all
|
|
||||||
|
|
||||||
@@ -0,0 +1,192 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
results_folder = Path("results/")
|
||||||
|
|
||||||
|
# latency results and the keys that will be printed into markdown
|
||||||
|
latency_results = []
|
||||||
|
latency_column_mapping = {
|
||||||
|
"test_name": "Test name",
|
||||||
|
"gpu_type": "GPU",
|
||||||
|
"avg_latency": "Mean latency (ms)",
|
||||||
|
# "P10": "P10 (s)",
|
||||||
|
# "P25": "P25 (s)",
|
||||||
|
"P50": "Median latency (ms)",
|
||||||
|
# "P75": "P75 (s)",
|
||||||
|
# "P90": "P90 (s)",
|
||||||
|
"P99": "P99 latency (ms)",
|
||||||
|
}
|
||||||
|
|
||||||
|
# throughput tests and the keys that will be printed into markdown
|
||||||
|
throughput_results = []
|
||||||
|
throughput_results_column_mapping = {
|
||||||
|
"test_name": "Test name",
|
||||||
|
"gpu_type": "GPU",
|
||||||
|
# "num_requests": "# of req.",
|
||||||
|
# "total_num_tokens": "Total # of tokens",
|
||||||
|
# "elapsed_time": "Elapsed time (s)",
|
||||||
|
"requests_per_second": "Tput (req/s)",
|
||||||
|
# "tokens_per_second": "Tput (tok/s)",
|
||||||
|
}
|
||||||
|
|
||||||
|
# serving results and the keys that will be printed into markdown
|
||||||
|
serving_results = []
|
||||||
|
serving_column_mapping = {
|
||||||
|
"test_name": "Test name",
|
||||||
|
"gpu_type": "GPU",
|
||||||
|
# "completed": "# of req.",
|
||||||
|
"request_throughput": "Tput (req/s)",
|
||||||
|
# "input_throughput": "Input Tput (tok/s)",
|
||||||
|
# "output_throughput": "Output Tput (tok/s)",
|
||||||
|
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||||
|
"median_ttft_ms": "Median TTFT (ms)",
|
||||||
|
"p99_ttft_ms": "P99 TTFT (ms)",
|
||||||
|
# "mean_tpot_ms": "Mean TPOT (ms)",
|
||||||
|
# "median_tpot_ms": "Median",
|
||||||
|
# "p99_tpot_ms": "P99",
|
||||||
|
"mean_itl_ms": "Mean ITL (ms)",
|
||||||
|
"median_itl_ms": "Median ITL (ms)",
|
||||||
|
"p99_itl_ms": "P99 ITL (ms)",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def read_markdown(file):
|
||||||
|
if os.path.exists(file):
|
||||||
|
with open(file, "r") as f:
|
||||||
|
return f.read() + "\n"
|
||||||
|
else:
|
||||||
|
return f"{file} not found.\n"
|
||||||
|
|
||||||
|
|
||||||
|
def results_to_json(latency, throughput, serving):
|
||||||
|
return json.dumps({
|
||||||
|
'latency': latency.to_dict(),
|
||||||
|
'throughput': throughput.to_dict(),
|
||||||
|
'serving': serving.to_dict()
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
# collect results
|
||||||
|
for test_file in results_folder.glob("*.json"):
|
||||||
|
|
||||||
|
with open(test_file, "r") as f:
|
||||||
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
|
if "serving" in str(test_file):
|
||||||
|
# this result is generated via `benchmark_serving.py`
|
||||||
|
|
||||||
|
# attach the benchmarking command to raw_result
|
||||||
|
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||||
|
command = json.loads(f.read())
|
||||||
|
raw_result.update(command)
|
||||||
|
|
||||||
|
# update the test name of this result
|
||||||
|
raw_result.update({"test_name": test_file.stem})
|
||||||
|
|
||||||
|
# add the result to raw_result
|
||||||
|
serving_results.append(raw_result)
|
||||||
|
continue
|
||||||
|
|
||||||
|
elif "latency" in f.name:
|
||||||
|
# this result is generated via `benchmark_latency.py`
|
||||||
|
|
||||||
|
# attach the benchmarking command to raw_result
|
||||||
|
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||||
|
command = json.loads(f.read())
|
||||||
|
raw_result.update(command)
|
||||||
|
|
||||||
|
# update the test name of this result
|
||||||
|
raw_result.update({"test_name": test_file.stem})
|
||||||
|
|
||||||
|
# get different percentiles
|
||||||
|
for perc in [10, 25, 50, 75, 90, 99]:
|
||||||
|
# Multiply 1000 to convert the time unit from s to ms
|
||||||
|
raw_result.update(
|
||||||
|
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
|
||||||
|
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
|
||||||
|
|
||||||
|
# add the result to raw_result
|
||||||
|
latency_results.append(raw_result)
|
||||||
|
continue
|
||||||
|
|
||||||
|
elif "throughput" in f.name:
|
||||||
|
# this result is generated via `benchmark_throughput.py`
|
||||||
|
|
||||||
|
# attach the benchmarking command to raw_result
|
||||||
|
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||||
|
command = json.loads(f.read())
|
||||||
|
raw_result.update(command)
|
||||||
|
|
||||||
|
# update the test name of this result
|
||||||
|
raw_result.update({"test_name": test_file.stem})
|
||||||
|
|
||||||
|
# add the result to raw_result
|
||||||
|
throughput_results.append(raw_result)
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"Skipping {test_file}")
|
||||||
|
|
||||||
|
latency_results = pd.DataFrame.from_dict(latency_results)
|
||||||
|
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||||
|
throughput_results = pd.DataFrame.from_dict(throughput_results)
|
||||||
|
|
||||||
|
raw_results_json = results_to_json(latency_results, throughput_results,
|
||||||
|
serving_results)
|
||||||
|
|
||||||
|
# remapping the key, for visualization purpose
|
||||||
|
if not latency_results.empty:
|
||||||
|
latency_results = latency_results[list(
|
||||||
|
latency_column_mapping.keys())].rename(
|
||||||
|
columns=latency_column_mapping)
|
||||||
|
if not serving_results.empty:
|
||||||
|
serving_results = serving_results[list(
|
||||||
|
serving_column_mapping.keys())].rename(
|
||||||
|
columns=serving_column_mapping)
|
||||||
|
if not throughput_results.empty:
|
||||||
|
throughput_results = throughput_results[list(
|
||||||
|
throughput_results_column_mapping.keys())].rename(
|
||||||
|
columns=throughput_results_column_mapping)
|
||||||
|
|
||||||
|
processed_results_json = results_to_json(latency_results,
|
||||||
|
throughput_results,
|
||||||
|
serving_results)
|
||||||
|
|
||||||
|
# get markdown tables
|
||||||
|
latency_md_table = tabulate(latency_results,
|
||||||
|
headers='keys',
|
||||||
|
tablefmt='pipe',
|
||||||
|
showindex=False)
|
||||||
|
serving_md_table = tabulate(serving_results,
|
||||||
|
headers='keys',
|
||||||
|
tablefmt='pipe',
|
||||||
|
showindex=False)
|
||||||
|
throughput_md_table = tabulate(throughput_results,
|
||||||
|
headers='keys',
|
||||||
|
tablefmt='pipe',
|
||||||
|
showindex=False)
|
||||||
|
|
||||||
|
# document the result
|
||||||
|
with open(results_folder / "benchmark_results.md", "w") as f:
|
||||||
|
|
||||||
|
results = read_markdown(
|
||||||
|
"../.buildkite/nightly-benchmarks/tests/descriptions.md")
|
||||||
|
results = results.format(
|
||||||
|
latency_tests_markdown_table=latency_md_table,
|
||||||
|
throughput_tests_markdown_table=throughput_md_table,
|
||||||
|
serving_tests_markdown_table=serving_md_table,
|
||||||
|
benchmarking_results_in_json_string=processed_results_json)
|
||||||
|
f.write(results)
|
||||||
|
|
||||||
|
# document benchmarking results in json
|
||||||
|
with open(results_folder / "benchmark_results.json", "w") as f:
|
||||||
|
|
||||||
|
results = latency_results.to_dict(
|
||||||
|
orient='records') + throughput_results.to_dict(
|
||||||
|
orient='records') + serving_results.to_dict(orient='records')
|
||||||
|
f.write(json.dumps(results))
|
||||||
26
.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
Normal file
26
.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
import argparse
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
def main(model, cachedir):
|
||||||
|
# Load the tokenizer and save it to the specified directory
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||||
|
tokenizer.save_pretrained(cachedir)
|
||||||
|
print(f"Tokenizer saved to {cachedir}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Download and save Hugging Face tokenizer")
|
||||||
|
parser.add_argument("--model",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Name of the model")
|
||||||
|
parser.add_argument("--cachedir",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Directory to save the tokenizer")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args.model, args.cachedir)
|
||||||
@@ -0,0 +1,6 @@
|
|||||||
|
from lmdeploy.serve.openai.api_client import APIClient
|
||||||
|
|
||||||
|
api_client = APIClient("http://localhost:8000")
|
||||||
|
model_name = api_client.available_models[0]
|
||||||
|
|
||||||
|
print(model_name)
|
||||||
102
.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
Normal file
102
.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
|
||||||
|
server_params=$1
|
||||||
|
common_params=$2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
model_path=$(echo "$common_params" | jq -r '.model')
|
||||||
|
model_name="${model_path#*/}"
|
||||||
|
model_type=$(echo "$server_params" | jq -r '.model_type')
|
||||||
|
model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
|
||||||
|
model_tp_size=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
|
||||||
|
max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
|
||||||
|
max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
|
||||||
|
trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
|
||||||
|
|
||||||
|
cd ~
|
||||||
|
rm -rf models
|
||||||
|
mkdir -p models
|
||||||
|
cd models
|
||||||
|
models_dir=$(pwd)
|
||||||
|
trt_model_path=${models_dir}/${model_name}-trt-ckpt
|
||||||
|
trt_engine_path=${models_dir}/${model_name}-trt-engine
|
||||||
|
|
||||||
|
cd ~
|
||||||
|
rm -rf tensorrt-demo
|
||||||
|
git clone https://github.com/neuralmagic/tensorrt-demo.git
|
||||||
|
cd tensorrt-demo
|
||||||
|
tensorrt_demo_dir=$(pwd)
|
||||||
|
|
||||||
|
# make sure the parameter inside tensorrt_demo is consistent to envvar
|
||||||
|
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
|
||||||
|
sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
|
||||||
|
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
|
||||||
|
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
|
||||||
|
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
|
||||||
|
sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
|
||||||
|
|
||||||
|
|
||||||
|
cd /
|
||||||
|
rm -rf tensorrtllm_backend
|
||||||
|
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
|
||||||
|
git lfs install
|
||||||
|
cd tensorrtllm_backend
|
||||||
|
git checkout $trt_llm_version
|
||||||
|
tensorrtllm_backend_dir=$(pwd)
|
||||||
|
git submodule update --init --recursive
|
||||||
|
cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
|
||||||
|
|
||||||
|
cd /tensorrtllm_backend
|
||||||
|
cd ./tensorrt_llm/examples/${model_type}
|
||||||
|
|
||||||
|
|
||||||
|
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
|
||||||
|
|
||||||
|
echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
|
||||||
|
echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
|
||||||
|
python ../quantization/quantize.py \
|
||||||
|
--model_dir ${model_path} \
|
||||||
|
--dtype ${model_dtype} \
|
||||||
|
--tp_size ${model_tp_size} \
|
||||||
|
--output_dir ${trt_model_path} \
|
||||||
|
--qformat fp8 \
|
||||||
|
--kv_cache_dtype fp8 \
|
||||||
|
--calib_size 2
|
||||||
|
|
||||||
|
else
|
||||||
|
|
||||||
|
echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
|
||||||
|
python3 convert_checkpoint.py \
|
||||||
|
--model_dir ${model_path} \
|
||||||
|
--dtype ${model_dtype} \
|
||||||
|
--tp_size ${model_tp_size} \
|
||||||
|
--output_dir ${trt_model_path}
|
||||||
|
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
trtllm-build \
|
||||||
|
--checkpoint_dir=${trt_model_path} \
|
||||||
|
--gpt_attention_plugin=${model_dtype} \
|
||||||
|
--gemm_plugin=${model_dtype} \
|
||||||
|
--remove_input_padding=enable \
|
||||||
|
--paged_kv_cache=enable \
|
||||||
|
--tp_size=${model_tp_size} \
|
||||||
|
--max_batch_size=${max_batch_size} \
|
||||||
|
--max_input_len=${max_input_len} \
|
||||||
|
--max_output_len=${max_output_len} \
|
||||||
|
--max_num_tokens=${max_output_len} \
|
||||||
|
--opt_num_tokens=${max_output_len} \
|
||||||
|
--output_dir=${trt_engine_path}
|
||||||
|
|
||||||
|
cd /tensorrtllm_backend/triton_model_repo
|
||||||
|
rm -rf ./tensorrt_llm/1/*
|
||||||
|
cp -r ${trt_engine_path}/* ./tensorrt_llm/1
|
||||||
|
cd /tensorrtllm_backend
|
||||||
|
python3 scripts/launch_triton_server.py \
|
||||||
|
--world_size=${model_tp_size} \
|
||||||
|
--model_repo=/tensorrtllm_backend/triton_model_repo &
|
||||||
40
.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
Normal file
40
.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
|
||||||
|
main() {
|
||||||
|
|
||||||
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
|
|
||||||
|
if [ ! -f /workspace/buildkite-agent ]; then
|
||||||
|
echo "buildkite-agent binary not found. Skip plotting the results."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# initial annotation
|
||||||
|
description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
|
||||||
|
|
||||||
|
# download results
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
mkdir -p results/
|
||||||
|
/workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
|
||||||
|
ls
|
||||||
|
ls results/
|
||||||
|
|
||||||
|
# generate figures
|
||||||
|
python3 -m pip install tabulate pandas matplotlib
|
||||||
|
python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||||
|
--description $description \
|
||||||
|
--results-folder results/
|
||||||
|
|
||||||
|
# upload results and figures
|
||||||
|
/workspace/buildkite-agent artifact upload "nightly_results.png"
|
||||||
|
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
|
||||||
|
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
|
||||||
|
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
135
.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
Normal file
135
.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
Normal file
@@ -0,0 +1,135 @@
|
|||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import pandas as pd
|
||||||
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
|
||||||
|
def parse_arguments():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description=
|
||||||
|
'Parse command line arguments for summary-nightly-results script.')
|
||||||
|
parser.add_argument('--results-folder',
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help='The folder where the results are stored.')
|
||||||
|
parser.add_argument('--description',
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help='Description of the results.')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
|
||||||
|
results_folder = Path(args.results_folder)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# collect results
|
||||||
|
for test_file in results_folder.glob("*_nightly_results.json"):
|
||||||
|
with open(test_file, "r") as f:
|
||||||
|
results = results + json.loads(f.read())
|
||||||
|
|
||||||
|
# generate markdown table
|
||||||
|
df = pd.DataFrame.from_dict(results)
|
||||||
|
|
||||||
|
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
|
||||||
|
|
||||||
|
with open(args.description, "r") as f:
|
||||||
|
description = f.read()
|
||||||
|
|
||||||
|
description = description.format(
|
||||||
|
nightly_results_benchmarking_table=md_table)
|
||||||
|
|
||||||
|
with open("nightly_results.md", "w") as f:
|
||||||
|
f.write(description)
|
||||||
|
|
||||||
|
plt.rcParams.update({'font.size': 20})
|
||||||
|
|
||||||
|
# plot results
|
||||||
|
fig, axes = plt.subplots(3, 3, figsize=(16, 14))
|
||||||
|
fig.subplots_adjust(hspace=1)
|
||||||
|
methods = ["vllm", "trt", "lmdeploy", "tgi"]
|
||||||
|
for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
|
||||||
|
for j, metric in enumerate(["TTFT", "ITL"]):
|
||||||
|
means, stds = [], []
|
||||||
|
for method in methods:
|
||||||
|
target = df['Test name'].str.contains(model)
|
||||||
|
target = target & df['Engine'].str.contains(method)
|
||||||
|
filtered_df = df[target]
|
||||||
|
|
||||||
|
if filtered_df.empty:
|
||||||
|
means.append(0.)
|
||||||
|
stds.append(0.)
|
||||||
|
else:
|
||||||
|
means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
|
||||||
|
std = filtered_df[f"Std {metric} (ms)"].values[0]
|
||||||
|
success = filtered_df["Successful req."].values[0]
|
||||||
|
stds.append(std / math.sqrt(success))
|
||||||
|
|
||||||
|
print(model, metric)
|
||||||
|
print(means, stds)
|
||||||
|
|
||||||
|
ax = axes[i, j + 1]
|
||||||
|
|
||||||
|
bars = ax.bar(
|
||||||
|
["vllm", "trt", "lmdeploy", "tgi"],
|
||||||
|
means,
|
||||||
|
yerr=stds,
|
||||||
|
capsize=10,
|
||||||
|
)
|
||||||
|
for idx, bar in enumerate(bars):
|
||||||
|
bar.set_color(bar_colors[idx])
|
||||||
|
ax.set_ylim(bottom=0)
|
||||||
|
|
||||||
|
ax.set_ylabel(f"{metric} (ms)")
|
||||||
|
ax.set_title(f"{model} {metric}")
|
||||||
|
ax.grid(axis='y')
|
||||||
|
|
||||||
|
metric = "Tput"
|
||||||
|
j = 0
|
||||||
|
if True:
|
||||||
|
tputs = []
|
||||||
|
for method in methods:
|
||||||
|
target = df['Test name'].str.contains(model)
|
||||||
|
target = target & df['Engine'].str.contains(method)
|
||||||
|
filtered_df = df[target]
|
||||||
|
|
||||||
|
if filtered_df.empty:
|
||||||
|
tputs.append(0.)
|
||||||
|
else:
|
||||||
|
input_tput = filtered_df["Input Tput (tok/s)"].values[0]
|
||||||
|
output_tput = filtered_df["Output Tput (tok/s)"].values[0]
|
||||||
|
tputs.append(input_tput + output_tput)
|
||||||
|
|
||||||
|
print(model, metric)
|
||||||
|
print(tputs)
|
||||||
|
|
||||||
|
ax = axes[i, j]
|
||||||
|
|
||||||
|
bars = ax.bar(
|
||||||
|
["vllm", "trt", "lmdeploy", "tgi"],
|
||||||
|
tputs,
|
||||||
|
)
|
||||||
|
for idx, bar in enumerate(bars):
|
||||||
|
bar.set_color(bar_colors[idx])
|
||||||
|
|
||||||
|
ax.set_ylim(bottom=0)
|
||||||
|
|
||||||
|
ax.set_ylabel("Tput (token/s)")
|
||||||
|
ax.set_title(f"{model} {metric}")
|
||||||
|
ax.grid(axis='y')
|
||||||
|
|
||||||
|
fig.tight_layout()
|
||||||
|
fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
args = parse_arguments()
|
||||||
|
main(args)
|
||||||
218
.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
Normal file
218
.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
Normal file
@@ -0,0 +1,218 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
check_gpus() {
|
||||||
|
# check the number of GPUs and GPU type.
|
||||||
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
|
echo "GPU found."
|
||||||
|
else
|
||||||
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||||
|
echo "GPU type is $gpu_type"
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_gpu_processes() {
|
||||||
|
pkill lmdeploy || true
|
||||||
|
# waiting for GPU processes to be fully killed
|
||||||
|
sleep 10
|
||||||
|
# Print the GPU memory usage
|
||||||
|
# so that we know if all GPU processes are killed.
|
||||||
|
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
||||||
|
# The memory usage should be 0 MB.
|
||||||
|
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
||||||
|
}
|
||||||
|
|
||||||
|
json2args() {
|
||||||
|
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||||
|
# example:
|
||||||
|
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||||
|
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||||
|
local json_string=$1
|
||||||
|
local args=$(
|
||||||
|
echo "$json_string" | jq -r '
|
||||||
|
to_entries |
|
||||||
|
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||||
|
join(" ")
|
||||||
|
'
|
||||||
|
)
|
||||||
|
echo "$args"
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
# wait for vllm server to start
|
||||||
|
# return 1 if vllm server crashes
|
||||||
|
timeout 1200 bash -c '
|
||||||
|
until curl -s localhost:8000/v1/completions > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done' && return 0 || return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
run_serving_tests() {
|
||||||
|
# run serving tests using `benchmark_serving.py`
|
||||||
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
|
local serving_test_file
|
||||||
|
serving_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over serving tests
|
||||||
|
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# append lmdeploy to the test name
|
||||||
|
test_name=lmdeploy_$test_name
|
||||||
|
|
||||||
|
# get common parameters
|
||||||
|
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# get client and server arguments
|
||||||
|
server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
|
||||||
|
client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
|
||||||
|
server_args=$(json2args "$server_params")
|
||||||
|
client_args=$(json2args "$client_params")
|
||||||
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# prepare tokenizer
|
||||||
|
rm -rf /tokenizer_cache
|
||||||
|
mkdir /tokenizer_cache
|
||||||
|
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
||||||
|
--model "$model" \
|
||||||
|
--cachedir /tokenizer_cache
|
||||||
|
|
||||||
|
server_command="lmdeploy serve api_server $model \
|
||||||
|
--tp $tp \
|
||||||
|
--server-port $port \
|
||||||
|
$server_args"
|
||||||
|
|
||||||
|
# run the server
|
||||||
|
echo "Running test case $test_name"
|
||||||
|
echo "Server command: $server_command"
|
||||||
|
bash -c "$server_command" &
|
||||||
|
|
||||||
|
# wait until the server is alive
|
||||||
|
wait_for_server
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo ""
|
||||||
|
echo "lmdeploy server is up and running."
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "lmdeploy failed to start within the timeout period."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
# get model name
|
||||||
|
model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
|
||||||
|
|
||||||
|
# iterate over different QPS
|
||||||
|
for qps in $qps_list; do
|
||||||
|
# remove the surrounding single quote from qps
|
||||||
|
if [[ "$qps" == *"inf"* ]]; then
|
||||||
|
echo "qps was $qps"
|
||||||
|
qps="inf"
|
||||||
|
echo "now qps is $qps"
|
||||||
|
fi
|
||||||
|
|
||||||
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
|
||||||
|
client_command="python3 benchmark_serving.py \
|
||||||
|
--backend lmdeploy \
|
||||||
|
--tokenizer /tokenizer_cache \
|
||||||
|
--dataset-name $dataset_name \
|
||||||
|
--dataset-path $dataset_path \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
--port $port \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
--model \"$model_name\" \
|
||||||
|
$client_args"
|
||||||
|
|
||||||
|
echo "Running test case $test_name with qps $qps"
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
|
eval "$client_command"
|
||||||
|
|
||||||
|
# record the benchmarking commands
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg server "$server_command" \
|
||||||
|
--arg client "$client_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
--arg engine "lmdeploy" \
|
||||||
|
'{
|
||||||
|
server_command: $server,
|
||||||
|
client_command: $client,
|
||||||
|
gpu_type: $gpu,
|
||||||
|
engine: $engine
|
||||||
|
}')
|
||||||
|
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
# clean up
|
||||||
|
kill_gpu_processes
|
||||||
|
rm -rf /root/.cache/huggingface/*
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
upload_to_buildkite() {
|
||||||
|
# upload the benchmarking results to buildkite
|
||||||
|
|
||||||
|
# if the agent binary is not found, skip uploading the results, exit 0
|
||||||
|
if [ ! -f /workspace/buildkite-agent ]; then
|
||||||
|
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||||
|
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
main() {
|
||||||
|
|
||||||
|
check_gpus
|
||||||
|
# enter vllm directory
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
|
||||||
|
declare -g RESULTS_FOLDER=results/
|
||||||
|
mkdir -p $RESULTS_FOLDER
|
||||||
|
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||||
|
|
||||||
|
python -m pip install transformers==4.41.2
|
||||||
|
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
|
||||||
|
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
||||||
|
python -m pip install tabulate pandas
|
||||||
|
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
||||||
|
upload_to_buildkite
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
216
.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
Normal file
216
.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
Normal file
@@ -0,0 +1,216 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
check_gpus() {
|
||||||
|
# check the number of GPUs and GPU type.
|
||||||
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
|
echo "GPU found."
|
||||||
|
else
|
||||||
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||||
|
echo "GPU type is $gpu_type"
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_gpu_processes() {
|
||||||
|
pkill text-generation || true
|
||||||
|
# waiting for GPU processes to be fully killed
|
||||||
|
sleep 10
|
||||||
|
# Print the GPU memory usage
|
||||||
|
# so that we know if all GPU processes are killed.
|
||||||
|
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
||||||
|
# The memory usage should be 0 MB.
|
||||||
|
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
||||||
|
}
|
||||||
|
|
||||||
|
json2args() {
|
||||||
|
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||||
|
# example:
|
||||||
|
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||||
|
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||||
|
local json_string=$1
|
||||||
|
local args=$(
|
||||||
|
echo "$json_string" | jq -r '
|
||||||
|
to_entries |
|
||||||
|
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||||
|
join(" ")
|
||||||
|
'
|
||||||
|
)
|
||||||
|
echo "$args"
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
timeout 1200 bash -c '
|
||||||
|
until curl -s localhost:8000/generate_stream > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done' && return 0 || return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
run_serving_tests() {
|
||||||
|
# run serving tests using `benchmark_serving.py`
|
||||||
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
|
local serving_test_file
|
||||||
|
serving_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over serving tests
|
||||||
|
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# append tgi to the test name
|
||||||
|
test_name=tgi_$test_name
|
||||||
|
|
||||||
|
# get common parameters
|
||||||
|
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
|
||||||
|
# get client and server arguments
|
||||||
|
server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
|
||||||
|
client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
|
||||||
|
server_args=$(json2args "$server_params")
|
||||||
|
client_args=$(json2args "$client_params")
|
||||||
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
|
||||||
|
echo "Key 'fp8' exists in common params."
|
||||||
|
server_command="/tgi-entrypoint.sh \
|
||||||
|
--model-id $model \
|
||||||
|
--num-shard $tp \
|
||||||
|
--port $port \
|
||||||
|
--quantize fp8 \
|
||||||
|
$server_args"
|
||||||
|
else
|
||||||
|
echo "Key 'fp8' does not exist in common params."
|
||||||
|
server_command="/tgi-entrypoint.sh \
|
||||||
|
--model-id $model \
|
||||||
|
--num-shard $tp \
|
||||||
|
--port $port \
|
||||||
|
$server_args"
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# run the server
|
||||||
|
echo "Running test case $test_name"
|
||||||
|
echo "Server command: $server_command"
|
||||||
|
eval "$server_command" &
|
||||||
|
|
||||||
|
# wait until the server is alive
|
||||||
|
wait_for_server
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo ""
|
||||||
|
echo "tgi server is up and running."
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "tgi failed to start within the timeout period."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
# iterate over different QPS
|
||||||
|
for qps in $qps_list; do
|
||||||
|
# remove the surrounding single quote from qps
|
||||||
|
if [[ "$qps" == *"inf"* ]]; then
|
||||||
|
echo "qps was $qps"
|
||||||
|
qps="inf"
|
||||||
|
echo "now qps is $qps"
|
||||||
|
fi
|
||||||
|
|
||||||
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
|
||||||
|
client_command="python3 benchmark_serving.py \
|
||||||
|
--backend tgi \
|
||||||
|
--model $model \
|
||||||
|
--dataset-name $dataset_name \
|
||||||
|
--dataset-path $dataset_path \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
--port $port \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
$client_args"
|
||||||
|
|
||||||
|
echo "Running test case $test_name with qps $qps"
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
|
eval "$client_command"
|
||||||
|
|
||||||
|
# record the benchmarking commands
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg server "$server_command" \
|
||||||
|
--arg client "$client_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
--arg engine "tgi" \
|
||||||
|
'{
|
||||||
|
server_command: $server,
|
||||||
|
client_command: $client,
|
||||||
|
gpu_type: $gpu,
|
||||||
|
engine: $engine
|
||||||
|
}')
|
||||||
|
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
# clean up
|
||||||
|
kill_gpu_processes
|
||||||
|
rm -rf /root/.cache/huggingface/*
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
upload_to_buildkite() {
|
||||||
|
# upload the benchmarking results to buildkite
|
||||||
|
|
||||||
|
# if the agent binary is not found, skip uploading the results, exit 0
|
||||||
|
if [ ! -f /workspace/buildkite-agent ]; then
|
||||||
|
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||||
|
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
|
||||||
|
check_gpus
|
||||||
|
# enter vllm directory
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
declare -g RESULTS_FOLDER=results/
|
||||||
|
mkdir -p $RESULTS_FOLDER
|
||||||
|
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||||
|
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=tgi
|
||||||
|
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
||||||
|
python -m pip install tabulate pandas
|
||||||
|
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
||||||
|
upload_to_buildkite
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
214
.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
Normal file
214
.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
Normal file
@@ -0,0 +1,214 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
check_gpus() {
|
||||||
|
# check the number of GPUs and GPU type.
|
||||||
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
|
echo "GPU found."
|
||||||
|
else
|
||||||
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||||
|
echo "GPU type is $gpu_type"
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_gpu_processes() {
|
||||||
|
pkill tritonserver || true
|
||||||
|
# waiting for GPU processes to be fully killed
|
||||||
|
sleep 20
|
||||||
|
# Print the GPU memory usage
|
||||||
|
# so that we know if all GPU processes are killed.
|
||||||
|
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
||||||
|
# The memory usage should be 0 MB.
|
||||||
|
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
||||||
|
}
|
||||||
|
|
||||||
|
json2args() {
|
||||||
|
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||||
|
# example:
|
||||||
|
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||||
|
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||||
|
local json_string=$1
|
||||||
|
local args=$(
|
||||||
|
echo "$json_string" | jq -r '
|
||||||
|
to_entries |
|
||||||
|
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||||
|
join(" ")
|
||||||
|
'
|
||||||
|
)
|
||||||
|
echo "$args"
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
timeout 1200 bash -c '
|
||||||
|
until curl -s localhost:8000/generate_stream > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done' && return 0 || return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
run_serving_tests() {
|
||||||
|
# run serving tests using `benchmark_serving.py`
|
||||||
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
|
local serving_test_file
|
||||||
|
serving_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over serving tests
|
||||||
|
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# append trt to the test name
|
||||||
|
test_name=trt_$test_name
|
||||||
|
|
||||||
|
# get common parameters
|
||||||
|
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
|
||||||
|
# get client and server arguments
|
||||||
|
server_params=$(echo "$params" | jq -r '.trt_server_parameters')
|
||||||
|
client_params=$(echo "$params" | jq -r '.trt_client_parameters')
|
||||||
|
client_args=$(json2args "$client_params")
|
||||||
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
|
||||||
|
|
||||||
|
echo "Running test case $test_name"
|
||||||
|
bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
|
||||||
|
|
||||||
|
# wait until the server is alive
|
||||||
|
wait_for_server
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo ""
|
||||||
|
echo "trt server is up and running."
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "trt failed to start within the timeout period."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
# prepare tokenizer
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
rm -rf /tokenizer_cache
|
||||||
|
mkdir /tokenizer_cache
|
||||||
|
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
||||||
|
--model "$model" \
|
||||||
|
--cachedir /tokenizer_cache
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
|
||||||
|
|
||||||
|
# iterate over different QPS
|
||||||
|
for qps in $qps_list; do
|
||||||
|
# remove the surrounding single quote from qps
|
||||||
|
if [[ "$qps" == *"inf"* ]]; then
|
||||||
|
echo "qps was $qps"
|
||||||
|
qps="inf"
|
||||||
|
echo "now qps is $qps"
|
||||||
|
fi
|
||||||
|
|
||||||
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
|
||||||
|
client_command="python3 benchmark_serving.py \
|
||||||
|
--backend tensorrt-llm \
|
||||||
|
--tokenizer /tokenizer_cache \
|
||||||
|
--model $model \
|
||||||
|
--dataset-name $dataset_name \
|
||||||
|
--dataset-path $dataset_path \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
--port $port \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
$client_args"
|
||||||
|
|
||||||
|
echo "Running test case $test_name with qps $qps"
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
|
eval "$client_command"
|
||||||
|
|
||||||
|
server_command=""
|
||||||
|
# record the benchmarking commands
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg server "$server_command" \
|
||||||
|
--arg client "$client_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
--arg engine "trt" \
|
||||||
|
'{
|
||||||
|
server_command: $server,
|
||||||
|
client_command: $client,
|
||||||
|
gpu_type: $gpu,
|
||||||
|
engine: $engine
|
||||||
|
}')
|
||||||
|
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
# clean up
|
||||||
|
kill_gpu_processes
|
||||||
|
rm -rf /root/.cache/huggingface/*
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
upload_to_buildkite() {
|
||||||
|
# upload the benchmarking results to buildkite
|
||||||
|
|
||||||
|
# if the agent binary is not found, skip uploading the results, exit 0
|
||||||
|
if [ ! -f /workspace/buildkite-agent ]; then
|
||||||
|
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||||
|
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
main() {
|
||||||
|
|
||||||
|
check_gpus
|
||||||
|
|
||||||
|
|
||||||
|
# enter vllm directory
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
|
||||||
|
declare -g RESULTS_FOLDER=results/
|
||||||
|
mkdir -p $RESULTS_FOLDER
|
||||||
|
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||||
|
|
||||||
|
# update transformers package, to make sure mixtral tokenizer is available
|
||||||
|
python -m pip install transformers -U
|
||||||
|
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=trt
|
||||||
|
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
||||||
|
python -m pip install tabulate pandas
|
||||||
|
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
||||||
|
upload_to_buildkite
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
221
.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
Normal file
221
.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
Normal file
@@ -0,0 +1,221 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
check_gpus() {
|
||||||
|
# check the number of GPUs and GPU type.
|
||||||
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
|
echo "GPU found."
|
||||||
|
else
|
||||||
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||||
|
echo "GPU type is $gpu_type"
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_gpu_processes() {
|
||||||
|
# kill all processes on GPU.
|
||||||
|
pkill pt_main_thread
|
||||||
|
sleep 10
|
||||||
|
|
||||||
|
# remove vllm config file
|
||||||
|
rm -rf ~/.config/vllm
|
||||||
|
|
||||||
|
# Print the GPU memory usage
|
||||||
|
# so that we know if all GPU processes are killed.
|
||||||
|
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
||||||
|
# The memory usage should be 0 MB.
|
||||||
|
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
||||||
|
}
|
||||||
|
|
||||||
|
json2args() {
|
||||||
|
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||||
|
# example:
|
||||||
|
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||||
|
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||||
|
local json_string=$1
|
||||||
|
local args=$(
|
||||||
|
echo "$json_string" | jq -r '
|
||||||
|
to_entries |
|
||||||
|
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||||
|
join(" ")
|
||||||
|
'
|
||||||
|
)
|
||||||
|
echo "$args"
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
# wait for vllm server to start
|
||||||
|
# return 1 if vllm server crashes
|
||||||
|
timeout 1200 bash -c '
|
||||||
|
until curl -s localhost:8000/v1/completions > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done' && return 0 || return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
run_serving_tests() {
|
||||||
|
# run serving tests using `benchmark_serving.py`
|
||||||
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
|
local serving_test_file
|
||||||
|
serving_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over serving tests
|
||||||
|
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# append vllm to the test name
|
||||||
|
test_name=vllm_$test_name
|
||||||
|
|
||||||
|
|
||||||
|
# get common parameters
|
||||||
|
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
|
||||||
|
# get client and server arguments
|
||||||
|
server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
|
||||||
|
client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
|
||||||
|
server_args=$(json2args "$server_params")
|
||||||
|
client_args=$(json2args "$client_params")
|
||||||
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
|
||||||
|
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||||
|
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||||
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
-tp $tp \
|
||||||
|
--model $model \
|
||||||
|
--port $port \
|
||||||
|
$server_args"
|
||||||
|
else
|
||||||
|
echo "Key 'fp8' does not exist in common params."
|
||||||
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
-tp $tp \
|
||||||
|
--model $model \
|
||||||
|
--port $port \
|
||||||
|
$server_args"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# run the server
|
||||||
|
echo "Running test case $test_name"
|
||||||
|
echo "Server command: $server_command"
|
||||||
|
eval "$server_command" &
|
||||||
|
|
||||||
|
# wait until the server is alive
|
||||||
|
wait_for_server
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo ""
|
||||||
|
echo "vllm server is up and running."
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "vllm failed to start within the timeout period."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
# iterate over different QPS
|
||||||
|
for qps in $qps_list; do
|
||||||
|
# remove the surrounding single quote from qps
|
||||||
|
if [[ "$qps" == *"inf"* ]]; then
|
||||||
|
echo "qps was $qps"
|
||||||
|
qps="inf"
|
||||||
|
echo "now qps is $qps"
|
||||||
|
fi
|
||||||
|
|
||||||
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
|
||||||
|
client_command="python3 benchmark_serving.py \
|
||||||
|
--backend vllm \
|
||||||
|
--model $model \
|
||||||
|
--dataset-name $dataset_name \
|
||||||
|
--dataset-path $dataset_path \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
--port $port \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
$client_args"
|
||||||
|
|
||||||
|
echo "Running test case $test_name with qps $qps"
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
|
eval "$client_command"
|
||||||
|
|
||||||
|
# record the benchmarking commands
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg server "$server_command" \
|
||||||
|
--arg client "$client_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
--arg engine "vllm" \
|
||||||
|
'{
|
||||||
|
server_command: $server,
|
||||||
|
client_command: $client,
|
||||||
|
gpu_type: $gpu,
|
||||||
|
engine: $engine
|
||||||
|
}')
|
||||||
|
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
# clean up
|
||||||
|
kill_gpu_processes
|
||||||
|
rm -rf /root/.cache/huggingface/*
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
upload_to_buildkite() {
|
||||||
|
# upload the benchmarking results to buildkite
|
||||||
|
|
||||||
|
# if the agent binary is not found, skip uploading the results, exit 0
|
||||||
|
if [ ! -f /workspace/buildkite-agent ]; then
|
||||||
|
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||||
|
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
|
||||||
|
check_gpus
|
||||||
|
# enter vllm directory
|
||||||
|
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
||||||
|
declare -g RESULTS_FOLDER=results/
|
||||||
|
mkdir -p $RESULTS_FOLDER
|
||||||
|
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||||
|
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=vllm
|
||||||
|
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
||||||
|
|
||||||
|
python3 -m pip install tabulate pandas
|
||||||
|
python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
||||||
|
upload_to_buildkite
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
@@ -0,0 +1,76 @@
|
|||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
results_folder = Path("results/")
|
||||||
|
|
||||||
|
# serving results and the keys that will be printed into markdown
|
||||||
|
serving_results = []
|
||||||
|
serving_column_mapping = {
|
||||||
|
"test_name": "Test name",
|
||||||
|
"gpu_type": "GPU",
|
||||||
|
"completed": "Successful req.",
|
||||||
|
"request_throughput": "Tput (req/s)",
|
||||||
|
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||||
|
"std_ttft_ms": "Std TTFT (ms)",
|
||||||
|
"mean_itl_ms": "Mean ITL (ms)",
|
||||||
|
"std_itl_ms": "Std ITL (ms)",
|
||||||
|
"input_throughput": "Input Tput (tok/s)",
|
||||||
|
"output_throughput": "Output Tput (tok/s)",
|
||||||
|
"engine": "Engine",
|
||||||
|
}
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
# collect results
|
||||||
|
for test_file in results_folder.glob("*.json"):
|
||||||
|
|
||||||
|
with open(test_file, "r") as f:
|
||||||
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
|
# attach the benchmarking command to raw_result
|
||||||
|
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||||
|
command = json.loads(f.read())
|
||||||
|
raw_result.update(command)
|
||||||
|
|
||||||
|
# update the test name of this result
|
||||||
|
raw_result.update({"test_name": test_file.stem})
|
||||||
|
|
||||||
|
# add the result to raw_result
|
||||||
|
serving_results.append(raw_result)
|
||||||
|
continue
|
||||||
|
|
||||||
|
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||||
|
|
||||||
|
if not serving_results.empty:
|
||||||
|
serving_results = serving_results[list(
|
||||||
|
serving_column_mapping.keys())].rename(
|
||||||
|
columns=serving_column_mapping)
|
||||||
|
|
||||||
|
serving_md_table_with_headers = tabulate(serving_results,
|
||||||
|
headers='keys',
|
||||||
|
tablefmt='pipe',
|
||||||
|
showindex=False)
|
||||||
|
# remove the first line of header
|
||||||
|
serving_md_table_lines = serving_md_table_with_headers.split('\n')
|
||||||
|
serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
|
||||||
|
|
||||||
|
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||||
|
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
|
||||||
|
|
||||||
|
# document benchmarking results in markdown
|
||||||
|
with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
|
||||||
|
# document results with header.
|
||||||
|
# for those who wants to reproduce our benchmark.
|
||||||
|
f.write(serving_md_table_with_headers)
|
||||||
|
f.write('\n')
|
||||||
|
|
||||||
|
# document benchmarking results in json
|
||||||
|
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
|
||||||
|
|
||||||
|
results = serving_results.to_dict(orient='records')
|
||||||
|
f.write(json.dumps(results))
|
||||||
17
.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
Normal file
17
.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
|
||||||
|
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
|
retries=0
|
||||||
|
while [ $retries -lt 1000 ]; do
|
||||||
|
if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Waiting for image to be available..."
|
||||||
|
|
||||||
|
retries=$((retries + 1))
|
||||||
|
sleep 5
|
||||||
|
done
|
||||||
|
|
||||||
|
exit 1
|
||||||
67
.buildkite/nightly-benchmarks/tests/descriptions.md
Normal file
67
.buildkite/nightly-benchmarks/tests/descriptions.md
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
|
||||||
|
## Latency tests
|
||||||
|
|
||||||
|
This test suite aims to test vllm's end-to-end latency under a controlled setup.
|
||||||
|
|
||||||
|
- Input length: 32 tokens.
|
||||||
|
- Output length: 128 tokens.
|
||||||
|
- Batch size: fixed (8).
|
||||||
|
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
|
||||||
|
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
||||||
|
|
||||||
|
### Latency benchmarking results
|
||||||
|
|
||||||
|
{latency_tests_markdown_table}
|
||||||
|
|
||||||
|
## Throughput tests
|
||||||
|
|
||||||
|
This test suite aims to test vllm's throughput.
|
||||||
|
|
||||||
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
|
- Output length: the corresponding output length of these 200 prompts.
|
||||||
|
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
||||||
|
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
|
||||||
|
- Evaluation metrics: throughput.
|
||||||
|
|
||||||
|
### Throughput benchmarking results
|
||||||
|
|
||||||
|
{throughput_tests_markdown_table}
|
||||||
|
|
||||||
|
## Serving tests
|
||||||
|
|
||||||
|
This test suite aims to test vllm's real serving metrics.
|
||||||
|
|
||||||
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
|
- Output length: the corresponding output length of these 200 prompts.
|
||||||
|
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
||||||
|
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
||||||
|
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
|
||||||
|
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
||||||
|
|
||||||
|
### Serving benchmarking results
|
||||||
|
|
||||||
|
{serving_tests_markdown_table}
|
||||||
|
|
||||||
|
## json version of the benchmarking tables
|
||||||
|
|
||||||
|
This section contains the data of the markdown tables above in JSON format.
|
||||||
|
You can load the benchmarking tables into pandas dataframes as follows:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import json
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
benchmarking_results_json = """The json string"""
|
||||||
|
benchmarking_results = json.loads(benchmarking_results_json)
|
||||||
|
latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
|
||||||
|
throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
|
||||||
|
serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
|
||||||
|
```
|
||||||
|
|
||||||
|
The json string for all benchmarking tables:
|
||||||
|
```json
|
||||||
|
{benchmarking_results_in_json_string}
|
||||||
|
```
|
||||||
|
|
||||||
|
You can also check the raw experiment data in the Artifact tab of the Buildkite page.
|
||||||
|
|
||||||
32
.buildkite/nightly-benchmarks/tests/latency-tests.json
Normal file
32
.buildkite/nightly-benchmarks/tests/latency-tests.json
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama8B_tp1",
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num_iters_warmup": 5,
|
||||||
|
"num_iters": 15
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama70B_tp4",
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num-iters-warmup": 5,
|
||||||
|
"num-iters": 15
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "latency_mixtral8x7B_tp2",
|
||||||
|
"parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num-iters-warmup": 5,
|
||||||
|
"num-iters": 15
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
116
.buildkite/nightly-benchmarks/tests/nightly-tests.json
Normal file
116
.buildkite/nightly-benchmarks/tests/nightly-tests.json
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "llama8B_tp1",
|
||||||
|
"qps_list": [4],
|
||||||
|
"common_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"tp": 1,
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 500,
|
||||||
|
"port": 8000
|
||||||
|
},
|
||||||
|
"lmdeploy_server_parameters": {
|
||||||
|
},
|
||||||
|
"lmdeploy_client_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_server_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_client_parameters": {
|
||||||
|
"endpoint": "/generate_stream"
|
||||||
|
},
|
||||||
|
"trt_server_parameters": {
|
||||||
|
"model_type": "llama",
|
||||||
|
"model_dtype": "float16",
|
||||||
|
"max_batch_size": 256,
|
||||||
|
"max_input_len": 4096,
|
||||||
|
"max_output_len": 4096,
|
||||||
|
"trt_llm_version": "r24.04"
|
||||||
|
},
|
||||||
|
"trt_client_parameters": {
|
||||||
|
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||||
|
},
|
||||||
|
"vllm_server_parameters": {
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": ""
|
||||||
|
},
|
||||||
|
"vllm_client_parameters": {
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "llama70B_tp4",
|
||||||
|
"qps_list": [2],
|
||||||
|
"common_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
|
"tp": 4,
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 500,
|
||||||
|
"port": 8000
|
||||||
|
},
|
||||||
|
"lmdeploy_server_parameters": {
|
||||||
|
},
|
||||||
|
"lmdeploy_client_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_server_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_client_parameters": {
|
||||||
|
"endpoint": "/generate_stream"
|
||||||
|
},
|
||||||
|
"trt_server_parameters": {
|
||||||
|
"model_type": "llama",
|
||||||
|
"model_dtype": "float16",
|
||||||
|
"max_batch_size": 256,
|
||||||
|
"max_input_len": 4096,
|
||||||
|
"max_output_len": 4096,
|
||||||
|
"trt_llm_version": "r24.04"
|
||||||
|
},
|
||||||
|
"trt_client_parameters": {
|
||||||
|
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||||
|
},
|
||||||
|
"vllm_server_parameters": {
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": ""
|
||||||
|
},
|
||||||
|
"vllm_client_parameters": {
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "mixtral8x7B_tp2",
|
||||||
|
"qps_list": [2],
|
||||||
|
"common_parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"tp": 2,
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 500,
|
||||||
|
"port": 8000
|
||||||
|
},
|
||||||
|
"lmdeploy_server_parameters": {
|
||||||
|
},
|
||||||
|
"lmdeploy_client_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_server_parameters": {
|
||||||
|
},
|
||||||
|
"tgi_client_parameters": {
|
||||||
|
"endpoint": "/generate_stream"
|
||||||
|
},
|
||||||
|
"trt_server_parameters": {
|
||||||
|
"model_type": "llama",
|
||||||
|
"model_dtype": "float16",
|
||||||
|
"max_batch_size": 256,
|
||||||
|
"max_input_len": 4096,
|
||||||
|
"max_output_len": 4096,
|
||||||
|
"trt_llm_version": "r24.04"
|
||||||
|
},
|
||||||
|
"trt_client_parameters": {
|
||||||
|
"endpoint": "/v2/models/ensemble/generate_stream"
|
||||||
|
},
|
||||||
|
"vllm_server_parameters": {
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": ""
|
||||||
|
},
|
||||||
|
"vllm_client_parameters": {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
59
.buildkite/nightly-benchmarks/tests/serving-tests.json
Normal file
59
.buildkite/nightly-benchmarks/tests/serving-tests.json
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama70B_tp4_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_mixtral8x7B_tp2_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
35
.buildkite/nightly-benchmarks/tests/throughput-tests.json
Normal file
35
.buildkite/nightly-benchmarks/tests/throughput-tests.json
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "throughput_llama8B_tp1",
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200,
|
||||||
|
"backend": "vllm"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "throughput_llama70B_tp4",
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200,
|
||||||
|
"backend": "vllm"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "throughput_mixtral8x7B_tp2",
|
||||||
|
"parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200,
|
||||||
|
"backend": "vllm"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
17
.buildkite/release-pipeline.yaml
Normal file
17
.buildkite/release-pipeline.yaml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
steps:
|
||||||
|
- label: "Build wheel - CUDA {{matrix.cuda_version}}"
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue
|
||||||
|
commands:
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ."
|
||||||
|
- "mkdir artifacts"
|
||||||
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
|
# rename the files to change linux -> manylinux1
|
||||||
|
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
|
||||||
|
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
|
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
|
||||||
|
matrix:
|
||||||
|
setup:
|
||||||
|
cuda_version:
|
||||||
|
- "11.8.0"
|
||||||
|
- "12.1.0"
|
||||||
@@ -2,6 +2,15 @@
|
|||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
# Print ROCm version
|
# Print ROCm version
|
||||||
|
echo "--- Confirming Clean Initial State"
|
||||||
|
while true; do
|
||||||
|
sleep 3
|
||||||
|
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||||
|
echo "GPUs state is \"clean\""
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
echo "--- ROCm info"
|
echo "--- ROCm info"
|
||||||
rocminfo
|
rocminfo
|
||||||
|
|
||||||
@@ -45,15 +54,10 @@ while true; do
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
echo "--- Building container"
|
echo "--- Pulling container"
|
||||||
sha=$(git rev-parse --short HEAD)
|
image_name="rocmshared/vllm-ci:${BUILDKITE_COMMIT}"
|
||||||
image_name=rocm_${sha}
|
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||||
container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
|
docker pull ${image_name}
|
||||||
docker build \
|
|
||||||
-t ${image_name} \
|
|
||||||
-f Dockerfile.rocm \
|
|
||||||
--progress plain \
|
|
||||||
.
|
|
||||||
|
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
docker rm -f ${container_name} || docker image rm -f ${image_name} || true
|
docker rm -f ${container_name} || docker image rm -f ${image_name} || true
|
||||||
|
|||||||
@@ -4,21 +4,25 @@ set -ex
|
|||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
docker build -t cpu-test -f Dockerfile.cpu .
|
docker build -t cpu-test -f Dockerfile.cpu .
|
||||||
|
docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() { docker rm -f cpu-test || true; }
|
remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image
|
# Run the image
|
||||||
docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
|
||||||
|
--cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
|
||||||
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
|
||||||
|
--cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2
|
||||||
|
|
||||||
# offline inference
|
# offline inference
|
||||||
docker exec cpu-test bash -c "python3 examples/offline_inference.py"
|
docker exec cpu-test bash -c "python3 examples/offline_inference.py"
|
||||||
|
docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test bash -c "cd tests;
|
docker exec cpu-test bash -c "cd tests;
|
||||||
pip install pytest Pillow protobuf
|
pip install pytest Pillow protobuf
|
||||||
bash ../.buildkite/download-images.sh
|
|
||||||
cd ../
|
cd ../
|
||||||
pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
|
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported
|
||||||
|
|||||||
105
.buildkite/run-multi-node-test.sh
Executable file
105
.buildkite/run-multi-node-test.sh
Executable file
@@ -0,0 +1,105 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -euox pipefail
|
||||||
|
|
||||||
|
if [[ $# -lt 4 ]]; then
|
||||||
|
echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
WORKING_DIR=$1
|
||||||
|
NUM_NODES=$2
|
||||||
|
NUM_GPUS=$3
|
||||||
|
DOCKER_IMAGE=$4
|
||||||
|
|
||||||
|
shift 4
|
||||||
|
COMMANDS=("$@")
|
||||||
|
if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
|
||||||
|
echo "The number of commands must be equal to the number of nodes."
|
||||||
|
echo "Number of nodes: $NUM_NODES"
|
||||||
|
echo "Number of commands: ${#COMMANDS[@]}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "List of commands"
|
||||||
|
for command in "${COMMANDS[@]}"; do
|
||||||
|
echo $command
|
||||||
|
done
|
||||||
|
|
||||||
|
start_network() {
|
||||||
|
docker network create --subnet=192.168.10.0/24 docker-net
|
||||||
|
}
|
||||||
|
|
||||||
|
start_nodes() {
|
||||||
|
for node in $(seq 0 $(($NUM_NODES-1))); do
|
||||||
|
GPU_DEVICES='"device='
|
||||||
|
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
|
||||||
|
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
|
||||||
|
GPU_DEVICES+=$(($DEVICE_NUM))
|
||||||
|
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
|
||||||
|
GPU_DEVICES+=','
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
GPU_DEVICES+='"'
|
||||||
|
|
||||||
|
# start the container in detached mode
|
||||||
|
# things to note:
|
||||||
|
# 1. --shm-size=10.24gb is required. don't use --ipc=host
|
||||||
|
# 2. pass HF_TOKEN to the container
|
||||||
|
# 3. map the huggingface cache directory to the container
|
||||||
|
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
|
||||||
|
# starting from 192.168.10.11)
|
||||||
|
docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
|
||||||
|
|
||||||
|
# organize containers into a ray cluster
|
||||||
|
if [ $node -eq 0 ]; then
|
||||||
|
# start the ray head node
|
||||||
|
docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
|
||||||
|
# wait for the head node to be ready
|
||||||
|
sleep 10
|
||||||
|
else
|
||||||
|
# start the ray worker nodes, and connect them to the head node
|
||||||
|
docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# wait for the cluster to be ready
|
||||||
|
sleep 10
|
||||||
|
|
||||||
|
# print the cluster status
|
||||||
|
docker exec node0 /bin/bash -c "ray status"
|
||||||
|
}
|
||||||
|
|
||||||
|
run_nodes() {
|
||||||
|
# important: iterate in reverse order to start the head node last
|
||||||
|
# we start the worker nodes first, in detached mode, and then start the head node
|
||||||
|
# in the foreground, so that the output of the head node is visible in the buildkite logs
|
||||||
|
for node in $(seq $(($NUM_NODES - 1)) -1 0); do
|
||||||
|
GPU_DEVICES='"device='
|
||||||
|
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
|
||||||
|
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
|
||||||
|
GPU_DEVICES+=$(($DEVICE_NUM))
|
||||||
|
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
|
||||||
|
GPU_DEVICES+=','
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
GPU_DEVICES+='"'
|
||||||
|
echo "Running node$node with GPU devices: $GPU_DEVICES"
|
||||||
|
if [ $node -ne 0 ]; then
|
||||||
|
docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
|
||||||
|
else
|
||||||
|
docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
|
cleanup() {
|
||||||
|
for node in $(seq 0 $(($NUM_NODES-1))); do
|
||||||
|
docker stop node$node
|
||||||
|
done
|
||||||
|
docker network rm docker-net
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
start_network
|
||||||
|
start_nodes
|
||||||
|
run_nodes
|
||||||
|
|
||||||
14
.buildkite/run-openvino-test.sh
Executable file
14
.buildkite/run-openvino-test.sh
Executable file
@@ -0,0 +1,14 @@
|
|||||||
|
# This script build the OpenVINO docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
docker build -t openvino-test -f Dockerfile.openvino .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f openvino-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image and launch offline inference
|
||||||
|
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
|
||||||
14
.buildkite/run-xpu-test.sh
Normal file
14
.buildkite/run-xpu-test.sh
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
docker build -t xpu-test -f Dockerfile.xpu .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f xpu-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image and launch offline inference
|
||||||
|
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
|
||||||
@@ -1,11 +1,39 @@
|
|||||||
# In this file, you can add more tests to run either by adding a new step or
|
# In this file, you can add more tests to run either by adding a new step or
|
||||||
# adding a new command to an existing step. See different options here for examples.
|
# adding a new command to an existing step. See different options here for examples.
|
||||||
# This script will be feed into Jinja template in `test-template.j2` to generate
|
|
||||||
# the final pipeline yaml file.
|
# This script will be feed into Jinja template in `test-template-aws.j2` at
|
||||||
|
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
|
||||||
|
# to generate the final pipeline yaml file.
|
||||||
|
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
- label: Async Engine, Inputs, Utils, Worker Test
|
||||||
|
fast_check: true
|
||||||
|
fast_check_only: true
|
||||||
|
commands:
|
||||||
|
- pytest -v -s async_engine # Async Engine
|
||||||
|
- bash ../.buildkite/download-images.sh # Inputs
|
||||||
|
- pytest -v -s test_inputs.py
|
||||||
|
- pytest -v -s multimodal
|
||||||
|
- pytest -v -s test_utils.py # Utils
|
||||||
|
- pytest -v -s worker # Worker
|
||||||
|
|
||||||
|
- label: Tensorizer, Metrics, Tracing Test
|
||||||
|
fast_check: true
|
||||||
|
fast_check_only: true
|
||||||
|
commands:
|
||||||
|
- apt-get install curl libsodium23 && pytest -v -s tensorizer_loader # Tensorizer
|
||||||
|
- pytest -v -s metrics # Metrics
|
||||||
|
- "pip install \
|
||||||
|
opentelemetry-sdk \
|
||||||
|
opentelemetry-api \
|
||||||
|
opentelemetry-exporter-otlp \
|
||||||
|
opentelemetry-semantic-conventions-ai" # Tracing
|
||||||
|
- pytest -v -s tracing
|
||||||
|
|
||||||
- label: Regression Test
|
- label: Regression Test
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
|
fast_check: true
|
||||||
command: pytest -v -s test_regression.py
|
command: pytest -v -s test_regression.py
|
||||||
working_dir: "/vllm-workspace/tests" # optional
|
working_dir: "/vllm-workspace/tests" # optional
|
||||||
|
|
||||||
@@ -15,58 +43,101 @@ steps:
|
|||||||
|
|
||||||
- label: Basic Correctness Test
|
- label: Basic Correctness Test
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
|
fast_check: true
|
||||||
commands:
|
commands:
|
||||||
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
|
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
||||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
|
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||||
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
|
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||||
|
|
||||||
- label: Core Test
|
- label: Core Test
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
command: pytest -v -s core
|
fast_check: true
|
||||||
|
commands:
|
||||||
|
- pytest -v -s core
|
||||||
|
- pytest -v -s distributed/test_parallel_state.py
|
||||||
|
|
||||||
- label: Distributed Comm Ops Test
|
- label: Distributed Comm Ops Test
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
command: pytest -v -s distributed/test_comm_ops.py
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
|
commands:
|
||||||
|
- pytest -v -s distributed/test_comm_ops.py
|
||||||
|
- pytest -v -s distributed/test_shm_broadcast.py
|
||||||
|
|
||||||
- label: Distributed Tests
|
- label: 2 Node Tests (4 GPUs in total)
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
num_nodes: 2
|
||||||
|
commands:
|
||||||
|
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
||||||
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
|
||||||
|
- TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||||
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
|
||||||
|
|
||||||
|
- label: Distributed Tests (2 GPUs)
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
commands:
|
commands:
|
||||||
|
- bash ../.buildkite/download-images.sh
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||||
|
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
|
||||||
|
- TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist.py
|
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
|
||||||
|
- TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
|
||||||
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
|
||||||
|
|
||||||
- label: Distributed Tests (Multiple Groups)
|
- label: Distributed Tests (4 GPUs)
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
|
fast_check: true
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
|
# We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
|
||||||
|
# See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
|
||||||
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||||
|
|
||||||
|
- label: Pipeline Parallelism Test
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 4
|
||||||
|
commands:
|
||||||
|
- TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
- TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
- TP_SIZE=1 PP_SIZE=3 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
- PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
- PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
|
||||||
|
|
||||||
- label: Engine Test
|
- label: Engine Test
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
|
commands:
|
||||||
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
||||||
|
# OOM in the CI unless we run this separately
|
||||||
|
- pytest -v -s tokenization
|
||||||
|
|
||||||
- label: Entrypoints Test
|
- label: Entrypoints Test
|
||||||
|
fast_check: true
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s entrypoints -m llm
|
- pytest -v -s entrypoints/llm
|
||||||
- pytest -v -s entrypoints -m openai
|
- pytest -v -s entrypoints/openai
|
||||||
|
|
||||||
- label: Examples Test
|
- label: Examples Test
|
||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
@@ -90,19 +161,22 @@ steps:
|
|||||||
|
|
||||||
- label: Kernels Test %N
|
- label: Kernels Test %N
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
commands:
|
||||||
|
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
||||||
|
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: Models Test
|
- label: Models Test
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models -m \"not llava\"
|
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
||||||
|
- pytest -v -s models -m \"not vlm\"
|
||||||
|
|
||||||
- label: Llava Test
|
- label: Vision Language Models Test
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
commands:
|
commands:
|
||||||
- bash ../.buildkite/download-images.sh
|
- bash ../.buildkite/download-images.sh
|
||||||
- pytest -v -s models -m llava
|
- pytest -v -s models -m vlm
|
||||||
|
|
||||||
- label: Prefix Caching Test
|
- label: Prefix Caching Test
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
@@ -118,7 +192,9 @@ steps:
|
|||||||
command: pytest -v -s test_logits_processor.py
|
command: pytest -v -s test_logits_processor.py
|
||||||
|
|
||||||
- label: Utils Test
|
- label: Utils Test
|
||||||
command: pytest -v -s test_utils.py
|
commands:
|
||||||
|
- pytest -v -s test_utils.py
|
||||||
|
- pytest -v -s test_embedded_commit.py
|
||||||
|
|
||||||
- label: Worker Test
|
- label: Worker Test
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
@@ -141,11 +217,17 @@ steps:
|
|||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
||||||
commands:
|
commands:
|
||||||
|
# FIXIT: find out which code initialize cuda before running the test
|
||||||
|
# before the fix, we need to use spawn to test it
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s -x lora/test_long_context.py
|
- pytest -v -s -x lora/test_long_context.py
|
||||||
|
|
||||||
- label: Tensorizer Test
|
- label: Tensorizer Test
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
|
commands:
|
||||||
|
- apt-get install curl libsodium23
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -v -s tensorizer_loader
|
||||||
|
|
||||||
- label: Metrics Test
|
- label: Metrics Test
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
@@ -155,6 +237,15 @@ steps:
|
|||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
command: pytest -v -s quantization
|
command: pytest -v -s quantization
|
||||||
|
|
||||||
|
- label: Tracing Test
|
||||||
|
commands:
|
||||||
|
- "pip install \
|
||||||
|
opentelemetry-sdk \
|
||||||
|
opentelemetry-api \
|
||||||
|
opentelemetry-exporter-otlp \
|
||||||
|
opentelemetry-semantic-conventions-ai"
|
||||||
|
- pytest -v -s tracing
|
||||||
|
|
||||||
- label: Benchmarks
|
- label: Benchmarks
|
||||||
working_dir: "/vllm-workspace/.buildkite"
|
working_dir: "/vllm-workspace/.buildkite"
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
@@ -162,9 +253,40 @@ steps:
|
|||||||
- pip install aiohttp
|
- pip install aiohttp
|
||||||
- bash run-benchmarks.sh
|
- bash run-benchmarks.sh
|
||||||
|
|
||||||
|
- label: LM Eval Small Models
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
commands:
|
||||||
|
- pip install lm-eval
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
||||||
|
|
||||||
|
- label: LM Eval Large Models
|
||||||
|
gpu: a100
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
commands:
|
||||||
|
- pip install lm-eval
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|
||||||
|
|
||||||
- label: Documentation Build
|
- label: Documentation Build
|
||||||
working_dir: "/vllm-workspace/test_docs/docs"
|
working_dir: "/vllm-workspace/test_docs/docs"
|
||||||
|
fast_check: true
|
||||||
no_gpu: True
|
no_gpu: True
|
||||||
commands:
|
commands:
|
||||||
- pip install -r requirements-docs.txt
|
- pip install -r requirements-docs.txt
|
||||||
- SPHINXOPTS=\"-W\" make html
|
- SPHINXOPTS=\"-W\" make html
|
||||||
|
|
||||||
|
- label: Distributed Tests (A100)
|
||||||
|
gpu: a100
|
||||||
|
num_gpus: 4
|
||||||
|
commands:
|
||||||
|
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||||
|
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||||
|
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||||
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
|
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
||||||
|
- VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
|
- VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
|
- pytest -v -s -x lora/test_mixtral.py
|
||||||
|
|||||||
@@ -1,92 +0,0 @@
|
|||||||
{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
|
|
||||||
{% set default_working_dir = "/vllm-workspace/tests" %}
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- label: ":docker: build image"
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ."
|
|
||||||
- "docker push {{ docker_image }}"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
retry:
|
|
||||||
automatic:
|
|
||||||
- exit_status: -1 # Agent was lost
|
|
||||||
limit: 5
|
|
||||||
- exit_status: -10 # Agent was lost
|
|
||||||
limit: 5
|
|
||||||
- wait
|
|
||||||
|
|
||||||
- group: "AMD Tests"
|
|
||||||
depends_on: ~
|
|
||||||
steps:
|
|
||||||
{% for step in steps %}
|
|
||||||
{% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
|
|
||||||
- label: "AMD: {{ step.label }}"
|
|
||||||
agents:
|
|
||||||
queue: amd
|
|
||||||
command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
soft_fail: true
|
|
||||||
{% endif %}
|
|
||||||
{% endfor %}
|
|
||||||
|
|
||||||
- label: "Neuron Test"
|
|
||||||
depends_on: ~
|
|
||||||
agents:
|
|
||||||
queue: neuron
|
|
||||||
command: bash .buildkite/run-neuron-test.sh
|
|
||||||
soft_fail: false
|
|
||||||
|
|
||||||
- label: "Intel Test"
|
|
||||||
depends_on: ~
|
|
||||||
agents:
|
|
||||||
queue: intel
|
|
||||||
command: bash .buildkite/run-cpu-test.sh
|
|
||||||
|
|
||||||
{% for step in steps %}
|
|
||||||
- label: "{{ step.label }}"
|
|
||||||
agents:
|
|
||||||
{% if step.label == "Documentation Build" %}
|
|
||||||
queue: small_cpu_queue
|
|
||||||
{% elif step.no_gpu %}
|
|
||||||
queue: cpu_queue
|
|
||||||
{% elif step.num_gpus == 2 or step.num_gpus == 4 %}
|
|
||||||
queue: gpu_4_queue
|
|
||||||
{% else %}
|
|
||||||
queue: gpu_1_queue
|
|
||||||
{% endif %}
|
|
||||||
soft_fail: {{ step.soft_fail or false }}
|
|
||||||
{% if step.parallelism %}
|
|
||||||
parallelism: {{ step.parallelism }}
|
|
||||||
{% endif %}
|
|
||||||
retry:
|
|
||||||
automatic:
|
|
||||||
- exit_status: -1 # Agent was lost
|
|
||||||
limit: 5
|
|
||||||
- exit_status: -10 # Agent was lost
|
|
||||||
limit: 5
|
|
||||||
plugins:
|
|
||||||
- docker#v5.2.0:
|
|
||||||
image: {{ docker_image }}
|
|
||||||
always-pull: true
|
|
||||||
propagate-environment: true
|
|
||||||
{% if not step.no_gpu %}
|
|
||||||
gpus: all
|
|
||||||
{% endif %}
|
|
||||||
{% if step.label == "Benchmarks" %}
|
|
||||||
mount-buildkite-agent: true
|
|
||||||
{% endif %}
|
|
||||||
command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}"]
|
|
||||||
environment:
|
|
||||||
- VLLM_USAGE_SOURCE=ci-test
|
|
||||||
- HF_TOKEN
|
|
||||||
{% if step.label == "Speculative decoding tests" %}
|
|
||||||
- VLLM_ATTENTION_BACKEND=XFORMERS
|
|
||||||
{% endif %}
|
|
||||||
volumes:
|
|
||||||
- /dev/shm:/dev/shm
|
|
||||||
{% endfor %}
|
|
||||||
@@ -1,96 +0,0 @@
|
|||||||
{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
|
|
||||||
{% set default_num_gpu = 1 %}
|
|
||||||
{% set default_working_dir = "/vllm-workspace/tests" %}
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- label: ":docker: build image"
|
|
||||||
commands:
|
|
||||||
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
|
|
||||||
- "docker push {{ docker_image }}"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
retry:
|
|
||||||
automatic:
|
|
||||||
- exit_status: -1 # Agent was lost
|
|
||||||
limit: 5
|
|
||||||
- exit_status: -10 # Agent was lost
|
|
||||||
limit: 5
|
|
||||||
- wait
|
|
||||||
|
|
||||||
- group: "AMD Tests"
|
|
||||||
depends_on: ~
|
|
||||||
steps:
|
|
||||||
{% for step in steps %}
|
|
||||||
{% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
|
|
||||||
- label: "AMD: {{ step.label }}"
|
|
||||||
agents:
|
|
||||||
queue: amd
|
|
||||||
command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
soft_fail: true
|
|
||||||
{% endif %}
|
|
||||||
{% endfor %}
|
|
||||||
|
|
||||||
- label: "Neuron Test"
|
|
||||||
depends_on: ~
|
|
||||||
agents:
|
|
||||||
queue: neuron
|
|
||||||
command: bash .buildkite/run-neuron-test.sh
|
|
||||||
soft_fail: false
|
|
||||||
|
|
||||||
- label: "Intel Test"
|
|
||||||
depends_on: ~
|
|
||||||
agents:
|
|
||||||
queue: intel
|
|
||||||
command: bash .buildkite/run-cpu-test.sh
|
|
||||||
|
|
||||||
{% for step in steps %}
|
|
||||||
- label: "{{ step.label }}"
|
|
||||||
agents:
|
|
||||||
queue: kubernetes
|
|
||||||
soft_fail: {{ step.soft_fail or false }}
|
|
||||||
{% if step.parallelism %}
|
|
||||||
parallelism: {{ step.parallelism }}
|
|
||||||
{% endif %}
|
|
||||||
retry:
|
|
||||||
automatic:
|
|
||||||
- exit_status: -1 # Agent was lost
|
|
||||||
limit: 5
|
|
||||||
- exit_status: -10 # Agent was lost
|
|
||||||
limit: 5
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
{% if step.num_gpus %}
|
|
||||||
priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
|
|
||||||
{% endif %}
|
|
||||||
volumes:
|
|
||||||
- name: dshm
|
|
||||||
emptyDir:
|
|
||||||
medium: Memory
|
|
||||||
containers:
|
|
||||||
- image: "{{ docker_image }}"
|
|
||||||
command: ["bash"]
|
|
||||||
args:
|
|
||||||
- '-c'
|
|
||||||
- "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
|
|
||||||
{% if not step.no_gpu %}
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
|
|
||||||
limits:
|
|
||||||
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
|
|
||||||
{% endif %}
|
|
||||||
env:
|
|
||||||
- name: VLLM_USAGE_SOURCE
|
|
||||||
value: ci-test
|
|
||||||
- name: HF_TOKEN
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: hf-token-secret
|
|
||||||
key: token
|
|
||||||
volumeMounts:
|
|
||||||
- mountPath: /dev/shm
|
|
||||||
name: dshm
|
|
||||||
{% endfor %}
|
|
||||||
2
.github/FUNDING.yml
vendored
Normal file
2
.github/FUNDING.yml
vendored
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
github: [vllm-project]
|
||||||
|
open_collective: [vllm]
|
||||||
21
.github/workflows/add_label_automerge.yml
vendored
Normal file
21
.github/workflows/add_label_automerge.yml
vendored
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
name: Add label on auto-merge enabled
|
||||||
|
on:
|
||||||
|
pull_request_target:
|
||||||
|
types:
|
||||||
|
- auto_merge_enabled
|
||||||
|
jobs:
|
||||||
|
add-label-on-auto-merge:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Add label
|
||||||
|
uses: actions/github-script@v5
|
||||||
|
with:
|
||||||
|
script: |
|
||||||
|
github.rest.issues.addLabels({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
issue_number: context.issue.number,
|
||||||
|
labels: ['ready']
|
||||||
|
})
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
23
.github/workflows/add_label_ready_comment.yml
vendored
Normal file
23
.github/workflows/add_label_ready_comment.yml
vendored
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
name: Add Ready Label on Ready Comment
|
||||||
|
|
||||||
|
on:
|
||||||
|
issue_comment:
|
||||||
|
types: [created]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
add-ready-label:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
if: github.event.issue.pull_request && contains(github.event.comment.body, '/ready')
|
||||||
|
steps:
|
||||||
|
- name: Add label
|
||||||
|
uses: actions/github-script@v5
|
||||||
|
with:
|
||||||
|
script: |
|
||||||
|
github.rest.issues.addLabels({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
issue_number: context.issue.number,
|
||||||
|
labels: ['ready']
|
||||||
|
})
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
22
.github/workflows/mypy.yaml
vendored
22
.github/workflows/mypy.yaml
vendored
@@ -32,20 +32,22 @@ jobs:
|
|||||||
pip install types-setuptools
|
pip install types-setuptools
|
||||||
- name: Mypy
|
- name: Mypy
|
||||||
run: |
|
run: |
|
||||||
|
mypy tests --config-file pyproject.toml
|
||||||
|
mypy vllm/*.py --config-file pyproject.toml
|
||||||
mypy vllm/attention --config-file pyproject.toml
|
mypy vllm/attention --config-file pyproject.toml
|
||||||
mypy vllm/core --config-file pyproject.toml
|
mypy vllm/core --config-file pyproject.toml
|
||||||
mypy vllm/distributed --config-file pyproject.toml
|
mypy vllm/distributed --config-file pyproject.toml
|
||||||
|
mypy vllm/engine --config-file pyproject.toml
|
||||||
mypy vllm/entrypoints --config-file pyproject.toml
|
mypy vllm/entrypoints --config-file pyproject.toml
|
||||||
mypy vllm/executor --config-file pyproject.toml
|
mypy vllm/executor --config-file pyproject.toml
|
||||||
mypy vllm/multimodal --config-file pyproject.toml
|
mypy vllm/inputs --config-file pyproject.toml
|
||||||
mypy vllm/usage --config-file pyproject.toml
|
|
||||||
mypy vllm/*.py --config-file pyproject.toml
|
|
||||||
mypy vllm/transformers_utils --config-file pyproject.toml
|
|
||||||
mypy vllm/engine --config-file pyproject.toml
|
|
||||||
mypy vllm/worker --config-file pyproject.toml
|
|
||||||
mypy vllm/spec_decode --config-file pyproject.toml
|
|
||||||
mypy vllm/model_executor --config-file pyproject.toml
|
|
||||||
mypy vllm/lora --config-file pyproject.toml
|
|
||||||
mypy vllm/logging --config-file pyproject.toml
|
mypy vllm/logging --config-file pyproject.toml
|
||||||
mypy vllm/model_executor --config-file pyproject.toml
|
mypy vllm/lora --config-file pyproject.toml
|
||||||
|
mypy vllm/model_executor --config-file pyproject.toml
|
||||||
|
mypy vllm/multimodal --config-file pyproject.toml
|
||||||
|
mypy vllm/platforms --config-file pyproject.toml
|
||||||
|
mypy vllm/spec_decode --config-file pyproject.toml
|
||||||
|
mypy vllm/transformers_utils --config-file pyproject.toml
|
||||||
|
mypy vllm/usage --config-file pyproject.toml
|
||||||
|
mypy vllm/worker --config-file pyproject.toml
|
||||||
|
|
||||||
|
|||||||
2
.github/workflows/publish.yml
vendored
2
.github/workflows/publish.yml
vendored
@@ -49,7 +49,7 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
os: ['ubuntu-20.04']
|
os: ['ubuntu-20.04']
|
||||||
python-version: ['3.8', '3.9', '3.10', '3.11']
|
python-version: ['3.8', '3.9', '3.10', '3.11']
|
||||||
pytorch-version: ['2.3.0'] # Must be the most recent version that meets requirements-cuda.txt.
|
pytorch-version: ['2.3.1'] # Must be the most recent version that meets requirements-cuda.txt.
|
||||||
cuda-version: ['11.8', '12.1']
|
cuda-version: ['11.8', '12.1']
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
|||||||
21
.github/workflows/reminder_comment.yml
vendored
Normal file
21
.github/workflows/reminder_comment.yml
vendored
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
name: PR Reminder Comment Bot
|
||||||
|
on:
|
||||||
|
pull_request_target:
|
||||||
|
types: [opened]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
pr_reminder:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Remind to run full CI on PR
|
||||||
|
uses: actions/github-script@v6
|
||||||
|
with:
|
||||||
|
script: |
|
||||||
|
github.rest.issues.createComment({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
issue_number: context.issue.number,
|
||||||
|
body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only trigger `fastcheck` CI to run, which consists only a small and essential subset of tests to quickly catch errors with the flexibility to run extra individual tests on top (you can do this by unblocking test steps in the Buildkite run). \n\nFull CI run is still required to merge this PR so once the PR is ready to go, please make sure to run it. If you need all test signals in between PR commits, you can trigger full CI as well.\n\n To run full CI, you can do one of these:\n- Comment `/ready` on the PR\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
|
||||||
|
})
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,3 +1,6 @@
|
|||||||
|
# vllm commit id, generated by setup.py
|
||||||
|
vllm/commit_id.py
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
|
|||||||
@@ -2,7 +2,8 @@ cmake_minimum_required(VERSION 3.21)
|
|||||||
|
|
||||||
project(vllm_extensions LANGUAGES CXX)
|
project(vllm_extensions LANGUAGES CXX)
|
||||||
|
|
||||||
option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")
|
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
||||||
|
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
||||||
|
|
||||||
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
||||||
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
||||||
@@ -31,9 +32,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
|
|||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from Dockerfile.rocm
|
# versions are derived from Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.3.1")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.4.0")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
@@ -98,18 +98,11 @@ elseif(HIP_FOUND)
|
|||||||
# .hip extension automatically, HIP must be enabled explicitly.
|
# .hip extension automatically, HIP must be enabled explicitly.
|
||||||
enable_language(HIP)
|
enable_language(HIP)
|
||||||
|
|
||||||
# ROCm 5.x
|
# ROCm 5.X and 6.X
|
||||||
if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
|
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
|
||||||
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
|
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
|
||||||
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
|
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM} "
|
||||||
"expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
|
"expected for ROCm build, saw ${Torch_VERSION} instead.")
|
||||||
endif()
|
|
||||||
|
|
||||||
# ROCm 6.x
|
|
||||||
if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
|
|
||||||
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
|
|
||||||
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
|
|
||||||
"expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
|
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
message(FATAL_ERROR "Can't find CUDA or HIP installation.")
|
message(FATAL_ERROR "Can't find CUDA or HIP installation.")
|
||||||
@@ -178,6 +171,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||||
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||||
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
||||||
|
"csrc/quantization/fp8/fp8_marlin.cu"
|
||||||
"csrc/custom_all_reduce.cu"
|
"csrc/custom_all_reduce.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
|
||||||
|
|||||||
92
Dockerfile
92
Dockerfile
@@ -5,9 +5,26 @@
|
|||||||
# docs/source/dev/dockerfile/dockerfile.rst and
|
# docs/source/dev/dockerfile/dockerfile.rst and
|
||||||
# docs/source/assets/dev/dockerfile-stages-dependency.png
|
# docs/source/assets/dev/dockerfile-stages-dependency.png
|
||||||
|
|
||||||
|
ARG CUDA_VERSION=12.4.1
|
||||||
#################### BASE BUILD IMAGE ####################
|
#################### BASE BUILD IMAGE ####################
|
||||||
# prepare basic build environment
|
# prepare basic build environment
|
||||||
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS base
|
||||||
|
|
||||||
|
ARG CUDA_VERSION=12.4.1
|
||||||
|
ARG PYTHON_VERSION=3
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||||
|
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||||
|
&& apt-get update -y \
|
||||||
|
&& apt-get install -y ccache software-properties-common \
|
||||||
|
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||||
|
&& apt-get update -y \
|
||||||
|
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv python3-pip \
|
||||||
|
&& if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
|
||||||
|
&& python3 --version \
|
||||||
|
&& python3 -m pip --version
|
||||||
|
|
||||||
RUN apt-get update -y \
|
RUN apt-get update -y \
|
||||||
&& apt-get install -y python3-pip git curl sudo
|
&& apt-get install -y python3-pip git curl sudo
|
||||||
@@ -16,7 +33,7 @@ RUN apt-get update -y \
|
|||||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||||
# this won't be needed for future versions of this docker image
|
# this won't be needed for future versions of this docker image
|
||||||
# or future versions of triton.
|
# or future versions of triton.
|
||||||
RUN ldconfig /usr/local/cuda-12.4/compat/
|
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
@@ -24,14 +41,11 @@ WORKDIR /workspace
|
|||||||
COPY requirements-common.txt requirements-common.txt
|
COPY requirements-common.txt requirements-common.txt
|
||||||
COPY requirements-cuda.txt requirements-cuda.txt
|
COPY requirements-cuda.txt requirements-cuda.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install -r requirements-cuda.txt
|
python3 -m pip install -r requirements-cuda.txt
|
||||||
|
|
||||||
# install development dependencies
|
COPY requirements-mamba.txt requirements-mamba.txt
|
||||||
COPY requirements-lint.txt requirements-lint.txt
|
RUN python3 -m pip install packaging
|
||||||
COPY requirements-test.txt requirements-test.txt
|
RUN python3 -m pip install -r requirements-mamba.txt
|
||||||
COPY requirements-dev.txt requirements-dev.txt
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -r requirements-dev.txt
|
|
||||||
|
|
||||||
# cuda arch list used by torch
|
# cuda arch list used by torch
|
||||||
# can be useful for both `dev` and `test`
|
# can be useful for both `dev` and `test`
|
||||||
@@ -41,14 +55,16 @@ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
|
|||||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||||
#################### BASE BUILD IMAGE ####################
|
#################### BASE BUILD IMAGE ####################
|
||||||
|
|
||||||
|
|
||||||
#################### WHEEL BUILD IMAGE ####################
|
#################### WHEEL BUILD IMAGE ####################
|
||||||
FROM dev AS build
|
FROM base AS build
|
||||||
|
|
||||||
|
ARG PYTHON_VERSION=3
|
||||||
|
|
||||||
# install build dependencies
|
# install build dependencies
|
||||||
COPY requirements-build.txt requirements-build.txt
|
COPY requirements-build.txt requirements-build.txt
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install -r requirements-build.txt
|
python3 -m pip install -r requirements-build.txt
|
||||||
|
|
||||||
# install compiler cache to speed up compilation leveraging local or remote caching
|
# install compiler cache to speed up compilation leveraging local or remote caching
|
||||||
RUN apt-get update -y && apt-get install -y ccache
|
RUN apt-get update -y && apt-get install -y ccache
|
||||||
@@ -72,6 +88,9 @@ ENV NVCC_THREADS=$nvcc_threads
|
|||||||
# make sure punica kernels are built (for LoRA)
|
# make sure punica kernels are built (for LoRA)
|
||||||
ENV VLLM_INSTALL_PUNICA_KERNELS=1
|
ENV VLLM_INSTALL_PUNICA_KERNELS=1
|
||||||
|
|
||||||
|
ARG buildkite_commit
|
||||||
|
ENV BUILDKITE_COMMIT=${buildkite_commit}
|
||||||
|
|
||||||
ARG USE_SCCACHE
|
ARG USE_SCCACHE
|
||||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
@@ -83,8 +102,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
|
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
|
||||||
&& export SCCACHE_BUCKET=vllm-build-sccache \
|
&& export SCCACHE_BUCKET=vllm-build-sccache \
|
||||||
&& export SCCACHE_REGION=us-west-2 \
|
&& export SCCACHE_REGION=us-west-2 \
|
||||||
|
&& export CMAKE_BUILD_TYPE=Release \
|
||||||
&& sccache --show-stats \
|
&& sccache --show-stats \
|
||||||
&& python3 setup.py bdist_wheel --dist-dir=dist \
|
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
|
||||||
&& sccache --show-stats; \
|
&& sccache --show-stats; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -92,7 +112,7 @@ ENV CCACHE_DIR=/root/.cache/ccache
|
|||||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||||
--mount=type=cache,target=/root/.cache/pip \
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
if [ "$USE_SCCACHE" != "1" ]; then \
|
if [ "$USE_SCCACHE" != "1" ]; then \
|
||||||
python3 setup.py bdist_wheel --dist-dir=dist; \
|
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# check the size of the wheel, we cannot upload wheels larger than 100MB
|
# check the size of the wheel, we cannot upload wheels larger than 100MB
|
||||||
@@ -101,9 +121,36 @@ RUN python3 check-wheel-size.py dist
|
|||||||
|
|
||||||
#################### EXTENSION Build IMAGE ####################
|
#################### EXTENSION Build IMAGE ####################
|
||||||
|
|
||||||
|
#################### DEV IMAGE ####################
|
||||||
|
FROM base as dev
|
||||||
|
|
||||||
|
COPY requirements-lint.txt requirements-lint.txt
|
||||||
|
COPY requirements-test.txt requirements-test.txt
|
||||||
|
COPY requirements-dev.txt requirements-dev.txt
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
python3 -m pip install -r requirements-dev.txt
|
||||||
|
|
||||||
|
#################### DEV IMAGE ####################
|
||||||
|
#################### MAMBA Build IMAGE ####################
|
||||||
|
FROM dev as mamba-builder
|
||||||
|
# max jobs used for build
|
||||||
|
ARG max_jobs=2
|
||||||
|
ENV MAX_JOBS=${max_jobs}
|
||||||
|
|
||||||
|
WORKDIR /usr/src/mamba
|
||||||
|
|
||||||
|
COPY requirements-mamba.txt requirements-mamba.txt
|
||||||
|
|
||||||
|
# Download the wheel or build it if a pre-compiled release doesn't exist
|
||||||
|
RUN pip --verbose wheel -r requirements-mamba.txt \
|
||||||
|
--no-build-isolation --no-deps --no-cache-dir
|
||||||
|
|
||||||
|
#################### MAMBA Build IMAGE ####################
|
||||||
|
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
# image with vLLM installed
|
# image with vLLM installed
|
||||||
FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
|
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
|
||||||
|
ARG CUDA_VERSION=12.4.1
|
||||||
WORKDIR /vllm-workspace
|
WORKDIR /vllm-workspace
|
||||||
|
|
||||||
RUN apt-get update -y \
|
RUN apt-get update -y \
|
||||||
@@ -113,12 +160,19 @@ RUN apt-get update -y \
|
|||||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||||
# this won't be needed for future versions of this docker image
|
# this won't be needed for future versions of this docker image
|
||||||
# or future versions of triton.
|
# or future versions of triton.
|
||||||
RUN ldconfig /usr/local/cuda-12.4/compat/
|
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||||
|
|
||||||
# install vllm wheel first, so that torch etc will be installed
|
# install vllm wheel first, so that torch etc will be installed
|
||||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||||
--mount=type=cache,target=/root/.cache/pip \
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install dist/*.whl --verbose
|
python3 -m pip install dist/*.whl --verbose
|
||||||
|
|
||||||
|
RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamba \
|
||||||
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
|
python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
|
|
||||||
|
|
||||||
@@ -131,7 +185,7 @@ ADD . /vllm-workspace/
|
|||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install -r requirements-dev.txt
|
python3 -m pip install -r requirements-dev.txt
|
||||||
|
|
||||||
# doc requires source code
|
# doc requires source code
|
||||||
# we hide them inside `test_docs/` , so that this source code
|
# we hide them inside `test_docs/` , so that this source code
|
||||||
@@ -148,7 +202,7 @@ FROM vllm-base AS vllm-openai
|
|||||||
|
|
||||||
# install additional dependencies for openai api server
|
# install additional dependencies for openai api server
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install accelerate hf_transfer modelscope
|
pip install accelerate hf_transfer 'modelscope!=1.15.0'
|
||||||
|
|
||||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,13 @@ RUN apt-get update -y \
|
|||||||
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
|
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
|
||||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||||
|
|
||||||
RUN echo 'export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD' >> ~/.bashrc
|
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
|
||||||
|
# intel-openmp provides additional performance improvement vs. openmp
|
||||||
|
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
|
||||||
|
RUN pip install intel-openmp
|
||||||
|
|
||||||
|
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"
|
||||||
|
|
||||||
|
|
||||||
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
|
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
|
||||||
|
|
||||||
@@ -21,10 +27,14 @@ WORKDIR /workspace/vllm
|
|||||||
|
|
||||||
RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
|
|
||||||
|
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
|
||||||
|
ARG VLLM_CPU_DISABLE_AVX512
|
||||||
|
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
||||||
|
|
||||||
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
||||||
|
|
||||||
WORKDIR /workspace/
|
WORKDIR /workspace/
|
||||||
|
|
||||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
|||||||
26
Dockerfile.openvino
Normal file
26
Dockerfile.openvino
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||||
|
# to run the OpenAI compatible server.
|
||||||
|
|
||||||
|
FROM ubuntu:22.04 AS dev
|
||||||
|
|
||||||
|
RUN apt-get update -y && \
|
||||||
|
apt-get install -y python3-pip git
|
||||||
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
# copy requirements
|
||||||
|
COPY requirements-build.txt /workspace/vllm/
|
||||||
|
COPY requirements-common.txt /workspace/vllm/
|
||||||
|
COPY requirements-openvino.txt /workspace/vllm/
|
||||||
|
|
||||||
|
COPY vllm/ /workspace/vllm/vllm
|
||||||
|
COPY setup.py /workspace/vllm/
|
||||||
|
|
||||||
|
# install build requirements
|
||||||
|
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
|
||||||
|
# build vLLM with OpenVINO backend
|
||||||
|
RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
|
||||||
|
|
||||||
|
COPY examples/ /workspace/vllm/examples
|
||||||
|
COPY benchmarks/ /workspace/vllm/benchmarks
|
||||||
|
|
||||||
|
CMD ["/bin/bash"]
|
||||||
22
Dockerfile.ppc64le
Normal file
22
Dockerfile.ppc64le
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
FROM mambaorg/micromamba
|
||||||
|
ARG MAMBA_DOCKERFILE_ACTIVATE=1
|
||||||
|
USER root
|
||||||
|
|
||||||
|
RUN apt-get update -y && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||||
|
|
||||||
|
# Some packages in requirements-cpu are installed here
|
||||||
|
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
|
||||||
|
# Currently these may not be available for venv or pip directly
|
||||||
|
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 pytorch-cpu=2.1.2 torchvision-cpu=0.16.2 && micromamba clean --all --yes
|
||||||
|
|
||||||
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
|
# These packages will be in rocketce eventually
|
||||||
|
RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
|
||||||
|
|
||||||
|
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
||||||
|
|
||||||
|
WORKDIR /vllm-workspace
|
||||||
|
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
213
Dockerfile.rocm
213
Dockerfile.rocm
@@ -1,35 +1,35 @@
|
|||||||
# default base image
|
# Default ROCm 6.1 base image
|
||||||
ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
|
ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
|
||||||
|
|
||||||
FROM $BASE_IMAGE
|
# Tested and supported base rocm/pytorch images
|
||||||
|
ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1" \
|
||||||
|
ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \
|
||||||
|
ROCM_6_1_BASE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
|
||||||
|
|
||||||
ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
|
# Default ROCm ARCHes to build vLLM for.
|
||||||
|
ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
|
||||||
|
|
||||||
RUN echo "Base image is $BASE_IMAGE"
|
# Whether to build CK-based flash-attention
|
||||||
|
# If 0, will not build flash attention
|
||||||
# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
|
# This is useful for gfx target where flash-attention is not supported
|
||||||
# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
|
# (i.e. those that do not appear in `FA_GFX_ARCHS`)
|
||||||
|
# Triton FA is used by default on ROCm now so this is unnecessary.
|
||||||
|
|
||||||
ARG FA_GFX_ARCHS="gfx90a;gfx942"
|
|
||||||
RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
|
|
||||||
|
|
||||||
ARG FA_BRANCH="ae7928c"
|
|
||||||
RUN echo "FA_BRANCH is $FA_BRANCH"
|
|
||||||
|
|
||||||
# whether to build flash-attention
|
|
||||||
# if 0, will not build flash attention
|
|
||||||
# this is useful for gfx target where flash-attention is not supported
|
|
||||||
# In that case, we need to use the python reference attention implementation in vllm
|
|
||||||
ARG BUILD_FA="1"
|
ARG BUILD_FA="1"
|
||||||
|
ARG FA_GFX_ARCHS="gfx90a;gfx942"
|
||||||
|
ARG FA_BRANCH="ae7928c"
|
||||||
|
|
||||||
# whether to build triton on rocm
|
# Whether to build triton on rocm
|
||||||
ARG BUILD_TRITON="1"
|
ARG BUILD_TRITON="1"
|
||||||
|
ARG TRITON_BRANCH="0ef1848"
|
||||||
|
|
||||||
|
### Base image build stage
|
||||||
|
FROM $BASE_IMAGE AS base
|
||||||
|
|
||||||
|
# Import arg(s) defined before this build stage
|
||||||
|
ARG PYTORCH_ROCM_ARCH
|
||||||
|
|
||||||
# Install some basic utilities
|
# Install some basic utilities
|
||||||
RUN apt-get update && apt-get install python3 python3-pip -y
|
RUN apt-get update && apt-get install python3 python3-pip -y
|
||||||
|
|
||||||
# Install some basic utilities
|
|
||||||
RUN apt-get update && apt-get install -y \
|
RUN apt-get update && apt-get install -y \
|
||||||
curl \
|
curl \
|
||||||
ca-certificates \
|
ca-certificates \
|
||||||
@@ -40,76 +40,165 @@ RUN apt-get update && apt-get install -y \
|
|||||||
build-essential \
|
build-essential \
|
||||||
wget \
|
wget \
|
||||||
unzip \
|
unzip \
|
||||||
nvidia-cuda-toolkit \
|
|
||||||
tmux \
|
tmux \
|
||||||
|
ccache \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
### Mount Point ###
|
# When launching the container, mount the code directory to /vllm-workspace
|
||||||
# When launching the container, mount the code directory to /app
|
|
||||||
ARG APP_MOUNT=/vllm-workspace
|
ARG APP_MOUNT=/vllm-workspace
|
||||||
VOLUME [ ${APP_MOUNT} ]
|
|
||||||
WORKDIR ${APP_MOUNT}
|
WORKDIR ${APP_MOUNT}
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip
|
RUN pip install --upgrade pip
|
||||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
# Remove sccache so it doesn't interfere with ccache
|
||||||
|
# TODO: implement sccache support across components
|
||||||
|
RUN apt-get purge -y sccache; pip uninstall -y sccache; rm -f "$(which sccache)"
|
||||||
|
# Install torch == 2.5.0 on ROCm
|
||||||
|
RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
||||||
|
*"rocm-5.7"*) \
|
||||||
|
pip uninstall -y torch torchaudio torchvision \
|
||||||
|
&& pip install --no-cache-dir --pre \
|
||||||
|
torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \
|
||||||
|
torchvision==0.20.0.dev20240710 \
|
||||||
|
--index-url https://download.pytorch.org/whl/nightly/rocm5.7;; \
|
||||||
|
*"rocm-6.0"*) \
|
||||||
|
pip uninstall -y torch torchaudio torchvision \
|
||||||
|
&& pip install --no-cache-dir --pre \
|
||||||
|
torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \
|
||||||
|
torchvision==0.20.0.dev20240710 \
|
||||||
|
--index-url https://download.pytorch.org/whl/nightly/rocm6.0;; \
|
||||||
|
*"rocm-6.1"*) \
|
||||||
|
pip uninstall -y torch torchaudio torchvision \
|
||||||
|
&& pip install --no-cache-dir --pre \
|
||||||
|
torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \
|
||||||
|
torchvision==0.20.0.dev20240710 \
|
||||||
|
--index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
|
||||||
|
*) ;; esac
|
||||||
|
|
||||||
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
|
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
|
||||||
ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
|
ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
|
||||||
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
|
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
|
||||||
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
|
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
|
||||||
|
|
||||||
# Install ROCm flash-attention
|
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
||||||
RUN if [ "$BUILD_FA" = "1" ]; then \
|
ENV CCACHE_DIR=/root/.cache/ccache
|
||||||
mkdir libs \
|
|
||||||
|
|
||||||
|
### AMD-SMI build stage
|
||||||
|
FROM base AS build_amdsmi
|
||||||
|
# Build amdsmi wheel always
|
||||||
|
RUN cd /opt/rocm/share/amd_smi \
|
||||||
|
&& pip wheel . --wheel-dir=/install
|
||||||
|
|
||||||
|
|
||||||
|
### Flash-Attention wheel build stage
|
||||||
|
FROM base AS build_fa
|
||||||
|
ARG BUILD_FA
|
||||||
|
ARG FA_GFX_ARCHS
|
||||||
|
ARG FA_BRANCH
|
||||||
|
# Build ROCm flash-attention wheel if `BUILD_FA = 1`
|
||||||
|
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
||||||
|
if [ "$BUILD_FA" = "1" ]; then \
|
||||||
|
mkdir -p libs \
|
||||||
&& cd libs \
|
&& cd libs \
|
||||||
&& git clone https://github.com/ROCm/flash-attention.git \
|
&& git clone https://github.com/ROCm/flash-attention.git \
|
||||||
&& cd flash-attention \
|
&& cd flash-attention \
|
||||||
&& git checkout ${FA_BRANCH} \
|
&& git checkout "${FA_BRANCH}" \
|
||||||
&& git submodule update --init \
|
&& git submodule update --init \
|
||||||
&& export GPU_ARCHS=${FA_GFX_ARCHS} \
|
&& case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
||||||
&& if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
|
*"rocm-5.7"*) \
|
||||||
patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
|
export VLLM_TORCH_PATH="$(python3 -c 'import torch; print(torch.__path__[0])')" \
|
||||||
&& python3 setup.py install \
|
&& patch "${VLLM_TORCH_PATH}"/utils/hipify/hipify_python.py hipify_patch.patch;; \
|
||||||
&& cd ..; \
|
*) ;; esac \
|
||||||
|
&& GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
|
||||||
|
# Create an empty directory otherwise as later build stages expect one
|
||||||
|
else mkdir -p /install; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
|
|
||||||
# Manually removed it so that later steps of numpy upgrade can continue
|
|
||||||
RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
|
|
||||||
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
|
|
||||||
|
|
||||||
# build triton
|
### Triton wheel build stage
|
||||||
RUN if [ "$BUILD_TRITON" = "1" ]; then \
|
FROM base AS build_triton
|
||||||
|
ARG BUILD_TRITON
|
||||||
|
ARG TRITON_BRANCH
|
||||||
|
# Build triton wheel if `BUILD_TRITON = 1`
|
||||||
|
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
||||||
|
if [ "$BUILD_TRITON" = "1" ]; then \
|
||||||
mkdir -p libs \
|
mkdir -p libs \
|
||||||
&& cd libs \
|
&& cd libs \
|
||||||
&& pip uninstall -y triton \
|
&& git clone https://github.com/OpenAI/triton.git \
|
||||||
&& git clone https://github.com/ROCm/triton.git \
|
&& cd triton \
|
||||||
&& cd triton/python \
|
&& git checkout "${TRITON_BRANCH}" \
|
||||||
&& pip3 install . \
|
&& cd python \
|
||||||
&& cd ../..; \
|
&& python3 setup.py bdist_wheel --dist-dir=/install; \
|
||||||
|
# Create an empty directory otherwise as later build stages expect one
|
||||||
|
else mkdir -p /install; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
WORKDIR /vllm-workspace
|
|
||||||
|
### Final vLLM build stage
|
||||||
|
FROM base AS final
|
||||||
|
# Import the vLLM development directory from the build context
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
#RUN python3 -m pip install pynvml # to be removed eventually
|
# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
|
||||||
RUN python3 -m pip install --upgrade pip numba
|
# Manually remove it so that later steps of numpy upgrade can continue
|
||||||
|
RUN case "$(which python3)" in \
|
||||||
|
*"/opt/conda/envs/py_3.9"*) \
|
||||||
|
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
|
||||||
|
*) ;; esac
|
||||||
|
|
||||||
# make sure punica kernels are built (for LoRA)
|
# Package upgrades for useful functionality or to avoid dependency issues
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install --upgrade numba scipy huggingface-hub[cli]
|
||||||
|
|
||||||
|
# Make sure punica kernels are built (for LoRA)
|
||||||
ENV VLLM_INSTALL_PUNICA_KERNELS=1
|
ENV VLLM_INSTALL_PUNICA_KERNELS=1
|
||||||
# Workaround for ray >= 2.10.0
|
# Workaround for ray >= 2.10.0
|
||||||
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||||
|
# Silences the HF Tokenizers warning
|
||||||
|
ENV TOKENIZERS_PARALLELISM=false
|
||||||
|
|
||||||
ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so
|
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
||||||
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -U -r requirements-rocm.txt \
|
pip install -U -r requirements-rocm.txt \
|
||||||
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
|
&& case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
||||||
&& python3 setup.py install \
|
*"rocm-6.0"*) \
|
||||||
&& cp build/lib.linux-x86_64-cpython-39/vllm/_C.abi3.so vllm/ \
|
patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch;; \
|
||||||
&& cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.abi3.so vllm/ \
|
*"rocm-6.1"*) \
|
||||||
&& cp build/lib.linux-x86_64-cpython-39/vllm/_moe_C.abi3.so vllm/ \
|
# Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
|
||||||
&& cd ..
|
wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P rocm_patch \
|
||||||
|
&& cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6 \
|
||||||
|
# Prevent interference if torch bundles its own HIP runtime
|
||||||
|
&& rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
|
||||||
|
*) ;; esac \
|
||||||
|
&& python3 setup.py clean --all \
|
||||||
|
&& python3 setup.py develop
|
||||||
|
|
||||||
|
# Copy amdsmi wheel into final image
|
||||||
|
RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
|
||||||
|
mkdir -p libs \
|
||||||
|
&& cp /install/*.whl libs \
|
||||||
|
# Preemptively uninstall to avoid same-version no-installs
|
||||||
|
&& pip uninstall -y amdsmi;
|
||||||
|
|
||||||
|
# Copy triton wheel(s) into final image if they were built
|
||||||
|
RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
|
||||||
|
mkdir -p libs \
|
||||||
|
&& if ls /install/*.whl; then \
|
||||||
|
cp /install/*.whl libs \
|
||||||
|
# Preemptively uninstall to avoid same-version no-installs
|
||||||
|
&& pip uninstall -y triton; fi
|
||||||
|
|
||||||
|
# Copy flash-attn wheel(s) into final image if they were built
|
||||||
|
RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
|
||||||
|
mkdir -p libs \
|
||||||
|
&& if ls /install/*.whl; then \
|
||||||
|
cp /install/*.whl libs \
|
||||||
|
# Preemptively uninstall to avoid same-version no-installs
|
||||||
|
&& pip uninstall -y flash-attn; fi
|
||||||
|
|
||||||
|
# Install wheels that were built to the final image
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
if ls libs/*.whl; then \
|
||||||
|
pip install libs/*.whl; fi
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
|||||||
@@ -2,11 +2,8 @@ ARG NIGHTLY_DATE="20240601"
|
|||||||
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
|
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
|
||||||
|
|
||||||
FROM $BASE_IMAGE
|
FROM $BASE_IMAGE
|
||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
COPY . /workspace/vllm
|
|
||||||
|
|
||||||
ENV VLLM_TARGET_DEVICE="tpu"
|
|
||||||
# Install aiohttp separately to avoid build errors.
|
# Install aiohttp separately to avoid build errors.
|
||||||
RUN pip install aiohttp
|
RUN pip install aiohttp
|
||||||
# Install the TPU and Pallas dependencies.
|
# Install the TPU and Pallas dependencies.
|
||||||
@@ -14,6 +11,13 @@ RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases
|
|||||||
RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
||||||
|
|
||||||
# Build vLLM.
|
# Build vLLM.
|
||||||
|
COPY . /workspace/vllm
|
||||||
|
ENV VLLM_TARGET_DEVICE="tpu"
|
||||||
RUN cd /workspace/vllm && python setup.py develop
|
RUN cd /workspace/vllm && python setup.py develop
|
||||||
|
|
||||||
|
# Re-install outlines to avoid dependency errors.
|
||||||
|
# The outlines version must follow requirements-common.txt.
|
||||||
|
RUN pip uninstall outlines -y
|
||||||
|
RUN pip install "outlines>=0.0.43"
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
|||||||
22
Dockerfile.xpu
Normal file
22
Dockerfile.xpu
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04
|
||||||
|
|
||||||
|
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
||||||
|
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||||
|
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
||||||
|
rm /etc/apt/sources.list.d/intel-graphics.list && \
|
||||||
|
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
||||||
|
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
||||||
|
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||||
|
|
||||||
|
RUN apt-get update -y \
|
||||||
|
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
|
||||||
|
|
||||||
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
|
RUN pip install -v -r requirements-xpu.txt
|
||||||
|
|
||||||
|
RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
|
||||||
|
|
||||||
|
CMD ["/bin/bash"]
|
||||||
26
README.md
26
README.md
@@ -16,27 +16,12 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
**Ray Summit CPF is Open (June 4th to June 20th)!**
|
|
||||||
|
|
||||||
There will be a track for vLLM at the Ray Summit (09/30-10/02, SF) this year!
|
|
||||||
If you have cool projects related to vLLM or LLM inference, we would love to see your proposals.
|
|
||||||
This will be a great chance for everyone in the community to get together and learn.
|
|
||||||
Please submit your proposal [here](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/eventsite)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
|
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
|
||||||
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
|
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
|
||||||
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
|
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
|
||||||
- [2024/01] Added ROCm 6.0 support to vLLM.
|
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
|
||||||
- [2023/12] Added ROCm 5.7 support to vLLM.
|
|
||||||
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
|
|
||||||
- [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
|
|
||||||
- [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
|
|
||||||
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
|
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
|
||||||
- [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
|
|
||||||
- [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
|
|
||||||
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
||||||
|
|
||||||
---
|
---
|
||||||
@@ -52,14 +37,16 @@ vLLM is fast with:
|
|||||||
- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
|
- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
|
||||||
- Optimized CUDA kernels
|
- Optimized CUDA kernels
|
||||||
|
|
||||||
|
**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/3924) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
|
||||||
|
|
||||||
vLLM is flexible and easy to use with:
|
vLLM is flexible and easy to use with:
|
||||||
|
|
||||||
- Seamless integration with popular Hugging Face models
|
- Seamless integration with popular Hugging Face models
|
||||||
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
|
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
|
||||||
- Tensor parallelism support for distributed inference
|
- Tensor parallelism and pipeline parallelism support for distributed inference
|
||||||
- Streaming outputs
|
- Streaming outputs
|
||||||
- OpenAI-compatible API server
|
- OpenAI-compatible API server
|
||||||
- Support NVIDIA GPUs, AMD GPUs, and Intel CPUs
|
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs
|
||||||
- (Experimental) Prefix caching support
|
- (Experimental) Prefix caching support
|
||||||
- (Experimental) Multi-lora support
|
- (Experimental) Multi-lora support
|
||||||
|
|
||||||
@@ -112,6 +99,7 @@ vLLM is a community project. Our compute resources for development and testing a
|
|||||||
- Trainy
|
- Trainy
|
||||||
- UC Berkeley
|
- UC Berkeley
|
||||||
- UC San Diego
|
- UC San Diego
|
||||||
|
- ZhenFund
|
||||||
|
|
||||||
We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
|
We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
|
||||||
|
|
||||||
|
|||||||
@@ -4,10 +4,13 @@ import sys
|
|||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import List, Optional
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
import huggingface_hub.constants
|
||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
|
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
||||||
|
PreTrainedTokenizerFast)
|
||||||
|
|
||||||
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
||||||
|
|
||||||
@@ -222,8 +225,8 @@ async def async_request_openai_completions(
|
|||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(
|
assert api_url.endswith(
|
||||||
"v1/completions"
|
"completions"
|
||||||
), "OpenAI Completions API URL must end with 'v1/completions'."
|
), "OpenAI Completions API URL must end with 'completions'."
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
assert not request_func_input.use_beam_search
|
assert not request_func_input.use_beam_search
|
||||||
@@ -262,6 +265,9 @@ async def async_request_openai_completions(
|
|||||||
else:
|
else:
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
|
|
||||||
|
# NOTE: Some completion API might have a last
|
||||||
|
# usage summary response without a token so we
|
||||||
|
# want to check a token was generated
|
||||||
if data["choices"][0]["text"]:
|
if data["choices"][0]["text"]:
|
||||||
timestamp = time.perf_counter()
|
timestamp = time.perf_counter()
|
||||||
# First token
|
# First token
|
||||||
@@ -270,12 +276,8 @@ async def async_request_openai_completions(
|
|||||||
output.ttft = ttft
|
output.ttft = ttft
|
||||||
|
|
||||||
# Decoding phase
|
# Decoding phase
|
||||||
# NOTE: Some completion API might have a last
|
output.itl.append(timestamp -
|
||||||
# usage summary response without a token so we
|
most_recent_timestamp)
|
||||||
# do not want to include as inter-token-latency
|
|
||||||
elif data.get("usage", None) is None:
|
|
||||||
output.itl.append(timestamp -
|
|
||||||
most_recent_timestamp)
|
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
generated_text += data["choices"][0]["text"]
|
generated_text += data["choices"][0]["text"]
|
||||||
@@ -302,8 +304,8 @@ async def async_request_openai_chat_completions(
|
|||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(
|
assert api_url.endswith(
|
||||||
"v1/chat/completions"
|
"chat/completions"
|
||||||
), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
|
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
assert not request_func_input.use_beam_search
|
assert not request_func_input.use_beam_search
|
||||||
@@ -388,6 +390,30 @@ def remove_prefix(text: str, prefix: str) -> str:
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def get_model(pretrained_model_name_or_path: str) -> str:
|
||||||
|
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
|
||||||
|
from modelscope import snapshot_download
|
||||||
|
|
||||||
|
model_path = snapshot_download(
|
||||||
|
model_id=pretrained_model_name_or_path,
|
||||||
|
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
||||||
|
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
|
||||||
|
|
||||||
|
return model_path
|
||||||
|
return pretrained_model_name_or_path
|
||||||
|
|
||||||
|
|
||||||
|
def get_tokenizer(
|
||||||
|
pretrained_model_name_or_path: str, trust_remote_code: bool
|
||||||
|
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
||||||
|
if pretrained_model_name_or_path is not None and not os.path.exists(
|
||||||
|
pretrained_model_name_or_path):
|
||||||
|
pretrained_model_name_or_path = get_model(
|
||||||
|
pretrained_model_name_or_path)
|
||||||
|
return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
|
||||||
|
trust_remote_code=trust_remote_code)
|
||||||
|
|
||||||
|
|
||||||
ASYNC_REQUEST_FUNCS = {
|
ASYNC_REQUEST_FUNCS = {
|
||||||
"tgi": async_request_tgi,
|
"tgi": async_request_tgi,
|
||||||
"vllm": async_request_openai_completions,
|
"vllm": async_request_openai_completions,
|
||||||
@@ -396,4 +422,5 @@ ASYNC_REQUEST_FUNCS = {
|
|||||||
"openai": async_request_openai_completions,
|
"openai": async_request_openai_completions,
|
||||||
"openai-chat": async_request_openai_chat_completions,
|
"openai-chat": async_request_openai_chat_completions,
|
||||||
"tensorrt-llm": async_request_trt_llm,
|
"tensorrt-llm": async_request_trt_llm,
|
||||||
|
"scalellm": async_request_openai_completions,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,8 +10,10 @@ import torch
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.inputs import PromptStrictInputs
|
from vllm.inputs import PromptStrictInputs
|
||||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
@@ -19,25 +21,33 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
# NOTE(woosuk): If the request cannot be processed in a single batch,
|
# NOTE(woosuk): If the request cannot be processed in a single batch,
|
||||||
# the engine will automatically process the request in multiple batches.
|
# the engine will automatically process the request in multiple batches.
|
||||||
llm = LLM(model=args.model,
|
llm = LLM(
|
||||||
speculative_model=args.speculative_model,
|
model=args.model,
|
||||||
num_speculative_tokens=args.num_speculative_tokens,
|
speculative_model=args.speculative_model,
|
||||||
tokenizer=args.tokenizer,
|
num_speculative_tokens=args.num_speculative_tokens,
|
||||||
quantization=args.quantization,
|
speculative_draft_tensor_parallel_size=\
|
||||||
tensor_parallel_size=args.tensor_parallel_size,
|
args.speculative_draft_tensor_parallel_size,
|
||||||
trust_remote_code=args.trust_remote_code,
|
tokenizer=args.tokenizer,
|
||||||
dtype=args.dtype,
|
quantization=args.quantization,
|
||||||
enforce_eager=args.enforce_eager,
|
tensor_parallel_size=args.tensor_parallel_size,
|
||||||
kv_cache_dtype=args.kv_cache_dtype,
|
trust_remote_code=args.trust_remote_code,
|
||||||
quantization_param_path=args.quantization_param_path,
|
dtype=args.dtype,
|
||||||
device=args.device,
|
max_model_len=args.max_model_len,
|
||||||
ray_workers_use_nsight=args.ray_workers_use_nsight,
|
enforce_eager=args.enforce_eager,
|
||||||
use_v2_block_manager=args.use_v2_block_manager,
|
kv_cache_dtype=args.kv_cache_dtype,
|
||||||
enable_chunked_prefill=args.enable_chunked_prefill,
|
quantization_param_path=args.quantization_param_path,
|
||||||
download_dir=args.download_dir,
|
device=args.device,
|
||||||
block_size=args.block_size,
|
ray_workers_use_nsight=args.ray_workers_use_nsight,
|
||||||
gpu_memory_utilization=args.gpu_memory_utilization,
|
use_v2_block_manager=args.use_v2_block_manager,
|
||||||
distributed_executor_backend=args.distributed_executor_backend)
|
enable_chunked_prefill=args.enable_chunked_prefill,
|
||||||
|
download_dir=args.download_dir,
|
||||||
|
block_size=args.block_size,
|
||||||
|
gpu_memory_utilization=args.gpu_memory_utilization,
|
||||||
|
load_format=args.load_format,
|
||||||
|
distributed_executor_backend=args.distributed_executor_backend,
|
||||||
|
otlp_traces_endpoint=args.otlp_traces_endpoint,
|
||||||
|
enable_prefix_caching=args.enable_prefix_caching,
|
||||||
|
)
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
n=args.n,
|
n=args.n,
|
||||||
@@ -96,7 +106,7 @@ def main(args: argparse.Namespace):
|
|||||||
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
|
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
|
||||||
latencies.append(run_to_completion(profile_dir=None))
|
latencies.append(run_to_completion(profile_dir=None))
|
||||||
latencies = np.array(latencies)
|
latencies = np.array(latencies)
|
||||||
percentages = [10, 25, 50, 75, 90]
|
percentages = [10, 25, 50, 75, 90, 99]
|
||||||
percentiles = np.percentile(latencies, percentages)
|
percentiles = np.percentile(latencies, percentages)
|
||||||
print(f'Avg latency: {np.mean(latencies)} seconds')
|
print(f'Avg latency: {np.mean(latencies)} seconds')
|
||||||
for percentage, percentile in zip(percentages, percentiles):
|
for percentage, percentile in zip(percentages, percentiles):
|
||||||
@@ -114,12 +124,16 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description='Benchmark the latency of processing a single batch of '
|
description='Benchmark the latency of processing a single batch of '
|
||||||
'requests till completion.')
|
'requests till completion.')
|
||||||
parser.add_argument('--model', type=str, default='facebook/opt-125m')
|
parser.add_argument('--model', type=str, default='facebook/opt-125m')
|
||||||
parser.add_argument('--speculative-model', type=str, default=None)
|
parser.add_argument('--speculative-model', type=str, default=None)
|
||||||
parser.add_argument('--num-speculative-tokens', type=int, default=None)
|
parser.add_argument('--num-speculative-tokens', type=int, default=None)
|
||||||
|
parser.add_argument('--speculative-draft-tensor-parallel-size',
|
||||||
|
'-spec-draft-tp',
|
||||||
|
type=int,
|
||||||
|
default=None)
|
||||||
parser.add_argument('--tokenizer', type=str, default=None)
|
parser.add_argument('--tokenizer', type=str, default=None)
|
||||||
parser.add_argument('--quantization',
|
parser.add_argument('--quantization',
|
||||||
'-q',
|
'-q',
|
||||||
@@ -145,6 +159,12 @@ if __name__ == '__main__':
|
|||||||
parser.add_argument('--trust-remote-code',
|
parser.add_argument('--trust-remote-code',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help='trust remote code from huggingface')
|
help='trust remote code from huggingface')
|
||||||
|
parser.add_argument(
|
||||||
|
'--max-model-len',
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help='Maximum length of a sequence (including prompt and output). '
|
||||||
|
'If None, will be derived from the model.')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--dtype',
|
'--dtype',
|
||||||
type=str,
|
type=str,
|
||||||
@@ -188,9 +208,10 @@ if __name__ == '__main__':
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--device",
|
"--device",
|
||||||
type=str,
|
type=str,
|
||||||
default="cuda",
|
default="auto",
|
||||||
choices=["cuda", "cpu", "tpu"],
|
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
|
||||||
help='device type for vLLM execution, supporting CUDA and CPU.')
|
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
|
||||||
|
'CPU.')
|
||||||
parser.add_argument('--block-size',
|
parser.add_argument('--block-size',
|
||||||
type=int,
|
type=int,
|
||||||
default=16,
|
default=16,
|
||||||
@@ -200,6 +221,9 @@ if __name__ == '__main__':
|
|||||||
action='store_true',
|
action='store_true',
|
||||||
help='If True, the prefill requests can be chunked based on the '
|
help='If True, the prefill requests can be chunked based on the '
|
||||||
'max_num_batched_tokens')
|
'max_num_batched_tokens')
|
||||||
|
parser.add_argument("--enable-prefix-caching",
|
||||||
|
action='store_true',
|
||||||
|
help="Enable automatic prefix caching")
|
||||||
parser.add_argument('--use-v2-block-manager', action='store_true')
|
parser.add_argument('--use-v2-block-manager', action='store_true')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--ray-workers-use-nsight",
|
"--ray-workers-use-nsight",
|
||||||
@@ -222,6 +246,29 @@ if __name__ == '__main__':
|
|||||||
help='the fraction of GPU memory to be used for '
|
help='the fraction of GPU memory to be used for '
|
||||||
'the model executor, which can range from 0 to 1.'
|
'the model executor, which can range from 0 to 1.'
|
||||||
'If unspecified, will use the default value of 0.9.')
|
'If unspecified, will use the default value of 0.9.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--load-format',
|
||||||
|
type=str,
|
||||||
|
default=EngineArgs.load_format,
|
||||||
|
choices=[
|
||||||
|
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
|
||||||
|
'bitsandbytes'
|
||||||
|
],
|
||||||
|
help='The format of the model weights to load.\n\n'
|
||||||
|
'* "auto" will try to load the weights in the safetensors format '
|
||||||
|
'and fall back to the pytorch bin format if safetensors format '
|
||||||
|
'is not available.\n'
|
||||||
|
'* "pt" will load the weights in the pytorch bin format.\n'
|
||||||
|
'* "safetensors" will load the weights in the safetensors format.\n'
|
||||||
|
'* "npcache" will load the weights in pytorch format and store '
|
||||||
|
'a numpy cache to speed up the loading.\n'
|
||||||
|
'* "dummy" will initialize the weights with random values, '
|
||||||
|
'which is mainly for profiling.\n'
|
||||||
|
'* "tensorizer" will load the weights using tensorizer from '
|
||||||
|
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
|
||||||
|
'section for more information.\n'
|
||||||
|
'* "bitsandbytes" will load the weights using bitsandbytes '
|
||||||
|
'quantization.\n')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--distributed-executor-backend',
|
'--distributed-executor-backend',
|
||||||
choices=['ray', 'mp'],
|
choices=['ray', 'mp'],
|
||||||
@@ -229,5 +276,10 @@ if __name__ == '__main__':
|
|||||||
help='Backend to use for distributed serving. When more than 1 GPU '
|
help='Backend to use for distributed serving. When more than 1 GPU '
|
||||||
'is used, will be automatically set to "ray" if installed '
|
'is used, will be automatically set to "ray" if installed '
|
||||||
'or "mp" (multiprocessing) otherwise.')
|
'or "mp" (multiprocessing) otherwise.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--otlp-traces-endpoint',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help='Target URL to which OpenTelemetry traces will be sent.')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import argparse
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501
|
PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501
|
||||||
|
|
||||||
@@ -44,7 +44,7 @@ def main(args):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description='Benchmark the performance with or without automatic '
|
description='Benchmark the performance with or without automatic '
|
||||||
'prefix caching.')
|
'prefix caching.')
|
||||||
parser.add_argument('--model',
|
parser.add_argument('--model',
|
||||||
|
|||||||
@@ -2,8 +2,8 @@
|
|||||||
|
|
||||||
On the server side, run one of the following commands:
|
On the server side, run one of the following commands:
|
||||||
vLLM OpenAI API server
|
vLLM OpenAI API server
|
||||||
python -m vllm.entrypoints.openai.api_server \
|
vllm serve <your_model> \
|
||||||
--model <your_model> --swap-space 16 \
|
--swap-space 16 \
|
||||||
--disable-log-requests
|
--disable-log-requests
|
||||||
|
|
||||||
(TGI backend)
|
(TGI backend)
|
||||||
@@ -17,7 +17,7 @@ On the client side, run:
|
|||||||
--dataset-path <path to dataset> \
|
--dataset-path <path to dataset> \
|
||||||
--request-rate <request_rate> \ # By default <request_rate> is inf
|
--request-rate <request_rate> \ # By default <request_rate> is inf
|
||||||
--num-prompts <num_prompts> # By default <num_prompts> is 1000
|
--num-prompts <num_prompts> # By default <num_prompts> is 1000
|
||||||
|
|
||||||
when using tgi backend, add
|
when using tgi backend, add
|
||||||
--endpoint /generate_stream
|
--endpoint /generate_stream
|
||||||
to the end of the command above.
|
to the end of the command above.
|
||||||
@@ -31,7 +31,7 @@ import time
|
|||||||
import warnings
|
import warnings
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import AsyncGenerator, List, Optional, Tuple
|
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
||||||
@@ -39,7 +39,15 @@ from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
|||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
try:
|
||||||
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
|
except ImportError:
|
||||||
|
from backend_request_func import get_tokenizer
|
||||||
|
|
||||||
|
try:
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
except ImportError:
|
||||||
|
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -52,12 +60,15 @@ class BenchmarkMetrics:
|
|||||||
output_throughput: float
|
output_throughput: float
|
||||||
mean_ttft_ms: float
|
mean_ttft_ms: float
|
||||||
median_ttft_ms: float
|
median_ttft_ms: float
|
||||||
|
std_ttft_ms: float
|
||||||
p99_ttft_ms: float
|
p99_ttft_ms: float
|
||||||
mean_tpot_ms: float
|
mean_tpot_ms: float
|
||||||
median_tpot_ms: float
|
median_tpot_ms: float
|
||||||
|
std_tpot_ms: float
|
||||||
p99_tpot_ms: float
|
p99_tpot_ms: float
|
||||||
mean_itl_ms: float
|
mean_itl_ms: float
|
||||||
median_itl_ms: float
|
median_itl_ms: float
|
||||||
|
std_itl_ms: float
|
||||||
p99_itl_ms: float
|
p99_itl_ms: float
|
||||||
|
|
||||||
|
|
||||||
@@ -69,7 +80,6 @@ def sample_sharegpt_requests(
|
|||||||
) -> List[Tuple[str, int, int]]:
|
) -> List[Tuple[str, int, int]]:
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
raise ValueError("output_len too small")
|
raise ValueError("output_len too small")
|
||||||
|
|
||||||
# Load the dataset.
|
# Load the dataset.
|
||||||
with open(dataset_path) as f:
|
with open(dataset_path) as f:
|
||||||
dataset = json.load(f)
|
dataset = json.load(f)
|
||||||
@@ -177,6 +187,31 @@ def sample_sonnet_requests(
|
|||||||
return sampled_requests
|
return sampled_requests
|
||||||
|
|
||||||
|
|
||||||
|
def sample_random_requests(
|
||||||
|
input_len: int, output_len: int, num_prompts: int, range_ratio: float,
|
||||||
|
tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
|
||||||
|
|
||||||
|
input_lens = np.random.randint(
|
||||||
|
int(input_len * range_ratio),
|
||||||
|
input_len + 1,
|
||||||
|
size=num_prompts,
|
||||||
|
)
|
||||||
|
output_lens = np.random.randint(
|
||||||
|
int(output_len * range_ratio),
|
||||||
|
output_len + 1,
|
||||||
|
size=num_prompts,
|
||||||
|
)
|
||||||
|
offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
|
||||||
|
input_requests = []
|
||||||
|
for i in range(num_prompts):
|
||||||
|
prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
|
||||||
|
for j in range(input_lens[i])])
|
||||||
|
input_requests.append(
|
||||||
|
(prompt, int(input_lens[i]), int(output_lens[i])))
|
||||||
|
|
||||||
|
return input_requests
|
||||||
|
|
||||||
|
|
||||||
async def get_request(
|
async def get_request(
|
||||||
input_requests: List[Tuple[str, int, int]],
|
input_requests: List[Tuple[str, int, int]],
|
||||||
request_rate: float,
|
request_rate: float,
|
||||||
@@ -188,6 +223,7 @@ async def get_request(
|
|||||||
if request_rate == float("inf"):
|
if request_rate == float("inf"):
|
||||||
# If the request rate is infinity, then we don't need to wait.
|
# If the request rate is infinity, then we don't need to wait.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Sample the request interval from the exponential distribution.
|
# Sample the request interval from the exponential distribution.
|
||||||
interval = np.random.exponential(1.0 / request_rate)
|
interval = np.random.exponential(1.0 / request_rate)
|
||||||
# The next request will be sent after the interval.
|
# The next request will be sent after the interval.
|
||||||
@@ -200,18 +236,18 @@ def calculate_metrics(
|
|||||||
dur_s: float,
|
dur_s: float,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
) -> Tuple[BenchmarkMetrics, List[int]]:
|
) -> Tuple[BenchmarkMetrics, List[int]]:
|
||||||
actual_output_lens = []
|
actual_output_lens: List[int] = []
|
||||||
total_input = 0
|
total_input = 0
|
||||||
completed = 0
|
completed = 0
|
||||||
itls = []
|
itls: List[float] = []
|
||||||
tpots = []
|
tpots: List[float] = []
|
||||||
ttfts = []
|
ttfts: List[float] = []
|
||||||
for i in range(len(outputs)):
|
for i in range(len(outputs)):
|
||||||
if outputs[i].success:
|
if outputs[i].success:
|
||||||
# We use the tokenizer to count the number of output tokens for all
|
# We use the tokenizer to count the number of output tokens for all
|
||||||
# serving backends instead of looking at len(outputs[i].itl) since
|
# serving backends instead of looking at len(outputs[i].itl) since
|
||||||
# multiple output tokens may be bundled together
|
# multiple output tokens may be bundled together
|
||||||
# Note: this may inflate the output token count slightly
|
# Note : this may inflate the output token count slightly
|
||||||
output_len = len(
|
output_len = len(
|
||||||
tokenizer(outputs[i].generated_text,
|
tokenizer(outputs[i].generated_text,
|
||||||
add_special_tokens=False).input_ids)
|
add_special_tokens=False).input_ids)
|
||||||
@@ -241,12 +277,15 @@ def calculate_metrics(
|
|||||||
mean_ttft_ms=np.mean(ttfts or 0) *
|
mean_ttft_ms=np.mean(ttfts or 0) *
|
||||||
1000, # ttfts is empty if streaming is not supported by backend
|
1000, # ttfts is empty if streaming is not supported by backend
|
||||||
median_ttft_ms=np.median(ttfts or 0) * 1000,
|
median_ttft_ms=np.median(ttfts or 0) * 1000,
|
||||||
|
std_ttft_ms=np.std(ttfts or 0) * 1000,
|
||||||
p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
|
p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
|
||||||
mean_tpot_ms=np.mean(tpots or 0) * 1000,
|
mean_tpot_ms=np.mean(tpots or 0) * 1000,
|
||||||
median_tpot_ms=np.median(tpots or 0) * 1000,
|
median_tpot_ms=np.median(tpots or 0) * 1000,
|
||||||
|
std_tpot_ms=np.std(tpots or 0) * 1000,
|
||||||
p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
|
p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
|
||||||
mean_itl_ms=np.mean(itls or 0) * 1000,
|
mean_itl_ms=np.mean(itls or 0) * 1000,
|
||||||
median_itl_ms=np.median(itls or 0) * 1000,
|
median_itl_ms=np.median(itls or 0) * 1000,
|
||||||
|
std_itl_ms=np.std(itls or 0) * 1000,
|
||||||
p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
|
p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -265,7 +304,7 @@ async def benchmark(
|
|||||||
disable_tqdm: bool,
|
disable_tqdm: bool,
|
||||||
):
|
):
|
||||||
if backend in ASYNC_REQUEST_FUNCS:
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
request_func = ASYNC_REQUEST_FUNCS.get(backend)
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown backend: {backend}")
|
raise ValueError(f"Unknown backend: {backend}")
|
||||||
|
|
||||||
@@ -292,7 +331,7 @@ async def benchmark(
|
|||||||
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
||||||
|
|
||||||
benchmark_start_time = time.perf_counter()
|
benchmark_start_time = time.perf_counter()
|
||||||
tasks = []
|
tasks: List[asyncio.Task] = []
|
||||||
async for request in get_request(input_requests, request_rate):
|
async for request in get_request(input_requests, request_rate):
|
||||||
prompt, prompt_len, output_len = request
|
prompt, prompt_len, output_len = request
|
||||||
request_func_input = RequestFuncInput(
|
request_func_input = RequestFuncInput(
|
||||||
@@ -310,7 +349,7 @@ async def benchmark(
|
|||||||
pbar=pbar)))
|
pbar=pbar)))
|
||||||
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
|
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
if not disable_tqdm:
|
if pbar is not None:
|
||||||
pbar.close()
|
pbar.close()
|
||||||
|
|
||||||
benchmark_duration = time.perf_counter() - benchmark_start_time
|
benchmark_duration = time.perf_counter() - benchmark_start_time
|
||||||
@@ -363,12 +402,15 @@ async def benchmark(
|
|||||||
"output_throughput": metrics.output_throughput,
|
"output_throughput": metrics.output_throughput,
|
||||||
"mean_ttft_ms": metrics.mean_ttft_ms,
|
"mean_ttft_ms": metrics.mean_ttft_ms,
|
||||||
"median_ttft_ms": metrics.median_ttft_ms,
|
"median_ttft_ms": metrics.median_ttft_ms,
|
||||||
|
"std_ttft_ms": metrics.std_ttft_ms,
|
||||||
"p99_ttft_ms": metrics.p99_ttft_ms,
|
"p99_ttft_ms": metrics.p99_ttft_ms,
|
||||||
"mean_tpot_ms": metrics.mean_tpot_ms,
|
"mean_tpot_ms": metrics.mean_tpot_ms,
|
||||||
"median_tpot_ms": metrics.median_tpot_ms,
|
"median_tpot_ms": metrics.median_tpot_ms,
|
||||||
|
"std_tpot_ms": metrics.std_tpot_ms,
|
||||||
"p99_tpot_ms": metrics.p99_tpot_ms,
|
"p99_tpot_ms": metrics.p99_tpot_ms,
|
||||||
"mean_itl_ms": metrics.mean_itl_ms,
|
"mean_itl_ms": metrics.mean_itl_ms,
|
||||||
"median_itl_ms": metrics.median_itl_ms,
|
"median_itl_ms": metrics.median_itl_ms,
|
||||||
|
"std_itl_ms": metrics.std_itl_ms,
|
||||||
"p99_itl_ms": metrics.p99_itl_ms,
|
"p99_itl_ms": metrics.p99_itl_ms,
|
||||||
"input_lens": [output.prompt_len for output in outputs],
|
"input_lens": [output.prompt_len for output in outputs],
|
||||||
"output_lens": actual_output_lens,
|
"output_lens": actual_output_lens,
|
||||||
@@ -448,6 +490,15 @@ def main(args: argparse.Namespace):
|
|||||||
for prompt, prompt_formatted, prompt_len,
|
for prompt, prompt_formatted, prompt_len,
|
||||||
output_len in input_requests]
|
output_len in input_requests]
|
||||||
|
|
||||||
|
elif args.dataset_name == "random":
|
||||||
|
input_requests = sample_random_requests(
|
||||||
|
input_len=args.random_input_len,
|
||||||
|
output_len=args.random_output_len,
|
||||||
|
num_prompts=args.num_prompts,
|
||||||
|
range_ratio=args.random_range_ratio,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
||||||
|
|
||||||
@@ -466,7 +517,7 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
# Save config and results to json
|
# Save config and results to json
|
||||||
if args.save_result:
|
if args.save_result:
|
||||||
result_json = {}
|
result_json: Dict[str, Any] = {}
|
||||||
|
|
||||||
# Setup
|
# Setup
|
||||||
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
|
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||||
@@ -499,6 +550,8 @@ def main(args: argparse.Namespace):
|
|||||||
# Save to file
|
# Save to file
|
||||||
base_model_id = model_id.split("/")[-1]
|
base_model_id = model_id.split("/")[-1]
|
||||||
file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa
|
file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa
|
||||||
|
if args.result_filename:
|
||||||
|
file_name = args.result_filename
|
||||||
if args.result_dir:
|
if args.result_dir:
|
||||||
file_name = os.path.join(args.result_dir, file_name)
|
file_name = os.path.join(args.result_dir, file_name)
|
||||||
with open(file_name, "w") as outfile:
|
with open(file_name, "w") as outfile:
|
||||||
@@ -506,7 +559,7 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark the online serving throughput.")
|
description="Benchmark the online serving throughput.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--backend",
|
"--backend",
|
||||||
@@ -539,7 +592,7 @@ if __name__ == "__main__":
|
|||||||
"--dataset-name",
|
"--dataset-name",
|
||||||
type=str,
|
type=str,
|
||||||
default="sharegpt",
|
default="sharegpt",
|
||||||
choices=["sharegpt", "sonnet"],
|
choices=["sharegpt", "sonnet", "random"],
|
||||||
help="Name of the dataset to benchmark on.",
|
help="Name of the dataset to benchmark on.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--dataset-path",
|
parser.add_argument("--dataset-path",
|
||||||
@@ -556,7 +609,7 @@ if __name__ == "__main__":
|
|||||||
"--tokenizer",
|
"--tokenizer",
|
||||||
type=str,
|
type=str,
|
||||||
help=
|
help=
|
||||||
"Name or path of the tokenizer, if not using the default tokenizer.",
|
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--best-of",
|
"--best-of",
|
||||||
@@ -599,6 +652,27 @@ if __name__ == "__main__":
|
|||||||
help=
|
help=
|
||||||
"Number of prefix tokens per request, used only for sonnet dataset.",
|
"Number of prefix tokens per request, used only for sonnet dataset.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--random-input-len",
|
||||||
|
type=int,
|
||||||
|
default=1024,
|
||||||
|
help=
|
||||||
|
"Number of input tokens per request, used only for random sampling.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--random-output-len",
|
||||||
|
type=int,
|
||||||
|
default=128,
|
||||||
|
help=
|
||||||
|
"Number of output tokens per request, used only for random sampling.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--random-range-ratio",
|
||||||
|
type=float,
|
||||||
|
default=1.0,
|
||||||
|
help="Range of sampled ratio of input/output length, "
|
||||||
|
"used only for random sampling.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--request-rate",
|
"--request-rate",
|
||||||
type=float,
|
type=float,
|
||||||
@@ -639,6 +713,15 @@ if __name__ == "__main__":
|
|||||||
help="Specify directory to save benchmark json results."
|
help="Specify directory to save benchmark json results."
|
||||||
"If not specified, results are saved in the current directory.",
|
"If not specified, results are saved in the current directory.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--result-filename",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Specify the filename to save benchmark json results."
|
||||||
|
"If not specified, results will be saved in "
|
||||||
|
"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
|
||||||
|
" format.",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@@ -10,7 +10,9 @@ from tqdm import tqdm
|
|||||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||||
PreTrainedTokenizerBase)
|
PreTrainedTokenizerBase)
|
||||||
|
|
||||||
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
def sample_requests(
|
def sample_requests(
|
||||||
@@ -81,6 +83,7 @@ def run_vllm(
|
|||||||
distributed_executor_backend: Optional[str],
|
distributed_executor_backend: Optional[str],
|
||||||
gpu_memory_utilization: float = 0.9,
|
gpu_memory_utilization: float = 0.9,
|
||||||
download_dir: Optional[str] = None,
|
download_dir: Optional[str] = None,
|
||||||
|
load_format: str = EngineArgs.load_format,
|
||||||
) -> float:
|
) -> float:
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
@@ -102,11 +105,12 @@ def run_vllm(
|
|||||||
enable_chunked_prefill=enable_chunked_prefill,
|
enable_chunked_prefill=enable_chunked_prefill,
|
||||||
max_num_batched_tokens=max_num_batched_tokens,
|
max_num_batched_tokens=max_num_batched_tokens,
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
|
load_format=load_format,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
prompts = []
|
prompts: List[str] = []
|
||||||
sampling_params = []
|
sampling_params: List[SamplingParams] = []
|
||||||
for prompt, _, output_len in requests:
|
for prompt, _, output_len in requests:
|
||||||
prompts.append(prompt)
|
prompts.append(prompt)
|
||||||
sampling_params.append(
|
sampling_params.append(
|
||||||
@@ -228,7 +232,7 @@ def main(args: argparse.Namespace):
|
|||||||
args.quantization_param_path, args.device,
|
args.quantization_param_path, args.device,
|
||||||
args.enable_prefix_caching, args.enable_chunked_prefill,
|
args.enable_prefix_caching, args.enable_chunked_prefill,
|
||||||
args.max_num_batched_tokens, args.distributed_executor_backend,
|
args.max_num_batched_tokens, args.distributed_executor_backend,
|
||||||
args.gpu_memory_utilization, args.download_dir)
|
args.gpu_memory_utilization, args.download_dir, args.load_format)
|
||||||
elif args.backend == "hf":
|
elif args.backend == "hf":
|
||||||
assert args.tensor_parallel_size == 1
|
assert args.tensor_parallel_size == 1
|
||||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
||||||
@@ -258,7 +262,7 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description="Benchmark the throughput.")
|
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
||||||
parser.add_argument("--backend",
|
parser.add_argument("--backend",
|
||||||
type=str,
|
type=str,
|
||||||
choices=["vllm", "hf", "mii"],
|
choices=["vllm", "hf", "mii"],
|
||||||
@@ -345,9 +349,10 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--device",
|
"--device",
|
||||||
type=str,
|
type=str,
|
||||||
default="cuda",
|
default="auto",
|
||||||
choices=["cuda", "cpu", "tpu"],
|
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
|
||||||
help='device type for vLLM execution, supporting CUDA and CPU.')
|
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
|
||||||
|
'CPU.')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--enable-prefix-caching",
|
"--enable-prefix-caching",
|
||||||
action='store_true',
|
action='store_true',
|
||||||
@@ -377,6 +382,29 @@ if __name__ == "__main__":
|
|||||||
help='Backend to use for distributed serving. When more than 1 GPU '
|
help='Backend to use for distributed serving. When more than 1 GPU '
|
||||||
'is used, will be automatically set to "ray" if installed '
|
'is used, will be automatically set to "ray" if installed '
|
||||||
'or "mp" (multiprocessing) otherwise.')
|
'or "mp" (multiprocessing) otherwise.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--load-format',
|
||||||
|
type=str,
|
||||||
|
default=EngineArgs.load_format,
|
||||||
|
choices=[
|
||||||
|
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
|
||||||
|
'bitsandbytes'
|
||||||
|
],
|
||||||
|
help='The format of the model weights to load.\n\n'
|
||||||
|
'* "auto" will try to load the weights in the safetensors format '
|
||||||
|
'and fall back to the pytorch bin format if safetensors format '
|
||||||
|
'is not available.\n'
|
||||||
|
'* "pt" will load the weights in the pytorch bin format.\n'
|
||||||
|
'* "safetensors" will load the weights in the safetensors format.\n'
|
||||||
|
'* "npcache" will load the weights in pytorch format and store '
|
||||||
|
'a numpy cache to speed up the loading.\n'
|
||||||
|
'* "dummy" will initialize the weights with random values, '
|
||||||
|
'which is mainly for profiling.\n'
|
||||||
|
'* "tensorizer" will load the weights using tensorizer from '
|
||||||
|
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
|
||||||
|
'section for more information.\n'
|
||||||
|
'* "bitsandbytes" will load the weights using bitsandbytes '
|
||||||
|
'quantization.\n')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.tokenizer is None:
|
if args.tokenizer is None:
|
||||||
args.tokenizer = args.model
|
args.tokenizer = args.model
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ from torch.utils.benchmark import Measurement as TMeasurement
|
|||||||
from weight_shapes import WEIGHT_SHAPES
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
|
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
|
||||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||||
@@ -46,7 +47,7 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
|
|||||||
# impl
|
# impl
|
||||||
|
|
||||||
|
|
||||||
def pytorch_i8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
|
def pytorch_mm_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
|
||||||
scale_b: torch.tensor,
|
scale_b: torch.tensor,
|
||||||
out_dtype: torch.dtype) -> torch.tensor:
|
out_dtype: torch.dtype) -> torch.tensor:
|
||||||
return torch.mm(a, b)
|
return torch.mm(a, b)
|
||||||
@@ -115,14 +116,13 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
|||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
|
bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
|
||||||
b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
|
b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
|
||||||
torch.bfloat16, label, sub_label, pytorch_i8_impl,
|
torch.bfloat16, label, sub_label, pytorch_mm_impl,
|
||||||
"pytorch_bf16_bf16_bf16_matmul-no-scales"))
|
"pytorch_bf16_bf16_bf16_matmul-no-scales"))
|
||||||
|
|
||||||
# cutlass impl
|
# cutlass impl
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
|
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
|
||||||
torch.bfloat16, label, sub_label, cutlass_impl,
|
cutlass_impl, "cutlass_i8_i8_bf16_scaled_mm"))
|
||||||
"cutlass_i8_i8_bf16_scaled_mm"))
|
|
||||||
|
|
||||||
return timers
|
return timers
|
||||||
|
|
||||||
@@ -136,6 +136,13 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
|||||||
|
|
||||||
timers = []
|
timers = []
|
||||||
|
|
||||||
|
# pytorch impl w. bf16
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
|
||||||
|
b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
|
||||||
|
torch.bfloat16, label, sub_label, pytorch_mm_impl,
|
||||||
|
"pytorch_bf16_bf16_bf16_matmul-no-scales"))
|
||||||
|
|
||||||
# pytorch impl: bf16 output, without fp8 fast accum
|
# pytorch impl: bf16 output, without fp8 fast accum
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
|
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
|
||||||
@@ -160,14 +167,12 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
|||||||
|
|
||||||
# cutlass impl: bf16 output
|
# cutlass impl: bf16 output
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
|
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
|
||||||
torch.bfloat16, label, sub_label, cutlass_impl,
|
cutlass_impl, "cutlass_fp8_fp8_bf16_scaled_mm"))
|
||||||
"cutlass_fp8_fp8_bf16_scaled_mm"))
|
|
||||||
# cutlass impl: fp16 output
|
# cutlass impl: fp16 output
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
|
bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
|
||||||
torch.float16, label, sub_label, cutlass_impl,
|
cutlass_impl, "cutlass_fp8_fp8_fp16_scaled_mm"))
|
||||||
"cutlass_fp8_fp8_fp16_scaled_mm"))
|
|
||||||
return timers
|
return timers
|
||||||
|
|
||||||
|
|
||||||
@@ -289,7 +294,7 @@ if __name__ == '__main__':
|
|||||||
return torch.float8_e4m3fn
|
return torch.float8_e4m3fn
|
||||||
raise ValueError("unsupported dtype")
|
raise ValueError("unsupported dtype")
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="""
|
description="""
|
||||||
Benchmark Cutlass GEMM.
|
Benchmark Cutlass GEMM.
|
||||||
|
|
||||||
|
|||||||
@@ -22,6 +22,12 @@ WEIGHT_SHAPES = {
|
|||||||
([4096, 22016], 1),
|
([4096, 22016], 1),
|
||||||
([11008, 4096], 0),
|
([11008, 4096], 0),
|
||||||
],
|
],
|
||||||
|
"meta-llama/Llama-3-8b": [
|
||||||
|
([4096, 6144], 1),
|
||||||
|
([4096, 4096], 0),
|
||||||
|
([4096, 28672], 1),
|
||||||
|
([14336, 4096], 0),
|
||||||
|
],
|
||||||
"meta-llama/Llama-2-13b-hf": [
|
"meta-llama/Llama-2-13b-hf": [
|
||||||
([5120, 15360], 1),
|
([5120, 15360], 1),
|
||||||
([5120, 5120], 0),
|
([5120, 5120], 0),
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
import argparse
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
@@ -10,6 +9,7 @@ from vllm import _custom_ops as ops
|
|||||||
from vllm.model_executor.layers.quantization.aqlm import (
|
from vllm.model_executor.layers.quantization.aqlm import (
|
||||||
dequantize_weight, generic_dequantize_gemm, get_int_dtype,
|
dequantize_weight, generic_dequantize_gemm, get_int_dtype,
|
||||||
optimized_dequantize_gemm)
|
optimized_dequantize_gemm)
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
||||||
|
|
||||||
@@ -86,9 +86,9 @@ def dequant_no_scale(
|
|||||||
# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
|
# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
|
||||||
# the generic pytorch version.
|
# the generic pytorch version.
|
||||||
# Just visual comparison.
|
# Just visual comparison.
|
||||||
def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
|
def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
|
||||||
|
|
||||||
n = parts.sum().item()
|
n = int(parts.sum().item())
|
||||||
|
|
||||||
device = torch.device('cuda:0')
|
device = torch.device('cuda:0')
|
||||||
|
|
||||||
@@ -137,7 +137,7 @@ def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="Benchmark aqlm performance.")
|
parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
|
||||||
|
|
||||||
# Add arguments
|
# Add arguments
|
||||||
parser.add_argument("--nbooks",
|
parser.add_argument("--nbooks",
|
||||||
@@ -204,7 +204,7 @@ def main():
|
|||||||
sys.stdout = sys.__stdout__
|
sys.stdout = sys.__stdout__
|
||||||
|
|
||||||
|
|
||||||
def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
|
def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
|
||||||
methods):
|
methods):
|
||||||
|
|
||||||
# I didn't see visible improvements from increasing these, but feel free :)
|
# I didn't see visible improvements from increasing these, but feel free :)
|
||||||
@@ -252,10 +252,10 @@ def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
|
|||||||
print('')
|
print('')
|
||||||
|
|
||||||
|
|
||||||
def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
|
def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor,
|
||||||
nbooks: int, bits: int, method) -> float:
|
nbooks: int, bits: int, method) -> float:
|
||||||
|
|
||||||
n = parts.sum().item()
|
n = int(parts.sum().item())
|
||||||
|
|
||||||
device = torch.device('cuda:0')
|
device = torch.device('cuda:0')
|
||||||
|
|
||||||
|
|||||||
@@ -1,20 +1,23 @@
|
|||||||
import argparse
|
from typing import List
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as benchmark
|
import torch.utils.benchmark as benchmark
|
||||||
from benchmark_shapes import WEIGHT_SHAPES
|
from benchmark_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.gptq_marlin import (
|
|
||||||
GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
|
|
||||||
GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
|
|
||||||
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
|
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
|
||||||
GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
|
GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
|
||||||
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
|
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
|
||||||
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
|
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
|
||||||
MarlinWorkspace, marlin_24_quantize, marlin_quantize)
|
GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
|
||||||
|
GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
|
||||||
|
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
|
||||||
|
MarlinWorkspace, marlin_quantize)
|
||||||
|
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
|
||||||
|
marlin_24_quantize)
|
||||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||||
gptq_pack, quantize_weights, sort_weights)
|
gptq_pack, quantize_weights, sort_weights)
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
|
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
|
||||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||||
@@ -23,8 +26,9 @@ ACT_ORDER_OPTS = [False, True]
|
|||||||
K_FULL_OPTS = [False, True]
|
K_FULL_OPTS = [False, True]
|
||||||
|
|
||||||
|
|
||||||
def bench_run(results, model, act_order, is_k_full, num_bits, group_size,
|
def bench_run(results: List[benchmark.Measurement], model: str,
|
||||||
size_m, size_k, size_n):
|
act_order: bool, is_k_full: bool, num_bits: int, group_size: int,
|
||||||
|
size_m: int, size_k: int, size_n: int):
|
||||||
label = "Quant Matmul"
|
label = "Quant Matmul"
|
||||||
|
|
||||||
sub_label = ("{}, act={} k_full={}, b={}, g={}, "
|
sub_label = ("{}, act={} k_full={}, b={}, g={}, "
|
||||||
@@ -156,7 +160,7 @@ def main(args):
|
|||||||
for i, model in enumerate(args.models):
|
for i, model in enumerate(args.models):
|
||||||
print(f"[{i}] {model}")
|
print(f"[{i}] {model}")
|
||||||
|
|
||||||
results = []
|
results: List[benchmark.Measurement] = []
|
||||||
|
|
||||||
for model in args.models:
|
for model in args.models:
|
||||||
for layer in WEIGHT_SHAPES[model]:
|
for layer in WEIGHT_SHAPES[model]:
|
||||||
@@ -209,7 +213,7 @@ def main(args):
|
|||||||
# python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
|
# python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
|
||||||
#
|
#
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark Marlin across specified models/shapes/batches")
|
description="Benchmark Marlin across specified models/shapes/batches")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--models",
|
"--models",
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import time
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Any, Dict, List, Tuple
|
from typing import Any, Dict, List, Tuple, TypedDict
|
||||||
|
|
||||||
import ray
|
import ray
|
||||||
import torch
|
import torch
|
||||||
@@ -10,10 +10,20 @@ from ray.experimental.tqdm_ray import tqdm
|
|||||||
from transformers import AutoConfig
|
from transformers import AutoConfig
|
||||||
|
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
class BenchmarkConfig(TypedDict):
|
||||||
|
BLOCK_SIZE_M: int
|
||||||
|
BLOCK_SIZE_N: int
|
||||||
|
BLOCK_SIZE_K: int
|
||||||
|
GROUP_SIZE_M: int
|
||||||
|
num_warps: int
|
||||||
|
num_stages: int
|
||||||
|
|
||||||
|
|
||||||
def benchmark_config(
|
def benchmark_config(
|
||||||
config: Dict[str, int],
|
config: BenchmarkConfig,
|
||||||
num_tokens: int,
|
num_tokens: int,
|
||||||
num_experts: int,
|
num_experts: int,
|
||||||
shard_intermediate_size: int,
|
shard_intermediate_size: int,
|
||||||
@@ -92,7 +102,7 @@ def benchmark_config(
|
|||||||
start_event = torch.cuda.Event(enable_timing=True)
|
start_event = torch.cuda.Event(enable_timing=True)
|
||||||
end_event = torch.cuda.Event(enable_timing=True)
|
end_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
|
||||||
latencies = []
|
latencies: List[float] = []
|
||||||
for i in range(num_iters):
|
for i in range(num_iters):
|
||||||
prepare(i)
|
prepare(i)
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
@@ -111,7 +121,7 @@ def get_configs_compute_bound() -> List[Dict[str, int]]:
|
|||||||
# Reduced search space for faster tuning.
|
# Reduced search space for faster tuning.
|
||||||
# TODO(woosuk): Increase the search space and use a performance model to
|
# TODO(woosuk): Increase the search space and use a performance model to
|
||||||
# prune the search space.
|
# prune the search space.
|
||||||
configs = []
|
configs: List[BenchmarkConfig] = []
|
||||||
for num_stages in [2, 3, 4, 5]:
|
for num_stages in [2, 3, 4, 5]:
|
||||||
for block_m in [16, 32, 64, 128, 256]:
|
for block_m in [16, 32, 64, 128, 256]:
|
||||||
for block_k in [64, 128, 256]:
|
for block_k in [64, 128, 256]:
|
||||||
@@ -175,8 +185,8 @@ class BenchmarkWorker:
|
|||||||
topk: int,
|
topk: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
use_fp8: bool,
|
use_fp8: bool,
|
||||||
search_space: List[Dict[str, int]],
|
search_space: List[BenchmarkConfig],
|
||||||
) -> Dict[str, int]:
|
) -> BenchmarkConfig:
|
||||||
best_config = None
|
best_config = None
|
||||||
best_time = float("inf")
|
best_time = float("inf")
|
||||||
for config in tqdm(search_space):
|
for config in tqdm(search_space):
|
||||||
@@ -199,10 +209,11 @@ class BenchmarkWorker:
|
|||||||
best_config = config
|
best_config = config
|
||||||
now = datetime.now()
|
now = datetime.now()
|
||||||
print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
|
print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
|
||||||
|
assert best_config is not None
|
||||||
return best_config
|
return best_config
|
||||||
|
|
||||||
|
|
||||||
def sort_config(config: Dict[str, int]) -> Dict[str, int]:
|
def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
|
||||||
return {
|
return {
|
||||||
"BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
|
"BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
|
||||||
"BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
|
"BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
|
||||||
@@ -214,7 +225,7 @@ def sort_config(config: Dict[str, int]) -> Dict[str, int]:
|
|||||||
|
|
||||||
|
|
||||||
def save_configs(
|
def save_configs(
|
||||||
configs: Dict[int, Dict[str, int]],
|
configs: Dict[int, BenchmarkConfig],
|
||||||
num_experts: int,
|
num_experts: int,
|
||||||
shard_intermediate_size: int,
|
shard_intermediate_size: int,
|
||||||
hidden_size: int,
|
hidden_size: int,
|
||||||
@@ -305,7 +316,7 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = FlexibleArgumentParser()
|
||||||
parser.add_argument("--model",
|
parser.add_argument("--model",
|
||||||
type=str,
|
type=str,
|
||||||
default="mistralai/Mixtral-8x7B-Instruct-v0.1")
|
default="mistralai/Mixtral-8x7B-Instruct-v0.1")
|
||||||
|
|||||||
@@ -1,12 +1,12 @@
|
|||||||
import argparse
|
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from typing import Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
|
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
|
||||||
|
create_kv_caches_with_random)
|
||||||
|
|
||||||
NUM_BLOCKS = 1024
|
NUM_BLOCKS = 1024
|
||||||
PARTITION_SIZE = 512
|
PARTITION_SIZE = 512
|
||||||
@@ -54,14 +54,17 @@ def main(
|
|||||||
|
|
||||||
# Create the block tables.
|
# Create the block tables.
|
||||||
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
|
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
|
||||||
block_tables = []
|
block_tables_lst: List[List[int]] = []
|
||||||
for _ in range(num_seqs):
|
for _ in range(num_seqs):
|
||||||
block_table = [
|
block_table = [
|
||||||
random.randint(0, NUM_BLOCKS - 1)
|
random.randint(0, NUM_BLOCKS - 1)
|
||||||
for _ in range(max_num_blocks_per_seq)
|
for _ in range(max_num_blocks_per_seq)
|
||||||
]
|
]
|
||||||
block_tables.append(block_table)
|
block_tables_lst.append(block_table)
|
||||||
block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)
|
|
||||||
|
block_tables = torch.tensor(block_tables_lst,
|
||||||
|
dtype=torch.int,
|
||||||
|
device=device)
|
||||||
|
|
||||||
# Create the KV cache.
|
# Create the KV cache.
|
||||||
key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
|
key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
|
||||||
@@ -158,14 +161,14 @@ def main(
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark the paged attention kernel.")
|
description="Benchmark the paged attention kernel.")
|
||||||
parser.add_argument("--version",
|
parser.add_argument("--version",
|
||||||
type=str,
|
type=str,
|
||||||
choices=["v1", "v2"],
|
choices=["v1", "v2"],
|
||||||
default="v2")
|
default="v2")
|
||||||
parser.add_argument("--batch-size", type=int, default=8)
|
parser.add_argument("--batch-size", type=int, default=8)
|
||||||
parser.add_argument("--seq_len", type=int, default=4096)
|
parser.add_argument("--seq-len", type=int, default=4096)
|
||||||
parser.add_argument("--num-query-heads", type=int, default=64)
|
parser.add_argument("--num-query-heads", type=int, default=64)
|
||||||
parser.add_argument("--num-kv-heads", type=int, default=8)
|
parser.add_argument("--num-kv-heads", type=int, default=8)
|
||||||
parser.add_argument("--head-size",
|
parser.add_argument("--head-size",
|
||||||
|
|||||||
@@ -1,11 +1,12 @@
|
|||||||
import argparse
|
|
||||||
from itertools import accumulate
|
from itertools import accumulate
|
||||||
from typing import Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
import nvtx
|
import nvtx
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
|
||||||
|
get_rope)
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
def benchmark_rope_kernels_multi_lora(
|
def benchmark_rope_kernels_multi_lora(
|
||||||
@@ -37,7 +38,7 @@ def benchmark_rope_kernels_multi_lora(
|
|||||||
})
|
})
|
||||||
# non-batched RoPE takes only one scaling factor, we create multiple
|
# non-batched RoPE takes only one scaling factor, we create multiple
|
||||||
# instances to simulate the same behavior
|
# instances to simulate the same behavior
|
||||||
non_batched_ropes = []
|
non_batched_ropes: List[RotaryEmbedding] = []
|
||||||
for scaling_factor in scaling_factors:
|
for scaling_factor in scaling_factors:
|
||||||
non_batched_ropes.append(
|
non_batched_ropes.append(
|
||||||
get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
|
get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
|
||||||
@@ -85,7 +86,7 @@ def benchmark_rope_kernels_multi_lora(
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark the rotary embedding kernels.")
|
description="Benchmark the rotary embedding kernels.")
|
||||||
parser.add_argument("--is-neox-style", type=bool, default=True)
|
parser.add_argument("--is-neox-style", type=bool, default=True)
|
||||||
parser.add_argument("--batch-size", type=int, default=16)
|
parser.add_argument("--batch-size", type=int, default=16)
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
import argparse
|
|
||||||
import cProfile
|
import cProfile
|
||||||
import pstats
|
import pstats
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
# A very long prompt, total number of tokens is about 15k.
|
# A very long prompt, total number of tokens is about 15k.
|
||||||
LONG_PROMPT = ["You are an expert in large language models, aren't you?"
|
LONG_PROMPT = ["You are an expert in large language models, aren't you?"
|
||||||
@@ -47,7 +47,7 @@ def main(args):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description='Benchmark the performance of hashing function in'
|
description='Benchmark the performance of hashing function in'
|
||||||
'automatic prefix caching.')
|
'automatic prefix caching.')
|
||||||
parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
|
parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
|
||||||
|
|||||||
@@ -33,9 +33,23 @@ function (find_isa CPUINFO TARGET OUT)
|
|||||||
endif()
|
endif()
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
|
function (is_avx512_disabled OUT)
|
||||||
|
set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512})
|
||||||
|
if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true")
|
||||||
|
set(${OUT} ON PARENT_SCOPE)
|
||||||
|
else()
|
||||||
|
set(${OUT} OFF PARENT_SCOPE)
|
||||||
|
endif()
|
||||||
|
endfunction()
|
||||||
|
|
||||||
if (AVX512_FOUND)
|
is_avx512_disabled(AVX512_DISABLED)
|
||||||
|
|
||||||
|
find_isa(${CPUINFO} "avx2" AVX2_FOUND)
|
||||||
|
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
|
||||||
|
find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
|
||||||
|
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
|
||||||
|
|
||||||
|
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||||
list(APPEND CXX_COMPILE_FLAGS
|
list(APPEND CXX_COMPILE_FLAGS
|
||||||
"-mavx512f"
|
"-mavx512f"
|
||||||
"-mavx512vl"
|
"-mavx512vl"
|
||||||
@@ -53,8 +67,18 @@ if (AVX512_FOUND)
|
|||||||
else()
|
else()
|
||||||
message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
|
message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
|
||||||
endif()
|
endif()
|
||||||
|
elseif (AVX2_FOUND)
|
||||||
|
list(APPEND CXX_COMPILE_FLAGS "-mavx2")
|
||||||
|
message(WARNING "vLLM CPU backend using AVX2 ISA")
|
||||||
|
elseif (POWER9_FOUND OR POWER10_FOUND)
|
||||||
|
message(STATUS "PowerPC detected")
|
||||||
|
# Check for PowerPC VSX support
|
||||||
|
list(APPEND CXX_COMPILE_FLAGS
|
||||||
|
"-mvsx"
|
||||||
|
"-mcpu=native"
|
||||||
|
"-mtune=native")
|
||||||
else()
|
else()
|
||||||
message(FATAL_ERROR "vLLM CPU backend requires AVX512 ISA support.")
|
message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
||||||
|
|||||||
@@ -147,16 +147,23 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
|
|||||||
if (${GPU_LANG} STREQUAL "HIP")
|
if (${GPU_LANG} STREQUAL "HIP")
|
||||||
#
|
#
|
||||||
# `GPU_ARCHES` controls the `--offload-arch` flags.
|
# `GPU_ARCHES` controls the `--offload-arch` flags.
|
||||||
# `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled
|
|
||||||
# via the `PYTORCH_ROCM_ARCH` env variable.
|
|
||||||
#
|
#
|
||||||
|
# If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list,
|
||||||
|
# if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling
|
||||||
|
# "rocm_agent_enumerator" in "enable_language(HIP)"
|
||||||
|
# (in file Modules/CMakeDetermineHIPCompiler.cmake)
|
||||||
|
#
|
||||||
|
if(DEFINED ENV{PYTORCH_ROCM_ARCH})
|
||||||
|
set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH})
|
||||||
|
else()
|
||||||
|
set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES})
|
||||||
|
endif()
|
||||||
#
|
#
|
||||||
# Find the intersection of the supported + detected architectures to
|
# Find the intersection of the supported + detected architectures to
|
||||||
# set the module architecture flags.
|
# set the module architecture flags.
|
||||||
#
|
#
|
||||||
set(${GPU_ARCHES})
|
set(${GPU_ARCHES})
|
||||||
foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES})
|
foreach (_ARCH ${HIP_ARCHITECTURES})
|
||||||
if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
|
if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
|
||||||
list(APPEND ${GPU_ARCHES} ${_ARCH})
|
list(APPEND ${GPU_ARCHES} ${_ARCH})
|
||||||
endif()
|
endif()
|
||||||
@@ -164,7 +171,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
|
|||||||
|
|
||||||
if(NOT ${GPU_ARCHES})
|
if(NOT ${GPU_ARCHES})
|
||||||
message(FATAL_ERROR
|
message(FATAL_ERROR
|
||||||
"None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is"
|
"None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
|
||||||
" supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
|
" supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|||||||
@@ -135,6 +135,12 @@ __device__ __forceinline__ T gelu_fast_kernel(const T& x) {
|
|||||||
return ((T)0.5) * x * (((T)1.0) + t);
|
return ((T)0.5) * x * (((T)1.0) + t);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__device__ __forceinline__ T gelu_quick_kernel(const T& x) {
|
||||||
|
// x * sigmoid(1.702 * x)
|
||||||
|
return (T)(((float)x) / (1.0f + expf(-1.702f * (float)x)));
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
void gelu_new(torch::Tensor& out, // [..., d]
|
void gelu_new(torch::Tensor& out, // [..., d]
|
||||||
@@ -148,3 +154,9 @@ void gelu_fast(torch::Tensor& out, // [..., d]
|
|||||||
{
|
{
|
||||||
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
|
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void gelu_quick(torch::Tensor& out, // [..., d]
|
||||||
|
torch::Tensor& input) // [..., d]
|
||||||
|
{
|
||||||
|
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_quick_kernel);
|
||||||
|
}
|
||||||
|
|||||||
@@ -59,6 +59,13 @@ FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8& x) {
|
|||||||
return w3 * x * (ones + t);
|
return w3 * x * (ones + t);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE vec_op::FP32Vec8 gelu_quick_act(const vec_op::FP32Vec8& x) {
|
||||||
|
const vec_op::FP32Vec8 zeros(0.0);
|
||||||
|
const vec_op::FP32Vec8 ones(1.0);
|
||||||
|
const vec_op::FP32Vec8 w1(1.702f);
|
||||||
|
return x / (ones + (zeros - w1 * x).exp());
|
||||||
|
}
|
||||||
|
|
||||||
FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8& x) {
|
FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8& x) {
|
||||||
const vec_op::FP32Vec8 ones(1.0);
|
const vec_op::FP32Vec8 ones(1.0);
|
||||||
const vec_op::FP32Vec8 w1(M_SQRT1_2);
|
const vec_op::FP32Vec8 w1(M_SQRT1_2);
|
||||||
@@ -142,3 +149,15 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input) {
|
|||||||
CPU_KERNEL_GUARD_OUT(gelu_fast_impl)
|
CPU_KERNEL_GUARD_OUT(gelu_fast_impl)
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void gelu_quick(torch::Tensor& out, torch::Tensor& input) {
|
||||||
|
int num_tokens = input.numel() / input.size(-1);
|
||||||
|
int d = input.size(-1);
|
||||||
|
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_quick_impl", [&] {
|
||||||
|
CPU_KERNEL_GUARD_IN(gelu_quick_impl)
|
||||||
|
activation_kernel<scalar_t, gelu_quick_act, false>(
|
||||||
|
num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
|
||||||
|
CPU_KERNEL_GUARD_OUT(gelu_quick_impl)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|||||||
@@ -2,351 +2,14 @@
|
|||||||
#ifndef CPU_TYPES_HPP
|
#ifndef CPU_TYPES_HPP
|
||||||
#define CPU_TYPES_HPP
|
#define CPU_TYPES_HPP
|
||||||
|
|
||||||
#include <immintrin.h>
|
#if defined(__x86_64__)
|
||||||
#include <torch/all.h>
|
//x86 implementation
|
||||||
|
#include "cpu_types_x86.hpp"
|
||||||
namespace vec_op {
|
#elif defined(__POWER9_VECTOR__)
|
||||||
|
//ppc implementation
|
||||||
// FIXME: FP16 is not fully supported in Torch-CPU
|
#include "cpu_types_vsx.hpp"
|
||||||
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
|
||||||
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
|
||||||
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
|
|
||||||
|
|
||||||
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
|
||||||
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
|
|
||||||
|
|
||||||
#ifndef CPU_OP_GUARD
|
|
||||||
#define CPU_KERNEL_GUARD_IN(NAME)
|
|
||||||
#define CPU_KERNEL_GUARD_OUT(NAME)
|
|
||||||
#else
|
#else
|
||||||
#define CPU_KERNEL_GUARD_IN(NAME) \
|
#warning "unsupported vLLM cpu implementation"
|
||||||
std::cout << #NAME << " invoked." << std::endl;
|
|
||||||
#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define FORCE_INLINE __attribute__((always_inline)) inline
|
|
||||||
|
|
||||||
namespace {
|
|
||||||
template <typename T, T... indexes, typename F>
|
|
||||||
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
|
|
||||||
(f(std::integral_constant<T, indexes>{}), ...);
|
|
||||||
}
|
|
||||||
}; // namespace
|
|
||||||
|
|
||||||
template <typename T, T count, typename F,
|
|
||||||
typename = std::enable_if_t<std::is_invocable_v<F, T>>>
|
|
||||||
constexpr void unroll_loop(F &&f) {
|
|
||||||
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T> struct Vec {
|
|
||||||
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
|
|
||||||
};
|
|
||||||
|
|
||||||
struct FP32Vec8;
|
|
||||||
struct FP32Vec16;
|
|
||||||
|
|
||||||
#ifdef __AVX512FP16__
|
|
||||||
struct FP16Vec8 : public Vec<FP16Vec8> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 8;
|
|
||||||
|
|
||||||
__m128h reg;
|
|
||||||
|
|
||||||
explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
|
|
||||||
|
|
||||||
explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
|
|
||||||
|
|
||||||
explicit FP16Vec8(__m128h data) : reg(data) {}
|
|
||||||
|
|
||||||
FP16Vec8 operator*(const FP16Vec8 &b) const {
|
|
||||||
return FP16Vec8(_mm_mul_ph(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP16Vec8 operator+(const FP16Vec8 &b) const {
|
|
||||||
return FP16Vec8(_mm_add_ph(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP16Vec8 operator-(const FP16Vec8 &b) const {
|
|
||||||
return FP16Vec8(_mm_sub_ph(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP16Vec8 operator/(const FP16Vec8 &b) const {
|
|
||||||
return FP16Vec8(_mm_div_ph(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct BF16Vec8 : public Vec<BF16Vec8> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 8;
|
|
||||||
|
|
||||||
__m128i reg;
|
|
||||||
|
|
||||||
explicit BF16Vec8(const void *ptr)
|
|
||||||
: reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
|
|
||||||
|
|
||||||
explicit BF16Vec8(const FP32Vec8 &);
|
|
||||||
|
|
||||||
void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
|
|
||||||
};
|
|
||||||
|
|
||||||
struct BF16Vec16 : public Vec<BF16Vec16> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 16;
|
|
||||||
|
|
||||||
__m256i reg;
|
|
||||||
|
|
||||||
explicit BF16Vec16(const void *ptr)
|
|
||||||
: reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
|
|
||||||
|
|
||||||
explicit BF16Vec16(const FP32Vec16 &);
|
|
||||||
|
|
||||||
void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
|
|
||||||
};
|
|
||||||
|
|
||||||
struct BF16Vec32 : public Vec<BF16Vec32> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 32;
|
|
||||||
|
|
||||||
__m512i reg;
|
|
||||||
|
|
||||||
explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
|
|
||||||
|
|
||||||
explicit BF16Vec32(__m512i data) : reg(data) {}
|
|
||||||
|
|
||||||
explicit BF16Vec32(BF16Vec8 &vec8_data)
|
|
||||||
: reg((__m512i)_mm512_inserti32x4(
|
|
||||||
_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
|
|
||||||
(__m128i)vec8_data.reg),
|
|
||||||
(__m128i)vec8_data.reg, 1),
|
|
||||||
(__m128i)vec8_data.reg, 2),
|
|
||||||
(__m128i)vec8_data.reg, 3)) {}
|
|
||||||
|
|
||||||
void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
|
|
||||||
};
|
|
||||||
|
|
||||||
struct FP32Vec4 : public Vec<FP32Vec4> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 4;
|
|
||||||
union AliasReg {
|
|
||||||
__m128 reg;
|
|
||||||
float values[VEC_ELEM_NUM];
|
|
||||||
};
|
|
||||||
|
|
||||||
__m128 reg;
|
|
||||||
|
|
||||||
explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec4(__m128 data) : reg(data) {}
|
|
||||||
|
|
||||||
explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct FP32Vec8 : public Vec<FP32Vec8> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 8;
|
|
||||||
union AliasReg {
|
|
||||||
__m256 reg;
|
|
||||||
float values[VEC_ELEM_NUM];
|
|
||||||
};
|
|
||||||
|
|
||||||
__m256 reg;
|
|
||||||
|
|
||||||
explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec8(__m256 data) : reg(data) {}
|
|
||||||
|
|
||||||
explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
|
|
||||||
|
|
||||||
#ifdef __AVX512FP16__
|
|
||||||
explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
explicit FP32Vec8(const BF16Vec8 &v)
|
|
||||||
: reg(_mm256_castsi256_ps(
|
|
||||||
_mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
|
|
||||||
|
|
||||||
float reduce_sum() const {
|
|
||||||
AliasReg ar;
|
|
||||||
ar.reg = reg;
|
|
||||||
float result = 0;
|
|
||||||
unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8 exp() const {
|
|
||||||
AliasReg ar;
|
|
||||||
ar.reg = reg;
|
|
||||||
return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]),
|
|
||||||
expf(ar.values[5]), expf(ar.values[4]),
|
|
||||||
expf(ar.values[3]), expf(ar.values[2]),
|
|
||||||
expf(ar.values[1]), expf(ar.values[0])));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8 tanh() const {
|
|
||||||
AliasReg ar;
|
|
||||||
ar.reg = reg;
|
|
||||||
return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]),
|
|
||||||
tanhf(ar.values[5]), tanhf(ar.values[4]),
|
|
||||||
tanhf(ar.values[3]), tanhf(ar.values[2]),
|
|
||||||
tanhf(ar.values[1]), tanhf(ar.values[0])));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8 er() const {
|
|
||||||
AliasReg ar;
|
|
||||||
ar.reg = reg;
|
|
||||||
return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]),
|
|
||||||
erf(ar.values[5]), erf(ar.values[4]),
|
|
||||||
erf(ar.values[3]), erf(ar.values[2]),
|
|
||||||
erf(ar.values[1]), erf(ar.values[0])));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8 operator*(const FP32Vec8 &b) const {
|
|
||||||
return FP32Vec8(_mm256_mul_ps(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8 operator+(const FP32Vec8 &b) const {
|
|
||||||
return FP32Vec8(_mm256_add_ps(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8 operator-(const FP32Vec8 &b) const {
|
|
||||||
return FP32Vec8(_mm256_sub_ps(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8 operator/(const FP32Vec8 &b) const {
|
|
||||||
return FP32Vec8(_mm256_div_ps(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
|
|
||||||
};
|
|
||||||
|
|
||||||
struct FP32Vec16 : public Vec<FP32Vec16> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 16;
|
|
||||||
union AliasReg {
|
|
||||||
__m512 reg;
|
|
||||||
float values[VEC_ELEM_NUM];
|
|
||||||
};
|
|
||||||
|
|
||||||
__m512 reg;
|
|
||||||
|
|
||||||
explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec16(__m512 data) : reg(data) {}
|
|
||||||
|
|
||||||
explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
|
|
||||||
|
|
||||||
explicit FP32Vec16(const FP32Vec4 &data)
|
|
||||||
: reg((__m512)_mm512_inserti32x4(
|
|
||||||
_mm512_inserti32x4(
|
|
||||||
_mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
|
|
||||||
(__m128i)data.reg, 1),
|
|
||||||
(__m128i)data.reg, 2),
|
|
||||||
(__m128i)data.reg, 3)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec16(const FP32Vec8 &data)
|
|
||||||
: reg((__m512)_mm512_inserti32x8(
|
|
||||||
_mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec16(const BF16Vec16 &v)
|
|
||||||
: reg(_mm512_castsi512_ps(
|
|
||||||
_mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
|
|
||||||
|
|
||||||
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
|
|
||||||
|
|
||||||
FP32Vec16 operator*(const FP32Vec16 &b) const {
|
|
||||||
return FP32Vec16(_mm512_mul_ps(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec16 operator+(const FP32Vec16 &b) const {
|
|
||||||
return FP32Vec16(_mm512_add_ps(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec16 operator-(const FP32Vec16 &b) const {
|
|
||||||
return FP32Vec16(_mm512_sub_ps(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec16 operator/(const FP32Vec16 &b) const {
|
|
||||||
return FP32Vec16(_mm512_div_ps(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
|
|
||||||
|
|
||||||
template <int group_size> float reduce_sub_sum(int idx) {
|
|
||||||
static_assert(VEC_ELEM_NUM % group_size == 0);
|
|
||||||
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
|
|
||||||
__mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
|
|
||||||
return _mm512_mask_reduce_add_ps(mask, reg);
|
|
||||||
}
|
|
||||||
|
|
||||||
void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename T> struct VecType { using vec_type = void; };
|
|
||||||
|
|
||||||
template <typename T> using vec_t = typename VecType<T>::vec_type;
|
|
||||||
|
|
||||||
template <> struct VecType<float> { using vec_type = FP32Vec8; };
|
|
||||||
|
|
||||||
#ifdef __AVX512FP16__
|
|
||||||
template <> struct VecType<c10::Half> { using vec_type = FP16Vec16; };
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
|
|
||||||
|
|
||||||
template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
|
|
||||||
|
|
||||||
#ifdef __AVX512FP16__
|
|
||||||
template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
|
|
||||||
*reinterpret_cast<_Float16 *>(ptr) = v;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
|
|
||||||
acc = acc + a * b;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef __AVX512BF16__
|
|
||||||
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
|
|
||||||
*reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
|
|
||||||
: reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
|
|
||||||
|
|
||||||
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
|
|
||||||
: reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
|
|
||||||
|
|
||||||
inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
|
|
||||||
acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
|
|
||||||
c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
|
|
||||||
reinterpret_cast<c10::BFloat16 *>(&v);
|
|
||||||
*ptr = *(v_ptr + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
|
|
||||||
: reg(_mm256_cvtepi32_epi16(
|
|
||||||
_mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
|
|
||||||
|
|
||||||
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
|
|
||||||
: reg(_mm512_cvtepi32_epi16(
|
|
||||||
_mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
|
|
||||||
|
|
||||||
}; // namespace vec_op
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
491
csrc/cpu/cpu_types_vsx.hpp
Normal file
491
csrc/cpu/cpu_types_vsx.hpp
Normal file
@@ -0,0 +1,491 @@
|
|||||||
|
|
||||||
|
#ifndef CPU_TYPES_VSX_HPP
|
||||||
|
#define CPU_TYPES_VSX_HPP
|
||||||
|
|
||||||
|
#include <altivec.h>
|
||||||
|
#include <cmath>
|
||||||
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
namespace vec_op {
|
||||||
|
|
||||||
|
// FIXME: FP16 is not fully supported in Torch-CPU
|
||||||
|
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
||||||
|
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
|
||||||
|
|
||||||
|
#ifndef CPU_OP_GUARD
|
||||||
|
#define CPU_KERNEL_GUARD_IN(NAME)
|
||||||
|
#define CPU_KERNEL_GUARD_OUT(NAME)
|
||||||
|
#else
|
||||||
|
#define CPU_KERNEL_GUARD_IN(NAME) \
|
||||||
|
std::cout << #NAME << " invoked." << std::endl;
|
||||||
|
#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define FORCE_INLINE __attribute__((always_inline)) inline
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
template <typename T, T... indexes, typename F>
|
||||||
|
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
|
||||||
|
(f(std::integral_constant<T, indexes>{}), ...);
|
||||||
|
}
|
||||||
|
}; // namespace
|
||||||
|
|
||||||
|
template <typename T, T count, typename F,
|
||||||
|
typename = std::enable_if_t<std::is_invocable_v<F, T>>>
|
||||||
|
constexpr void unroll_loop(F &&f) {
|
||||||
|
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> struct Vec {
|
||||||
|
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef struct ss16x8x2_t {
|
||||||
|
__vector signed short val[2];
|
||||||
|
} ss16x8x2_t;
|
||||||
|
|
||||||
|
typedef struct ss16x8x4_t {
|
||||||
|
__vector signed short val[4];
|
||||||
|
} ss16x8x4_t;
|
||||||
|
|
||||||
|
typedef struct f32x4x2_t {
|
||||||
|
__vector float val[2];
|
||||||
|
} f32x4x2_t;
|
||||||
|
|
||||||
|
typedef struct f32x4x4_t {
|
||||||
|
__vector float val[4];
|
||||||
|
} f32x4x4_t;
|
||||||
|
|
||||||
|
struct FP32Vec8;
|
||||||
|
struct FP32Vec16;
|
||||||
|
|
||||||
|
struct BF16Vec8 : public Vec<BF16Vec8> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 8;
|
||||||
|
|
||||||
|
__vector signed short reg;
|
||||||
|
|
||||||
|
explicit BF16Vec8(const void *ptr)
|
||||||
|
: reg((__vector signed short)vec_xl(0, (__vector signed short *)ptr)) {}
|
||||||
|
|
||||||
|
explicit BF16Vec8(const FP32Vec8 &);
|
||||||
|
|
||||||
|
void save(void *ptr) const { *reinterpret_cast<__vector signed short *>(ptr) = reg; }
|
||||||
|
};
|
||||||
|
|
||||||
|
struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 16;
|
||||||
|
|
||||||
|
ss16x8x2_t reg;
|
||||||
|
|
||||||
|
explicit BF16Vec16(const void *ptr) {
|
||||||
|
// Load 256 bits in two parts
|
||||||
|
reg.val[0] = (__vector signed short)vec_xl(0, (signed short *)ptr);
|
||||||
|
reg.val[1] = (__vector signed short)vec_xl(16, (signed short *)ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit BF16Vec16(const FP32Vec16 &);
|
||||||
|
|
||||||
|
void save(void *ptr) const {
|
||||||
|
// Save 256 bits in two parts
|
||||||
|
vec_xst(reg.val[0], 0, (signed short *)ptr);
|
||||||
|
vec_xst(reg.val[1], 16, (signed short *)ptr);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const static __vector signed short zero = vec_splats((signed short)0);
|
||||||
|
|
||||||
|
struct BF16Vec32 : public Vec<BF16Vec32> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 32;
|
||||||
|
|
||||||
|
ss16x8x4_t reg;
|
||||||
|
explicit BF16Vec32(const void *ptr)
|
||||||
|
: reg(*reinterpret_cast<const ss16x8x4_t *>(ptr)) {}
|
||||||
|
|
||||||
|
explicit BF16Vec32(ss16x8x4_t data) : reg(data) {}
|
||||||
|
|
||||||
|
explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({
|
||||||
|
vec8_data.reg,
|
||||||
|
vec8_data.reg,
|
||||||
|
vec8_data.reg,
|
||||||
|
vec8_data.reg
|
||||||
|
}) {}
|
||||||
|
|
||||||
|
void save(void *ptr) const { *reinterpret_cast<ss16x8x4_t *>(ptr) = reg; }
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FP32Vec4 : public Vec<FP32Vec4> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 4;
|
||||||
|
union AliasReg {
|
||||||
|
__vector float reg;
|
||||||
|
float values[VEC_ELEM_NUM];
|
||||||
|
};
|
||||||
|
|
||||||
|
__vector float reg;
|
||||||
|
|
||||||
|
explicit FP32Vec4(float v) : reg(vec_splats(v)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec4() : reg(vec_splats(0.0f)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec4(const float *ptr) : reg(vec_xl(0, ptr)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec4(__vector float data) : reg(data) {}
|
||||||
|
|
||||||
|
explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FP32Vec8 : public Vec<FP32Vec8> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 8;
|
||||||
|
union AliasReg {
|
||||||
|
f32x4x2_t reg;
|
||||||
|
float values[VEC_ELEM_NUM];
|
||||||
|
};
|
||||||
|
|
||||||
|
f32x4x2_t reg;
|
||||||
|
|
||||||
|
explicit FP32Vec8(float v) {
|
||||||
|
reg.val[0] = vec_splats(v);
|
||||||
|
reg.val[1] = vec_splats(v);
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec8() {
|
||||||
|
reg.val[0] = vec_splats(0.0f);
|
||||||
|
reg.val[1] = vec_splats(0.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec8(const float *ptr) {
|
||||||
|
reg.val[0] = vec_xl(0, ptr);
|
||||||
|
reg.val[1] = vec_xl(16, ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec8(f32x4x2_t data) : reg(data) {}
|
||||||
|
|
||||||
|
explicit FP32Vec8(const FP32Vec8 &data) {
|
||||||
|
reg.val[0] = data.reg.val[0];
|
||||||
|
reg.val[1] = data.reg.val[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec8(const BF16Vec8 &v) {
|
||||||
|
reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
|
||||||
|
reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
|
||||||
|
}
|
||||||
|
|
||||||
|
float reduce_sum() const {
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
float result = 0;
|
||||||
|
unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 exp() const {
|
||||||
|
// TODO: Vectorize this
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
f32x4x4_t ret;
|
||||||
|
ret.val[0][0] = std::exp(ar.values[0]);
|
||||||
|
ret.val[0][1] = std::exp(ar.values[1]);
|
||||||
|
ret.val[0][2] = std::exp(ar.values[2]);
|
||||||
|
ret.val[0][3] = std::exp(ar.values[3]);
|
||||||
|
ret.val[1][0] = std::exp(ar.values[4]);
|
||||||
|
ret.val[1][1] = std::exp(ar.values[5]);
|
||||||
|
ret.val[1][2] = std::exp(ar.values[6]);
|
||||||
|
ret.val[1][3] = std::exp(ar.values[7]);
|
||||||
|
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 tanh() const {
|
||||||
|
// TODO: Vectorize this
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
f32x4x4_t ret;
|
||||||
|
ret.val[0][0] = std::tanh(ar.values[0]);
|
||||||
|
ret.val[0][1] = std::tanh(ar.values[1]);
|
||||||
|
ret.val[0][2] = std::tanh(ar.values[2]);
|
||||||
|
ret.val[0][3] = std::tanh(ar.values[3]);
|
||||||
|
ret.val[1][0] = std::tanh(ar.values[4]);
|
||||||
|
ret.val[1][1] = std::tanh(ar.values[5]);
|
||||||
|
ret.val[1][2] = std::tanh(ar.values[6]);
|
||||||
|
ret.val[1][3] = std::tanh(ar.values[7]);
|
||||||
|
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 er() const {
|
||||||
|
// TODO: Vectorize this
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
f32x4x4_t ret;
|
||||||
|
ret.val[0][0] = std::erf(ar.values[0]);
|
||||||
|
ret.val[0][1] = std::erf(ar.values[1]);
|
||||||
|
ret.val[0][2] = std::erf(ar.values[2]);
|
||||||
|
ret.val[0][3] = std::erf(ar.values[3]);
|
||||||
|
ret.val[1][0] = std::erf(ar.values[4]);
|
||||||
|
ret.val[1][1] = std::erf(ar.values[5]);
|
||||||
|
ret.val[1][2] = std::erf(ar.values[6]);
|
||||||
|
ret.val[1][3] = std::erf(ar.values[7]);
|
||||||
|
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 operator*(const FP32Vec8 &b) const {
|
||||||
|
return FP32Vec8({vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])});
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 operator+(const FP32Vec8 &b) const {
|
||||||
|
return FP32Vec8({vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])});
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 operator-(const FP32Vec8 &b) const {
|
||||||
|
return FP32Vec8({vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])});
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 operator/(const FP32Vec8 &b) const {
|
||||||
|
return FP32Vec8({vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])});
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(float *ptr) const {
|
||||||
|
vec_xst(reg.val[0], 0, ptr);
|
||||||
|
vec_xst(reg.val[1], 16, ptr);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 16;
|
||||||
|
union AliasReg {
|
||||||
|
f32x4x4_t reg;
|
||||||
|
float values[VEC_ELEM_NUM];
|
||||||
|
};
|
||||||
|
|
||||||
|
f32x4x4_t reg;
|
||||||
|
|
||||||
|
explicit FP32Vec16(float v) {
|
||||||
|
reg.val[0] = vec_splats(v);
|
||||||
|
reg.val[1] = vec_splats(v);
|
||||||
|
reg.val[2] = vec_splats(v);
|
||||||
|
reg.val[3] = vec_splats(v);
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec16() {
|
||||||
|
reg.val[0] = vec_splats(0.0f);
|
||||||
|
reg.val[1] = vec_splats(0.0f);
|
||||||
|
reg.val[2] = vec_splats(0.0f);
|
||||||
|
reg.val[3] = vec_splats(0.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const float *ptr) {
|
||||||
|
reg.val[0] = vec_xl(0, ptr);
|
||||||
|
reg.val[1] = vec_xl(16, ptr);
|
||||||
|
reg.val[2] = vec_xl(32, ptr);
|
||||||
|
reg.val[3] = vec_xl(48, ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec16(f32x4x4_t data) : reg(data) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP32Vec16 &data) {
|
||||||
|
reg.val[0] = data.reg.val[0];
|
||||||
|
reg.val[1] = data.reg.val[1];
|
||||||
|
reg.val[2] = data.reg.val[2];
|
||||||
|
reg.val[3] = data.reg.val[3];
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP32Vec4 &data) {
|
||||||
|
reg.val[0] = data.reg;
|
||||||
|
reg.val[1] = data.reg;
|
||||||
|
reg.val[2] = data.reg;
|
||||||
|
reg.val[3] = data.reg;
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP32Vec8 &data) {
|
||||||
|
reg.val[0] = data.reg.val[0];
|
||||||
|
reg.val[1] = data.reg.val[1];
|
||||||
|
reg.val[2] = data.reg.val[0];
|
||||||
|
reg.val[3] = data.reg.val[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const BF16Vec16 &v) {
|
||||||
|
reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
|
||||||
|
reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
|
||||||
|
reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
|
||||||
|
reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
|
||||||
|
|
||||||
|
FP32Vec16 operator*(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(f32x4x4_t({
|
||||||
|
vec_mul(reg.val[0], b.reg.val[0]),
|
||||||
|
vec_mul(reg.val[1], b.reg.val[1]),
|
||||||
|
vec_mul(reg.val[2], b.reg.val[2]),
|
||||||
|
vec_mul(reg.val[3], b.reg.val[3])}));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 operator+(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(f32x4x4_t({
|
||||||
|
vec_add(reg.val[0], b.reg.val[0]),
|
||||||
|
vec_add(reg.val[1], b.reg.val[1]),
|
||||||
|
vec_add(reg.val[2], b.reg.val[2]),
|
||||||
|
vec_add(reg.val[3], b.reg.val[3])}));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 operator-(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(f32x4x4_t({
|
||||||
|
vec_sub(reg.val[0], b.reg.val[0]),
|
||||||
|
vec_sub(reg.val[1], b.reg.val[1]),
|
||||||
|
vec_sub(reg.val[2], b.reg.val[2]),
|
||||||
|
vec_sub(reg.val[3], b.reg.val[3])}));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 operator/(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(f32x4x4_t({
|
||||||
|
vec_div(reg.val[0], b.reg.val[0]),
|
||||||
|
vec_div(reg.val[1], b.reg.val[1]),
|
||||||
|
vec_div(reg.val[2], b.reg.val[2]),
|
||||||
|
vec_div(reg.val[3], b.reg.val[3])}));
|
||||||
|
}
|
||||||
|
|
||||||
|
float reduce_sum() const {
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
float result = 0;
|
||||||
|
unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int group_size> float reduce_sub_sum(int idx) {
|
||||||
|
static_assert(VEC_ELEM_NUM % group_size == 0);
|
||||||
|
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
float result = 0;
|
||||||
|
const int start = idx * group_size;
|
||||||
|
unroll_loop<int, group_size>(
|
||||||
|
[&result, &start, ar](int i) { result += ar.values[start + i]; });
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(float *ptr) const {
|
||||||
|
vec_xst(reg.val[0], 0, ptr);
|
||||||
|
vec_xst(reg.val[1], 16, ptr);
|
||||||
|
vec_xst(reg.val[2], 32, ptr);
|
||||||
|
vec_xst(reg.val[3], 48, ptr);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> struct VecType { using vec_type = void; };
|
||||||
|
|
||||||
|
template <typename T> using vec_t = typename VecType<T>::vec_type;
|
||||||
|
|
||||||
|
template <> struct VecType<float> { using vec_type = FP32Vec8; };
|
||||||
|
|
||||||
|
template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
|
||||||
|
|
||||||
|
template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
|
||||||
|
|
||||||
|
inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
|
||||||
|
acc = acc + a * b;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
|
||||||
|
c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
|
||||||
|
reinterpret_cast<c10::BFloat16 *>(&v);
|
||||||
|
*ptr = *(v_ptr + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef __VEC_CLASS_FP_NAN
|
||||||
|
#define __VEC_CLASS_FP_NAN (1 << 6)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
const static __vector unsigned char omask = { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
|
||||||
|
#ifndef _ARCH_PWR10
|
||||||
|
const static __vector unsigned int bias = { 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff };
|
||||||
|
const static __vector unsigned int nan = { 0x7fc00000, 0x7fc00000, 0x7fc00000, 0x7fc00000 };
|
||||||
|
const static __vector unsigned int sh16 = { 16, 16, 16, 16 };
|
||||||
|
const static __vector unsigned int one = { 1, 1, 1, 1 };
|
||||||
|
#endif
|
||||||
|
|
||||||
|
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) {
|
||||||
|
#ifdef _ARCH_PWR10
|
||||||
|
__vector signed short ret[2];
|
||||||
|
ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]);
|
||||||
|
ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]);
|
||||||
|
reg = vec_perm(ret[0], ret[1], omask);
|
||||||
|
#elif defined(_ARCH_PWR9)
|
||||||
|
__vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
|
||||||
|
__vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
|
||||||
|
__vector unsigned int lsb0 = vec_sr(inp0, sh16);
|
||||||
|
__vector unsigned int lsb1 = vec_sr(inp1, sh16);
|
||||||
|
lsb0 = vec_and(lsb0, one);
|
||||||
|
lsb1 = vec_and(lsb1, one);
|
||||||
|
__vector unsigned int rnd0 = vec_add(lsb0, bias);
|
||||||
|
__vector unsigned int rnd1 = vec_add(lsb1, bias);
|
||||||
|
inp0 = vec_add(inp0, rnd0);
|
||||||
|
inp1 = vec_add(inp1, rnd1);
|
||||||
|
__vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
|
||||||
|
__vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
|
||||||
|
inp0 = vec_sel(inp0, nan, sel0);
|
||||||
|
inp1 = vec_sel(inp1, nan, sel1);
|
||||||
|
inp0 = vec_sr(inp0, sh16);
|
||||||
|
inp1 = vec_sr(inp1, sh16);
|
||||||
|
reg = (__vector signed short)vec_perm(inp0, inp1, omask);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
|
||||||
|
#ifdef _ARCH_PWR10
|
||||||
|
__vector signed short ret[4];
|
||||||
|
ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]);
|
||||||
|
ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]);
|
||||||
|
ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[2]);
|
||||||
|
ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[3]);
|
||||||
|
reg.val[0] = vec_perm(ret[0], ret[1], omask);
|
||||||
|
reg.val[1] = vec_perm(ret[2], ret[3], omask);
|
||||||
|
#elif defined(_ARCH_PWR9)
|
||||||
|
__vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
|
||||||
|
__vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
|
||||||
|
__vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
|
||||||
|
__vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
|
||||||
|
__vector unsigned int lsb0 = vec_sr(inp0, sh16);
|
||||||
|
__vector unsigned int lsb1 = vec_sr(inp1, sh16);
|
||||||
|
__vector unsigned int lsb2 = vec_sr(inp2, sh16);
|
||||||
|
__vector unsigned int lsb3 = vec_sr(inp3, sh16);
|
||||||
|
lsb0 = vec_and(lsb0, one);
|
||||||
|
lsb1 = vec_and(lsb1, one);
|
||||||
|
lsb2 = vec_and(lsb2, one);
|
||||||
|
lsb3 = vec_and(lsb3, one);
|
||||||
|
__vector unsigned int rnd0 = vec_add(lsb0, bias);
|
||||||
|
__vector unsigned int rnd1 = vec_add(lsb1, bias);
|
||||||
|
__vector unsigned int rnd2 = vec_add(lsb2, bias);
|
||||||
|
__vector unsigned int rnd3 = vec_add(lsb3, bias);
|
||||||
|
inp0 = vec_add(inp0, rnd0);
|
||||||
|
inp1 = vec_add(inp1, rnd1);
|
||||||
|
inp2 = vec_add(inp2, rnd2);
|
||||||
|
inp3 = vec_add(inp3, rnd3);
|
||||||
|
__vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
|
||||||
|
__vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
|
||||||
|
__vector __bool int sel2 = vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN);
|
||||||
|
__vector __bool int sel3 = vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN);
|
||||||
|
inp0 = vec_sel(inp0, nan, sel0);
|
||||||
|
inp1 = vec_sel(inp1, nan, sel1);
|
||||||
|
inp2 = vec_sel(inp2, nan, sel2);
|
||||||
|
inp3 = vec_sel(inp3, nan, sel3);
|
||||||
|
inp0 = vec_sr(inp0, sh16);
|
||||||
|
inp1 = vec_sr(inp1, sh16);
|
||||||
|
inp2 = vec_sr(inp2, sh16);
|
||||||
|
inp3 = vec_sr(inp3, sh16);
|
||||||
|
reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
|
||||||
|
reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void prefetch(const void *addr) {
|
||||||
|
__asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory");
|
||||||
|
}
|
||||||
|
|
||||||
|
}; // namespace vec_op
|
||||||
|
|
||||||
|
#endif
|
||||||
515
csrc/cpu/cpu_types_x86.hpp
Normal file
515
csrc/cpu/cpu_types_x86.hpp
Normal file
@@ -0,0 +1,515 @@
|
|||||||
|
|
||||||
|
#ifndef CPU_TYPES_X86_HPP
|
||||||
|
#define CPU_TYPES_X86_HPP
|
||||||
|
|
||||||
|
#include <immintrin.h>
|
||||||
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
#ifndef __AVX2__
|
||||||
|
static_assert(false, "AVX2 must be supported for the current implementation.");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace vec_op {
|
||||||
|
|
||||||
|
// FIXME: FP16 is not fully supported in Torch-CPU
|
||||||
|
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
||||||
|
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
|
||||||
|
|
||||||
|
#ifndef CPU_OP_GUARD
|
||||||
|
#define CPU_KERNEL_GUARD_IN(NAME)
|
||||||
|
#define CPU_KERNEL_GUARD_OUT(NAME)
|
||||||
|
#else
|
||||||
|
#define CPU_KERNEL_GUARD_IN(NAME) \
|
||||||
|
std::cout << #NAME << " invoked." << std::endl;
|
||||||
|
#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define FORCE_INLINE __attribute__((always_inline)) inline
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
template <typename T, T... indexes, typename F>
|
||||||
|
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
|
||||||
|
(f(std::integral_constant<T, indexes>{}), ...);
|
||||||
|
}
|
||||||
|
}; // namespace
|
||||||
|
|
||||||
|
template <typename T, T count, typename F,
|
||||||
|
typename = std::enable_if_t<std::is_invocable_v<F, T>>>
|
||||||
|
constexpr void unroll_loop(F &&f) {
|
||||||
|
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> struct Vec {
|
||||||
|
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FP32Vec8;
|
||||||
|
struct FP32Vec16;
|
||||||
|
|
||||||
|
#ifdef __AVX512FP16__
|
||||||
|
struct FP16Vec8 : public Vec<FP16Vec8> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 8;
|
||||||
|
|
||||||
|
__m128h reg;
|
||||||
|
|
||||||
|
explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
|
||||||
|
|
||||||
|
explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
|
||||||
|
|
||||||
|
explicit FP16Vec8(__m128h data) : reg(data) {}
|
||||||
|
|
||||||
|
FP16Vec8 operator*(const FP16Vec8 &b) const {
|
||||||
|
return FP16Vec8(_mm_mul_ph(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP16Vec8 operator+(const FP16Vec8 &b) const {
|
||||||
|
return FP16Vec8(_mm_add_ph(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP16Vec8 operator-(const FP16Vec8 &b) const {
|
||||||
|
return FP16Vec8(_mm_sub_ph(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP16Vec8 operator/(const FP16Vec8 &b) const {
|
||||||
|
return FP16Vec8(_mm_div_ph(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct BF16Vec8 : public Vec<BF16Vec8> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 8;
|
||||||
|
|
||||||
|
__m128i reg;
|
||||||
|
|
||||||
|
explicit BF16Vec8(const void *ptr)
|
||||||
|
: reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
|
||||||
|
|
||||||
|
explicit BF16Vec8(const FP32Vec8 &);
|
||||||
|
|
||||||
|
void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
|
||||||
|
};
|
||||||
|
|
||||||
|
struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 16;
|
||||||
|
|
||||||
|
__m256i reg;
|
||||||
|
|
||||||
|
explicit BF16Vec16(const void *ptr)
|
||||||
|
: reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
|
||||||
|
|
||||||
|
explicit BF16Vec16(const FP32Vec16 &);
|
||||||
|
|
||||||
|
void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef __AVX512F__
|
||||||
|
struct BF16Vec32 : public Vec<BF16Vec32> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 32;
|
||||||
|
|
||||||
|
__m512i reg;
|
||||||
|
|
||||||
|
explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
|
||||||
|
|
||||||
|
explicit BF16Vec32(__m512i data) : reg(data) {}
|
||||||
|
|
||||||
|
explicit BF16Vec32(BF16Vec8 &vec8_data)
|
||||||
|
: reg((__m512i)_mm512_inserti32x4(
|
||||||
|
_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
|
||||||
|
(__m128i)vec8_data.reg),
|
||||||
|
(__m128i)vec8_data.reg, 1),
|
||||||
|
(__m128i)vec8_data.reg, 2),
|
||||||
|
(__m128i)vec8_data.reg, 3)) {}
|
||||||
|
|
||||||
|
void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
|
||||||
|
};
|
||||||
|
#else
|
||||||
|
struct BF16Vec32 : public Vec<BF16Vec32> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 32;
|
||||||
|
|
||||||
|
__m256i reg_low;
|
||||||
|
__m256i reg_high;
|
||||||
|
|
||||||
|
explicit BF16Vec32(const void *ptr)
|
||||||
|
: reg_low(_mm256_loadu_si256((__m256i const *)ptr)),
|
||||||
|
reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {}
|
||||||
|
|
||||||
|
explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low),
|
||||||
|
reg_high(high) {}
|
||||||
|
|
||||||
|
explicit BF16Vec32(BF16Vec8 &vec8_data)
|
||||||
|
: reg_low((__m256i)_mm256_inserti32x4(
|
||||||
|
_mm256_castsi128_si256((__m128i)vec8_data.reg),
|
||||||
|
(__m128i)vec8_data.reg, 1)),
|
||||||
|
reg_high((__m256i)_mm256_inserti32x4(
|
||||||
|
_mm256_castsi128_si256((__m128i)vec8_data.reg),
|
||||||
|
(__m128i)vec8_data.reg, 1)) {}
|
||||||
|
|
||||||
|
void save(void *ptr) const {
|
||||||
|
*reinterpret_cast<__m256i *>(ptr) = reg_low;
|
||||||
|
*reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct FP32Vec4 : public Vec<FP32Vec4> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 4;
|
||||||
|
union AliasReg {
|
||||||
|
__m128 reg;
|
||||||
|
float values[VEC_ELEM_NUM];
|
||||||
|
};
|
||||||
|
|
||||||
|
__m128 reg;
|
||||||
|
|
||||||
|
explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec4(__m128 data) : reg(data) {}
|
||||||
|
|
||||||
|
explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FP32Vec8 : public Vec<FP32Vec8> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 8;
|
||||||
|
union AliasReg {
|
||||||
|
__m256 reg;
|
||||||
|
float values[VEC_ELEM_NUM];
|
||||||
|
};
|
||||||
|
|
||||||
|
__m256 reg;
|
||||||
|
|
||||||
|
explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec8(__m256 data) : reg(data) {}
|
||||||
|
|
||||||
|
explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
|
||||||
|
|
||||||
|
#ifdef __AVX512FP16__
|
||||||
|
explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
explicit FP32Vec8(const BF16Vec8 &v)
|
||||||
|
: reg(_mm256_castsi256_ps(
|
||||||
|
_mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
|
||||||
|
|
||||||
|
float reduce_sum() const {
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
float result = 0;
|
||||||
|
unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 exp() const {
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]),
|
||||||
|
expf(ar.values[5]), expf(ar.values[4]),
|
||||||
|
expf(ar.values[3]), expf(ar.values[2]),
|
||||||
|
expf(ar.values[1]), expf(ar.values[0])));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 tanh() const {
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]),
|
||||||
|
tanhf(ar.values[5]), tanhf(ar.values[4]),
|
||||||
|
tanhf(ar.values[3]), tanhf(ar.values[2]),
|
||||||
|
tanhf(ar.values[1]), tanhf(ar.values[0])));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 er() const {
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]),
|
||||||
|
erf(ar.values[5]), erf(ar.values[4]),
|
||||||
|
erf(ar.values[3]), erf(ar.values[2]),
|
||||||
|
erf(ar.values[1]), erf(ar.values[0])));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 operator*(const FP32Vec8 &b) const {
|
||||||
|
return FP32Vec8(_mm256_mul_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 operator+(const FP32Vec8 &b) const {
|
||||||
|
return FP32Vec8(_mm256_add_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 operator-(const FP32Vec8 &b) const {
|
||||||
|
return FP32Vec8(_mm256_sub_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 operator/(const FP32Vec8 &b) const {
|
||||||
|
return FP32Vec8(_mm256_div_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef __AVX512F__
|
||||||
|
struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 16;
|
||||||
|
union AliasReg {
|
||||||
|
__m512 reg;
|
||||||
|
float values[VEC_ELEM_NUM];
|
||||||
|
};
|
||||||
|
|
||||||
|
__m512 reg;
|
||||||
|
|
||||||
|
explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(__m512 data) : reg(data) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP32Vec4 &data)
|
||||||
|
: reg((__m512)_mm512_inserti32x4(
|
||||||
|
_mm512_inserti32x4(
|
||||||
|
_mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
|
||||||
|
(__m128i)data.reg, 1),
|
||||||
|
(__m128i)data.reg, 2),
|
||||||
|
(__m128i)data.reg, 3)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP32Vec8 &data)
|
||||||
|
: reg((__m512)_mm512_inserti32x8(
|
||||||
|
_mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const BF16Vec16 &v)
|
||||||
|
: reg(_mm512_castsi512_ps(
|
||||||
|
_mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
|
||||||
|
|
||||||
|
FP32Vec16 operator*(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(_mm512_mul_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 operator+(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(_mm512_add_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 operator-(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(_mm512_sub_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 operator/(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(_mm512_div_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
|
||||||
|
|
||||||
|
template <int group_size> float reduce_sub_sum(int idx) {
|
||||||
|
static_assert(VEC_ELEM_NUM % group_size == 0);
|
||||||
|
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
|
||||||
|
__mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
|
||||||
|
return _mm512_mask_reduce_add_ps(mask, reg);
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
|
||||||
|
};
|
||||||
|
#else
|
||||||
|
struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 16;
|
||||||
|
|
||||||
|
union AliasReg {
|
||||||
|
__m256 reg;
|
||||||
|
float values[8];
|
||||||
|
};
|
||||||
|
|
||||||
|
__m256 reg_low;
|
||||||
|
__m256 reg_high;
|
||||||
|
|
||||||
|
explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)),
|
||||||
|
reg_high(_mm256_set1_ps(v)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)),
|
||||||
|
reg_high(_mm256_set1_ps(0.0)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)),
|
||||||
|
reg_high(_mm256_loadu_ps(ptr + 8)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low),
|
||||||
|
reg_high(data.reg_high) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP32Vec4 &data)
|
||||||
|
: reg_low((__m256)_mm256_inserti128_si256(
|
||||||
|
_mm256_castsi128_si256((__m128i)data.reg),
|
||||||
|
(__m128i)data.reg, 1)),
|
||||||
|
reg_high((__m256)_mm256_inserti128_si256(
|
||||||
|
_mm256_castsi128_si256((__m128i)data.reg),
|
||||||
|
(__m128i)data.reg, 1)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP32Vec8 &data)
|
||||||
|
: reg_low(data.reg), reg_high(data.reg) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const BF16Vec16 &v) {
|
||||||
|
__m128i low = _mm256_extractf128_si256(v.reg, 0);
|
||||||
|
__m128i high = _mm256_extractf128_si256(v.reg, 1);
|
||||||
|
|
||||||
|
__m256i v_low_epi32 = _mm256_cvtepu16_epi32(low);
|
||||||
|
__m256i v_high_epi32 = _mm256_cvtepu16_epi32(high);
|
||||||
|
|
||||||
|
__m256i v_low_shifted = _mm256_bslli_epi128(v_low_epi32, 2);
|
||||||
|
__m256i v_high_shifted = _mm256_bslli_epi128(v_high_epi32, 2);
|
||||||
|
|
||||||
|
reg_low = _mm256_castsi256_ps(v_low_shifted);
|
||||||
|
reg_high = _mm256_castsi256_ps(v_high_shifted);
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
|
||||||
|
|
||||||
|
FP32Vec16 operator*(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low),
|
||||||
|
_mm256_mul_ps(reg_high, b.reg_high));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 operator+(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low),
|
||||||
|
_mm256_add_ps(reg_high, b.reg_high));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 operator-(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low),
|
||||||
|
_mm256_sub_ps(reg_high, b.reg_high));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 operator/(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low),
|
||||||
|
_mm256_div_ps(reg_high, b.reg_high));
|
||||||
|
}
|
||||||
|
|
||||||
|
float reduce_sum() const {
|
||||||
|
FP32Vec8 low = FP32Vec8(reg_low);
|
||||||
|
FP32Vec8 high = FP32Vec8(reg_high);
|
||||||
|
return low.reduce_sum() + high.reduce_sum();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int group_size> float reduce_sub_sum(int idx) {
|
||||||
|
float sum = 0.0;
|
||||||
|
static_assert(VEC_ELEM_NUM % group_size == 0);
|
||||||
|
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
|
||||||
|
uint32_t mask = base_mask << (idx * group_size);
|
||||||
|
|
||||||
|
AliasReg ar;
|
||||||
|
|
||||||
|
auto func = [&sum, &mask, &ar](int i) {
|
||||||
|
int flag = mask & 0x1;
|
||||||
|
mask = mask >> 1;
|
||||||
|
if (flag != 0) sum += ar.values[i];
|
||||||
|
};
|
||||||
|
|
||||||
|
ar.reg = reg_low;
|
||||||
|
unroll_loop<int, 8>(func);
|
||||||
|
|
||||||
|
ar.reg = reg_high;
|
||||||
|
unroll_loop<int, 8>(func);
|
||||||
|
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(float *ptr) const {
|
||||||
|
_mm256_storeu_ps(ptr, reg_low);
|
||||||
|
_mm256_storeu_ps(ptr + 8, reg_high);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template <typename T> struct VecType { using vec_type = void; };
|
||||||
|
|
||||||
|
template <typename T> using vec_t = typename VecType<T>::vec_type;
|
||||||
|
|
||||||
|
template <> struct VecType<float> { using vec_type = FP32Vec8; };
|
||||||
|
|
||||||
|
#ifdef __AVX512FP16__
|
||||||
|
template <> struct VecType<c10::Half> { using vec_type = FP16Vec16; };
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
|
||||||
|
|
||||||
|
template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
|
||||||
|
|
||||||
|
#ifdef __AVX512FP16__
|
||||||
|
template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
|
||||||
|
*reinterpret_cast<_Float16 *>(ptr) = v;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
|
||||||
|
acc = acc + a * b;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef __AVX512BF16__
|
||||||
|
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
|
||||||
|
*reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
|
||||||
|
: reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
|
||||||
|
|
||||||
|
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
|
||||||
|
: reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
|
||||||
|
|
||||||
|
inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
|
||||||
|
acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
|
||||||
|
c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
|
||||||
|
reinterpret_cast<c10::BFloat16 *>(&v);
|
||||||
|
*ptr = *(v_ptr + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef __AVX512F__
|
||||||
|
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
|
||||||
|
: reg(_mm256_cvtepi32_epi16(
|
||||||
|
_mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
|
||||||
|
|
||||||
|
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
|
||||||
|
: reg(_mm512_cvtepi32_epi16(
|
||||||
|
_mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
|
||||||
|
#else
|
||||||
|
namespace{
|
||||||
|
__m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) {
|
||||||
|
__m256i ai = _mm256_castps_si256(a);
|
||||||
|
ai = _mm256_srli_epi32(ai, 16);
|
||||||
|
ai = _mm256_packus_epi32(ai, ai);
|
||||||
|
ai = _mm256_permute4x64_epi64(ai, 0b00111001);
|
||||||
|
return _mm256_extracti128_si256(ai, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
|
||||||
|
: reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {}
|
||||||
|
|
||||||
|
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
|
||||||
|
BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low));
|
||||||
|
BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high));
|
||||||
|
reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1);
|
||||||
|
}
|
||||||
|
#endif // __AVX512F__
|
||||||
|
#endif // __AVX512BF16__
|
||||||
|
|
||||||
|
inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
|
||||||
|
|
||||||
|
}; // namespace vec_op
|
||||||
|
|
||||||
|
#endif
|
||||||
@@ -58,6 +58,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
|
ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
|
||||||
ops.impl("gelu_fast", torch::kCPU, &gelu_fast);
|
ops.impl("gelu_fast", torch::kCPU, &gelu_fast);
|
||||||
|
|
||||||
|
// Quick GELU implementation.
|
||||||
|
ops.def("gelu_quick(Tensor! out, Tensor input) -> ()");
|
||||||
|
ops.impl("gelu_quick", torch::kCPU, &gelu_quick);
|
||||||
|
|
||||||
// Layernorm
|
// Layernorm
|
||||||
// Apply Root Mean Square (RMS) Normalization to the input tensor.
|
// Apply Root Mean Square (RMS) Normalization to the input tensor.
|
||||||
ops.def(
|
ops.def(
|
||||||
|
|||||||
13
csrc/ops.h
13
csrc/ops.h
@@ -1,5 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <optional>
|
||||||
#include <torch/library.h>
|
#include <torch/library.h>
|
||||||
|
|
||||||
void paged_attention_v1(
|
void paged_attention_v1(
|
||||||
@@ -49,6 +50,8 @@ void gelu_new(torch::Tensor& out, torch::Tensor& input);
|
|||||||
|
|
||||||
void gelu_fast(torch::Tensor& out, torch::Tensor& input);
|
void gelu_fast(torch::Tensor& out, torch::Tensor& input);
|
||||||
|
|
||||||
|
void gelu_quick(torch::Tensor& out, torch::Tensor& input);
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
|
torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
|
||||||
const torch::Tensor& codebooks,
|
const torch::Tensor& codebooks,
|
||||||
@@ -90,9 +93,17 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
|
|||||||
int64_t size_k, int64_t size_n,
|
int64_t size_k, int64_t size_n,
|
||||||
int64_t num_bits);
|
int64_t num_bits);
|
||||||
|
|
||||||
|
torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||||
|
torch::Tensor& b_scales, torch::Tensor& workspace,
|
||||||
|
int64_t num_bits, int64_t size_m, int64_t size_n,
|
||||||
|
int64_t size_k);
|
||||||
|
|
||||||
|
bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
|
||||||
|
|
||||||
void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
|
void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
|
||||||
torch::Tensor const& b, torch::Tensor const& a_scales,
|
torch::Tensor const& b, torch::Tensor const& a_scales,
|
||||||
torch::Tensor const& b_scales);
|
torch::Tensor const& b_scales,
|
||||||
|
c10::optional<torch::Tensor> const& bias);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@@ -16,14 +16,20 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
|
|||||||
f(in_T, out_T, W_T, narrow, 512) \
|
f(in_T, out_T, W_T, narrow, 512) \
|
||||||
f(in_T, out_T, W_T, narrow, 640) \
|
f(in_T, out_T, W_T, narrow, 640) \
|
||||||
f(in_T, out_T, W_T, narrow, 768) \
|
f(in_T, out_T, W_T, narrow, 768) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 896) \
|
||||||
f(in_T, out_T, W_T, narrow, 1024) \
|
f(in_T, out_T, W_T, narrow, 1024) \
|
||||||
f(in_T, out_T, W_T, narrow, 1152) \
|
f(in_T, out_T, W_T, narrow, 1152) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 1216) \
|
||||||
f(in_T, out_T, W_T, narrow, 1280) \
|
f(in_T, out_T, W_T, narrow, 1280) \
|
||||||
f(in_T, out_T, W_T, narrow, 1536) \
|
f(in_T, out_T, W_T, narrow, 1536) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 1664) \
|
||||||
f(in_T, out_T, W_T, narrow, 1728) \
|
f(in_T, out_T, W_T, narrow, 1728) \
|
||||||
f(in_T, out_T, W_T, narrow, 1792) \
|
f(in_T, out_T, W_T, narrow, 1792) \
|
||||||
f(in_T, out_T, W_T, narrow, 2048) \
|
f(in_T, out_T, W_T, narrow, 2048) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 2240) \
|
||||||
f(in_T, out_T, W_T, narrow, 2304) \
|
f(in_T, out_T, W_T, narrow, 2304) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 2368) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 2432) \
|
||||||
f(in_T, out_T, W_T, narrow, 2560) \
|
f(in_T, out_T, W_T, narrow, 2560) \
|
||||||
f(in_T, out_T, W_T, narrow, 2752) \
|
f(in_T, out_T, W_T, narrow, 2752) \
|
||||||
f(in_T, out_T, W_T, narrow, 2816) \
|
f(in_T, out_T, W_T, narrow, 2816) \
|
||||||
@@ -31,32 +37,47 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
|
|||||||
f(in_T, out_T, W_T, narrow, 3328) \
|
f(in_T, out_T, W_T, narrow, 3328) \
|
||||||
f(in_T, out_T, W_T, narrow, 3456) \
|
f(in_T, out_T, W_T, narrow, 3456) \
|
||||||
f(in_T, out_T, W_T, narrow, 3584) \
|
f(in_T, out_T, W_T, narrow, 3584) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 3712) \
|
||||||
f(in_T, out_T, W_T, narrow, 4096) \
|
f(in_T, out_T, W_T, narrow, 4096) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 4480) \
|
||||||
f(in_T, out_T, W_T, narrow, 4608) \
|
f(in_T, out_T, W_T, narrow, 4608) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 4736) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 4864) \
|
||||||
f(in_T, out_T, W_T, narrow, 5120) \
|
f(in_T, out_T, W_T, narrow, 5120) \
|
||||||
f(in_T, out_T, W_T, narrow, 5504) \
|
f(in_T, out_T, W_T, narrow, 5504) \
|
||||||
f(in_T, out_T, W_T, narrow, 5632) \
|
f(in_T, out_T, W_T, narrow, 5632) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 5888) \
|
||||||
f(in_T, out_T, W_T, narrow, 6144) \
|
f(in_T, out_T, W_T, narrow, 6144) \
|
||||||
f(in_T, out_T, W_T, narrow, 6400) \
|
f(in_T, out_T, W_T, narrow, 6400) \
|
||||||
f(in_T, out_T, W_T, narrow, 6848) \
|
f(in_T, out_T, W_T, narrow, 6848) \
|
||||||
f(in_T, out_T, W_T, narrow, 6912) \
|
f(in_T, out_T, W_T, narrow, 6912) \
|
||||||
f(in_T, out_T, W_T, narrow, 7168) \
|
f(in_T, out_T, W_T, narrow, 7168) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 7424) \
|
||||||
f(in_T, out_T, W_T, narrow, 8192) \
|
f(in_T, out_T, W_T, narrow, 8192) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 8960) \
|
||||||
f(in_T, out_T, W_T, narrow, 9216) \
|
f(in_T, out_T, W_T, narrow, 9216) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 9472) \
|
||||||
f(in_T, out_T, W_T, narrow, 10240) \
|
f(in_T, out_T, W_T, narrow, 10240) \
|
||||||
f(in_T, out_T, W_T, narrow, 11008) \
|
f(in_T, out_T, W_T, narrow, 11008) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 11264) \
|
||||||
f(in_T, out_T, W_T, narrow, 12288) \
|
f(in_T, out_T, W_T, narrow, 12288) \
|
||||||
f(in_T, out_T, W_T, narrow, 13696) \
|
f(in_T, out_T, W_T, narrow, 13696) \
|
||||||
f(in_T, out_T, W_T, narrow, 13824) \
|
f(in_T, out_T, W_T, narrow, 13824) \
|
||||||
f(in_T, out_T, W_T, narrow, 14336) \
|
f(in_T, out_T, W_T, narrow, 14336) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 14784) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 14848) \
|
||||||
f(in_T, out_T, W_T, narrow, 15360) \
|
f(in_T, out_T, W_T, narrow, 15360) \
|
||||||
f(in_T, out_T, W_T, narrow, 16384) \
|
f(in_T, out_T, W_T, narrow, 16384) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 18944) \
|
||||||
f(in_T, out_T, W_T, narrow, 20480) \
|
f(in_T, out_T, W_T, narrow, 20480) \
|
||||||
f(in_T, out_T, W_T, narrow, 22016) \
|
f(in_T, out_T, W_T, narrow, 22016) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 22528) \
|
||||||
f(in_T, out_T, W_T, narrow, 24576) \
|
f(in_T, out_T, W_T, narrow, 24576) \
|
||||||
f(in_T, out_T, W_T, narrow, 27392) \
|
f(in_T, out_T, W_T, narrow, 27392) \
|
||||||
f(in_T, out_T, W_T, narrow, 27648) \
|
f(in_T, out_T, W_T, narrow, 27648) \
|
||||||
f(in_T, out_T, W_T, narrow, 28672) \
|
f(in_T, out_T, W_T, narrow, 28672) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 29568) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 29696) \
|
||||||
f(in_T, out_T, W_T, narrow, 32000) \
|
f(in_T, out_T, W_T, narrow, 32000) \
|
||||||
f(in_T, out_T, W_T, narrow, 32256) \
|
f(in_T, out_T, W_T, narrow, 32256) \
|
||||||
f(in_T, out_T, W_T, narrow, 32512) \
|
f(in_T, out_T, W_T, narrow, 32512) \
|
||||||
@@ -65,6 +86,9 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
|
|||||||
f(in_T, out_T, W_T, narrow, 36864) \
|
f(in_T, out_T, W_T, narrow, 36864) \
|
||||||
f(in_T, out_T, W_T, narrow, 43264) \
|
f(in_T, out_T, W_T, narrow, 43264) \
|
||||||
f(in_T, out_T, W_T, narrow, 49152) \
|
f(in_T, out_T, W_T, narrow, 49152) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 49408) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 60544) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 60672) \
|
||||||
f(in_T, out_T, W_T, narrow, 64000) \
|
f(in_T, out_T, W_T, narrow, 64000) \
|
||||||
f(in_T, out_T, W_T, narrow, 64256) \
|
f(in_T, out_T, W_T, narrow, 64256) \
|
||||||
f(in_T, out_T, W_T, narrow, 64512) \
|
f(in_T, out_T, W_T, narrow, 64512) \
|
||||||
@@ -74,12 +98,14 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
|
|||||||
f(in_T, out_T, W_T, narrow, 128000) \
|
f(in_T, out_T, W_T, narrow, 128000) \
|
||||||
f(in_T, out_T, W_T, narrow, 128256) \
|
f(in_T, out_T, W_T, narrow, 128256) \
|
||||||
f(in_T, out_T, W_T, narrow, 128512) \
|
f(in_T, out_T, W_T, narrow, 128512) \
|
||||||
|
|
||||||
|
|
||||||
// Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA
|
// Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA
|
||||||
// and vllm/tests/lora/test_punica.py
|
// and vllm/tests/lora/test_punica.py
|
||||||
|
|
||||||
// Used for defining kernels going from the variety of
|
// Used for defining kernels going from the variety of
|
||||||
// dim in to the narrow dim out
|
// dim in to the narrow dim out
|
||||||
// Using it for the fully sharded column
|
// Using it for the fully sharded column
|
||||||
// parallel LoRA A which splits the rank dim
|
// parallel LoRA A which splits the rank dim
|
||||||
#define FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, narrow) \
|
#define FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, narrow) \
|
||||||
f(in_T, out_T, W_T, 128, narrow) \
|
f(in_T, out_T, W_T, 128, narrow) \
|
||||||
@@ -87,14 +113,20 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
|
|||||||
f(in_T, out_T, W_T, 512, narrow) \
|
f(in_T, out_T, W_T, 512, narrow) \
|
||||||
f(in_T, out_T, W_T, 640, narrow) \
|
f(in_T, out_T, W_T, 640, narrow) \
|
||||||
f(in_T, out_T, W_T, 768, narrow) \
|
f(in_T, out_T, W_T, 768, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 896, narrow) \
|
||||||
f(in_T, out_T, W_T, 1024, narrow) \
|
f(in_T, out_T, W_T, 1024, narrow) \
|
||||||
f(in_T, out_T, W_T, 1152, narrow) \
|
f(in_T, out_T, W_T, 1152, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 1216, narrow) \
|
||||||
f(in_T, out_T, W_T, 1280, narrow) \
|
f(in_T, out_T, W_T, 1280, narrow) \
|
||||||
f(in_T, out_T, W_T, 1536, narrow) \
|
f(in_T, out_T, W_T, 1536, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 1664, narrow) \
|
||||||
f(in_T, out_T, W_T, 1728, narrow) \
|
f(in_T, out_T, W_T, 1728, narrow) \
|
||||||
f(in_T, out_T, W_T, 1792, narrow) \
|
f(in_T, out_T, W_T, 1792, narrow) \
|
||||||
f(in_T, out_T, W_T, 2048, narrow) \
|
f(in_T, out_T, W_T, 2048, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 2240, narrow) \
|
||||||
f(in_T, out_T, W_T, 2304, narrow) \
|
f(in_T, out_T, W_T, 2304, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 2368, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 2432, narrow) \
|
||||||
f(in_T, out_T, W_T, 2560, narrow) \
|
f(in_T, out_T, W_T, 2560, narrow) \
|
||||||
f(in_T, out_T, W_T, 2752, narrow) \
|
f(in_T, out_T, W_T, 2752, narrow) \
|
||||||
f(in_T, out_T, W_T, 2816, narrow) \
|
f(in_T, out_T, W_T, 2816, narrow) \
|
||||||
@@ -102,32 +134,47 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
|
|||||||
f(in_T, out_T, W_T, 3328, narrow) \
|
f(in_T, out_T, W_T, 3328, narrow) \
|
||||||
f(in_T, out_T, W_T, 3456, narrow) \
|
f(in_T, out_T, W_T, 3456, narrow) \
|
||||||
f(in_T, out_T, W_T, 3584, narrow) \
|
f(in_T, out_T, W_T, 3584, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 3712, narrow) \
|
||||||
f(in_T, out_T, W_T, 4096, narrow) \
|
f(in_T, out_T, W_T, 4096, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 4480, narrow) \
|
||||||
f(in_T, out_T, W_T, 4608, narrow) \
|
f(in_T, out_T, W_T, 4608, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 4736, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 4864, narrow) \
|
||||||
f(in_T, out_T, W_T, 5120, narrow) \
|
f(in_T, out_T, W_T, 5120, narrow) \
|
||||||
f(in_T, out_T, W_T, 5504, narrow) \
|
f(in_T, out_T, W_T, 5504, narrow) \
|
||||||
f(in_T, out_T, W_T, 5632, narrow) \
|
f(in_T, out_T, W_T, 5632, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 5888, narrow) \
|
||||||
f(in_T, out_T, W_T, 6144, narrow) \
|
f(in_T, out_T, W_T, 6144, narrow) \
|
||||||
f(in_T, out_T, W_T, 6400, narrow) \
|
f(in_T, out_T, W_T, 6400, narrow) \
|
||||||
f(in_T, out_T, W_T, 6848, narrow) \
|
f(in_T, out_T, W_T, 6848, narrow) \
|
||||||
f(in_T, out_T, W_T, 6912, narrow) \
|
f(in_T, out_T, W_T, 6912, narrow) \
|
||||||
f(in_T, out_T, W_T, 7168, narrow) \
|
f(in_T, out_T, W_T, 7168, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 7424, narrow) \
|
||||||
f(in_T, out_T, W_T, 8192, narrow) \
|
f(in_T, out_T, W_T, 8192, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 8960, narrow) \
|
||||||
f(in_T, out_T, W_T, 9216, narrow) \
|
f(in_T, out_T, W_T, 9216, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 9472, narrow) \
|
||||||
f(in_T, out_T, W_T, 10240, narrow) \
|
f(in_T, out_T, W_T, 10240, narrow) \
|
||||||
f(in_T, out_T, W_T, 11008, narrow) \
|
f(in_T, out_T, W_T, 11008, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 11264, narrow) \
|
||||||
f(in_T, out_T, W_T, 12288, narrow) \
|
f(in_T, out_T, W_T, 12288, narrow) \
|
||||||
f(in_T, out_T, W_T, 13696, narrow) \
|
f(in_T, out_T, W_T, 13696, narrow) \
|
||||||
f(in_T, out_T, W_T, 13824, narrow) \
|
f(in_T, out_T, W_T, 13824, narrow) \
|
||||||
f(in_T, out_T, W_T, 14336, narrow) \
|
f(in_T, out_T, W_T, 14336, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 14784, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 14848, narrow) \
|
||||||
f(in_T, out_T, W_T, 15360, narrow) \
|
f(in_T, out_T, W_T, 15360, narrow) \
|
||||||
f(in_T, out_T, W_T, 16384, narrow) \
|
f(in_T, out_T, W_T, 16384, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 18944, narrow) \
|
||||||
f(in_T, out_T, W_T, 20480, narrow) \
|
f(in_T, out_T, W_T, 20480, narrow) \
|
||||||
f(in_T, out_T, W_T, 22016, narrow) \
|
f(in_T, out_T, W_T, 22016, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 22528, narrow) \
|
||||||
f(in_T, out_T, W_T, 24576, narrow) \
|
f(in_T, out_T, W_T, 24576, narrow) \
|
||||||
f(in_T, out_T, W_T, 27392, narrow) \
|
f(in_T, out_T, W_T, 27392, narrow) \
|
||||||
f(in_T, out_T, W_T, 27648, narrow) \
|
f(in_T, out_T, W_T, 27648, narrow) \
|
||||||
f(in_T, out_T, W_T, 28672, narrow) \
|
f(in_T, out_T, W_T, 28672, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 29568, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 29696, narrow) \
|
||||||
f(in_T, out_T, W_T, 32000, narrow) \
|
f(in_T, out_T, W_T, 32000, narrow) \
|
||||||
f(in_T, out_T, W_T, 32256, narrow) \
|
f(in_T, out_T, W_T, 32256, narrow) \
|
||||||
f(in_T, out_T, W_T, 32512, narrow) \
|
f(in_T, out_T, W_T, 32512, narrow) \
|
||||||
@@ -136,6 +183,9 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
|
|||||||
f(in_T, out_T, W_T, 36864, narrow) \
|
f(in_T, out_T, W_T, 36864, narrow) \
|
||||||
f(in_T, out_T, W_T, 43264, narrow) \
|
f(in_T, out_T, W_T, 43264, narrow) \
|
||||||
f(in_T, out_T, W_T, 49152, narrow) \
|
f(in_T, out_T, W_T, 49152, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 49408, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 60544, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 60672, narrow) \
|
||||||
f(in_T, out_T, W_T, 64000, narrow) \
|
f(in_T, out_T, W_T, 64000, narrow) \
|
||||||
f(in_T, out_T, W_T, 64256, narrow) \
|
f(in_T, out_T, W_T, 64256, narrow) \
|
||||||
f(in_T, out_T, W_T, 64512, narrow) \
|
f(in_T, out_T, W_T, 64512, narrow) \
|
||||||
|
|||||||
@@ -153,7 +153,7 @@ struct Sm90RowOrScalarBroadcast {
|
|||||||
|
|
||||||
CUTLASS_DEVICE void
|
CUTLASS_DEVICE void
|
||||||
begin(uint64_t* full_mbarrier_ptr, int load_iteration, bool issue_tma_load) {
|
begin(uint64_t* full_mbarrier_ptr, int load_iteration, bool issue_tma_load) {
|
||||||
if (params.ptr_row == nullptr) {
|
if (!params.row_broadcast) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "cutlass/cutlass.h"
|
#include "cutlass/cutlass.h"
|
||||||
|
#include <climits>
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper function for checking CUTLASS errors
|
* Helper function for checking CUTLASS errors
|
||||||
@@ -10,3 +11,17 @@
|
|||||||
TORCH_CHECK(status == cutlass::Status::kSuccess, \
|
TORCH_CHECK(status == cutlass::Status::kSuccess, \
|
||||||
cutlassGetStatusString(status)) \
|
cutlassGetStatusString(status)) \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline uint32_t next_pow_2(uint32_t const num) {
|
||||||
|
if (num <= 1) return num;
|
||||||
|
return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
|
||||||
|
int max_shared_mem_per_block_opt_in = 0;
|
||||||
|
cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
|
||||||
|
cudaDevAttrMaxSharedMemoryPerBlockOptin,
|
||||||
|
device);
|
||||||
|
return max_shared_mem_per_block_opt_in;
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -77,24 +77,12 @@ struct enable_sm89_to_sm90 : Kernel {
|
|||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
This epilogue function defines a quantized GEMM operation similar to
|
* This class provides the common ScaleA and ScaleB descriptors for the
|
||||||
torch._scaled_mm.
|
* ScaledEpilogue and ScaledEpilogueBias classes.
|
||||||
|
*/
|
||||||
A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
|
|
||||||
per-row. B can be quantized per-tensor or per-column.
|
|
||||||
Any combination of per-tensor and per-row or column is supported.
|
|
||||||
A and B must have symmetric quantization (zero point == 0).
|
|
||||||
|
|
||||||
So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
|
|
||||||
scales are applied elementwise with numpy-style broadcasting.
|
|
||||||
|
|
||||||
ScaleA and ScaleB define the epilogue functions that apply the scales for
|
|
||||||
the A and B operands respectively. These scales may be either per-tensor or
|
|
||||||
per row or column.
|
|
||||||
*/
|
|
||||||
template <typename ElementD, typename OutputTileThreadMap>
|
template <typename ElementD, typename OutputTileThreadMap>
|
||||||
struct ScaledEpilogue {
|
struct ScaledEpilogueBase {
|
||||||
private:
|
protected:
|
||||||
using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
|
using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
|
||||||
|
|
||||||
using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
|
using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
|
||||||
@@ -102,6 +90,32 @@ struct ScaledEpilogue {
|
|||||||
|
|
||||||
using ScaleB = cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast<
|
using ScaleB = cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast<
|
||||||
OutputTileThreadMap, float, Stride<Int<0>, Int<1>, Int<0>>>;
|
OutputTileThreadMap, float, Stride<Int<0>, Int<1>, Int<0>>>;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
This epilogue function defines a quantized GEMM operation similar to
|
||||||
|
torch._scaled_mm.
|
||||||
|
|
||||||
|
A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
|
||||||
|
per-row. B can be quantized per-tensor or per-column.
|
||||||
|
Any combination of per-tensor and per-row or column is supported.
|
||||||
|
A and B must have symmetric quantization (zero point == 0).
|
||||||
|
|
||||||
|
So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
|
||||||
|
scales are applied elementwise with numpy-style broadcasting.
|
||||||
|
|
||||||
|
ScaleA and ScaleB define the epilogue functions that apply the scales for
|
||||||
|
the A and B operands respectively. These scales may be either per-tensor or
|
||||||
|
per row or column.
|
||||||
|
*/
|
||||||
|
template <typename ElementD, typename OutputTileThreadMap>
|
||||||
|
struct ScaledEpilogue
|
||||||
|
: private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
|
||||||
|
private:
|
||||||
|
using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
|
||||||
|
using Accum = typename SUPER::Accum;
|
||||||
|
using ScaleA = typename SUPER::ScaleA;
|
||||||
|
using ScaleB = typename SUPER::ScaleB;
|
||||||
|
|
||||||
using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
|
using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
|
||||||
cutlass::multiplies, float, float,
|
cutlass::multiplies, float, float,
|
||||||
@@ -134,6 +148,53 @@ struct ScaledEpilogue {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <typename ElementD, typename OutputTileThreadMap>
|
||||||
|
struct ScaledEpilogueBias
|
||||||
|
: private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
|
||||||
|
private:
|
||||||
|
using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
|
||||||
|
using Accum = typename SUPER::Accum;
|
||||||
|
using ScaleA = typename SUPER::ScaleA;
|
||||||
|
using ScaleB = typename SUPER::ScaleB;
|
||||||
|
|
||||||
|
using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
|
||||||
|
cutlass::multiplies, float, float,
|
||||||
|
cutlass::FloatRoundStyle::round_to_nearest>;
|
||||||
|
|
||||||
|
using EVTCompute0 =
|
||||||
|
cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
|
||||||
|
|
||||||
|
using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
|
||||||
|
cutlass::multiply_add, ElementD, float,
|
||||||
|
cutlass::FloatRoundStyle::round_to_nearest>;
|
||||||
|
|
||||||
|
using Bias = cutlass::epilogue::threadblock::VisitorRowBroadcast<
|
||||||
|
OutputTileThreadMap, ElementD, Stride<Int<0>, Int<1>, Int<0>>>;
|
||||||
|
|
||||||
|
public:
|
||||||
|
using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA,
|
||||||
|
EVTCompute0, Bias>;
|
||||||
|
using ArgumentType = typename EVTCompute::Arguments;
|
||||||
|
|
||||||
|
static ArgumentType prepare_args(torch::Tensor const& a_scales,
|
||||||
|
torch::Tensor const& b_scales,
|
||||||
|
torch::Tensor const& bias) {
|
||||||
|
using ScaleAArgs = typename ScaleA::Arguments;
|
||||||
|
using ScaleBArgs = typename ScaleB::Arguments;
|
||||||
|
using BiasArgs = typename Bias::Arguments;
|
||||||
|
|
||||||
|
ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
|
||||||
|
ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
|
||||||
|
BiasArgs bias_args{static_cast<ElementD*>(bias.data_ptr()), {}};
|
||||||
|
|
||||||
|
typename EVTCompute0::Arguments evt0_compute_args{b_args};
|
||||||
|
|
||||||
|
typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args,
|
||||||
|
bias_args};
|
||||||
|
return evt_compute_args;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template <typename Arch, template <typename> typename ArchGuard,
|
template <typename Arch, template <typename> typename ArchGuard,
|
||||||
typename ElementAB_, typename ElementD_,
|
typename ElementAB_, typename ElementD_,
|
||||||
template <typename, typename> typename Epilogue_, typename TileShape,
|
template <typename, typename> typename Epilogue_, typename TileShape,
|
||||||
@@ -168,13 +229,13 @@ struct cutlass_2x_gemm {
|
|||||||
// clang-format off
|
// clang-format off
|
||||||
using RowMajor = typename cutlass::layout::RowMajor;
|
using RowMajor = typename cutlass::layout::RowMajor;
|
||||||
using ColumnMajor = typename cutlass::layout::ColumnMajor;
|
using ColumnMajor = typename cutlass::layout::ColumnMajor;
|
||||||
using KernelType =
|
using KernelType =
|
||||||
ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
|
ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
|
||||||
ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16,
|
ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16,
|
||||||
ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16,
|
ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16,
|
||||||
float, cutlass::layout::RowMajor, 4,
|
float, cutlass::layout::RowMajor, 4,
|
||||||
ElementAcc, float, cutlass::arch::OpClassTensorOp,
|
ElementAcc, float, cutlass::arch::OpClassTensorOp,
|
||||||
Arch,
|
Arch,
|
||||||
TileShape, WarpShape, InstructionShape,
|
TileShape, WarpShape, InstructionShape,
|
||||||
EVTD,
|
EVTD,
|
||||||
cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
|
cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
|
||||||
@@ -250,16 +311,167 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
|
|||||||
CUTLASS_CHECK(status);
|
CUTLASS_CHECK(status);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename Gemm, typename FallbackGemm, typename... EpilogueArgs>
|
||||||
|
void fallback_cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
|
||||||
|
torch::Tensor const& b,
|
||||||
|
EpilogueArgs&&... args) {
|
||||||
|
// In some cases, the GPU isn't able to accommodate the
|
||||||
|
// shared memory requirements of the Gemm. In such cases, use
|
||||||
|
// the FallbackGemm instead.
|
||||||
|
static const int max_shared_mem_per_block_opt_in =
|
||||||
|
get_cuda_max_shared_memory_per_block_opt_in(0);
|
||||||
|
|
||||||
|
size_t const gemm_shared_mem_size =
|
||||||
|
sizeof(typename Gemm::KernelType::SharedStorage);
|
||||||
|
size_t const fallback_gemm_shared_mem_size =
|
||||||
|
sizeof(typename FallbackGemm::KernelType::SharedStorage);
|
||||||
|
|
||||||
|
if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) {
|
||||||
|
return cutlass_gemm_caller<Gemm>(out, a, b,
|
||||||
|
std::forward<EpilogueArgs>(args)...);
|
||||||
|
} else {
|
||||||
|
TORCH_CHECK(fallback_gemm_shared_mem_size <=
|
||||||
|
max_shared_mem_per_block_opt_in);
|
||||||
|
return cutlass_gemm_caller<FallbackGemm>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename InType, typename OutType,
|
||||||
|
template <typename, typename> typename Epilogue>
|
||||||
|
struct sm80_config_default {
|
||||||
|
// This config is used in 2 cases,
|
||||||
|
// - M in (128, inf)
|
||||||
|
// - M in (64, 128] and N >= 8192
|
||||||
|
// Shared Memory required by this Gemm - 81920 bytes
|
||||||
|
static_assert(std::is_same<InType, int8_t>());
|
||||||
|
using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
|
||||||
|
using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
|
||||||
|
using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
|
||||||
|
using Cutlass2xGemm =
|
||||||
|
cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
|
||||||
|
Epilogue, TileShape, WarpShape, InstructionShape, 5>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType,
|
||||||
|
template <typename, typename> typename Epilogue>
|
||||||
|
struct sm80_config_M64 {
|
||||||
|
// This config is used in 2 cases,
|
||||||
|
// - M in (32, 64]
|
||||||
|
// - M in (64, 128] and N < 8192
|
||||||
|
// Shared Memory required by this Gemm - 122880 bytes
|
||||||
|
static_assert(std::is_same<InType, int8_t>());
|
||||||
|
using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
|
||||||
|
using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
|
||||||
|
using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
|
||||||
|
using Cutlass2xGemm =
|
||||||
|
cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
|
||||||
|
Epilogue, TileShape, WarpShape, InstructionShape, 5>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType,
|
||||||
|
template <typename, typename> typename Epilogue>
|
||||||
|
struct sm80_config_M32 {
|
||||||
|
// M in (16, 32]
|
||||||
|
// Shared Memory required by this Gemm - 61440 bytes
|
||||||
|
static_assert(std::is_same<InType, int8_t>());
|
||||||
|
using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
|
||||||
|
using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
|
||||||
|
using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
|
||||||
|
using Cutlass2xGemm =
|
||||||
|
cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
|
||||||
|
Epilogue, TileShape, WarpShape, InstructionShape, 5>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType,
|
||||||
|
template <typename, typename> typename Epilogue>
|
||||||
|
struct sm80_config_M16 {
|
||||||
|
// M in [1, 16]
|
||||||
|
// Shared Memory required by this Gemm - 51200 bytes
|
||||||
|
static_assert(std::is_same<InType, int8_t>());
|
||||||
|
using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
|
||||||
|
using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
|
||||||
|
using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
|
||||||
|
using Cutlass2xGemm =
|
||||||
|
cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
|
||||||
|
Epilogue, TileShape, WarpShape, InstructionShape, 5>;
|
||||||
|
};
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
|
template <typename InType, typename OutType,
|
||||||
torch::Tensor const& b,
|
template <typename, typename> typename Epilogue,
|
||||||
torch::Tensor const& a_scales,
|
typename... EpilogueArgs>
|
||||||
torch::Tensor const& b_scales) {
|
void cutlass_gemm_sm80_dispatch(torch::Tensor& out, torch::Tensor const& a,
|
||||||
|
torch::Tensor const& b,
|
||||||
|
EpilogueArgs&&... args) {
|
||||||
|
static_assert(std::is_same<InType, int8_t>());
|
||||||
|
TORCH_CHECK(a.dtype() == torch::kInt8);
|
||||||
|
TORCH_CHECK(b.dtype() == torch::kInt8);
|
||||||
|
|
||||||
|
using Cutlass2xGemmDefault =
|
||||||
|
typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
|
||||||
|
using Cutlass2xGemmM128BigN =
|
||||||
|
typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
|
||||||
|
using Cutlass2xGemmM128SmallN =
|
||||||
|
typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
|
||||||
|
using Cutlass2xGemmM64 =
|
||||||
|
typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
|
||||||
|
using Cutlass2xGemmM32 =
|
||||||
|
typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
|
||||||
|
using Cutlass2xGemmM16 =
|
||||||
|
typename sm80_config_M16<InType, OutType, Epilogue>::Cutlass2xGemm;
|
||||||
|
|
||||||
|
// Due to shared memory requirements, some Gemms may fail to run on some
|
||||||
|
// GPUs. As the name indicates, the Fallback Gemm is used as an alternative
|
||||||
|
// in such cases.
|
||||||
|
// sm80_config_M16 has the least shared-memory requirement. However,
|
||||||
|
// based on some profiling, we select sm80_config_M32 as a better alternative
|
||||||
|
// performance wise.
|
||||||
|
using FallbackGemm =
|
||||||
|
typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
|
||||||
|
|
||||||
|
uint32_t const m = a.size(0);
|
||||||
|
uint32_t const mp2 =
|
||||||
|
std::max(static_cast<uint32_t>(16), next_pow_2(m)); // next power of 2
|
||||||
|
if (mp2 <= 16) {
|
||||||
|
// M in [1, 16]
|
||||||
|
return fallback_cutlass_gemm_caller<Cutlass2xGemmM16, FallbackGemm>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
} else if (mp2 <= 32) {
|
||||||
|
// M in (16, 32]
|
||||||
|
return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
} else if (mp2 <= 64) {
|
||||||
|
// M in (32, 64]
|
||||||
|
return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
} else if (mp2 <= 128) {
|
||||||
|
// M in (64, 128]
|
||||||
|
uint32_t const n = out.size(1);
|
||||||
|
bool const small_n = n < 8192;
|
||||||
|
if (small_n) {
|
||||||
|
return fallback_cutlass_gemm_caller<Cutlass2xGemmM128SmallN,
|
||||||
|
FallbackGemm>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
} else {
|
||||||
|
return fallback_cutlass_gemm_caller<Cutlass2xGemmM128BigN, FallbackGemm>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// M in (128, inf)
|
||||||
|
return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <template <typename, typename> typename Epilogue,
|
||||||
|
typename... EpilogueArgs>
|
||||||
|
void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
|
||||||
|
torch::Tensor const& b,
|
||||||
|
EpilogueArgs&&... epilogue_args) {
|
||||||
TORCH_CHECK(a.dtype() == torch::kInt8);
|
TORCH_CHECK(a.dtype() == torch::kInt8);
|
||||||
TORCH_CHECK(b.dtype() == torch::kInt8);
|
TORCH_CHECK(b.dtype() == torch::kInt8);
|
||||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
|
||||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
|
||||||
|
|
||||||
using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
|
using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
|
||||||
using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
|
using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
|
||||||
@@ -268,85 +480,130 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
|
|||||||
if (out.dtype() == torch::kBFloat16) {
|
if (out.dtype() == torch::kBFloat16) {
|
||||||
return cutlass_gemm_caller<cutlass_2x_gemm<
|
return cutlass_gemm_caller<cutlass_2x_gemm<
|
||||||
cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::bfloat16_t,
|
cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::bfloat16_t,
|
||||||
ScaledEpilogue, TileShape, WarpShape, InstructionShape, 2>>(
|
Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
|
||||||
out, a, b, a_scales, b_scales);
|
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
} else {
|
} else {
|
||||||
TORCH_CHECK(out.dtype() == torch::kFloat16);
|
TORCH_CHECK(out.dtype() == torch::kFloat16);
|
||||||
return cutlass_gemm_caller<cutlass_2x_gemm<
|
return cutlass_gemm_caller<cutlass_2x_gemm<
|
||||||
cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::half_t,
|
cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::half_t,
|
||||||
ScaledEpilogue, TileShape, WarpShape, InstructionShape, 2>>(
|
Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
|
||||||
out, a, b, a_scales, b_scales);
|
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
|
||||||
|
torch::Tensor const& b,
|
||||||
|
torch::Tensor const& a_scales,
|
||||||
|
torch::Tensor const& b_scales,
|
||||||
|
c10::optional<torch::Tensor> const& bias) {
|
||||||
|
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||||
|
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||||
|
if (bias) {
|
||||||
|
TORCH_CHECK(bias->dtype() == out.dtype(),
|
||||||
|
"currently bias dtype must match output dtype ", out.dtype());
|
||||||
|
return cutlass_scaled_mm_sm75_epilogue<ScaledEpilogueBias>(
|
||||||
|
out, a, b, a_scales, b_scales, *bias);
|
||||||
|
} else {
|
||||||
|
return cutlass_scaled_mm_sm75_epilogue<ScaledEpilogue>(out, a, b, a_scales,
|
||||||
|
b_scales);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <template <typename, typename> typename Epilogue,
|
||||||
|
typename... EpilogueArgs>
|
||||||
|
void cutlass_scaled_mm_sm80_epilogue(torch::Tensor& out, torch::Tensor const& a,
|
||||||
|
torch::Tensor const& b,
|
||||||
|
EpilogueArgs&&... epilogue_args) {
|
||||||
|
TORCH_CHECK(a.dtype() == torch::kInt8);
|
||||||
|
TORCH_CHECK(b.dtype() == torch::kInt8);
|
||||||
|
|
||||||
|
if (out.dtype() == torch::kBFloat16) {
|
||||||
|
return cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
|
} else {
|
||||||
|
TORCH_CHECK(out.dtype() == torch::kFloat16);
|
||||||
|
return cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
|
void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
|
||||||
torch::Tensor const& b,
|
torch::Tensor const& b,
|
||||||
torch::Tensor const& a_scales,
|
torch::Tensor const& a_scales,
|
||||||
torch::Tensor const& b_scales) {
|
torch::Tensor const& b_scales,
|
||||||
TORCH_CHECK(a.dtype() == torch::kInt8);
|
c10::optional<torch::Tensor> const& bias) {
|
||||||
TORCH_CHECK(b.dtype() == torch::kInt8);
|
|
||||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||||
|
if (bias) {
|
||||||
using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
|
TORCH_CHECK(bias->dtype() == out.dtype(),
|
||||||
using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
|
"currently bias dtype must match output dtype ", out.dtype());
|
||||||
using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
|
return cutlass_scaled_mm_sm80_epilogue<ScaledEpilogueBias>(
|
||||||
|
out, a, b, a_scales, b_scales, *bias);
|
||||||
if (out.dtype() == torch::kBFloat16) {
|
|
||||||
return cutlass_gemm_caller<cutlass_2x_gemm<
|
|
||||||
cutlass::arch::Sm80, enable_sm80_to_sm89, int8_t, cutlass::bfloat16_t,
|
|
||||||
ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
|
|
||||||
out, a, b, a_scales, b_scales);
|
|
||||||
} else {
|
} else {
|
||||||
TORCH_CHECK(out.dtype() == torch::kFloat16);
|
return cutlass_scaled_mm_sm80_epilogue<ScaledEpilogue>(out, a, b, a_scales,
|
||||||
return cutlass_gemm_caller<cutlass_2x_gemm<
|
b_scales);
|
||||||
cutlass::arch::Sm80, enable_sm80_to_sm89, int8_t, cutlass::half_t,
|
|
||||||
ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
|
|
||||||
out, a, b, a_scales, b_scales);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
|
template <template <typename, typename> typename Epilogue,
|
||||||
torch::Tensor const& b,
|
typename... EpilogueArgs>
|
||||||
torch::Tensor const& a_scales,
|
void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
|
||||||
torch::Tensor const& b_scales) {
|
torch::Tensor const& b,
|
||||||
|
EpilogueArgs&&... epilogue_args) {
|
||||||
using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
|
using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
|
||||||
using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
|
using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
|
||||||
using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
|
using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
|
||||||
|
|
||||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
|
||||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
|
||||||
|
|
||||||
if (a.dtype() == torch::kInt8) {
|
if (a.dtype() == torch::kInt8) {
|
||||||
TORCH_CHECK(b.dtype() == torch::kInt8);
|
TORCH_CHECK(b.dtype() == torch::kInt8);
|
||||||
|
|
||||||
if (out.dtype() == torch::kBFloat16) {
|
if (out.dtype() == torch::kBFloat16) {
|
||||||
return cutlass_gemm_caller<cutlass_2x_gemm<
|
return cutlass_gemm_caller<cutlass_2x_gemm<
|
||||||
cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::bfloat16_t,
|
cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::bfloat16_t,
|
||||||
ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
|
Epilogue, TileShape, WarpShape, InstructionShape, 5>>(
|
||||||
out, a, b, a_scales, b_scales);
|
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
} else {
|
} else {
|
||||||
assert(out.dtype() == torch::kFloat16);
|
assert(out.dtype() == torch::kFloat16);
|
||||||
return cutlass_gemm_caller<cutlass_2x_gemm<
|
return cutlass_gemm_caller<cutlass_2x_gemm<
|
||||||
cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::half_t,
|
cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::half_t,
|
||||||
ScaledEpilogue, TileShape, WarpShape, InstructionShape, 5>>(
|
Epilogue, TileShape, WarpShape, InstructionShape, 5>>(
|
||||||
out, a, b, a_scales, b_scales);
|
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
|
TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
|
||||||
TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
|
TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
|
||||||
|
|
||||||
if (out.dtype() == torch::kBFloat16) {
|
if (out.dtype() == torch::kBFloat16) {
|
||||||
return cutlass_gemm_caller<cutlass_2x_gemm<
|
return cutlass_gemm_caller<
|
||||||
cutlass::arch::Sm89, enable_sm89_to_sm90, cutlass::float_e4m3_t,
|
cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||||
cutlass::bfloat16_t, ScaledEpilogue, TileShape, WarpShape,
|
cutlass::float_e4m3_t, cutlass::bfloat16_t, Epilogue,
|
||||||
InstructionShape, 5>>(out, a, b, a_scales, b_scales);
|
TileShape, WarpShape, InstructionShape, 5>>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
} else {
|
} else {
|
||||||
TORCH_CHECK(out.dtype() == torch::kFloat16);
|
TORCH_CHECK(out.dtype() == torch::kFloat16);
|
||||||
return cutlass_gemm_caller<cutlass_2x_gemm<
|
return cutlass_gemm_caller<
|
||||||
cutlass::arch::Sm89, enable_sm89_to_sm90, cutlass::float_e4m3_t,
|
cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||||
cutlass::half_t, ScaledEpilogue, TileShape, WarpShape,
|
cutlass::float_e4m3_t, cutlass::half_t, Epilogue,
|
||||||
InstructionShape, 5>>(out, a, b, a_scales, b_scales);
|
TileShape, WarpShape, InstructionShape, 5>>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
|
||||||
|
torch::Tensor const& b,
|
||||||
|
torch::Tensor const& a_scales,
|
||||||
|
torch::Tensor const& b_scales,
|
||||||
|
c10::optional<torch::Tensor> const& bias) {
|
||||||
|
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||||
|
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||||
|
if (bias) {
|
||||||
|
TORCH_CHECK(bias->dtype() == out.dtype(),
|
||||||
|
"currently bias dtype must match output dtype ", out.dtype());
|
||||||
|
return cutlass_scaled_mm_sm89_epilogue<ScaledEpilogueBias>(
|
||||||
|
out, a, b, a_scales, b_scales, *bias);
|
||||||
|
} else {
|
||||||
|
return cutlass_scaled_mm_sm89_epilogue<ScaledEpilogue>(out, a, b, a_scales,
|
||||||
|
b_scales);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -44,11 +44,6 @@ using namespace cute;
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
uint32_t next_pow_2(uint32_t const num) {
|
|
||||||
if (num <= 1) return num;
|
|
||||||
return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
|
|
||||||
}
|
|
||||||
|
|
||||||
// A wrapper for the GEMM kernel that is used to guard against compilation on
|
// A wrapper for the GEMM kernel that is used to guard against compilation on
|
||||||
// architectures that will never use the kernel. The purpose of this is to
|
// architectures that will never use the kernel. The purpose of this is to
|
||||||
// reduce the size of the compiled binary.
|
// reduce the size of the compiled binary.
|
||||||
@@ -64,6 +59,28 @@ struct enable_sm90_or_later : Kernel {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This class provides the common ScaleA and ScaleB descriptors for the
|
||||||
|
* ScaledEpilogue and ScaledEpilogueBias classes.
|
||||||
|
*/
|
||||||
|
template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
|
||||||
|
struct ScaledEpilogueBase {
|
||||||
|
protected:
|
||||||
|
using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
|
||||||
|
|
||||||
|
using ScaleA = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
|
||||||
|
0 /*Stages*/, typename EpilogueDescriptor::TileShape, float,
|
||||||
|
Stride<Int<1>, Int<0>, Int<0>>>;
|
||||||
|
|
||||||
|
using ScaleBDescriptor =
|
||||||
|
cutlass::epilogue::collective::detail::RowBroadcastDescriptor<
|
||||||
|
EpilogueDescriptor, float>;
|
||||||
|
|
||||||
|
using ScaleB = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
|
||||||
|
ScaleBDescriptor::Stages, typename EpilogueDescriptor::TileShape,
|
||||||
|
typename ScaleBDescriptor::Element, Stride<Int<0>, Int<1>, Int<0>>>;
|
||||||
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
This epilogue function defines a quantized GEMM operation similar to
|
This epilogue function defines a quantized GEMM operation similar to
|
||||||
torch.scaled_mm_.
|
torch.scaled_mm_.
|
||||||
@@ -81,21 +98,13 @@ struct enable_sm90_or_later : Kernel {
|
|||||||
per row or column.
|
per row or column.
|
||||||
*/
|
*/
|
||||||
template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
|
template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
|
||||||
struct ScaledEpilogue {
|
struct ScaledEpilogue
|
||||||
|
: private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
|
||||||
private:
|
private:
|
||||||
using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
|
using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
|
||||||
|
using Accum = typename SUPER::Accum;
|
||||||
using ScaleA = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
|
using ScaleA = typename SUPER::ScaleA;
|
||||||
0 /*Stages*/, typename EpilogueDescriptor::TileShape, float,
|
using ScaleB = typename SUPER::ScaleB;
|
||||||
Stride<Int<1>, Int<0>, Int<0>>>;
|
|
||||||
|
|
||||||
using ScaleBDescriptor =
|
|
||||||
cutlass::epilogue::collective::detail::RowBroadcastDescriptor<
|
|
||||||
EpilogueDescriptor, float>;
|
|
||||||
|
|
||||||
using ScaleB = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
|
|
||||||
ScaleBDescriptor::Stages, typename EpilogueDescriptor::TileShape,
|
|
||||||
typename ScaleBDescriptor::Element, Stride<Int<0>, Int<1>, Int<0>>>;
|
|
||||||
|
|
||||||
using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
|
using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
|
||||||
cutlass::multiplies, float, float,
|
cutlass::multiplies, float, float,
|
||||||
@@ -125,6 +134,54 @@ struct ScaledEpilogue {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
|
||||||
|
struct ScaledEpilogueBias
|
||||||
|
: private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
|
||||||
|
private:
|
||||||
|
using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
|
||||||
|
using Accum = typename SUPER::Accum;
|
||||||
|
using ScaleA = typename SUPER::ScaleA;
|
||||||
|
using ScaleB = typename SUPER::ScaleB;
|
||||||
|
|
||||||
|
using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
|
||||||
|
cutlass::multiplies, float, float,
|
||||||
|
cutlass::FloatRoundStyle::round_to_nearest>;
|
||||||
|
|
||||||
|
using EVTCompute0 =
|
||||||
|
cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
|
||||||
|
|
||||||
|
using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
|
||||||
|
cutlass::multiply_add, ElementD, float,
|
||||||
|
cutlass::FloatRoundStyle::round_to_nearest>;
|
||||||
|
|
||||||
|
using BiasDescriptor =
|
||||||
|
cutlass::epilogue::collective::detail::RowBroadcastDescriptor<
|
||||||
|
EpilogueDescriptor, ElementD>;
|
||||||
|
|
||||||
|
using Bias = cutlass::epilogue::fusion::Sm90RowBroadcast<
|
||||||
|
BiasDescriptor::Stages, typename EpilogueDescriptor::TileShape, ElementD,
|
||||||
|
Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<ElementD>, false>;
|
||||||
|
|
||||||
|
public:
|
||||||
|
using EVTCompute =
|
||||||
|
cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
|
||||||
|
using ArgumentType = typename EVTCompute::Arguments;
|
||||||
|
|
||||||
|
static ArgumentType prepare_args(torch::Tensor const& a_scales,
|
||||||
|
torch::Tensor const& b_scales,
|
||||||
|
torch::Tensor const& bias) {
|
||||||
|
using ScaleA_Args = typename ScaleA::Arguments;
|
||||||
|
using ScaleB_Args = typename ScaleB::Arguments;
|
||||||
|
using Bias_Args = typename Bias::Arguments;
|
||||||
|
|
||||||
|
ScaleA_Args a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
|
||||||
|
ScaleB_Args b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
|
||||||
|
Bias_Args bias_args{static_cast<ElementD*>(bias.data_ptr())};
|
||||||
|
|
||||||
|
return ArgumentType{a_args, {b_args}, bias_args};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template <typename ElementAB_, typename ElementD_,
|
template <typename ElementAB_, typename ElementD_,
|
||||||
template <typename, typename, typename> typename Epilogue_,
|
template <typename, typename, typename> typename Epilogue_,
|
||||||
typename TileShape, typename ClusterShape, typename KernelSchedule,
|
typename TileShape, typename ClusterShape, typename KernelSchedule,
|
||||||
@@ -234,15 +291,15 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename InType, typename OutType,
|
template <typename InType, typename OutType,
|
||||||
template <typename, typename, typename> typename Epilogue, int32_t M>
|
template <typename, typename, typename> typename Epilogue>
|
||||||
struct sm90_fp8_config {
|
struct sm90_fp8_config_default {
|
||||||
|
// M in (128, inf)
|
||||||
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
using KernelSchedule =
|
using KernelSchedule =
|
||||||
cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
|
cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
|
||||||
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
||||||
using TileShape = Shape<_128, _128, _128>;
|
using TileShape = Shape<_128, _128, _128>;
|
||||||
using ClusterShape = Shape<_2, _1, _1>;
|
using ClusterShape = Shape<_2, _1, _1>;
|
||||||
|
|
||||||
using Cutlass3xGemm =
|
using Cutlass3xGemm =
|
||||||
cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
|
cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
|
||||||
KernelSchedule, EpilogueSchedule>;
|
KernelSchedule, EpilogueSchedule>;
|
||||||
@@ -250,14 +307,14 @@ struct sm90_fp8_config {
|
|||||||
|
|
||||||
template <typename InType, typename OutType,
|
template <typename InType, typename OutType,
|
||||||
template <typename, typename, typename> typename Epilogue>
|
template <typename, typename, typename> typename Epilogue>
|
||||||
struct sm90_fp8_config<InType, OutType, Epilogue, 128> {
|
struct sm90_fp8_config_M128 {
|
||||||
|
// M in (64, 128]
|
||||||
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
using KernelSchedule =
|
using KernelSchedule =
|
||||||
cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
|
cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
|
||||||
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
||||||
using TileShape = Shape<_64, _128, _128>;
|
using TileShape = Shape<_64, _128, _128>;
|
||||||
using ClusterShape = Shape<_2, _1, _1>;
|
using ClusterShape = Shape<_2, _1, _1>;
|
||||||
|
|
||||||
using Cutlass3xGemm =
|
using Cutlass3xGemm =
|
||||||
cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
|
cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
|
||||||
KernelSchedule, EpilogueSchedule>;
|
KernelSchedule, EpilogueSchedule>;
|
||||||
@@ -265,7 +322,8 @@ struct sm90_fp8_config<InType, OutType, Epilogue, 128> {
|
|||||||
|
|
||||||
template <typename InType, typename OutType,
|
template <typename InType, typename OutType,
|
||||||
template <typename, typename, typename> typename Epilogue>
|
template <typename, typename, typename> typename Epilogue>
|
||||||
struct sm90_fp8_config<InType, OutType, Epilogue, 64> {
|
struct sm90_fp8_config_M64 {
|
||||||
|
// M in [1, 64]
|
||||||
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
using KernelSchedule =
|
using KernelSchedule =
|
||||||
cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
|
cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
|
||||||
@@ -278,6 +336,78 @@ struct sm90_fp8_config<InType, OutType, Epilogue, 64> {
|
|||||||
KernelSchedule, EpilogueSchedule>;
|
KernelSchedule, EpilogueSchedule>;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType,
|
||||||
|
template <typename, typename, typename> typename Epilogue>
|
||||||
|
struct sm90_int8_config_default {
|
||||||
|
// For M > 128 and any N
|
||||||
|
static_assert(std::is_same<InType, int8_t>());
|
||||||
|
using KernelSchedule =
|
||||||
|
typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
|
||||||
|
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
||||||
|
using TileShape = Shape<_128, _128, _128>;
|
||||||
|
using ClusterShape = Shape<_2, _1, _1>;
|
||||||
|
using Cutlass3xGemm =
|
||||||
|
cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
|
||||||
|
KernelSchedule, EpilogueSchedule>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType,
|
||||||
|
template <typename, typename, typename> typename Epilogue>
|
||||||
|
struct sm90_int8_config_M128 {
|
||||||
|
// For M in (64, 128] and any N
|
||||||
|
static_assert(std::is_same<InType, int8_t>());
|
||||||
|
using KernelSchedule =
|
||||||
|
typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
|
||||||
|
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
||||||
|
using TileShape = Shape<_64, _128, _128>;
|
||||||
|
using ClusterShape = Shape<_2, _1, _1>;
|
||||||
|
using Cutlass3xGemm =
|
||||||
|
cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
|
||||||
|
KernelSchedule, EpilogueSchedule>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType,
|
||||||
|
template <typename, typename, typename> typename Epilogue>
|
||||||
|
struct sm90_int8_config_M64 {
|
||||||
|
// For M in (32, 64] and any N
|
||||||
|
static_assert(std::is_same<InType, int8_t>());
|
||||||
|
using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
|
||||||
|
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
||||||
|
using TileShape = Shape<_64, _64, _256>;
|
||||||
|
using ClusterShape = Shape<_1, _1, _1>;
|
||||||
|
using Cutlass3xGemm =
|
||||||
|
cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
|
||||||
|
KernelSchedule, EpilogueSchedule>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType,
|
||||||
|
template <typename, typename, typename> typename Epilogue>
|
||||||
|
struct sm90_int8_config_M32_NBig {
|
||||||
|
// For M in [1, 32] and N >= 8192
|
||||||
|
static_assert(std::is_same<InType, int8_t>());
|
||||||
|
using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
|
||||||
|
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
||||||
|
using TileShape = Shape<_64, _128, _256>;
|
||||||
|
using ClusterShape = Shape<_1, _4, _1>;
|
||||||
|
using Cutlass3xGemm =
|
||||||
|
cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
|
||||||
|
KernelSchedule, EpilogueSchedule>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType,
|
||||||
|
template <typename, typename, typename> typename Epilogue>
|
||||||
|
struct sm90_int8_config_M32_NSmall {
|
||||||
|
// For M in [1, 32] and N < 8192
|
||||||
|
static_assert(std::is_same<InType, int8_t>());
|
||||||
|
using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
|
||||||
|
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
||||||
|
using TileShape = Shape<_64, _64, _256>;
|
||||||
|
using ClusterShape = Shape<_1, _8, _1>;
|
||||||
|
using Cutlass3xGemm =
|
||||||
|
cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
|
||||||
|
KernelSchedule, EpilogueSchedule>;
|
||||||
|
};
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
template <typename InType, typename OutType,
|
template <typename InType, typename OutType,
|
||||||
@@ -291,11 +421,12 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
|
|||||||
TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
|
TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
|
||||||
|
|
||||||
using Cutlass3xGemmDefault =
|
using Cutlass3xGemmDefault =
|
||||||
typename sm90_fp8_config<InType, OutType, Epilogue, 0>::Cutlass3xGemm;
|
typename sm90_fp8_config_default<InType, OutType,
|
||||||
|
Epilogue>::Cutlass3xGemm;
|
||||||
using Cutlass3xGemmM64 =
|
using Cutlass3xGemmM64 =
|
||||||
typename sm90_fp8_config<InType, OutType, Epilogue, 64>::Cutlass3xGemm;
|
typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
|
||||||
using Cutlass3xGemmM128 =
|
using Cutlass3xGemmM128 =
|
||||||
typename sm90_fp8_config<InType, OutType, Epilogue, 128>::Cutlass3xGemm;
|
typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
|
||||||
|
|
||||||
uint32_t const m = a.size(0);
|
uint32_t const m = a.size(0);
|
||||||
uint32_t const mp2 =
|
uint32_t const mp2 =
|
||||||
@@ -316,49 +447,111 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void cutlass_scaled_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
|
template <typename InType, typename OutType,
|
||||||
torch::Tensor const& b,
|
template <typename, typename, typename> typename Epilogue,
|
||||||
torch::Tensor const& a_scales,
|
typename... EpilogueArgs>
|
||||||
torch::Tensor const& b_scales) {
|
void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
|
||||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
torch::Tensor const& b,
|
||||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
EpilogueArgs&&... args) {
|
||||||
|
static_assert(std::is_same<InType, int8_t>());
|
||||||
|
TORCH_CHECK(a.dtype() == torch::kInt8);
|
||||||
|
TORCH_CHECK(b.dtype() == torch::kInt8);
|
||||||
|
|
||||||
|
using Cutlass3xGemmDefault =
|
||||||
|
typename sm90_int8_config_default<InType, OutType,
|
||||||
|
Epilogue>::Cutlass3xGemm;
|
||||||
|
using Cutlass3xGemmM128 =
|
||||||
|
typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
|
||||||
|
using Cutlass3xGemmM64 =
|
||||||
|
typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
|
||||||
|
using Cutlass3xGemmM32NBig =
|
||||||
|
typename sm90_int8_config_M32_NBig<InType, OutType,
|
||||||
|
Epilogue>::Cutlass3xGemm;
|
||||||
|
using Cutlass3xGemmM32NSmall =
|
||||||
|
typename sm90_int8_config_M32_NSmall<InType, OutType,
|
||||||
|
Epilogue>::Cutlass3xGemm;
|
||||||
|
|
||||||
|
uint32_t const n = out.size(1);
|
||||||
|
bool const is_small_n = n < 8192;
|
||||||
|
|
||||||
|
uint32_t const m = a.size(0);
|
||||||
|
uint32_t const mp2 =
|
||||||
|
std::max(static_cast<uint32_t>(32), next_pow_2(m)); // next power of 2
|
||||||
|
|
||||||
|
if (mp2 <= 32) {
|
||||||
|
// m in [1, 32]
|
||||||
|
if (is_small_n) {
|
||||||
|
return cutlass_gemm_caller<Cutlass3xGemmM32NSmall>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
} else {
|
||||||
|
return cutlass_gemm_caller<Cutlass3xGemmM32NBig>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
}
|
||||||
|
} else if (mp2 <= 64) {
|
||||||
|
// m in (32, 64]
|
||||||
|
return cutlass_gemm_caller<Cutlass3xGemmM64>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
} else if (mp2 <= 128) {
|
||||||
|
// m in (64, 128]
|
||||||
|
return cutlass_gemm_caller<Cutlass3xGemmM128>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
} else {
|
||||||
|
// m in (128, inf)
|
||||||
|
return cutlass_gemm_caller<Cutlass3xGemmDefault>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <template <typename, typename, typename> typename Epilogue,
|
||||||
|
typename... EpilogueArgs>
|
||||||
|
void cutlass_scaled_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor const& a,
|
||||||
|
torch::Tensor const& b,
|
||||||
|
EpilogueArgs&&... epilogue_args) {
|
||||||
if (a.dtype() == torch::kInt8) {
|
if (a.dtype() == torch::kInt8) {
|
||||||
TORCH_CHECK(b.dtype() == torch::kInt8);
|
TORCH_CHECK(b.dtype() == torch::kInt8);
|
||||||
|
|
||||||
using TileShape = Shape<_128, _128, _128>;
|
|
||||||
using ClusterShape = Shape<_1, _2, _1>;
|
|
||||||
using KernelSchedule =
|
|
||||||
typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
|
|
||||||
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
|
||||||
|
|
||||||
if (out.dtype() == torch::kBFloat16) {
|
if (out.dtype() == torch::kBFloat16) {
|
||||||
return cutlass_gemm_caller<cutlass_3x_gemm<
|
return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
|
||||||
int8_t, cutlass::bfloat16_t, ScaledEpilogue, TileShape, ClusterShape,
|
Epilogue>(
|
||||||
KernelSchedule, EpilogueSchedule>>(out, a, b, a_scales, b_scales);
|
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
} else {
|
} else {
|
||||||
TORCH_CHECK(out.dtype() == torch::kFloat16);
|
TORCH_CHECK(out.dtype() == torch::kFloat16);
|
||||||
|
return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
|
||||||
return cutlass_gemm_caller<
|
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
cutlass_3x_gemm<int8_t, cutlass::half_t, ScaledEpilogue, TileShape,
|
|
||||||
ClusterShape, KernelSchedule, EpilogueSchedule>>(
|
|
||||||
out, a, b, a_scales, b_scales);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
|
TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
|
||||||
TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
|
TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
|
||||||
|
|
||||||
if (out.dtype() == torch::kBFloat16) {
|
if (out.dtype() == torch::kBFloat16) {
|
||||||
return cutlass_gemm_sm90_fp8_dispatch<
|
return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
|
||||||
cutlass::float_e4m3_t, cutlass::bfloat16_t, ScaledEpilogue>(
|
cutlass::bfloat16_t, Epilogue>(
|
||||||
out, a, b, a_scales, b_scales);
|
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
} else {
|
} else {
|
||||||
TORCH_CHECK(out.dtype() == torch::kFloat16);
|
TORCH_CHECK(out.dtype() == torch::kFloat16);
|
||||||
return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
|
return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
|
||||||
cutlass::half_t, ScaledEpilogue>(
|
cutlass::half_t, Epilogue>(
|
||||||
out, a, b, a_scales, b_scales);
|
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
|
||||||
|
torch::Tensor const& b,
|
||||||
|
torch::Tensor const& a_scales,
|
||||||
|
torch::Tensor const& b_scales,
|
||||||
|
c10::optional<torch::Tensor> const& bias) {
|
||||||
|
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||||
|
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||||
|
if (bias) {
|
||||||
|
TORCH_CHECK(bias->dtype() == c.dtype(),
|
||||||
|
"currently bias dtype must match output dtype ", c.dtype());
|
||||||
|
return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogueBias>(
|
||||||
|
c, a, b, a_scales, b_scales, *bias);
|
||||||
|
} else {
|
||||||
|
return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogue>(c, a, b, a_scales,
|
||||||
|
b_scales);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -6,28 +6,55 @@
|
|||||||
void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
|
void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a,
|
||||||
torch::Tensor const& b,
|
torch::Tensor const& b,
|
||||||
torch::Tensor const& a_scales,
|
torch::Tensor const& a_scales,
|
||||||
torch::Tensor const& b_scales);
|
torch::Tensor const& b_scales,
|
||||||
|
c10::optional<torch::Tensor> const& bias);
|
||||||
|
|
||||||
void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a,
|
void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a,
|
||||||
torch::Tensor const& b,
|
torch::Tensor const& b,
|
||||||
torch::Tensor const& a_scales,
|
torch::Tensor const& a_scales,
|
||||||
torch::Tensor const& b_scales);
|
torch::Tensor const& b_scales,
|
||||||
|
c10::optional<torch::Tensor> const& bias);
|
||||||
|
|
||||||
void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
|
void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
|
||||||
torch::Tensor const& b,
|
torch::Tensor const& b,
|
||||||
torch::Tensor const& a_scales,
|
torch::Tensor const& a_scales,
|
||||||
torch::Tensor const& b_scales);
|
torch::Tensor const& b_scales,
|
||||||
|
c10::optional<torch::Tensor> const& bias);
|
||||||
|
|
||||||
#if defined CUDA_VERSION && CUDA_VERSION >= 12000
|
#if defined CUDA_VERSION && CUDA_VERSION >= 12000
|
||||||
void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
|
void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
|
||||||
torch::Tensor const& b,
|
torch::Tensor const& b,
|
||||||
torch::Tensor const& a_scales,
|
torch::Tensor const& a_scales,
|
||||||
torch::Tensor const& b_scales);
|
torch::Tensor const& b_scales,
|
||||||
|
c10::optional<torch::Tensor> const& bias);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
|
||||||
|
// CUTLASS FP8 kernels need at least
|
||||||
|
// CUDA 12.0 on SM90 systems (Hopper)
|
||||||
|
// CUDA 12.4 on SM89 systems (Lovelace)
|
||||||
|
|
||||||
|
#if defined CUDA_VERSION
|
||||||
|
if (cuda_device_capability >= 90) {
|
||||||
|
return CUDA_VERSION >= 12000;
|
||||||
|
} else if (cuda_device_capability >= 89) {
|
||||||
|
// CUTLASS Kernels have not been tuned for Ada Lovelace systems
|
||||||
|
// and are slower than torch.mm. Return false unconditionally in this case.
|
||||||
|
return false;
|
||||||
|
|
||||||
|
// Once the CUTLASS kernels have been optimized for Lovelace systems,
|
||||||
|
// use the following check:
|
||||||
|
// return CUDA_VERSION >= 12040;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
|
void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
|
||||||
torch::Tensor const& b, torch::Tensor const& a_scales,
|
torch::Tensor const& b, torch::Tensor const& a_scales,
|
||||||
torch::Tensor const& b_scales) {
|
torch::Tensor const& b_scales,
|
||||||
|
c10::optional<torch::Tensor> const& bias) {
|
||||||
int32_t major_capability;
|
int32_t major_capability;
|
||||||
int32_t minor_capability;
|
int32_t minor_capability;
|
||||||
cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
|
cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
|
||||||
@@ -50,6 +77,11 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
|
|||||||
b.stride(1) % 16 == 0); // 16 Byte Alignment
|
b.stride(1) % 16 == 0); // 16 Byte Alignment
|
||||||
TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
|
TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
|
||||||
|
|
||||||
|
if (bias) {
|
||||||
|
TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
|
||||||
|
bias->dim() == 1);
|
||||||
|
}
|
||||||
|
|
||||||
at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
|
at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
|
||||||
|
|
||||||
if (version_num >= 90) {
|
if (version_num >= 90) {
|
||||||
@@ -57,19 +89,19 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
|
|||||||
|
|
||||||
// Guard against compilation issues for sm90 kernels
|
// Guard against compilation issues for sm90 kernels
|
||||||
#if defined CUDA_VERSION && CUDA_VERSION >= 12000
|
#if defined CUDA_VERSION && CUDA_VERSION >= 12000
|
||||||
cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales);
|
cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales, bias);
|
||||||
#else
|
#else
|
||||||
cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales);
|
cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales, bias);
|
||||||
#endif
|
#endif
|
||||||
} else if (version_num == 89) {
|
} else if (version_num == 89) {
|
||||||
// Ada Lovelace
|
// Ada Lovelace
|
||||||
cutlass_scaled_mm_sm89(c, a, b, a_scales, b_scales);
|
cutlass_scaled_mm_sm89(c, a, b, a_scales, b_scales, bias);
|
||||||
} else if (version_num >= 80) {
|
} else if (version_num >= 80) {
|
||||||
// Ampere
|
// Ampere
|
||||||
cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales);
|
cutlass_scaled_mm_sm80(c, a, b, a_scales, b_scales, bias);
|
||||||
} else {
|
} else {
|
||||||
// Turing
|
// Turing
|
||||||
TORCH_CHECK(version_num >= 75);
|
TORCH_CHECK(version_num >= 75);
|
||||||
cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales);
|
cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
1308
csrc/quantization/fp8/fp8_marlin.cu
Normal file
1308
csrc/quantization/fp8/fp8_marlin.cu
Normal file
File diff suppressed because it is too large
Load Diff
@@ -17,9 +17,23 @@
|
|||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
#include "base.h"
|
#include "base.h"
|
||||||
|
#include <cudaTypedefs.h>
|
||||||
|
|
||||||
namespace marlin_24 {
|
namespace marlin_24 {
|
||||||
|
|
||||||
|
// On CUDA earlier than 12.5, the ordered_metadata version of this instruction
|
||||||
|
// is not supported. On later versions of CUDA the version without ordered
|
||||||
|
// metadata results in the following warning:
|
||||||
|
// | Advisory: Modifier ‘.sp::ordered_metadata’ should be used on instruction
|
||||||
|
// | ‘mma’ instead of modifier ‘.sp’ as it is expected to have substantially
|
||||||
|
// | reduced performance on some future architectures
|
||||||
|
#if defined CUDA_VERSION && CUDA_VERSION >= 12050
|
||||||
|
#define MMA_SP_INST \
|
||||||
|
"mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
|
||||||
|
#else
|
||||||
|
#define MMA_SP_INST "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
|
||||||
|
#endif
|
||||||
|
|
||||||
// m16n8k32 sparse tensor core mma instruction with fp16 inputs and fp32
|
// m16n8k32 sparse tensor core mma instruction with fp16 inputs and fp32
|
||||||
// output/accumulation.
|
// output/accumulation.
|
||||||
__device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
|
__device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
|
||||||
@@ -29,41 +43,38 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
|
|||||||
const uint32_t* a1 = reinterpret_cast<const uint32_t*>(&a_frag1);
|
const uint32_t* a1 = reinterpret_cast<const uint32_t*>(&a_frag1);
|
||||||
const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
|
const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
|
||||||
const uint32_t* e = reinterpret_cast<const uint32_t*>(&frag_m);
|
const uint32_t* e = reinterpret_cast<const uint32_t*>(&frag_m);
|
||||||
|
|
||||||
float* c = reinterpret_cast<float*>(&frag_c);
|
float* c = reinterpret_cast<float*>(&frag_c);
|
||||||
if (psel == 0) {
|
if (psel == 0) {
|
||||||
asm volatile(
|
asm volatile(MMA_SP_INST
|
||||||
"mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
|
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
|
||||||
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
|
"{%12,%13,%14,%15}, %16, 0x0;\n"
|
||||||
"{%12,%13,%14,%15}, %16, 0x0;\n"
|
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||||
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
: "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
|
||||||
: "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]), "r"(b[2]),
|
"r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]),
|
||||||
"r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]),
|
"f"(c[2]), "f"(c[3]), "r"(e[0]));
|
||||||
"r"(e[0]));
|
asm volatile(MMA_SP_INST
|
||||||
asm volatile(
|
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
|
||||||
"mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
|
"{%12,%13,%14,%15}, %16, 0x0;\n"
|
||||||
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
|
: "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
|
||||||
"{%12,%13,%14,%15}, %16, 0x0;\n"
|
: "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
|
||||||
: "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
|
"r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
|
||||||
: "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]), "r"(b[3]),
|
"f"(c[6]), "f"(c[7]), "r"(e[0]));
|
||||||
"r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]), "f"(c[6]), "f"(c[7]),
|
|
||||||
"r"(e[0]));
|
|
||||||
} else {
|
} else {
|
||||||
asm volatile(
|
asm volatile(MMA_SP_INST
|
||||||
"mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
|
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
|
||||||
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
|
"{%12,%13,%14,%15}, %16, 0x1;\n"
|
||||||
"{%12,%13,%14,%15}, %16, 0x1;\n"
|
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
||||||
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
|
: "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
|
||||||
: "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]), "r"(b[2]),
|
"r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]),
|
||||||
"r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]),
|
"f"(c[2]), "f"(c[3]), "r"(e[0]));
|
||||||
"r"(e[0]));
|
asm volatile(MMA_SP_INST
|
||||||
asm volatile(
|
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
|
||||||
"mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
|
"{%12,%13,%14,%15}, %16, 0x1;\n"
|
||||||
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
|
: "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
|
||||||
"{%12,%13,%14,%15}, %16, 0x1;\n"
|
: "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
|
||||||
: "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
|
"r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
|
||||||
: "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]), "r"(b[3]),
|
"f"(c[6]), "f"(c[7]), "r"(e[0]));
|
||||||
"r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]), "f"(c[6]), "f"(c[7]),
|
|
||||||
"r"(e[0]));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -68,6 +68,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
|
ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
|
||||||
ops.impl("gelu_fast", torch::kCUDA, &gelu_fast);
|
ops.impl("gelu_fast", torch::kCUDA, &gelu_fast);
|
||||||
|
|
||||||
|
// Quick GELU implementation.
|
||||||
|
ops.def("gelu_quick(Tensor! out, Tensor input) -> ()");
|
||||||
|
ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);
|
||||||
|
|
||||||
// Layernorm
|
// Layernorm
|
||||||
// Apply Root Mean Square (RMS) Normalization to the input tensor.
|
// Apply Root Mean Square (RMS) Normalization to the input tensor.
|
||||||
ops.def(
|
ops.def(
|
||||||
@@ -133,13 +137,23 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
ops.def("gptq_marlin_repack", &gptq_marlin_repack);
|
ops.def("gptq_marlin_repack", &gptq_marlin_repack);
|
||||||
ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
|
ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
|
||||||
|
|
||||||
|
// fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
|
||||||
|
ops.def("fp8_marlin_gemm", &fp8_marlin_gemm);
|
||||||
|
ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm);
|
||||||
|
|
||||||
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
|
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
|
||||||
// quantization.
|
// quantization.
|
||||||
ops.def(
|
ops.def(
|
||||||
"cutlass_scaled_mm(Tensor! out, Tensor a,"
|
"cutlass_scaled_mm(Tensor! out, Tensor a,"
|
||||||
" Tensor b, Tensor a_scales,"
|
" Tensor b, Tensor a_scales,"
|
||||||
" Tensor b_scales) -> ()");
|
" Tensor b_scales, Tensor? bias) -> ()");
|
||||||
ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
|
ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
|
||||||
|
|
||||||
|
// Check if cutlass scaled_mm is supported for CUDA devices of the given
|
||||||
|
// capability
|
||||||
|
ops.def("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
|
||||||
|
ops.impl("cutlass_scaled_mm_supports_fp8", torch::kCUDA,
|
||||||
|
&cutlass_scaled_mm_supports_fp8);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Quantized GEMM for GPTQ.
|
// Quantized GEMM for GPTQ.
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user