Compare commits
380 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
79d406e918 | ||
|
|
abad5746a7 | ||
|
|
e58294ddf2 | ||
|
|
f1e15da6fe | ||
|
|
0097bb1829 | ||
|
|
ea4b570483 | ||
|
|
a41357e941 | ||
|
|
ae96ef8fbd | ||
|
|
69ec3ca14c | ||
|
|
81d7a50f24 | ||
|
|
27902d42be | ||
|
|
56b325e977 | ||
|
|
3dd507083f | ||
|
|
0ed646b7aa | ||
|
|
1dab9bc8a9 | ||
|
|
3de6e6a30e | ||
|
|
966fe72141 | ||
|
|
62963d129e | ||
|
|
d9e98f42e4 | ||
|
|
3c6325f0fc | ||
|
|
47f0954af0 | ||
|
|
7cd2ebb025 | ||
|
|
f1c78138aa | ||
|
|
3a86b54fb0 | ||
|
|
f666207161 | ||
|
|
d830656a97 | ||
|
|
d18bab3587 | ||
|
|
9831aec49f | ||
|
|
482045ee77 | ||
|
|
9d6a8daa87 | ||
|
|
ee93f4f92a | ||
|
|
7c008c51a9 | ||
|
|
4d26d806e1 | ||
|
|
c5832d2ae9 | ||
|
|
15aba081f3 | ||
|
|
31354e563f | ||
|
|
98d6682cd1 | ||
|
|
2c37540aa6 | ||
|
|
3476ed0809 | ||
|
|
54600709b6 | ||
|
|
e373853e12 | ||
|
|
c87ebc3ef9 | ||
|
|
c4059ea54f | ||
|
|
8e0817c262 | ||
|
|
83bdcb6ac3 | ||
|
|
12a59959ed | ||
|
|
dec6fc6f3b | ||
|
|
8893130b63 | ||
|
|
bb60326836 | ||
|
|
4050d646e5 | ||
|
|
d76084c12f | ||
|
|
80ca1e6a3a | ||
|
|
614aa51203 | ||
|
|
af9ad46fca | ||
|
|
7836fdcc11 | ||
|
|
deacb7ec44 | ||
|
|
f5e73c9f1b | ||
|
|
c6c240aa0a | ||
|
|
2be6955a3f | ||
|
|
9d47f64eb6 | ||
|
|
cff6a1fec1 | ||
|
|
bcc6a09b63 | ||
|
|
9def10664e | ||
|
|
75aa1442db | ||
|
|
99397da534 | ||
|
|
8dbfcd35bf | ||
|
|
f7dac83d95 | ||
|
|
7c01f70641 | ||
|
|
51e971d39e | ||
|
|
329df38f1a | ||
|
|
580353da93 | ||
|
|
ba4994443a | ||
|
|
906a19cdb0 | ||
|
|
c4bca740e8 | ||
|
|
7f83f40dee | ||
|
|
54814fd85b | ||
|
|
7041de4384 | ||
|
|
6a62cb82cc | ||
|
|
5d2a1a9cf0 | ||
|
|
4bf35ed9ae | ||
|
|
be0b3af9e0 | ||
|
|
2cd402e169 | ||
|
|
b185230744 | ||
|
|
6a2d659d28 | ||
|
|
b2c620230a | ||
|
|
b90d8cd832 | ||
|
|
3b752a6555 | ||
|
|
ec1ad0046c | ||
|
|
57f09a419c | ||
|
|
5932634409 | ||
|
|
5cbe8d155c | ||
|
|
0d0e3a42ac | ||
|
|
74d55c065b | ||
|
|
f136da15e1 | ||
|
|
c3dde367f1 | ||
|
|
64e8d2a783 | ||
|
|
79c92c7c8a | ||
|
|
736ed38849 | ||
|
|
365791ff81 | ||
|
|
691e29ecf3 | ||
|
|
3fd02bda51 | ||
|
|
98cf2ed678 | ||
|
|
e9d32d077d | ||
|
|
2061f0b8a7 | ||
|
|
96354d6a29 | ||
|
|
d12af207d2 | ||
|
|
6eabc6cb0e | ||
|
|
2110557dab | ||
|
|
b9e84259e9 | ||
|
|
294104c3f9 | ||
|
|
38a1674abb | ||
|
|
f5c8628fdc | ||
|
|
cbc53b6b8d | ||
|
|
c54269d967 | ||
|
|
5bfd1bbc98 | ||
|
|
6984c02a27 | ||
|
|
3439c5a8e3 | ||
|
|
6806998bf9 | ||
|
|
515080ad2f | ||
|
|
3aa7b6cf66 | ||
|
|
dda4811591 | ||
|
|
82079729cc | ||
|
|
c2a8ac75e0 | ||
|
|
f178e56c68 | ||
|
|
dd793d1de5 | ||
|
|
bc34937d68 | ||
|
|
dd248f7675 | ||
|
|
d9b34baedd | ||
|
|
c18ebfdd71 | ||
|
|
67882dbb44 | ||
|
|
7b99314301 | ||
|
|
2ce5d6688b | ||
|
|
f23871e9ee | ||
|
|
e9de9dd551 | ||
|
|
ba991d5c84 | ||
|
|
1744cc99ba | ||
|
|
e72dc6cb35 | ||
|
|
c246212952 | ||
|
|
edd5fe5fa2 | ||
|
|
5d4d90536f | ||
|
|
6c916ac8a8 | ||
|
|
832ea88fcb | ||
|
|
8c00f9c15d | ||
|
|
0cbc1d2b4f | ||
|
|
ff9ddbceee | ||
|
|
9c62db07ed | ||
|
|
cf90ae0123 | ||
|
|
f5dda63eb5 | ||
|
|
7187507301 | ||
|
|
f1e72cc19a | ||
|
|
5b15bde539 | ||
|
|
bd620b01fb | ||
|
|
d9a252bc8e | ||
|
|
67005a07bc | ||
|
|
c35e4a3dd7 | ||
|
|
1f5674218f | ||
|
|
b12518d3cf | ||
|
|
6c5b7af152 | ||
|
|
8065a7e220 | ||
|
|
3f3b6b2150 | ||
|
|
a7dcc62086 | ||
|
|
ad137cd111 | ||
|
|
111af1fa2c | ||
|
|
1b2eaac316 | ||
|
|
3730a1c832 | ||
|
|
949e49a685 | ||
|
|
4a30d7e3cc | ||
|
|
e83db9e7e3 | ||
|
|
78687504f7 | ||
|
|
d571ca0108 | ||
|
|
afed90a034 | ||
|
|
3ee5c4bca5 | ||
|
|
e9c2732b97 | ||
|
|
d8714530d1 | ||
|
|
7d46c8d378 | ||
|
|
da971ec7a5 | ||
|
|
3eea74889f | ||
|
|
f758aed0e8 | ||
|
|
e5150f2c28 | ||
|
|
59a1eb59c9 | ||
|
|
6820724e51 | ||
|
|
b23ce92032 | ||
|
|
2bd231a7b7 | ||
|
|
8a173382c8 | ||
|
|
07feecde1a | ||
|
|
19091efc44 | ||
|
|
95db455e7f | ||
|
|
7879f24dcc | ||
|
|
13db4369d9 | ||
|
|
4ad7b53e59 | ||
|
|
f0cc0e68e3 | ||
|
|
db5ec52ad7 | ||
|
|
114d7270ff | ||
|
|
32c86e494a | ||
|
|
8eadcf0b90 | ||
|
|
5002175e80 | ||
|
|
daef218b55 | ||
|
|
fa9e385229 | ||
|
|
26e1188e51 | ||
|
|
a3e8a05d4c | ||
|
|
e441bad674 | ||
|
|
1b44aaf4e3 | ||
|
|
9e4e6fe207 | ||
|
|
ab66536dbf | ||
|
|
728c4c8a06 | ||
|
|
1f12122b17 | ||
|
|
890d8d960b | ||
|
|
9e74d9d003 | ||
|
|
9333fb8eb9 | ||
|
|
e2b85cf86a | ||
|
|
845a3f26f9 | ||
|
|
f07d513320 | ||
|
|
4a6769053a | ||
|
|
f31c1f90e3 | ||
|
|
3ce2c050dd | ||
|
|
1c0afa13c5 | ||
|
|
d919ecc771 | ||
|
|
e691918e3b | ||
|
|
81fbb3655f | ||
|
|
0e9164b40a | ||
|
|
1b8a0d71cf | ||
|
|
bd7efe95d0 | ||
|
|
f5bb85b435 | ||
|
|
28c145eb57 | ||
|
|
e2afb03c92 | ||
|
|
6e2527a7cb | ||
|
|
cdab68dcdb | ||
|
|
d1c3d7d139 | ||
|
|
77490c6f2f | ||
|
|
48f589e18b | ||
|
|
348616ac4b | ||
|
|
15985680e2 | ||
|
|
d74674bbd9 | ||
|
|
703475f6c2 | ||
|
|
d47af2bc02 | ||
|
|
319ad7f1d3 | ||
|
|
0f0d8bc065 | ||
|
|
55d6361b13 | ||
|
|
cd9c0d65d9 | ||
|
|
50eed24d25 | ||
|
|
e38042d4af | ||
|
|
33e3b37242 | ||
|
|
1696efe6c9 | ||
|
|
6b0511a57b | ||
|
|
a8fda4f661 | ||
|
|
30299a41fa | ||
|
|
85657b5607 | ||
|
|
0ce7b952f8 | ||
|
|
39873476f8 | ||
|
|
03dccc886e | ||
|
|
a65634d3ae | ||
|
|
80aa7e91fc | ||
|
|
bd43973522 | ||
|
|
23ec72fa03 | ||
|
|
c2637a613b | ||
|
|
88407532e7 | ||
|
|
916d219d62 | ||
|
|
ea3890a5f0 | ||
|
|
2135cacb45 | ||
|
|
7d19de2e9c | ||
|
|
94a07bbdd8 | ||
|
|
b8d4dfff9c | ||
|
|
622d45128c | ||
|
|
51602eefd3 | ||
|
|
5cc50a531f | ||
|
|
5985e3427d | ||
|
|
8b82a89997 | ||
|
|
c3c2903e72 | ||
|
|
1a8bfd92d5 | ||
|
|
847cdcca1c | ||
|
|
e3c12bf6d2 | ||
|
|
3dd6853bc8 | ||
|
|
8f89d72090 | ||
|
|
99dac099ab | ||
|
|
c4bd03c7c5 | ||
|
|
dcbf4286af | ||
|
|
00e6a2dc53 | ||
|
|
2e02311a1b | ||
|
|
89ec06c33b | ||
|
|
9fde251bf0 | ||
|
|
4c2ffb28ff | ||
|
|
246598a6b1 | ||
|
|
8bab4959be | ||
|
|
3c4cebf751 | ||
|
|
d8f31f2f8b | ||
|
|
640052b069 | ||
|
|
351d5e7b82 | ||
|
|
a008629807 | ||
|
|
76477a93b7 | ||
|
|
77c87beb06 | ||
|
|
114332b88e | ||
|
|
cb77ad836f | ||
|
|
856c990041 | ||
|
|
c5602f0baa | ||
|
|
f7f9c5f97b | ||
|
|
2c0d933594 | ||
|
|
774d1035e4 | ||
|
|
6b29d6fe70 | ||
|
|
0bfa1c4f13 | ||
|
|
c81da5f56d | ||
|
|
68bc81703e | ||
|
|
5884c2b454 | ||
|
|
45f92c00cf | ||
|
|
5467ac3196 | ||
|
|
5d7e3d0176 | ||
|
|
0373e1837e | ||
|
|
c09dade2a2 | ||
|
|
8ea5e44a43 | ||
|
|
9fb900f90c | ||
|
|
c96fc06747 | ||
|
|
b3376e5c76 | ||
|
|
e69ded7d1c | ||
|
|
767c727a81 | ||
|
|
6840a71610 | ||
|
|
7a9cb294ae | ||
|
|
ca3ea51bde | ||
|
|
dc49fb892c | ||
|
|
18a277b52d | ||
|
|
8d75fe48ca | ||
|
|
388596c914 | ||
|
|
baa15a9ec3 | ||
|
|
15063741e3 | ||
|
|
ccdc490dda | ||
|
|
a31cab7556 | ||
|
|
828da0d44e | ||
|
|
abe855d637 | ||
|
|
4efff036f0 | ||
|
|
89c920785f | ||
|
|
7b0a0dfb22 | ||
|
|
3a6ae1d33c | ||
|
|
8f1729b829 | ||
|
|
6a7c7711a2 | ||
|
|
0f83ddd4d7 | ||
|
|
065aff6c16 | ||
|
|
3d33e372a1 | ||
|
|
faf71bcd4b | ||
|
|
f270a39537 | ||
|
|
51a08e7d8f | ||
|
|
eb8fcd2666 | ||
|
|
5563a4dea8 | ||
|
|
ccd4f129e8 | ||
|
|
02cc3b51a7 | ||
|
|
d5b1eb081e | ||
|
|
f0a500545f | ||
|
|
c65146e75e | ||
|
|
41ca62cf03 | ||
|
|
974fc9b845 | ||
|
|
fee4dcc33a | ||
|
|
650a4cc55e | ||
|
|
9ca62d8668 | ||
|
|
45c35f0d58 | ||
|
|
9ba093b4f4 | ||
|
|
27208be66e | ||
|
|
87d5abef75 | ||
|
|
ec784b2526 | ||
|
|
a58f24e590 | ||
|
|
f42a006b15 | ||
|
|
3a434b07ed | ||
|
|
bd0e7802e0 | ||
|
|
06b2550cbb | ||
|
|
f775a07e30 | ||
|
|
4f0d17c05c | ||
|
|
10c38e3e46 | ||
|
|
cafb8e06c5 | ||
|
|
cbb2f59cc8 | ||
|
|
0ab278ca31 | ||
|
|
7a64d24aad | ||
|
|
dfbe60dc62 | ||
|
|
a66cf40b20 | ||
|
|
f790ad3c50 | ||
|
|
ed59a7ed23 | ||
|
|
044793d8df | ||
|
|
c2d6d2f960 | ||
|
|
8279078e21 | ||
|
|
b9c0605a8e | ||
|
|
37464a0f74 | ||
|
|
c354072828 | ||
|
|
f081c3ce4b | ||
|
|
260d119e86 | ||
|
|
a360ff80bb |
@@ -8,10 +8,6 @@ set -o pipefail
|
|||||||
# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
|
# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
|
||||||
mkdir -p images
|
mkdir -p images
|
||||||
cd images
|
cd images
|
||||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
|
|
||||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
|
|
||||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
|
|
||||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
|
|
||||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
|
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
|
||||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
|
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
||||||
|
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.892
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.892
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
||||||
|
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.756
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.752
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
|
||||||
|
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.756
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.752
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
||||||
|
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.86
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.86
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
||||||
|
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.624
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.624
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
|
||||||
|
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.616
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.632
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
||||||
|
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.792
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.824
|
||||||
|
limit: 250
|
||||||
|
num_fewshot: 5
|
||||||
3
.buildkite/lm-eval-harness/configs/models-large.txt
Normal file
3
.buildkite/lm-eval-harness/configs/models-large.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
Meta-Llama-3-70B-Instruct.yaml
|
||||||
|
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||||
|
Qwen2-57B-A14-Instruct.yaml
|
||||||
2
.buildkite/lm-eval-harness/configs/models-small.txt
Normal file
2
.buildkite/lm-eval-harness/configs/models-small.txt
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
Meta-Llama-3-8B-Instruct.yaml
|
||||||
|
Meta-Llama-3-8B-Instruct-FP8.yaml
|
||||||
46
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file
46
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# We can use this script to compute baseline accuracy on GSM for transformers.
|
||||||
|
#
|
||||||
|
# Make sure you have lm-eval-harness installed:
|
||||||
|
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on GSM8k using huggingface transformers."
|
||||||
|
echo "This pathway is intended to be used to create baselines for "
|
||||||
|
echo "our automated nm-test-accuracy workflow"
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -m - huggingface stub or local directory of the model"
|
||||||
|
echo " -b - batch size to run the evaluation at"
|
||||||
|
echo " -l - limit number of samples to run"
|
||||||
|
echo " -f - number of fewshot samples to use"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
while getopts "m:b:l:f:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
m )
|
||||||
|
MODEL="$OPTARG"
|
||||||
|
;;
|
||||||
|
b )
|
||||||
|
BATCH_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
l )
|
||||||
|
LIMIT="$OPTARG"
|
||||||
|
;;
|
||||||
|
f )
|
||||||
|
FEWSHOT="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
lm_eval --model hf \
|
||||||
|
--model_args pretrained=$MODEL,parallelize=True \
|
||||||
|
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
|
||||||
|
--batch_size $BATCH_SIZE
|
||||||
51
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
Normal file
51
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# We can use this script to compute baseline accuracy on GSM for vllm.
|
||||||
|
# We use this for fp8, which HF does not support.
|
||||||
|
#
|
||||||
|
# Make sure you have lm-eval-harness installed:
|
||||||
|
# pip install lm-eval==0.4.2
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on GSM8k using huggingface transformers."
|
||||||
|
echo "This pathway is intended to be used to create baselines for "
|
||||||
|
echo "our automated nm-test-accuracy workflow"
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -m - huggingface stub or local directory of the model"
|
||||||
|
echo " -b - batch size to run the evaluation at"
|
||||||
|
echo " -l - limit number of samples to run"
|
||||||
|
echo " -f - number of fewshot samples to use"
|
||||||
|
echo " -t - tensor parallel size to run at"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
while getopts "m:b:l:f:t:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
m )
|
||||||
|
MODEL="$OPTARG"
|
||||||
|
;;
|
||||||
|
b )
|
||||||
|
BATCH_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
l )
|
||||||
|
LIMIT="$OPTARG"
|
||||||
|
;;
|
||||||
|
f )
|
||||||
|
FEWSHOT="$OPTARG"
|
||||||
|
;;
|
||||||
|
t )
|
||||||
|
TP_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
lm_eval --model vllm \
|
||||||
|
--model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE \
|
||||||
|
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
|
||||||
|
--batch_size $BATCH_SIZE
|
||||||
59
.buildkite/lm-eval-harness/run-tests.sh
Normal file
59
.buildkite/lm-eval-harness/run-tests.sh
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on GSM8k using vllm and compares to "
|
||||||
|
echo "precomputed baseline (measured by HF transformers.)"
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
|
||||||
|
echo " -t - tensor parallel size"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
SUCCESS=0
|
||||||
|
|
||||||
|
while getopts "c:t:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
c )
|
||||||
|
CONFIG="$OPTARG"
|
||||||
|
;;
|
||||||
|
t )
|
||||||
|
TP_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# Parse list of configs.
|
||||||
|
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
|
||||||
|
|
||||||
|
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
|
||||||
|
do
|
||||||
|
LOCAL_SUCCESS=0
|
||||||
|
|
||||||
|
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
|
||||||
|
|
||||||
|
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
|
||||||
|
export LM_EVAL_TP_SIZE=$TP_SIZE
|
||||||
|
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
|
||||||
|
|
||||||
|
if [[ $LOCAL_SUCCESS == 0 ]]; then
|
||||||
|
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
|
||||||
|
else
|
||||||
|
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
|
||||||
|
fi
|
||||||
|
|
||||||
|
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "${SUCCESS}" -eq "0" ]; then
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
54
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
Normal file
54
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
"""
|
||||||
|
LM eval harness on model to compare vs HF baseline computed offline.
|
||||||
|
Configs are found in configs/$MODEL.yaml
|
||||||
|
|
||||||
|
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
|
||||||
|
* export LM_EVAL_TP_SIZE=4
|
||||||
|
* pytest -s test_lm_eval_correctness.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import lm_eval
|
||||||
|
import numpy
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
RTOL = 0.02
|
||||||
|
TEST_DATA_FILE = os.environ.get(
|
||||||
|
"LM_EVAL_TEST_DATA_FILE",
|
||||||
|
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
|
||||||
|
|
||||||
|
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
|
||||||
|
|
||||||
|
|
||||||
|
def launch_lm_eval(eval_config):
|
||||||
|
model_args = f"pretrained={eval_config['model_name']}," \
|
||||||
|
f"tensor_parallel_size={TP_SIZE}"
|
||||||
|
|
||||||
|
results = lm_eval.simple_evaluate(
|
||||||
|
model="vllm",
|
||||||
|
model_args=model_args,
|
||||||
|
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||||
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
|
limit=eval_config["limit"],
|
||||||
|
batch_size="auto")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def test_lm_eval_correctness():
|
||||||
|
eval_config = yaml.safe_load(
|
||||||
|
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
# Launch eval requests.
|
||||||
|
results = launch_lm_eval(eval_config)
|
||||||
|
|
||||||
|
# Confirm scores match ground truth.
|
||||||
|
for task in eval_config["tasks"]:
|
||||||
|
for metric in task["metrics"]:
|
||||||
|
ground_truth = metric["value"]
|
||||||
|
measured_value = results["results"][task["name"]][metric["name"]]
|
||||||
|
print(f'{task["name"]} | {metric["name"]}: '
|
||||||
|
f'ground_truth={ground_truth} | measured={measured_value}')
|
||||||
|
assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
|
||||||
103
.buildkite/nightly-benchmarks/README.md
Normal file
103
.buildkite/nightly-benchmarks/README.md
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
# vLLM benchmark suite
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
This directory contains the performance benchmarking CI for vllm.
|
||||||
|
The goal is to help developers know the impact of their PRs on the performance of vllm.
|
||||||
|
|
||||||
|
This benchmark will be *triggered* upon:
|
||||||
|
- A PR being merged into vllm.
|
||||||
|
- Every commit for those PRs with `perf-benchmarks` label.
|
||||||
|
|
||||||
|
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for more GPUs is comming later), with different models.
|
||||||
|
|
||||||
|
**Benchmarking Duration**: about 1hr.
|
||||||
|
|
||||||
|
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to less than 1.5 hr so that it won't take forever to run.
|
||||||
|
|
||||||
|
|
||||||
|
## Configuring the workload
|
||||||
|
|
||||||
|
The benchmarking workload contains three parts:
|
||||||
|
- Latency tests in `latency-tests.json`.
|
||||||
|
- Throughput tests in `throughput-tests.json`.
|
||||||
|
- Serving tests in `serving-tests.json`.
|
||||||
|
|
||||||
|
See [descriptions.md](tests/descriptions.md) for detailed descriptions.
|
||||||
|
|
||||||
|
### Latency test
|
||||||
|
|
||||||
|
Here is an example of one test inside `latency-tests.json`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama8B_tp1",
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num_iters_warmup": 5,
|
||||||
|
"num_iters": 15
|
||||||
|
}
|
||||||
|
},
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
In this example:
|
||||||
|
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
||||||
|
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
||||||
|
|
||||||
|
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
||||||
|
|
||||||
|
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
|
||||||
|
|
||||||
|
|
||||||
|
### Throughput test
|
||||||
|
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
|
||||||
|
|
||||||
|
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
||||||
|
|
||||||
|
### Serving test
|
||||||
|
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
||||||
|
|
||||||
|
```
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
Inside this example:
|
||||||
|
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
||||||
|
- The `server-parameters` includes the command line arguments for vLLM server.
|
||||||
|
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
|
||||||
|
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
|
||||||
|
|
||||||
|
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
|
||||||
|
|
||||||
|
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
||||||
|
|
||||||
|
## Visualizing the results
|
||||||
|
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
||||||
|
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||||
|
If you do not see the table, please wait till the benchmark finish running.
|
||||||
|
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||||
|
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
||||||
62
.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
Normal file
62
.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
steps:
|
||||||
|
- label: "Wait for container to be ready"
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
containers:
|
||||||
|
- image: badouralix/curl-jq
|
||||||
|
command:
|
||||||
|
- sh
|
||||||
|
- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
||||||
|
- wait
|
||||||
|
- label: "A100 Benchmark"
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
priorityClassName: perf-benchmark
|
||||||
|
containers:
|
||||||
|
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 8
|
||||||
|
volumeMounts:
|
||||||
|
- name: devshm
|
||||||
|
mountPath: /dev/shm
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
nodeSelector:
|
||||||
|
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||||
|
volumes:
|
||||||
|
- name: devshm
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
# - label: "H100: NVIDIA SMI"
|
||||||
|
# agents:
|
||||||
|
# queue: H100
|
||||||
|
# plugins:
|
||||||
|
# - docker#v5.11.0:
|
||||||
|
# image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||||
|
# command:
|
||||||
|
# - bash
|
||||||
|
# - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
|
||||||
|
# mount-buildkite-agent: true
|
||||||
|
# propagate-environment: true
|
||||||
|
# propagate-uid-gid: false
|
||||||
|
# ipc: host
|
||||||
|
# gpus: all
|
||||||
|
# environment:
|
||||||
|
# - VLLM_USAGE_SOURCE
|
||||||
|
# - HF_TOKEN
|
||||||
|
|
||||||
27
.buildkite/nightly-benchmarks/kickoff-pipeline.sh
Executable file
27
.buildkite/nightly-benchmarks/kickoff-pipeline.sh
Executable file
@@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# NOTE(simon): this script runs inside a buildkite agent with CPU only access.
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Install system packages
|
||||||
|
apt update
|
||||||
|
apt install -y curl jq
|
||||||
|
|
||||||
|
# Install minijinja for templating
|
||||||
|
curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
|
||||||
|
source $HOME/.cargo/env
|
||||||
|
|
||||||
|
# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
|
||||||
|
if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
|
||||||
|
PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
|
||||||
|
|
||||||
|
if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
|
||||||
|
echo "This PR has the 'perf-benchmarks' label. Proceeding with the nightly benchmarks."
|
||||||
|
else
|
||||||
|
echo "This PR does not have the 'perf-benchmarks' label. Skipping the nightly benchmarks."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Upload sample.yaml
|
||||||
|
buildkite-agent pipeline upload .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
|
||||||
358
.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
Normal file
358
.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
Normal file
@@ -0,0 +1,358 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script should be run inside the CI process
|
||||||
|
# This script assumes that we are already inside the vllm/ directory
|
||||||
|
# Benchmarking results will be available inside vllm/benchmarks/results/
|
||||||
|
|
||||||
|
# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
|
||||||
|
# and we still want to see other benchmarking results even when mixtral crashes.
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
check_gpus() {
|
||||||
|
# check the number of GPUs and GPU type.
|
||||||
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
|
echo "GPU found."
|
||||||
|
else
|
||||||
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||||
|
echo "GPU type is $gpu_type"
|
||||||
|
}
|
||||||
|
|
||||||
|
check_hf_token() {
|
||||||
|
# check if HF_TOKEN is available and valid
|
||||||
|
if [[ -z "$HF_TOKEN" ]]; then
|
||||||
|
echo "Error: HF_TOKEN is not set."
|
||||||
|
exit 1
|
||||||
|
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
||||||
|
echo "Error: HF_TOKEN does not start with 'hf_'."
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
echo "HF_TOKEN is set and valid."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
json2args() {
|
||||||
|
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||||
|
# example:
|
||||||
|
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||||
|
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||||
|
local json_string=$1
|
||||||
|
local args=$(
|
||||||
|
echo "$json_string" | jq -r '
|
||||||
|
to_entries |
|
||||||
|
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||||
|
join(" ")
|
||||||
|
'
|
||||||
|
)
|
||||||
|
echo "$args"
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
# wait for vllm server to start
|
||||||
|
# return 1 if vllm server crashes
|
||||||
|
timeout 1200 bash -c '
|
||||||
|
until curl localhost:8000/v1/completions; do
|
||||||
|
sleep 1
|
||||||
|
done' && return 0 || return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_gpu_processes() {
|
||||||
|
# kill all processes on GPU.
|
||||||
|
pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
|
||||||
|
if [ -z "$pids" ]; then
|
||||||
|
echo "No GPU processes found."
|
||||||
|
else
|
||||||
|
for pid in $pids; do
|
||||||
|
kill -9 "$pid"
|
||||||
|
echo "Killed process with PID: $pid"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "All GPU processes have been killed."
|
||||||
|
fi
|
||||||
|
|
||||||
|
# waiting for GPU processes to be fully killed
|
||||||
|
sleep 10
|
||||||
|
|
||||||
|
# remove vllm config file
|
||||||
|
rm -rf ~/.config/vllm
|
||||||
|
|
||||||
|
# Print the GPU memory usage
|
||||||
|
# so that we know if all GPU processes are killed.
|
||||||
|
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
||||||
|
# The memory usage should be 0 MB.
|
||||||
|
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
||||||
|
}
|
||||||
|
|
||||||
|
upload_to_buildkite() {
|
||||||
|
# upload the benchmarking results to buildkite
|
||||||
|
|
||||||
|
# if the agent binary is not found, skip uploading the results, exit 0
|
||||||
|
if [ ! -f /workspace/buildkite-agent ]; then
|
||||||
|
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
|
||||||
|
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||||
|
}
|
||||||
|
|
||||||
|
run_latency_tests() {
|
||||||
|
# run latency tests using `benchmark_latency.py`
|
||||||
|
# $1: a json file specifying latency test cases
|
||||||
|
|
||||||
|
local latency_test_file
|
||||||
|
latency_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over latency tests
|
||||||
|
jq -c '.[]' "$latency_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
if [[ ! "$test_name" =~ ^latency_ ]]; then
|
||||||
|
echo "In latency-test.json, test_name must start with \"latency_\"."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# get arguments
|
||||||
|
latency_params=$(echo "$params" | jq -r '.parameters')
|
||||||
|
latency_args=$(json2args "$latency_params")
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
latency_command="python3 benchmark_latency.py \
|
||||||
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
|
$latency_args"
|
||||||
|
|
||||||
|
echo "Running test case $test_name"
|
||||||
|
echo "Latency command: $latency_command"
|
||||||
|
|
||||||
|
# recoding benchmarking command ang GPU command
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg latency "$latency_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
'{
|
||||||
|
latency_command: $latency,
|
||||||
|
gpu_type: $gpu
|
||||||
|
}')
|
||||||
|
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
|
||||||
|
|
||||||
|
# run the benchmark
|
||||||
|
eval "$latency_command"
|
||||||
|
|
||||||
|
kill_gpu_processes
|
||||||
|
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
run_throughput_tests() {
|
||||||
|
# run throughput tests using `benchmark_throughput.py`
|
||||||
|
# $1: a json file specifying throughput test cases
|
||||||
|
|
||||||
|
local throughput_test_file
|
||||||
|
throughput_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over throughput tests
|
||||||
|
jq -c '.[]' "$throughput_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
if [[ ! "$test_name" =~ ^throughput_ ]]; then
|
||||||
|
echo "In throughput-test.json, test_name must start with \"throughput_\"."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# get arguments
|
||||||
|
throughput_params=$(echo "$params" | jq -r '.parameters')
|
||||||
|
throughput_args=$(json2args "$throughput_params")
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
tp=$(echo $throughput_params | jq -r '.tensor_parallel_size')
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
throughput_command="python3 benchmark_throughput.py \
|
||||||
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
|
$throughput_args"
|
||||||
|
|
||||||
|
echo "Running test case $test_name"
|
||||||
|
echo "Throughput command: $throughput_command"
|
||||||
|
# recoding benchmarking command ang GPU command
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg command "$throughput_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
'{
|
||||||
|
throughput_command: $command,
|
||||||
|
gpu_type: $gpu
|
||||||
|
}')
|
||||||
|
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
|
||||||
|
|
||||||
|
# run the benchmark
|
||||||
|
eval "$throughput_command"
|
||||||
|
|
||||||
|
kill_gpu_processes
|
||||||
|
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
run_serving_tests() {
|
||||||
|
# run serving tests using `benchmark_serving.py`
|
||||||
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
|
local serving_test_file
|
||||||
|
serving_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over serving tests
|
||||||
|
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
if [[ ! "$test_name" =~ ^serving_ ]]; then
|
||||||
|
echo "In serving-test.json, test_name must start with \"serving_\"."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
# get client and server arguments
|
||||||
|
server_params=$(echo "$params" | jq -r '.server_parameters')
|
||||||
|
client_params=$(echo "$params" | jq -r '.client_parameters')
|
||||||
|
server_args=$(json2args "$server_params")
|
||||||
|
client_args=$(json2args "$client_params")
|
||||||
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# check if server model and client model is aligned
|
||||||
|
server_model=$(echo "$server_params" | jq -r '.model')
|
||||||
|
client_model=$(echo "$client_params" | jq -r '.model')
|
||||||
|
if [[ $server_model != "$client_model" ]]; then
|
||||||
|
echo "Server model and client model must be the same. Skip testcase $testname."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
$server_args"
|
||||||
|
|
||||||
|
# run the server
|
||||||
|
echo "Running test case $test_name"
|
||||||
|
echo "Server command: $server_command"
|
||||||
|
eval "$server_command" &
|
||||||
|
|
||||||
|
# wait until the server is alive
|
||||||
|
wait_for_server
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo ""
|
||||||
|
echo "vllm server is up and running."
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "vllm failed to start within the timeout period."
|
||||||
|
fi
|
||||||
|
|
||||||
|
# iterate over different QPS
|
||||||
|
for qps in $qps_list; do
|
||||||
|
# remove the surrounding single quote from qps
|
||||||
|
if [[ "$qps" == *"inf"* ]]; then
|
||||||
|
echo "qps was $qps"
|
||||||
|
qps="inf"
|
||||||
|
echo "now qps is $qps"
|
||||||
|
fi
|
||||||
|
|
||||||
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
|
||||||
|
client_command="python3 benchmark_serving.py \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
$client_args"
|
||||||
|
|
||||||
|
echo "Running test case $test_name with qps $qps"
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
|
eval "$client_command"
|
||||||
|
|
||||||
|
# record the benchmarking commands
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg server "$server_command" \
|
||||||
|
--arg client "$client_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
'{
|
||||||
|
server_command: $server,
|
||||||
|
client_command: $client,
|
||||||
|
gpu_type: $gpu
|
||||||
|
}')
|
||||||
|
echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
# clean up
|
||||||
|
kill_gpu_processes
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
check_gpus
|
||||||
|
check_hf_token
|
||||||
|
|
||||||
|
# dependencies
|
||||||
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
|
|
||||||
|
# get the current IP address, required by benchmark_serving.py
|
||||||
|
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||||
|
# turn of the reporting of the status of each request, to clean up the terminal output
|
||||||
|
export VLLM_LOG_LEVEL="WARNING"
|
||||||
|
|
||||||
|
# prepare for benchmarking
|
||||||
|
cd benchmarks || exit 1
|
||||||
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
declare -g RESULTS_FOLDER=results/
|
||||||
|
mkdir -p $RESULTS_FOLDER
|
||||||
|
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||||
|
|
||||||
|
# benchmarking
|
||||||
|
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
|
||||||
|
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
|
||||||
|
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
|
||||||
|
|
||||||
|
|
||||||
|
# postprocess benchmarking results
|
||||||
|
pip install tabulate pandas
|
||||||
|
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
|
||||||
|
|
||||||
|
upload_to_buildkite
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
@@ -0,0 +1,192 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
results_folder = Path("results/")
|
||||||
|
|
||||||
|
# latency results and the keys that will be printed into markdown
|
||||||
|
latency_results = []
|
||||||
|
latency_column_mapping = {
|
||||||
|
"test_name": "Test name",
|
||||||
|
"gpu_type": "GPU",
|
||||||
|
"avg_latency": "Mean latency (ms)",
|
||||||
|
# "P10": "P10 (s)",
|
||||||
|
# "P25": "P25 (s)",
|
||||||
|
"P50": "Median latency (ms)",
|
||||||
|
# "P75": "P75 (s)",
|
||||||
|
# "P90": "P90 (s)",
|
||||||
|
"P99": "P99 latency (ms)",
|
||||||
|
}
|
||||||
|
|
||||||
|
# throughput tests and the keys that will be printed into markdown
|
||||||
|
throughput_results = []
|
||||||
|
throughput_results_column_mapping = {
|
||||||
|
"test_name": "Test name",
|
||||||
|
"gpu_type": "GPU",
|
||||||
|
# "num_requests": "# of req.",
|
||||||
|
# "total_num_tokens": "Total # of tokens",
|
||||||
|
# "elapsed_time": "Elapsed time (s)",
|
||||||
|
"requests_per_second": "Tput (req/s)",
|
||||||
|
# "tokens_per_second": "Tput (tok/s)",
|
||||||
|
}
|
||||||
|
|
||||||
|
# serving results and the keys that will be printed into markdown
|
||||||
|
serving_results = []
|
||||||
|
serving_column_mapping = {
|
||||||
|
"test_name": "Test name",
|
||||||
|
"gpu_type": "GPU",
|
||||||
|
# "completed": "# of req.",
|
||||||
|
"request_throughput": "Tput (req/s)",
|
||||||
|
# "input_throughput": "Input Tput (tok/s)",
|
||||||
|
# "output_throughput": "Output Tput (tok/s)",
|
||||||
|
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||||
|
"median_ttft_ms": "Median TTFT (ms)",
|
||||||
|
"p99_ttft_ms": "P99 TTFT (ms)",
|
||||||
|
# "mean_tpot_ms": "Mean TPOT (ms)",
|
||||||
|
# "median_tpot_ms": "Median",
|
||||||
|
# "p99_tpot_ms": "P99",
|
||||||
|
"mean_itl_ms": "Mean ITL (ms)",
|
||||||
|
"median_itl_ms": "Median ITL (ms)",
|
||||||
|
"p99_itl_ms": "P99 ITL (ms)",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def read_markdown(file):
|
||||||
|
if os.path.exists(file):
|
||||||
|
with open(file, "r") as f:
|
||||||
|
return f.read() + "\n"
|
||||||
|
else:
|
||||||
|
return f"{file} not found.\n"
|
||||||
|
|
||||||
|
|
||||||
|
def results_to_json(latency, throughput, serving):
|
||||||
|
return json.dumps({
|
||||||
|
'latency': latency.to_dict(),
|
||||||
|
'throughput': throughput.to_dict(),
|
||||||
|
'serving': serving.to_dict()
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
# collect results
|
||||||
|
for test_file in results_folder.glob("*.json"):
|
||||||
|
|
||||||
|
with open(test_file, "r") as f:
|
||||||
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
|
if "serving" in str(test_file):
|
||||||
|
# this result is generated via `benchmark_serving.py`
|
||||||
|
|
||||||
|
# attach the benchmarking command to raw_result
|
||||||
|
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||||
|
command = json.loads(f.read())
|
||||||
|
raw_result.update(command)
|
||||||
|
|
||||||
|
# update the test name of this result
|
||||||
|
raw_result.update({"test_name": test_file.stem})
|
||||||
|
|
||||||
|
# add the result to raw_result
|
||||||
|
serving_results.append(raw_result)
|
||||||
|
continue
|
||||||
|
|
||||||
|
elif "latency" in f.name:
|
||||||
|
# this result is generated via `benchmark_latency.py`
|
||||||
|
|
||||||
|
# attach the benchmarking command to raw_result
|
||||||
|
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||||
|
command = json.loads(f.read())
|
||||||
|
raw_result.update(command)
|
||||||
|
|
||||||
|
# update the test name of this result
|
||||||
|
raw_result.update({"test_name": test_file.stem})
|
||||||
|
|
||||||
|
# get different percentiles
|
||||||
|
for perc in [10, 25, 50, 75, 90, 99]:
|
||||||
|
# Multiply 1000 to convert the time unit from s to ms
|
||||||
|
raw_result.update(
|
||||||
|
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
|
||||||
|
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
|
||||||
|
|
||||||
|
# add the result to raw_result
|
||||||
|
latency_results.append(raw_result)
|
||||||
|
continue
|
||||||
|
|
||||||
|
elif "throughput" in f.name:
|
||||||
|
# this result is generated via `benchmark_throughput.py`
|
||||||
|
|
||||||
|
# attach the benchmarking command to raw_result
|
||||||
|
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||||
|
command = json.loads(f.read())
|
||||||
|
raw_result.update(command)
|
||||||
|
|
||||||
|
# update the test name of this result
|
||||||
|
raw_result.update({"test_name": test_file.stem})
|
||||||
|
|
||||||
|
# add the result to raw_result
|
||||||
|
throughput_results.append(raw_result)
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"Skipping {test_file}")
|
||||||
|
|
||||||
|
latency_results = pd.DataFrame.from_dict(latency_results)
|
||||||
|
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||||
|
throughput_results = pd.DataFrame.from_dict(throughput_results)
|
||||||
|
|
||||||
|
raw_results_json = results_to_json(latency_results, throughput_results,
|
||||||
|
serving_results)
|
||||||
|
|
||||||
|
# remapping the key, for visualization purpose
|
||||||
|
if not latency_results.empty:
|
||||||
|
latency_results = latency_results[list(
|
||||||
|
latency_column_mapping.keys())].rename(
|
||||||
|
columns=latency_column_mapping)
|
||||||
|
if not serving_results.empty:
|
||||||
|
serving_results = serving_results[list(
|
||||||
|
serving_column_mapping.keys())].rename(
|
||||||
|
columns=serving_column_mapping)
|
||||||
|
if not throughput_results.empty:
|
||||||
|
throughput_results = throughput_results[list(
|
||||||
|
throughput_results_column_mapping.keys())].rename(
|
||||||
|
columns=throughput_results_column_mapping)
|
||||||
|
|
||||||
|
processed_results_json = results_to_json(latency_results,
|
||||||
|
throughput_results,
|
||||||
|
serving_results)
|
||||||
|
|
||||||
|
# get markdown tables
|
||||||
|
latency_md_table = tabulate(latency_results,
|
||||||
|
headers='keys',
|
||||||
|
tablefmt='pipe',
|
||||||
|
showindex=False)
|
||||||
|
serving_md_table = tabulate(serving_results,
|
||||||
|
headers='keys',
|
||||||
|
tablefmt='pipe',
|
||||||
|
showindex=False)
|
||||||
|
throughput_md_table = tabulate(throughput_results,
|
||||||
|
headers='keys',
|
||||||
|
tablefmt='pipe',
|
||||||
|
showindex=False)
|
||||||
|
|
||||||
|
# document the result
|
||||||
|
with open(results_folder / "benchmark_results.md", "w") as f:
|
||||||
|
|
||||||
|
results = read_markdown(
|
||||||
|
"../.buildkite/nightly-benchmarks/tests/descriptions.md")
|
||||||
|
results = results.format(
|
||||||
|
latency_tests_markdown_table=latency_md_table,
|
||||||
|
throughput_tests_markdown_table=throughput_md_table,
|
||||||
|
serving_tests_markdown_table=serving_md_table,
|
||||||
|
benchmarking_results_in_json_string=processed_results_json)
|
||||||
|
f.write(results)
|
||||||
|
|
||||||
|
# document benchmarking results in json
|
||||||
|
with open(results_folder / "benchmark_results.json", "w") as f:
|
||||||
|
|
||||||
|
results = latency_results.to_dict(
|
||||||
|
orient='records') + throughput_results.to_dict(
|
||||||
|
orient='records') + serving_results.to_dict(orient='records')
|
||||||
|
f.write(json.dumps(results))
|
||||||
17
.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
Normal file
17
.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
|
||||||
|
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
|
retries=0
|
||||||
|
while [ $retries -lt 1000 ]; do
|
||||||
|
if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Waiting for image to be available..."
|
||||||
|
|
||||||
|
retries=$((retries + 1))
|
||||||
|
sleep 5
|
||||||
|
done
|
||||||
|
|
||||||
|
exit 1
|
||||||
67
.buildkite/nightly-benchmarks/tests/descriptions.md
Normal file
67
.buildkite/nightly-benchmarks/tests/descriptions.md
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
|
||||||
|
## Latency tests
|
||||||
|
|
||||||
|
This test suite aims to test vllm's end-to-end latency under a controlled setup.
|
||||||
|
|
||||||
|
- Input length: 32 tokens.
|
||||||
|
- Output length: 128 tokens.
|
||||||
|
- Batch size: fixed (8).
|
||||||
|
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
|
||||||
|
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
||||||
|
|
||||||
|
### Latency benchmarking results
|
||||||
|
|
||||||
|
{latency_tests_markdown_table}
|
||||||
|
|
||||||
|
## Throughput tests
|
||||||
|
|
||||||
|
This test suite aims to test vllm's throughput.
|
||||||
|
|
||||||
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
|
- Output length: the corresponding output length of these 200 prompts.
|
||||||
|
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
||||||
|
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
|
||||||
|
- Evaluation metrics: throughput.
|
||||||
|
|
||||||
|
### Throughput benchmarking results
|
||||||
|
|
||||||
|
{throughput_tests_markdown_table}
|
||||||
|
|
||||||
|
## Serving tests
|
||||||
|
|
||||||
|
This test suite aims to test vllm's real serving metrics.
|
||||||
|
|
||||||
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
|
- Output length: the corresponding output length of these 200 prompts.
|
||||||
|
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
||||||
|
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
||||||
|
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
|
||||||
|
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
||||||
|
|
||||||
|
### Serving benchmarking results
|
||||||
|
|
||||||
|
{serving_tests_markdown_table}
|
||||||
|
|
||||||
|
## json version of the benchmarking tables
|
||||||
|
|
||||||
|
This section contains the data of the markdown tables above in JSON format.
|
||||||
|
You can load the benchmarking tables into pandas dataframes as follows:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import json
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
benchmarking_results_json = """The json string"""
|
||||||
|
benchmarking_results = json.loads(benchmarking_results_json)
|
||||||
|
latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
|
||||||
|
throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
|
||||||
|
serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
|
||||||
|
```
|
||||||
|
|
||||||
|
The json string for all benchmarking tables:
|
||||||
|
```json
|
||||||
|
{benchmarking_results_in_json_string}
|
||||||
|
```
|
||||||
|
|
||||||
|
You can also check the raw experiment data in the Artifact tab of the Buildkite page.
|
||||||
|
|
||||||
32
.buildkite/nightly-benchmarks/tests/latency-tests.json
Normal file
32
.buildkite/nightly-benchmarks/tests/latency-tests.json
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama8B_tp1",
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num_iters_warmup": 5,
|
||||||
|
"num_iters": 15
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama70B_tp4",
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num-iters-warmup": 5,
|
||||||
|
"num-iters": 15
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "latency_mixtral8x7B_tp2",
|
||||||
|
"parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num-iters-warmup": 5,
|
||||||
|
"num-iters": 15
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
59
.buildkite/nightly-benchmarks/tests/serving-tests.json
Normal file
59
.buildkite/nightly-benchmarks/tests/serving-tests.json
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama70B_tp4_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_mixtral8x7B_tp2_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
35
.buildkite/nightly-benchmarks/tests/throughput-tests.json
Normal file
35
.buildkite/nightly-benchmarks/tests/throughput-tests.json
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "throughput_llama8B_tp1",
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200,
|
||||||
|
"backend": "vllm"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "throughput_llama70B_tp4",
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200,
|
||||||
|
"backend": "vllm"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "throughput_mixtral8x7B_tp2",
|
||||||
|
"parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200,
|
||||||
|
"backend": "vllm"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
21
.buildkite/release-pipeline.yaml
Normal file
21
.buildkite/release-pipeline.yaml
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
steps:
|
||||||
|
- block: "Build wheels"
|
||||||
|
|
||||||
|
- label: "Build wheel - Python {{matrix.python_version}}, CUDA {{matrix.cuda_version}}"
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue
|
||||||
|
commands:
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ."
|
||||||
|
- "mkdir artifacts"
|
||||||
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host"
|
||||||
|
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
|
matrix:
|
||||||
|
setup:
|
||||||
|
cuda_version:
|
||||||
|
- "11.8.0"
|
||||||
|
- "12.1.0"
|
||||||
|
python_version:
|
||||||
|
- "3.8"
|
||||||
|
- "3.9"
|
||||||
|
- "3.10"
|
||||||
|
- "3.11"
|
||||||
@@ -50,16 +50,16 @@ echo "### Serving Benchmarks" >> benchmark_results.md
|
|||||||
sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
|
sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
|
||||||
echo "" >> benchmark_results.md
|
echo "" >> benchmark_results.md
|
||||||
echo '```' >> benchmark_results.md
|
echo '```' >> benchmark_results.md
|
||||||
tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
|
tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
|
||||||
echo '```' >> benchmark_results.md
|
echo '```' >> benchmark_results.md
|
||||||
|
|
||||||
# if the agent binary is not found, skip uploading the results, exit 0
|
# if the agent binary is not found, skip uploading the results, exit 0
|
||||||
if [ ! -f /workspace/buildkite-agent ]; then
|
if [ ! -f /usr/bin/buildkite-agent ]; then
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# upload the results to buildkite
|
# upload the results to buildkite
|
||||||
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
|
buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
|
||||||
|
|
||||||
# exit with the exit code of the benchmarks
|
# exit with the exit code of the benchmarks
|
||||||
if [ $bench_latency_exit_code -ne 0 ]; then
|
if [ $bench_latency_exit_code -ne 0 ]; then
|
||||||
@@ -75,4 +75,4 @@ if [ $bench_serving_exit_code -ne 0 ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
rm ShareGPT_V3_unfiltered_cleaned_split.json
|
rm ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
/workspace/buildkite-agent artifact upload "*.json"
|
buildkite-agent artifact upload "*.json"
|
||||||
|
|||||||
@@ -4,11 +4,25 @@ set -ex
|
|||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
docker build -t cpu-test -f Dockerfile.cpu .
|
docker build -t cpu-test -f Dockerfile.cpu .
|
||||||
|
docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() { docker rm -f cpu-test || true; }
|
remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image and launch offline inference
|
# Run the image
|
||||||
docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
|
||||||
|
--cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
|
||||||
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
|
||||||
|
--cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2
|
||||||
|
|
||||||
|
# offline inference
|
||||||
|
docker exec cpu-test bash -c "python3 examples/offline_inference.py"
|
||||||
|
docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
|
||||||
|
|
||||||
|
# Run basic model test
|
||||||
|
docker exec cpu-test bash -c "cd tests;
|
||||||
|
pip install pytest Pillow protobuf
|
||||||
|
cd ../
|
||||||
|
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported
|
||||||
|
|||||||
14
.buildkite/run-openvino-test.sh
Executable file
14
.buildkite/run-openvino-test.sh
Executable file
@@ -0,0 +1,14 @@
|
|||||||
|
# This script build the OpenVINO docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
docker build -t openvino-test -f Dockerfile.openvino .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f openvino-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image and launch offline inference
|
||||||
|
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
|
||||||
14
.buildkite/run-xpu-test.sh
Normal file
14
.buildkite/run-xpu-test.sh
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
docker build -t xpu-test -f Dockerfile.xpu .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f xpu-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image and launch offline inference
|
||||||
|
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
|
||||||
@@ -1,7 +1,10 @@
|
|||||||
# In this file, you can add more tests to run either by adding a new step or
|
# In this file, you can add more tests to run either by adding a new step or
|
||||||
# adding a new command to an existing step. See different options here for examples.
|
# adding a new command to an existing step. See different options here for examples.
|
||||||
# This script will be feed into Jinja template in `test-template.j2` to generate
|
|
||||||
# the final pipeline yaml file.
|
# This script will be feed into Jinja template in `test-template-aws.j2` at
|
||||||
|
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
|
||||||
|
# to generate the final pipeline yaml file.
|
||||||
|
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- label: Regression Test
|
- label: Regression Test
|
||||||
@@ -24,35 +27,63 @@ steps:
|
|||||||
|
|
||||||
- label: Core Test
|
- label: Core Test
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
command: pytest -v -s core
|
commands:
|
||||||
|
- pytest -v -s core
|
||||||
|
- pytest -v -s distributed/test_parallel_state.py
|
||||||
|
|
||||||
- label: Distributed Comm Ops Test
|
- label: Distributed Comm Ops Test
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
command: pytest -v -s distributed/test_comm_ops.py
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
|
commands:
|
||||||
|
- pytest -v -s distributed/test_comm_ops.py
|
||||||
|
- pytest -v -s distributed/test_shm_broadcast.py
|
||||||
|
|
||||||
- label: Distributed Tests
|
- label: Distributed Tests (2 GPUs)
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
commands:
|
commands:
|
||||||
|
- bash ../.buildkite/download-images.sh
|
||||||
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||||
|
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
|
||||||
|
- TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist.py
|
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
|
||||||
|
- TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
|
||||||
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||||
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
|
||||||
|
|
||||||
- label: Distributed Tests (Multiple Groups)
|
- label: Distributed Tests (4 GPUs)
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
|
# We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
|
||||||
|
# See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
|
||||||
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||||
|
|
||||||
|
- label: Pipeline Parallelism Test
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 4
|
||||||
|
commands:
|
||||||
|
- TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
- TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
- TP_SIZE=1 PP_SIZE=3 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
- PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
- PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
|
||||||
|
|
||||||
- label: Engine Test
|
- label: Engine Test
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
@@ -62,9 +93,8 @@ steps:
|
|||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s test_inputs.py
|
- pytest -v -s entrypoints/llm
|
||||||
- pytest -v -s entrypoints -m llm
|
- pytest -v -s entrypoints/openai
|
||||||
- pytest -v -s entrypoints -m openai
|
|
||||||
|
|
||||||
- label: Examples Test
|
- label: Examples Test
|
||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
@@ -79,22 +109,31 @@ steps:
|
|||||||
- python3 llava_example.py
|
- python3 llava_example.py
|
||||||
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
|
|
||||||
|
- label: Inputs Test
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
|
commands:
|
||||||
|
- bash ../.buildkite/download-images.sh
|
||||||
|
- pytest -v -s test_inputs.py
|
||||||
|
- pytest -v -s multimodal
|
||||||
|
|
||||||
- label: Kernels Test %N
|
- label: Kernels Test %N
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
commands:
|
||||||
|
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
||||||
|
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: Models Test
|
- label: Models Test
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
commands:
|
commands:
|
||||||
- bash ../.buildkite/download-images.sh
|
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
||||||
- pytest -v -s models --ignore=models/test_llava.py
|
- pytest -v -s models -m \"not vlm\"
|
||||||
|
|
||||||
- label: Llava Test
|
- label: Vision Language Models Test
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
commands:
|
commands:
|
||||||
- bash ../.buildkite/download-images.sh
|
- bash ../.buildkite/download-images.sh
|
||||||
- pytest -v -s models/test_llava.py
|
- pytest -v -s models -m vlm
|
||||||
|
|
||||||
- label: Prefix Caching Test
|
- label: Prefix Caching Test
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
@@ -118,7 +157,10 @@ steps:
|
|||||||
|
|
||||||
- label: Speculative decoding tests
|
- label: Speculative decoding tests
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
command: pytest -v -s spec_decode
|
commands:
|
||||||
|
# See https://github.com/vllm-project/vllm/issues/5152
|
||||||
|
- export VLLM_ATTENTION_BACKEND=XFORMERS
|
||||||
|
- pytest -v -s spec_decode
|
||||||
|
|
||||||
- label: LoRA Test %N
|
- label: LoRA Test %N
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
@@ -130,14 +172,10 @@ steps:
|
|||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
||||||
commands:
|
commands:
|
||||||
# Temporarily run this way because we cannot clean up GPU mem usage
|
# FIXIT: find out which code initialize cuda before running the test
|
||||||
# for multi GPU tests.
|
# before the fix, we need to use spawn to test it
|
||||||
# TODO(sang): Fix it.
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s lora/test_long_context.py::test_rotary_emb_replaced
|
- pytest -v -s -x lora/test_long_context.py
|
||||||
- pytest -v -s lora/test_long_context.py::test_batched_rope_kernel
|
|
||||||
- pytest -v -s lora/test_long_context.py::test_self_consistency
|
|
||||||
- pytest -v -s lora/test_long_context.py::test_quality
|
|
||||||
- pytest -v -s lora/test_long_context.py::test_max_len
|
|
||||||
|
|
||||||
- label: Tensorizer Test
|
- label: Tensorizer Test
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
@@ -151,6 +189,15 @@ steps:
|
|||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
command: pytest -v -s quantization
|
command: pytest -v -s quantization
|
||||||
|
|
||||||
|
- label: Tracing Test
|
||||||
|
commands:
|
||||||
|
- "pip install \
|
||||||
|
opentelemetry-sdk \
|
||||||
|
opentelemetry-api \
|
||||||
|
opentelemetry-exporter-otlp \
|
||||||
|
opentelemetry-semantic-conventions-ai"
|
||||||
|
- pytest -v -s tracing
|
||||||
|
|
||||||
- label: Benchmarks
|
- label: Benchmarks
|
||||||
working_dir: "/vllm-workspace/.buildkite"
|
working_dir: "/vllm-workspace/.buildkite"
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
@@ -158,9 +205,39 @@ steps:
|
|||||||
- pip install aiohttp
|
- pip install aiohttp
|
||||||
- bash run-benchmarks.sh
|
- bash run-benchmarks.sh
|
||||||
|
|
||||||
|
- label: LM Eval Small Models
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
commands:
|
||||||
|
- pip install lm-eval
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
||||||
|
|
||||||
|
- label: LM Eval Large Models
|
||||||
|
gpu: a100
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
commands:
|
||||||
|
- pip install lm-eval
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|
||||||
|
|
||||||
- label: Documentation Build
|
- label: Documentation Build
|
||||||
working_dir: "/vllm-workspace/test_docs/docs"
|
working_dir: "/vllm-workspace/test_docs/docs"
|
||||||
no_gpu: True
|
no_gpu: True
|
||||||
commands:
|
commands:
|
||||||
- pip install -r requirements-docs.txt
|
- pip install -r requirements-docs.txt
|
||||||
- SPHINXOPTS=\"-W\" make html
|
- SPHINXOPTS=\"-W\" make html
|
||||||
|
|
||||||
|
- label: Distributed Tests (A100)
|
||||||
|
gpu: a100
|
||||||
|
num_gpus: 4
|
||||||
|
commands:
|
||||||
|
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||||
|
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||||
|
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||||
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
|
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
||||||
|
- VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
|
- VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
|
- pytest -v -s -x lora/test_mixtral.py
|
||||||
|
|||||||
@@ -1,93 +0,0 @@
|
|||||||
{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
|
|
||||||
{% set default_num_gpu = 1 %}
|
|
||||||
{% set default_working_dir = "/vllm-workspace/tests" %}
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- label: ":docker: build image"
|
|
||||||
commands:
|
|
||||||
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
|
|
||||||
- "docker push {{ docker_image }}"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
retry:
|
|
||||||
automatic:
|
|
||||||
- exit_status: -1 # Agent was lost
|
|
||||||
limit: 5
|
|
||||||
- exit_status: -10 # Agent was lost
|
|
||||||
limit: 5
|
|
||||||
- wait
|
|
||||||
|
|
||||||
- group: "AMD Tests"
|
|
||||||
depends_on: ~
|
|
||||||
steps:
|
|
||||||
{% for step in steps %}
|
|
||||||
{% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
|
|
||||||
- label: "AMD: {{ step.label }}"
|
|
||||||
agents:
|
|
||||||
queue: amd
|
|
||||||
command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
{% endif %}
|
|
||||||
{% endfor %}
|
|
||||||
|
|
||||||
- label: "Neuron Test"
|
|
||||||
depends_on: ~
|
|
||||||
agents:
|
|
||||||
queue: neuron
|
|
||||||
command: bash .buildkite/run-neuron-test.sh
|
|
||||||
soft_fail: true
|
|
||||||
|
|
||||||
- label: "Intel Test"
|
|
||||||
depends_on: ~
|
|
||||||
command: bash .buildkite/run-cpu-test.sh
|
|
||||||
|
|
||||||
{% for step in steps %}
|
|
||||||
- label: "{{ step.label }}"
|
|
||||||
agents:
|
|
||||||
queue: kubernetes
|
|
||||||
soft_fail: {{ step.soft_fail or false }}
|
|
||||||
{% if step.parallelism %}
|
|
||||||
parallelism: {{ step.parallelism }}
|
|
||||||
{% endif %}
|
|
||||||
retry:
|
|
||||||
automatic:
|
|
||||||
- exit_status: -1 # Agent was lost
|
|
||||||
limit: 5
|
|
||||||
- exit_status: -10 # Agent was lost
|
|
||||||
limit: 5
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
{% if step.num_gpus %}
|
|
||||||
priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
|
|
||||||
{% endif %}
|
|
||||||
volumes:
|
|
||||||
- name: dshm
|
|
||||||
emptyDir:
|
|
||||||
medium: Memory
|
|
||||||
containers:
|
|
||||||
- image: "{{ docker_image }}"
|
|
||||||
command: ["bash"]
|
|
||||||
args:
|
|
||||||
- '-c'
|
|
||||||
- "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
|
|
||||||
{% if not step.no_gpu %}
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
|
|
||||||
limits:
|
|
||||||
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
|
|
||||||
{% endif %}
|
|
||||||
env:
|
|
||||||
- name: VLLM_USAGE_SOURCE
|
|
||||||
value: ci-test
|
|
||||||
- name: HF_TOKEN
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: hf-token-secret
|
|
||||||
key: token
|
|
||||||
volumeMounts:
|
|
||||||
- mountPath: /dev/shm
|
|
||||||
name: dshm
|
|
||||||
{% endfor %}
|
|
||||||
3
.github/workflows/mypy.yaml
vendored
3
.github/workflows/mypy.yaml
vendored
@@ -37,6 +37,7 @@ jobs:
|
|||||||
mypy vllm/distributed --config-file pyproject.toml
|
mypy vllm/distributed --config-file pyproject.toml
|
||||||
mypy vllm/entrypoints --config-file pyproject.toml
|
mypy vllm/entrypoints --config-file pyproject.toml
|
||||||
mypy vllm/executor --config-file pyproject.toml
|
mypy vllm/executor --config-file pyproject.toml
|
||||||
|
mypy vllm/multimodal --config-file pyproject.toml
|
||||||
mypy vllm/usage --config-file pyproject.toml
|
mypy vllm/usage --config-file pyproject.toml
|
||||||
mypy vllm/*.py --config-file pyproject.toml
|
mypy vllm/*.py --config-file pyproject.toml
|
||||||
mypy vllm/transformers_utils --config-file pyproject.toml
|
mypy vllm/transformers_utils --config-file pyproject.toml
|
||||||
@@ -46,5 +47,5 @@ jobs:
|
|||||||
mypy vllm/model_executor --config-file pyproject.toml
|
mypy vllm/model_executor --config-file pyproject.toml
|
||||||
mypy vllm/lora --config-file pyproject.toml
|
mypy vllm/lora --config-file pyproject.toml
|
||||||
mypy vllm/logging --config-file pyproject.toml
|
mypy vllm/logging --config-file pyproject.toml
|
||||||
mypy vllm/model_executor --config-file pyproject.toml
|
mypy tests --config-file pyproject.toml
|
||||||
|
|
||||||
|
|||||||
2
.github/workflows/ruff.yml
vendored
2
.github/workflows/ruff.yml
vendored
@@ -25,7 +25,7 @@ jobs:
|
|||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
|
pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
|
||||||
- name: Analysing the code with ruff
|
- name: Analysing the code with ruff
|
||||||
run: |
|
run: |
|
||||||
ruff .
|
ruff .
|
||||||
|
|||||||
@@ -2,7 +2,8 @@ cmake_minimum_required(VERSION 3.21)
|
|||||||
|
|
||||||
project(vllm_extensions LANGUAGES CXX)
|
project(vllm_extensions LANGUAGES CXX)
|
||||||
|
|
||||||
option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")
|
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
||||||
|
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
||||||
|
|
||||||
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
||||||
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
||||||
@@ -32,8 +33,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
|
|||||||
# versions are derived from Dockerfile.rocm
|
# versions are derived from Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.4.0")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
@@ -66,19 +66,6 @@ endif()
|
|||||||
#
|
#
|
||||||
find_package(Torch REQUIRED)
|
find_package(Torch REQUIRED)
|
||||||
|
|
||||||
#
|
|
||||||
# Normally `torch.utils.cpp_extension.CUDAExtension` would add
|
|
||||||
# `libtorch_python.so` for linking against an extension. Torch's cmake
|
|
||||||
# configuration does not include this library (presumably since the cmake
|
|
||||||
# config is used for standalone C++ binaries that link against torch).
|
|
||||||
# The `libtorch_python.so` library defines some of the glue code between
|
|
||||||
# torch/python via pybind and is required by VLLM extensions for this
|
|
||||||
# reason. So, add it by manually with `find_library` using torch's
|
|
||||||
# installed library path.
|
|
||||||
#
|
|
||||||
find_library(torch_python_LIBRARY torch_python PATHS
|
|
||||||
"${TORCH_INSTALL_PREFIX}/lib")
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Forward the non-CUDA device extensions to external CMake scripts.
|
# Forward the non-CUDA device extensions to external CMake scripts.
|
||||||
#
|
#
|
||||||
@@ -111,18 +98,11 @@ elseif(HIP_FOUND)
|
|||||||
# .hip extension automatically, HIP must be enabled explicitly.
|
# .hip extension automatically, HIP must be enabled explicitly.
|
||||||
enable_language(HIP)
|
enable_language(HIP)
|
||||||
|
|
||||||
# ROCm 5.x
|
# ROCm 5.X and 6.X
|
||||||
if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
|
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
|
||||||
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
|
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
|
||||||
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
|
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM} "
|
||||||
"expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
|
"expected for ROCm build, saw ${Torch_VERSION} instead.")
|
||||||
endif()
|
|
||||||
|
|
||||||
# ROCm 6.x
|
|
||||||
if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
|
|
||||||
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
|
|
||||||
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
|
|
||||||
"expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
|
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
message(FATAL_ERROR "Can't find CUDA or HIP installation.")
|
message(FATAL_ERROR "Can't find CUDA or HIP installation.")
|
||||||
@@ -171,7 +151,7 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/quantization/fp8/common.cu"
|
"csrc/quantization/fp8/common.cu"
|
||||||
"csrc/cuda_utils_kernels.cu"
|
"csrc/cuda_utils_kernels.cu"
|
||||||
"csrc/moe_align_block_size_kernels.cu"
|
"csrc/moe_align_block_size_kernels.cu"
|
||||||
"csrc/pybind.cpp")
|
"csrc/torch_bindings.cpp")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
include(FetchContent)
|
include(FetchContent)
|
||||||
@@ -191,10 +171,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||||
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||||
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
||||||
|
"csrc/quantization/fp8/fp8_marlin.cu"
|
||||||
"csrc/custom_all_reduce.cu"
|
"csrc/custom_all_reduce.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu")
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
|
||||||
|
|
||||||
#
|
#
|
||||||
# The CUTLASS kernels for Hopper require sm90a to be enabled.
|
# The CUTLASS kernels for Hopper require sm90a to be enabled.
|
||||||
@@ -202,7 +183,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# That adds an extra 17MB to compiled binary, so instead we selectively enable it.
|
# That adds an extra 17MB to compiled binary, so instead we selectively enable it.
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
|
||||||
set_source_files_properties(
|
set_source_files_properties(
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
|
||||||
PROPERTIES
|
PROPERTIES
|
||||||
COMPILE_FLAGS
|
COMPILE_FLAGS
|
||||||
"-gencode arch=compute_90a,code=sm_90a")
|
"-gencode arch=compute_90a,code=sm_90a")
|
||||||
@@ -218,6 +199,7 @@ define_gpu_extension_target(
|
|||||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||||
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
||||||
|
USE_SABI 3
|
||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
|
|
||||||
#
|
#
|
||||||
@@ -225,7 +207,7 @@ define_gpu_extension_target(
|
|||||||
#
|
#
|
||||||
|
|
||||||
set(VLLM_MOE_EXT_SRC
|
set(VLLM_MOE_EXT_SRC
|
||||||
"csrc/moe/moe_ops.cpp"
|
"csrc/moe/torch_bindings.cpp"
|
||||||
"csrc/moe/topk_softmax_kernels.cu")
|
"csrc/moe/topk_softmax_kernels.cu")
|
||||||
|
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
@@ -235,6 +217,7 @@ define_gpu_extension_target(
|
|||||||
SOURCES ${VLLM_MOE_EXT_SRC}
|
SOURCES ${VLLM_MOE_EXT_SRC}
|
||||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||||
|
USE_SABI 3
|
||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
|
|
||||||
#
|
#
|
||||||
@@ -249,7 +232,7 @@ set(VLLM_PUNICA_EXT_SRC
|
|||||||
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
|
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
|
||||||
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
|
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
|
||||||
"csrc/punica/punica_ops.cu"
|
"csrc/punica/punica_ops.cu"
|
||||||
"csrc/punica/punica_pybind.cpp")
|
"csrc/punica/torch_bindings.cpp")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Copy GPU compilation flags+update for punica
|
# Copy GPU compilation flags+update for punica
|
||||||
@@ -286,6 +269,7 @@ if (VLLM_PUNICA_GPU_ARCHES)
|
|||||||
SOURCES ${VLLM_PUNICA_EXT_SRC}
|
SOURCES ${VLLM_PUNICA_EXT_SRC}
|
||||||
COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
|
COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
|
||||||
ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
|
ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
|
||||||
|
USE_SABI 3
|
||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
else()
|
else()
|
||||||
message(WARNING "Unable to create _punica_C target because none of the "
|
message(WARNING "Unable to create _punica_C target because none of the "
|
||||||
@@ -311,6 +295,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
|
|||||||
message(STATUS "Enabling C extension.")
|
message(STATUS "Enabling C extension.")
|
||||||
add_dependencies(default _C)
|
add_dependencies(default _C)
|
||||||
|
|
||||||
|
message(STATUS "Enabling moe extension.")
|
||||||
|
add_dependencies(default _moe_C)
|
||||||
|
|
||||||
# Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
|
# Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
|
||||||
# VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
|
# VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
|
||||||
# there are supported target arches.
|
# there are supported target arches.
|
||||||
@@ -320,8 +307,3 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
|
|||||||
add_dependencies(default _punica_C)
|
add_dependencies(default _punica_C)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|
||||||
message(STATUS "Enabling moe extension.")
|
|
||||||
add_dependencies(default _moe_C)
|
|
||||||
endif()
|
|
||||||
|
|||||||
101
Dockerfile
101
Dockerfile
@@ -5,18 +5,35 @@
|
|||||||
# docs/source/dev/dockerfile/dockerfile.rst and
|
# docs/source/dev/dockerfile/dockerfile.rst and
|
||||||
# docs/source/assets/dev/dockerfile-stages-dependency.png
|
# docs/source/assets/dev/dockerfile-stages-dependency.png
|
||||||
|
|
||||||
|
ARG CUDA_VERSION=12.4.1
|
||||||
#################### BASE BUILD IMAGE ####################
|
#################### BASE BUILD IMAGE ####################
|
||||||
# prepare basic build environment
|
# prepare basic build environment
|
||||||
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS base
|
||||||
|
|
||||||
|
ARG CUDA_VERSION=12.4.1
|
||||||
|
ARG PYTHON_VERSION=3
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||||
|
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||||
|
&& apt-get update -y \
|
||||||
|
&& apt-get install -y ccache software-properties-common \
|
||||||
|
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||||
|
&& apt-get update -y \
|
||||||
|
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv python3-pip \
|
||||||
|
&& if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
|
||||||
|
&& python3 --version \
|
||||||
|
&& python3 -m pip --version
|
||||||
|
|
||||||
RUN apt-get update -y \
|
RUN apt-get update -y \
|
||||||
&& apt-get install -y python3-pip git
|
&& apt-get install -y python3-pip git curl sudo
|
||||||
|
|
||||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||||
# this won't be needed for future versions of this docker image
|
# this won't be needed for future versions of this docker image
|
||||||
# or future versions of triton.
|
# or future versions of triton.
|
||||||
RUN ldconfig /usr/local/cuda-12.4/compat/
|
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
@@ -24,12 +41,11 @@ WORKDIR /workspace
|
|||||||
COPY requirements-common.txt requirements-common.txt
|
COPY requirements-common.txt requirements-common.txt
|
||||||
COPY requirements-cuda.txt requirements-cuda.txt
|
COPY requirements-cuda.txt requirements-cuda.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install -r requirements-cuda.txt
|
python3 -m pip install -r requirements-cuda.txt
|
||||||
|
|
||||||
# install development dependencies
|
COPY requirements-mamba.txt requirements-mamba.txt
|
||||||
COPY requirements-dev.txt requirements-dev.txt
|
RUN python3 -m pip install packaging
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN python3 -m pip install -r requirements-mamba.txt
|
||||||
pip install -r requirements-dev.txt
|
|
||||||
|
|
||||||
# cuda arch list used by torch
|
# cuda arch list used by torch
|
||||||
# can be useful for both `dev` and `test`
|
# can be useful for both `dev` and `test`
|
||||||
@@ -39,14 +55,16 @@ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
|
|||||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||||
#################### BASE BUILD IMAGE ####################
|
#################### BASE BUILD IMAGE ####################
|
||||||
|
|
||||||
|
|
||||||
#################### WHEEL BUILD IMAGE ####################
|
#################### WHEEL BUILD IMAGE ####################
|
||||||
FROM dev AS build
|
FROM base AS build
|
||||||
|
|
||||||
|
ARG PYTHON_VERSION=3
|
||||||
|
|
||||||
# install build dependencies
|
# install build dependencies
|
||||||
COPY requirements-build.txt requirements-build.txt
|
COPY requirements-build.txt requirements-build.txt
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install -r requirements-build.txt
|
python3 -m pip install -r requirements-build.txt
|
||||||
|
|
||||||
# install compiler cache to speed up compilation leveraging local or remote caching
|
# install compiler cache to speed up compilation leveraging local or remote caching
|
||||||
RUN apt-get update -y && apt-get install -y ccache
|
RUN apt-get update -y && apt-get install -y ccache
|
||||||
@@ -70,10 +88,28 @@ ENV NVCC_THREADS=$nvcc_threads
|
|||||||
# make sure punica kernels are built (for LoRA)
|
# make sure punica kernels are built (for LoRA)
|
||||||
ENV VLLM_INSTALL_PUNICA_KERNELS=1
|
ENV VLLM_INSTALL_PUNICA_KERNELS=1
|
||||||
|
|
||||||
|
ARG USE_SCCACHE
|
||||||
|
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
if [ "$USE_SCCACHE" = "1" ]; then \
|
||||||
|
echo "Installing sccache..." \
|
||||||
|
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
|
||||||
|
&& tar -xzf sccache.tar.gz \
|
||||||
|
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
|
||||||
|
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
|
||||||
|
&& export SCCACHE_BUCKET=vllm-build-sccache \
|
||||||
|
&& export SCCACHE_REGION=us-west-2 \
|
||||||
|
&& sccache --show-stats \
|
||||||
|
&& python3 setup.py bdist_wheel --dist-dir=dist \
|
||||||
|
&& sccache --show-stats; \
|
||||||
|
fi
|
||||||
|
|
||||||
ENV CCACHE_DIR=/root/.cache/ccache
|
ENV CCACHE_DIR=/root/.cache/ccache
|
||||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||||
--mount=type=cache,target=/root/.cache/pip \
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
python3 setup.py bdist_wheel --dist-dir=dist
|
if [ "$USE_SCCACHE" != "1" ]; then \
|
||||||
|
python3 setup.py bdist_wheel --dist-dir=dist; \
|
||||||
|
fi
|
||||||
|
|
||||||
# check the size of the wheel, we cannot upload wheels larger than 100MB
|
# check the size of the wheel, we cannot upload wheels larger than 100MB
|
||||||
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
||||||
@@ -81,9 +117,36 @@ RUN python3 check-wheel-size.py dist
|
|||||||
|
|
||||||
#################### EXTENSION Build IMAGE ####################
|
#################### EXTENSION Build IMAGE ####################
|
||||||
|
|
||||||
|
#################### DEV IMAGE ####################
|
||||||
|
FROM base as dev
|
||||||
|
|
||||||
|
COPY requirements-lint.txt requirements-lint.txt
|
||||||
|
COPY requirements-test.txt requirements-test.txt
|
||||||
|
COPY requirements-dev.txt requirements-dev.txt
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
python3 -m pip install -r requirements-dev.txt
|
||||||
|
|
||||||
|
#################### DEV IMAGE ####################
|
||||||
|
#################### MAMBA Build IMAGE ####################
|
||||||
|
FROM dev as mamba-builder
|
||||||
|
# max jobs used for build
|
||||||
|
ARG max_jobs=2
|
||||||
|
ENV MAX_JOBS=${max_jobs}
|
||||||
|
|
||||||
|
WORKDIR /usr/src/mamba
|
||||||
|
|
||||||
|
COPY requirements-mamba.txt requirements-mamba.txt
|
||||||
|
|
||||||
|
# Download the wheel or build it if a pre-compiled release doesn't exist
|
||||||
|
RUN pip --verbose wheel -r requirements-mamba.txt \
|
||||||
|
--no-build-isolation --no-deps --no-cache-dir
|
||||||
|
|
||||||
|
#################### MAMBA Build IMAGE ####################
|
||||||
|
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
# image with vLLM installed
|
# image with vLLM installed
|
||||||
FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
|
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
|
||||||
|
ARG CUDA_VERSION=12.4.1
|
||||||
WORKDIR /vllm-workspace
|
WORKDIR /vllm-workspace
|
||||||
|
|
||||||
RUN apt-get update -y \
|
RUN apt-get update -y \
|
||||||
@@ -93,12 +156,16 @@ RUN apt-get update -y \
|
|||||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||||
# this won't be needed for future versions of this docker image
|
# this won't be needed for future versions of this docker image
|
||||||
# or future versions of triton.
|
# or future versions of triton.
|
||||||
RUN ldconfig /usr/local/cuda-12.4/compat/
|
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||||
|
|
||||||
# install vllm wheel first, so that torch etc will be installed
|
# install vllm wheel first, so that torch etc will be installed
|
||||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||||
--mount=type=cache,target=/root/.cache/pip \
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install dist/*.whl --verbose
|
python3 -m pip install dist/*.whl --verbose
|
||||||
|
|
||||||
|
RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamba \
|
||||||
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
|
python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
|
|
||||||
|
|
||||||
@@ -111,7 +178,7 @@ ADD . /vllm-workspace/
|
|||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install -r requirements-dev.txt
|
python3 -m pip install -r requirements-dev.txt
|
||||||
|
|
||||||
# doc requires source code
|
# doc requires source code
|
||||||
# we hide them inside `test_docs/` , so that this source code
|
# we hide them inside `test_docs/` , so that this source code
|
||||||
@@ -128,7 +195,7 @@ FROM vllm-base AS vllm-openai
|
|||||||
|
|
||||||
# install additional dependencies for openai api server
|
# install additional dependencies for openai api server
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install accelerate hf_transfer modelscope
|
pip install accelerate hf_transfer 'modelscope!=1.15.0'
|
||||||
|
|
||||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||||
|
|
||||||
|
|||||||
@@ -1,13 +1,25 @@
|
|||||||
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
|
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
|
||||||
|
|
||||||
FROM ubuntu:22.04
|
FROM ubuntu:22.04 AS cpu-test-1
|
||||||
|
|
||||||
RUN apt-get update -y \
|
RUN apt-get update -y \
|
||||||
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
|
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
|
||||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||||
|
|
||||||
|
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
|
||||||
|
# intel-openmp provides additional performance improvement vs. openmp
|
||||||
|
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
|
||||||
|
RUN pip install intel-openmp
|
||||||
|
|
||||||
|
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"
|
||||||
|
|
||||||
|
|
||||||
|
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
|
||||||
|
|
||||||
RUN pip install --upgrade pip \
|
RUN pip install --upgrade pip \
|
||||||
&& pip install wheel packaging ninja setuptools>=49.4.0 numpy
|
&& pip install wheel packaging ninja "setuptools>=49.4.0" numpy
|
||||||
|
|
||||||
|
FROM cpu-test-1 AS build
|
||||||
|
|
||||||
COPY ./ /workspace/vllm
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
@@ -15,8 +27,14 @@ WORKDIR /workspace/vllm
|
|||||||
|
|
||||||
RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
|
|
||||||
|
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
|
||||||
|
ARG VLLM_CPU_DISABLE_AVX512
|
||||||
|
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
||||||
|
|
||||||
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
||||||
|
|
||||||
WORKDIR /workspace/
|
WORKDIR /workspace/
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||||
|
|
||||||
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
|
|||||||
RUN cd /app/vllm \
|
RUN cd /app/vllm \
|
||||||
&& python3 -m pip install -U -r requirements-neuron.txt
|
&& python3 -m pip install -U -r requirements-neuron.txt
|
||||||
|
|
||||||
ENV VLLM_BUILD_WITH_NEURON 1
|
ENV VLLM_TARGET_DEVICE neuron
|
||||||
RUN cd /app/vllm \
|
RUN cd /app/vllm \
|
||||||
&& pip install -e . \
|
&& pip install -e . \
|
||||||
&& cd ..
|
&& cd ..
|
||||||
|
|||||||
26
Dockerfile.openvino
Normal file
26
Dockerfile.openvino
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||||
|
# to run the OpenAI compatible server.
|
||||||
|
|
||||||
|
FROM ubuntu:22.04 AS dev
|
||||||
|
|
||||||
|
RUN apt-get update -y && \
|
||||||
|
apt-get install -y python3-pip git
|
||||||
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
# copy requirements
|
||||||
|
COPY requirements-build.txt /workspace/vllm/
|
||||||
|
COPY requirements-common.txt /workspace/vllm/
|
||||||
|
COPY requirements-openvino.txt /workspace/vllm/
|
||||||
|
|
||||||
|
COPY vllm/ /workspace/vllm/vllm
|
||||||
|
COPY setup.py /workspace/vllm/
|
||||||
|
|
||||||
|
# install build requirements
|
||||||
|
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
|
||||||
|
# build vLLM with OpenVINO backend
|
||||||
|
RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
|
||||||
|
|
||||||
|
COPY examples/ /workspace/vllm/examples
|
||||||
|
COPY benchmarks/ /workspace/vllm/benchmarks
|
||||||
|
|
||||||
|
CMD ["/bin/bash"]
|
||||||
22
Dockerfile.ppc64le
Normal file
22
Dockerfile.ppc64le
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
FROM mambaorg/micromamba
|
||||||
|
ARG MAMBA_DOCKERFILE_ACTIVATE=1
|
||||||
|
USER root
|
||||||
|
|
||||||
|
RUN apt-get update -y && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||||
|
|
||||||
|
# Some packages in requirements-cpu are installed here
|
||||||
|
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
|
||||||
|
# Currently these may not be available for venv or pip directly
|
||||||
|
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 pytorch-cpu=2.1.2 torchvision-cpu=0.16.2 && micromamba clean --all --yes
|
||||||
|
|
||||||
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
|
# These packages will be in rocketce eventually
|
||||||
|
RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
|
||||||
|
|
||||||
|
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
||||||
|
|
||||||
|
WORKDIR /vllm-workspace
|
||||||
|
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
212
Dockerfile.rocm
212
Dockerfile.rocm
@@ -1,35 +1,35 @@
|
|||||||
# default base image
|
# Default ROCm 6.1 base image
|
||||||
ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
|
ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
|
||||||
|
|
||||||
FROM $BASE_IMAGE
|
# Tested and supported base rocm/pytorch images
|
||||||
|
ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1" \
|
||||||
|
ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \
|
||||||
|
ROCM_6_1_BASE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
|
||||||
|
|
||||||
ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
|
# Default ROCm ARCHes to build vLLM for.
|
||||||
|
ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
|
||||||
|
|
||||||
RUN echo "Base image is $BASE_IMAGE"
|
# Whether to build CK-based flash-attention
|
||||||
|
# If 0, will not build flash attention
|
||||||
# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
|
# This is useful for gfx target where flash-attention is not supported
|
||||||
# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
|
# (i.e. those that do not appear in `FA_GFX_ARCHS`)
|
||||||
|
# Triton FA is used by default on ROCm now so this is unnecessary.
|
||||||
|
|
||||||
ARG FA_GFX_ARCHS="gfx90a;gfx942"
|
|
||||||
RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
|
|
||||||
|
|
||||||
ARG FA_BRANCH="ae7928c"
|
|
||||||
RUN echo "FA_BRANCH is $FA_BRANCH"
|
|
||||||
|
|
||||||
# whether to build flash-attention
|
|
||||||
# if 0, will not build flash attention
|
|
||||||
# this is useful for gfx target where flash-attention is not supported
|
|
||||||
# In that case, we need to use the python reference attention implementation in vllm
|
|
||||||
ARG BUILD_FA="1"
|
ARG BUILD_FA="1"
|
||||||
|
ARG FA_GFX_ARCHS="gfx90a;gfx942"
|
||||||
|
ARG FA_BRANCH="ae7928c"
|
||||||
|
|
||||||
# whether to build triton on rocm
|
# Whether to build triton on rocm
|
||||||
ARG BUILD_TRITON="1"
|
ARG BUILD_TRITON="1"
|
||||||
|
ARG TRITON_BRANCH="0ef1848"
|
||||||
|
|
||||||
|
### Base image build stage
|
||||||
|
FROM $BASE_IMAGE AS base
|
||||||
|
|
||||||
|
# Import arg(s) defined before this build stage
|
||||||
|
ARG PYTORCH_ROCM_ARCH
|
||||||
|
|
||||||
# Install some basic utilities
|
# Install some basic utilities
|
||||||
RUN apt-get update && apt-get install python3 python3-pip -y
|
RUN apt-get update && apt-get install python3 python3-pip -y
|
||||||
|
|
||||||
# Install some basic utilities
|
|
||||||
RUN apt-get update && apt-get install -y \
|
RUN apt-get update && apt-get install -y \
|
||||||
curl \
|
curl \
|
||||||
ca-certificates \
|
ca-certificates \
|
||||||
@@ -40,75 +40,165 @@ RUN apt-get update && apt-get install -y \
|
|||||||
build-essential \
|
build-essential \
|
||||||
wget \
|
wget \
|
||||||
unzip \
|
unzip \
|
||||||
nvidia-cuda-toolkit \
|
|
||||||
tmux \
|
tmux \
|
||||||
|
ccache \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
### Mount Point ###
|
# When launching the container, mount the code directory to /vllm-workspace
|
||||||
# When launching the container, mount the code directory to /app
|
|
||||||
ARG APP_MOUNT=/vllm-workspace
|
ARG APP_MOUNT=/vllm-workspace
|
||||||
VOLUME [ ${APP_MOUNT} ]
|
|
||||||
WORKDIR ${APP_MOUNT}
|
WORKDIR ${APP_MOUNT}
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip
|
RUN pip install --upgrade pip
|
||||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
# Remove sccache so it doesn't interfere with ccache
|
||||||
|
# TODO: implement sccache support across components
|
||||||
|
RUN apt-get purge -y sccache; pip uninstall -y sccache; rm -f "$(which sccache)"
|
||||||
|
# Install torch == 2.4.0 on ROCm
|
||||||
|
RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
||||||
|
*"rocm-5.7"*) \
|
||||||
|
pip uninstall -y torch torchaudio torchvision \
|
||||||
|
&& pip install --no-cache-dir --pre \
|
||||||
|
torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
|
||||||
|
torchvision==0.19.0.dev20240612 \
|
||||||
|
--index-url https://download.pytorch.org/whl/nightly/rocm5.7;; \
|
||||||
|
*"rocm-6.0"*) \
|
||||||
|
pip uninstall -y torch torchaudio torchvision \
|
||||||
|
&& pip install --no-cache-dir --pre \
|
||||||
|
torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
|
||||||
|
torchvision==0.19.0.dev20240612 \
|
||||||
|
--index-url https://download.pytorch.org/whl/nightly/rocm6.0;; \
|
||||||
|
*"rocm-6.1"*) \
|
||||||
|
pip uninstall -y torch torchaudio torchvision \
|
||||||
|
&& pip install --no-cache-dir --pre \
|
||||||
|
torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
|
||||||
|
torchvision==0.19.0.dev20240612 \
|
||||||
|
--index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
|
||||||
|
*) ;; esac
|
||||||
|
|
||||||
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
|
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
|
||||||
ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
|
ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
|
||||||
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
|
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
|
||||||
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
|
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
|
||||||
|
|
||||||
# Install ROCm flash-attention
|
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
||||||
RUN if [ "$BUILD_FA" = "1" ]; then \
|
ENV CCACHE_DIR=/root/.cache/ccache
|
||||||
mkdir libs \
|
|
||||||
|
|
||||||
|
### AMD-SMI build stage
|
||||||
|
FROM base AS build_amdsmi
|
||||||
|
# Build amdsmi wheel always
|
||||||
|
RUN cd /opt/rocm/share/amd_smi \
|
||||||
|
&& pip wheel . --wheel-dir=/install
|
||||||
|
|
||||||
|
|
||||||
|
### Flash-Attention wheel build stage
|
||||||
|
FROM base AS build_fa
|
||||||
|
ARG BUILD_FA
|
||||||
|
ARG FA_GFX_ARCHS
|
||||||
|
ARG FA_BRANCH
|
||||||
|
# Build ROCm flash-attention wheel if `BUILD_FA = 1`
|
||||||
|
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
||||||
|
if [ "$BUILD_FA" = "1" ]; then \
|
||||||
|
mkdir -p libs \
|
||||||
&& cd libs \
|
&& cd libs \
|
||||||
&& git clone https://github.com/ROCm/flash-attention.git \
|
&& git clone https://github.com/ROCm/flash-attention.git \
|
||||||
&& cd flash-attention \
|
&& cd flash-attention \
|
||||||
&& git checkout ${FA_BRANCH} \
|
&& git checkout "${FA_BRANCH}" \
|
||||||
&& git submodule update --init \
|
&& git submodule update --init \
|
||||||
&& export GPU_ARCHS=${FA_GFX_ARCHS} \
|
&& case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
||||||
&& if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
|
*"rocm-5.7"*) \
|
||||||
patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
|
export VLLM_TORCH_PATH="$(python3 -c 'import torch; print(torch.__path__[0])')" \
|
||||||
&& python3 setup.py install \
|
&& patch "${VLLM_TORCH_PATH}"/utils/hipify/hipify_python.py hipify_patch.patch;; \
|
||||||
&& cd ..; \
|
*) ;; esac \
|
||||||
|
&& GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
|
||||||
|
# Create an empty directory otherwise as later build stages expect one
|
||||||
|
else mkdir -p /install; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
|
|
||||||
# Manually removed it so that later steps of numpy upgrade can continue
|
|
||||||
RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
|
|
||||||
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
|
|
||||||
|
|
||||||
# build triton
|
### Triton wheel build stage
|
||||||
RUN if [ "$BUILD_TRITON" = "1" ]; then \
|
FROM base AS build_triton
|
||||||
|
ARG BUILD_TRITON
|
||||||
|
ARG TRITON_BRANCH
|
||||||
|
# Build triton wheel if `BUILD_TRITON = 1`
|
||||||
|
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
||||||
|
if [ "$BUILD_TRITON" = "1" ]; then \
|
||||||
mkdir -p libs \
|
mkdir -p libs \
|
||||||
&& cd libs \
|
&& cd libs \
|
||||||
&& pip uninstall -y triton \
|
&& git clone https://github.com/OpenAI/triton.git \
|
||||||
&& git clone https://github.com/ROCm/triton.git \
|
&& cd triton \
|
||||||
&& cd triton/python \
|
&& git checkout "${TRITON_BRANCH}" \
|
||||||
&& pip3 install . \
|
&& cd python \
|
||||||
&& cd ../..; \
|
&& python3 setup.py bdist_wheel --dist-dir=/install; \
|
||||||
|
# Create an empty directory otherwise as later build stages expect one
|
||||||
|
else mkdir -p /install; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
WORKDIR /vllm-workspace
|
|
||||||
|
### Final vLLM build stage
|
||||||
|
FROM base AS final
|
||||||
|
# Import the vLLM development directory from the build context
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
#RUN python3 -m pip install pynvml # to be removed eventually
|
# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
|
||||||
RUN python3 -m pip install --upgrade pip numba
|
# Manually remove it so that later steps of numpy upgrade can continue
|
||||||
|
RUN case "$(which python3)" in \
|
||||||
|
*"/opt/conda/envs/py_3.9"*) \
|
||||||
|
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
|
||||||
|
*) ;; esac
|
||||||
|
|
||||||
# make sure punica kernels are built (for LoRA)
|
# Package upgrades for useful functionality or to avoid dependency issues
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install --upgrade numba scipy huggingface-hub[cli]
|
||||||
|
|
||||||
|
# Make sure punica kernels are built (for LoRA)
|
||||||
ENV VLLM_INSTALL_PUNICA_KERNELS=1
|
ENV VLLM_INSTALL_PUNICA_KERNELS=1
|
||||||
# Workaround for ray >= 2.10.0
|
# Workaround for ray >= 2.10.0
|
||||||
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||||
|
# Silences the HF Tokenizers warning
|
||||||
|
ENV TOKENIZERS_PARALLELISM=false
|
||||||
|
|
||||||
ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so
|
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
||||||
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -U -r requirements-rocm.txt \
|
pip install -U -r requirements-rocm.txt \
|
||||||
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
|
&& case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
||||||
&& python3 setup.py install \
|
*"rocm-6.0"*) \
|
||||||
&& cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
|
patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch;; \
|
||||||
&& cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.cpython-39-x86_64-linux-gnu.so vllm/ \
|
*"rocm-6.1"*) \
|
||||||
&& cd ..
|
# Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
|
||||||
|
wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P rocm_patch \
|
||||||
|
&& cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6 \
|
||||||
|
# Prevent interference if torch bundles its own HIP runtime
|
||||||
|
&& rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
|
||||||
|
*) ;; esac \
|
||||||
|
&& python3 setup.py clean --all \
|
||||||
|
&& python3 setup.py develop
|
||||||
|
|
||||||
|
# Copy amdsmi wheel into final image
|
||||||
|
RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
|
||||||
|
mkdir -p libs \
|
||||||
|
&& cp /install/*.whl libs \
|
||||||
|
# Preemptively uninstall to avoid same-version no-installs
|
||||||
|
&& pip uninstall -y amdsmi;
|
||||||
|
|
||||||
|
# Copy triton wheel(s) into final image if they were built
|
||||||
|
RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
|
||||||
|
mkdir -p libs \
|
||||||
|
&& if ls /install/*.whl; then \
|
||||||
|
cp /install/*.whl libs \
|
||||||
|
# Preemptively uninstall to avoid same-version no-installs
|
||||||
|
&& pip uninstall -y triton; fi
|
||||||
|
|
||||||
|
# Copy flash-attn wheel(s) into final image if they were built
|
||||||
|
RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
|
||||||
|
mkdir -p libs \
|
||||||
|
&& if ls /install/*.whl; then \
|
||||||
|
cp /install/*.whl libs \
|
||||||
|
# Preemptively uninstall to avoid same-version no-installs
|
||||||
|
&& pip uninstall -y flash-attn; fi
|
||||||
|
|
||||||
|
# Install wheels that were built to the final image
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
if ls libs/*.whl; then \
|
||||||
|
pip install libs/*.whl; fi
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
|||||||
19
Dockerfile.tpu
Normal file
19
Dockerfile.tpu
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
ARG NIGHTLY_DATE="20240601"
|
||||||
|
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
|
||||||
|
|
||||||
|
FROM $BASE_IMAGE
|
||||||
|
|
||||||
|
WORKDIR /workspace
|
||||||
|
COPY . /workspace/vllm
|
||||||
|
|
||||||
|
ENV VLLM_TARGET_DEVICE="tpu"
|
||||||
|
# Install aiohttp separately to avoid build errors.
|
||||||
|
RUN pip install aiohttp
|
||||||
|
# Install the TPU and Pallas dependencies.
|
||||||
|
RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
|
||||||
|
RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
||||||
|
|
||||||
|
# Build vLLM.
|
||||||
|
RUN cd /workspace/vllm && python setup.py develop
|
||||||
|
|
||||||
|
CMD ["/bin/bash"]
|
||||||
22
Dockerfile.xpu
Normal file
22
Dockerfile.xpu
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04
|
||||||
|
|
||||||
|
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
||||||
|
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||||
|
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
||||||
|
rm /etc/apt/sources.list.d/intel-graphics.list && \
|
||||||
|
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
||||||
|
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
||||||
|
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||||
|
|
||||||
|
RUN apt-get update -y \
|
||||||
|
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
|
||||||
|
|
||||||
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
|
RUN pip install -v -r requirements-xpu.txt
|
||||||
|
|
||||||
|
RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
|
||||||
|
|
||||||
|
CMD ["/bin/bash"]
|
||||||
15
README.md
15
README.md
@@ -16,16 +16,17 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
**The Fourth vLLM Bay Area Meetup (June 11th 5:30pm-8pm PT)**
|
**Ray Summit CPF is Open (June 4th to June 20th)!**
|
||||||
|
|
||||||
We are thrilled to announce our fourth vLLM Meetup!
|
There will be a track for vLLM at the Ray Summit (09/30-10/02, SF) this year!
|
||||||
The vLLM team will share recent updates and roadmap.
|
If you have cool projects related to vLLM or LLM inference, we would love to see your proposals.
|
||||||
We will also have vLLM collaborators from BentoML and Cloudflare coming up to the stage to discuss their experience in deploying LLMs with vLLM.
|
This will be a great chance for everyone in the community to get together and learn.
|
||||||
Please register [here](https://lu.ma/agivllm) and join us!
|
Please submit your proposal [here](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/eventsite)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
|
||||||
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
|
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
|
||||||
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
|
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
|
||||||
- [2024/01] Added ROCm 6.0 support to vLLM.
|
- [2024/01] Added ROCm 6.0 support to vLLM.
|
||||||
@@ -58,7 +59,7 @@ vLLM is flexible and easy to use with:
|
|||||||
- Tensor parallelism support for distributed inference
|
- Tensor parallelism support for distributed inference
|
||||||
- Streaming outputs
|
- Streaming outputs
|
||||||
- OpenAI-compatible API server
|
- OpenAI-compatible API server
|
||||||
- Support NVIDIA GPUs and AMD GPUs
|
- Support NVIDIA GPUs, AMD GPUs, Intel CPUs and GPUs
|
||||||
- (Experimental) Prefix caching support
|
- (Experimental) Prefix caching support
|
||||||
- (Experimental) Multi-lora support
|
- (Experimental) Multi-lora support
|
||||||
|
|
||||||
@@ -107,9 +108,11 @@ vLLM is a community project. Our compute resources for development and testing a
|
|||||||
- Replicate
|
- Replicate
|
||||||
- Roblox
|
- Roblox
|
||||||
- RunPod
|
- RunPod
|
||||||
|
- Sequoia Capital
|
||||||
- Trainy
|
- Trainy
|
||||||
- UC Berkeley
|
- UC Berkeley
|
||||||
- UC San Diego
|
- UC San Diego
|
||||||
|
- ZhenFund
|
||||||
|
|
||||||
We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
|
We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
|
||||||
|
|
||||||
|
|||||||
@@ -4,10 +4,13 @@ import sys
|
|||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import List, Optional
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
import huggingface_hub.constants
|
||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
|
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
||||||
|
PreTrainedTokenizerFast)
|
||||||
|
|
||||||
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
||||||
|
|
||||||
@@ -68,9 +71,13 @@ async def async_request_tgi(
|
|||||||
chunk_bytes = chunk_bytes.strip()
|
chunk_bytes = chunk_bytes.strip()
|
||||||
if not chunk_bytes:
|
if not chunk_bytes:
|
||||||
continue
|
continue
|
||||||
|
chunk_bytes = chunk_bytes.decode("utf-8")
|
||||||
|
|
||||||
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
|
#NOTE: Sometimes TGI returns a ping response without
|
||||||
"data:")
|
# any data, we should skip it.
|
||||||
|
if chunk_bytes.startswith(":"):
|
||||||
|
continue
|
||||||
|
chunk = remove_prefix(chunk_bytes, "data:")
|
||||||
|
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
timestamp = time.perf_counter()
|
timestamp = time.perf_counter()
|
||||||
@@ -218,8 +225,8 @@ async def async_request_openai_completions(
|
|||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(
|
assert api_url.endswith(
|
||||||
"v1/completions"
|
"completions"
|
||||||
), "OpenAI Completions API URL must end with 'v1/completions'."
|
), "OpenAI Completions API URL must end with 'completions'."
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
assert not request_func_input.use_beam_search
|
assert not request_func_input.use_beam_search
|
||||||
@@ -258,6 +265,9 @@ async def async_request_openai_completions(
|
|||||||
else:
|
else:
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
|
|
||||||
|
# NOTE: Some completion API might have a last
|
||||||
|
# usage summary response without a token so we
|
||||||
|
# want to check a token was generated
|
||||||
if data["choices"][0]["text"]:
|
if data["choices"][0]["text"]:
|
||||||
timestamp = time.perf_counter()
|
timestamp = time.perf_counter()
|
||||||
# First token
|
# First token
|
||||||
@@ -266,12 +276,8 @@ async def async_request_openai_completions(
|
|||||||
output.ttft = ttft
|
output.ttft = ttft
|
||||||
|
|
||||||
# Decoding phase
|
# Decoding phase
|
||||||
# NOTE: Some completion API might have a last
|
output.itl.append(timestamp -
|
||||||
# usage summary response without a token so we
|
most_recent_timestamp)
|
||||||
# do not want to include as inter-token-latency
|
|
||||||
elif data.get("usage", None) is None:
|
|
||||||
output.itl.append(timestamp -
|
|
||||||
most_recent_timestamp)
|
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
generated_text += data["choices"][0]["text"]
|
generated_text += data["choices"][0]["text"]
|
||||||
@@ -298,8 +304,8 @@ async def async_request_openai_chat_completions(
|
|||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(
|
assert api_url.endswith(
|
||||||
"v1/chat/completions"
|
"chat/completions"
|
||||||
), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
|
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
assert not request_func_input.use_beam_search
|
assert not request_func_input.use_beam_search
|
||||||
@@ -384,6 +390,30 @@ def remove_prefix(text: str, prefix: str) -> str:
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def get_model(pretrained_model_name_or_path: str):
|
||||||
|
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
|
||||||
|
from modelscope import snapshot_download
|
||||||
|
else:
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
|
model_path = snapshot_download(
|
||||||
|
model_id=pretrained_model_name_or_path,
|
||||||
|
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
||||||
|
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
|
||||||
|
return model_path
|
||||||
|
|
||||||
|
|
||||||
|
def get_tokenizer(
|
||||||
|
pretrained_model_name_or_path: str, trust_remote_code: bool
|
||||||
|
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
||||||
|
if pretrained_model_name_or_path is not None and not os.path.exists(
|
||||||
|
pretrained_model_name_or_path):
|
||||||
|
pretrained_model_name_or_path = get_model(
|
||||||
|
pretrained_model_name_or_path)
|
||||||
|
return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
|
||||||
|
trust_remote_code=trust_remote_code)
|
||||||
|
|
||||||
|
|
||||||
ASYNC_REQUEST_FUNCS = {
|
ASYNC_REQUEST_FUNCS = {
|
||||||
"tgi": async_request_tgi,
|
"tgi": async_request_tgi,
|
||||||
"vllm": async_request_openai_completions,
|
"vllm": async_request_openai_completions,
|
||||||
@@ -392,4 +422,5 @@ ASYNC_REQUEST_FUNCS = {
|
|||||||
"openai": async_request_openai_completions,
|
"openai": async_request_openai_completions,
|
||||||
"openai-chat": async_request_openai_chat_completions,
|
"openai-chat": async_request_openai_chat_completions,
|
||||||
"tensorrt-llm": async_request_trt_llm,
|
"tensorrt-llm": async_request_trt_llm,
|
||||||
|
"scalellm": async_request_openai_completions,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,8 +10,10 @@ import torch
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.inputs import PromptStrictInputs
|
from vllm.inputs import PromptStrictInputs
|
||||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
@@ -19,24 +21,33 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
# NOTE(woosuk): If the request cannot be processed in a single batch,
|
# NOTE(woosuk): If the request cannot be processed in a single batch,
|
||||||
# the engine will automatically process the request in multiple batches.
|
# the engine will automatically process the request in multiple batches.
|
||||||
llm = LLM(model=args.model,
|
llm = LLM(
|
||||||
speculative_model=args.speculative_model,
|
model=args.model,
|
||||||
num_speculative_tokens=args.num_speculative_tokens,
|
speculative_model=args.speculative_model,
|
||||||
tokenizer=args.tokenizer,
|
num_speculative_tokens=args.num_speculative_tokens,
|
||||||
quantization=args.quantization,
|
speculative_draft_tensor_parallel_size=\
|
||||||
tensor_parallel_size=args.tensor_parallel_size,
|
args.speculative_draft_tensor_parallel_size,
|
||||||
trust_remote_code=args.trust_remote_code,
|
tokenizer=args.tokenizer,
|
||||||
dtype=args.dtype,
|
quantization=args.quantization,
|
||||||
enforce_eager=args.enforce_eager,
|
tensor_parallel_size=args.tensor_parallel_size,
|
||||||
kv_cache_dtype=args.kv_cache_dtype,
|
trust_remote_code=args.trust_remote_code,
|
||||||
quantization_param_path=args.quantization_param_path,
|
dtype=args.dtype,
|
||||||
device=args.device,
|
max_model_len=args.max_model_len,
|
||||||
ray_workers_use_nsight=args.ray_workers_use_nsight,
|
enforce_eager=args.enforce_eager,
|
||||||
use_v2_block_manager=args.use_v2_block_manager,
|
kv_cache_dtype=args.kv_cache_dtype,
|
||||||
enable_chunked_prefill=args.enable_chunked_prefill,
|
quantization_param_path=args.quantization_param_path,
|
||||||
download_dir=args.download_dir,
|
device=args.device,
|
||||||
block_size=args.block_size,
|
ray_workers_use_nsight=args.ray_workers_use_nsight,
|
||||||
gpu_memory_utilization=args.gpu_memory_utilization)
|
use_v2_block_manager=args.use_v2_block_manager,
|
||||||
|
enable_chunked_prefill=args.enable_chunked_prefill,
|
||||||
|
download_dir=args.download_dir,
|
||||||
|
block_size=args.block_size,
|
||||||
|
gpu_memory_utilization=args.gpu_memory_utilization,
|
||||||
|
load_format=args.load_format,
|
||||||
|
distributed_executor_backend=args.distributed_executor_backend,
|
||||||
|
otlp_traces_endpoint=args.otlp_traces_endpoint,
|
||||||
|
enable_prefix_caching=args.enable_prefix_caching,
|
||||||
|
)
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
n=args.n,
|
n=args.n,
|
||||||
@@ -95,7 +106,7 @@ def main(args: argparse.Namespace):
|
|||||||
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
|
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
|
||||||
latencies.append(run_to_completion(profile_dir=None))
|
latencies.append(run_to_completion(profile_dir=None))
|
||||||
latencies = np.array(latencies)
|
latencies = np.array(latencies)
|
||||||
percentages = [10, 25, 50, 75, 90]
|
percentages = [10, 25, 50, 75, 90, 99]
|
||||||
percentiles = np.percentile(latencies, percentages)
|
percentiles = np.percentile(latencies, percentages)
|
||||||
print(f'Avg latency: {np.mean(latencies)} seconds')
|
print(f'Avg latency: {np.mean(latencies)} seconds')
|
||||||
for percentage, percentile in zip(percentages, percentiles):
|
for percentage, percentile in zip(percentages, percentiles):
|
||||||
@@ -113,12 +124,16 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description='Benchmark the latency of processing a single batch of '
|
description='Benchmark the latency of processing a single batch of '
|
||||||
'requests till completion.')
|
'requests till completion.')
|
||||||
parser.add_argument('--model', type=str, default='facebook/opt-125m')
|
parser.add_argument('--model', type=str, default='facebook/opt-125m')
|
||||||
parser.add_argument('--speculative-model', type=str, default=None)
|
parser.add_argument('--speculative-model', type=str, default=None)
|
||||||
parser.add_argument('--num-speculative-tokens', type=int, default=None)
|
parser.add_argument('--num-speculative-tokens', type=int, default=None)
|
||||||
|
parser.add_argument('--speculative-draft-tensor-parallel-size',
|
||||||
|
'-spec-draft-tp',
|
||||||
|
type=int,
|
||||||
|
default=None)
|
||||||
parser.add_argument('--tokenizer', type=str, default=None)
|
parser.add_argument('--tokenizer', type=str, default=None)
|
||||||
parser.add_argument('--quantization',
|
parser.add_argument('--quantization',
|
||||||
'-q',
|
'-q',
|
||||||
@@ -144,6 +159,12 @@ if __name__ == '__main__':
|
|||||||
parser.add_argument('--trust-remote-code',
|
parser.add_argument('--trust-remote-code',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help='trust remote code from huggingface')
|
help='trust remote code from huggingface')
|
||||||
|
parser.add_argument(
|
||||||
|
'--max-model-len',
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help='Maximum length of a sequence (including prompt and output). '
|
||||||
|
'If None, will be derived from the model.')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--dtype',
|
'--dtype',
|
||||||
type=str,
|
type=str,
|
||||||
@@ -187,9 +208,10 @@ if __name__ == '__main__':
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--device",
|
"--device",
|
||||||
type=str,
|
type=str,
|
||||||
default="cuda",
|
default="auto",
|
||||||
choices=["cuda", "cpu"],
|
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
|
||||||
help='device type for vLLM execution, supporting CUDA and CPU.')
|
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
|
||||||
|
'CPU.')
|
||||||
parser.add_argument('--block-size',
|
parser.add_argument('--block-size',
|
||||||
type=int,
|
type=int,
|
||||||
default=16,
|
default=16,
|
||||||
@@ -199,6 +221,9 @@ if __name__ == '__main__':
|
|||||||
action='store_true',
|
action='store_true',
|
||||||
help='If True, the prefill requests can be chunked based on the '
|
help='If True, the prefill requests can be chunked based on the '
|
||||||
'max_num_batched_tokens')
|
'max_num_batched_tokens')
|
||||||
|
parser.add_argument("--enable-prefix-caching",
|
||||||
|
action='store_true',
|
||||||
|
help="Enable automatic prefix caching")
|
||||||
parser.add_argument('--use-v2-block-manager', action='store_true')
|
parser.add_argument('--use-v2-block-manager', action='store_true')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--ray-workers-use-nsight",
|
"--ray-workers-use-nsight",
|
||||||
@@ -221,5 +246,40 @@ if __name__ == '__main__':
|
|||||||
help='the fraction of GPU memory to be used for '
|
help='the fraction of GPU memory to be used for '
|
||||||
'the model executor, which can range from 0 to 1.'
|
'the model executor, which can range from 0 to 1.'
|
||||||
'If unspecified, will use the default value of 0.9.')
|
'If unspecified, will use the default value of 0.9.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--load-format',
|
||||||
|
type=str,
|
||||||
|
default=EngineArgs.load_format,
|
||||||
|
choices=[
|
||||||
|
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
|
||||||
|
'bitsandbytes'
|
||||||
|
],
|
||||||
|
help='The format of the model weights to load.\n\n'
|
||||||
|
'* "auto" will try to load the weights in the safetensors format '
|
||||||
|
'and fall back to the pytorch bin format if safetensors format '
|
||||||
|
'is not available.\n'
|
||||||
|
'* "pt" will load the weights in the pytorch bin format.\n'
|
||||||
|
'* "safetensors" will load the weights in the safetensors format.\n'
|
||||||
|
'* "npcache" will load the weights in pytorch format and store '
|
||||||
|
'a numpy cache to speed up the loading.\n'
|
||||||
|
'* "dummy" will initialize the weights with random values, '
|
||||||
|
'which is mainly for profiling.\n'
|
||||||
|
'* "tensorizer" will load the weights using tensorizer from '
|
||||||
|
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
|
||||||
|
'section for more information.\n'
|
||||||
|
'* "bitsandbytes" will load the weights using bitsandbytes '
|
||||||
|
'quantization.\n')
|
||||||
|
parser.add_argument(
|
||||||
|
'--distributed-executor-backend',
|
||||||
|
choices=['ray', 'mp'],
|
||||||
|
default=None,
|
||||||
|
help='Backend to use for distributed serving. When more than 1 GPU '
|
||||||
|
'is used, will be automatically set to "ray" if installed '
|
||||||
|
'or "mp" (multiprocessing) otherwise.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--otlp-traces-endpoint',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help='Target URL to which OpenTelemetry traces will be sent.')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import argparse
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501
|
PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501
|
||||||
|
|
||||||
@@ -44,7 +44,7 @@ def main(args):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description='Benchmark the performance with or without automatic '
|
description='Benchmark the performance with or without automatic '
|
||||||
'prefix caching.')
|
'prefix caching.')
|
||||||
parser.add_argument('--model',
|
parser.add_argument('--model',
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ import time
|
|||||||
import warnings
|
import warnings
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import AsyncGenerator, List, Optional, Tuple
|
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
||||||
@@ -39,7 +39,15 @@ from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
|||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
try:
|
||||||
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
|
except ImportError:
|
||||||
|
from backend_request_func import get_tokenizer
|
||||||
|
|
||||||
|
try:
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
except ImportError:
|
||||||
|
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -56,6 +64,9 @@ class BenchmarkMetrics:
|
|||||||
mean_tpot_ms: float
|
mean_tpot_ms: float
|
||||||
median_tpot_ms: float
|
median_tpot_ms: float
|
||||||
p99_tpot_ms: float
|
p99_tpot_ms: float
|
||||||
|
mean_itl_ms: float
|
||||||
|
median_itl_ms: float
|
||||||
|
p99_itl_ms: float
|
||||||
|
|
||||||
|
|
||||||
def sample_sharegpt_requests(
|
def sample_sharegpt_requests(
|
||||||
@@ -197,19 +208,27 @@ def calculate_metrics(
|
|||||||
dur_s: float,
|
dur_s: float,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
) -> Tuple[BenchmarkMetrics, List[int]]:
|
) -> Tuple[BenchmarkMetrics, List[int]]:
|
||||||
actual_output_lens = []
|
actual_output_lens: List[int] = []
|
||||||
total_input = 0
|
total_input = 0
|
||||||
completed = 0
|
completed = 0
|
||||||
tpots = []
|
itls: List[float] = []
|
||||||
ttfts = []
|
tpots: List[float] = []
|
||||||
|
ttfts: List[float] = []
|
||||||
for i in range(len(outputs)):
|
for i in range(len(outputs)):
|
||||||
if outputs[i].success:
|
if outputs[i].success:
|
||||||
output_len = len(tokenizer(outputs[i].generated_text).input_ids)
|
# We use the tokenizer to count the number of output tokens for all
|
||||||
|
# serving backends instead of looking at len(outputs[i].itl) since
|
||||||
|
# multiple output tokens may be bundled together
|
||||||
|
# Note: this may inflate the output token count slightly
|
||||||
|
output_len = len(
|
||||||
|
tokenizer(outputs[i].generated_text,
|
||||||
|
add_special_tokens=False).input_ids)
|
||||||
actual_output_lens.append(output_len)
|
actual_output_lens.append(output_len)
|
||||||
total_input += input_requests[i][1]
|
total_input += input_requests[i][1]
|
||||||
if output_len > 1:
|
if output_len > 1:
|
||||||
tpots.append(
|
tpots.append(
|
||||||
(outputs[i].latency - outputs[i].ttft) / (output_len - 1))
|
(outputs[i].latency - outputs[i].ttft) / (output_len - 1))
|
||||||
|
itls += outputs[i].itl
|
||||||
ttfts.append(outputs[i].ttft)
|
ttfts.append(outputs[i].ttft)
|
||||||
completed += 1
|
completed += 1
|
||||||
else:
|
else:
|
||||||
@@ -234,6 +253,9 @@ def calculate_metrics(
|
|||||||
mean_tpot_ms=np.mean(tpots or 0) * 1000,
|
mean_tpot_ms=np.mean(tpots or 0) * 1000,
|
||||||
median_tpot_ms=np.median(tpots or 0) * 1000,
|
median_tpot_ms=np.median(tpots or 0) * 1000,
|
||||||
p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
|
p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
|
||||||
|
mean_itl_ms=np.mean(itls or 0) * 1000,
|
||||||
|
median_itl_ms=np.median(itls or 0) * 1000,
|
||||||
|
p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
|
||||||
)
|
)
|
||||||
|
|
||||||
return metrics, actual_output_lens
|
return metrics, actual_output_lens
|
||||||
@@ -251,7 +273,7 @@ async def benchmark(
|
|||||||
disable_tqdm: bool,
|
disable_tqdm: bool,
|
||||||
):
|
):
|
||||||
if backend in ASYNC_REQUEST_FUNCS:
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
request_func = ASYNC_REQUEST_FUNCS.get(backend)
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown backend: {backend}")
|
raise ValueError(f"Unknown backend: {backend}")
|
||||||
|
|
||||||
@@ -278,7 +300,7 @@ async def benchmark(
|
|||||||
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
||||||
|
|
||||||
benchmark_start_time = time.perf_counter()
|
benchmark_start_time = time.perf_counter()
|
||||||
tasks = []
|
tasks: List[asyncio.Task] = []
|
||||||
async for request in get_request(input_requests, request_rate):
|
async for request in get_request(input_requests, request_rate):
|
||||||
prompt, prompt_len, output_len = request
|
prompt, prompt_len, output_len = request
|
||||||
request_func_input = RequestFuncInput(
|
request_func_input = RequestFuncInput(
|
||||||
@@ -296,7 +318,7 @@ async def benchmark(
|
|||||||
pbar=pbar)))
|
pbar=pbar)))
|
||||||
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
|
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
if not disable_tqdm:
|
if pbar is not None:
|
||||||
pbar.close()
|
pbar.close()
|
||||||
|
|
||||||
benchmark_duration = time.perf_counter() - benchmark_start_time
|
benchmark_duration = time.perf_counter() - benchmark_start_time
|
||||||
@@ -333,6 +355,10 @@ async def benchmark(
|
|||||||
print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
|
print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
|
||||||
metrics.median_tpot_ms))
|
metrics.median_tpot_ms))
|
||||||
print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
|
print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
|
||||||
|
print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
|
||||||
|
print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
|
||||||
|
print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
|
||||||
|
print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
|
||||||
print("=" * 50)
|
print("=" * 50)
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
@@ -349,6 +375,9 @@ async def benchmark(
|
|||||||
"mean_tpot_ms": metrics.mean_tpot_ms,
|
"mean_tpot_ms": metrics.mean_tpot_ms,
|
||||||
"median_tpot_ms": metrics.median_tpot_ms,
|
"median_tpot_ms": metrics.median_tpot_ms,
|
||||||
"p99_tpot_ms": metrics.p99_tpot_ms,
|
"p99_tpot_ms": metrics.p99_tpot_ms,
|
||||||
|
"mean_itl_ms": metrics.mean_itl_ms,
|
||||||
|
"median_itl_ms": metrics.median_itl_ms,
|
||||||
|
"p99_itl_ms": metrics.p99_itl_ms,
|
||||||
"input_lens": [output.prompt_len for output in outputs],
|
"input_lens": [output.prompt_len for output in outputs],
|
||||||
"output_lens": actual_output_lens,
|
"output_lens": actual_output_lens,
|
||||||
"ttfts": [output.ttft for output in outputs],
|
"ttfts": [output.ttft for output in outputs],
|
||||||
@@ -445,7 +474,7 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
# Save config and results to json
|
# Save config and results to json
|
||||||
if args.save_result:
|
if args.save_result:
|
||||||
result_json = {}
|
result_json: Dict[str, Any] = {}
|
||||||
|
|
||||||
# Setup
|
# Setup
|
||||||
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
|
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||||
@@ -478,6 +507,8 @@ def main(args: argparse.Namespace):
|
|||||||
# Save to file
|
# Save to file
|
||||||
base_model_id = model_id.split("/")[-1]
|
base_model_id = model_id.split("/")[-1]
|
||||||
file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa
|
file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa
|
||||||
|
if args.result_filename:
|
||||||
|
file_name = args.result_filename
|
||||||
if args.result_dir:
|
if args.result_dir:
|
||||||
file_name = os.path.join(args.result_dir, file_name)
|
file_name = os.path.join(args.result_dir, file_name)
|
||||||
with open(file_name, "w") as outfile:
|
with open(file_name, "w") as outfile:
|
||||||
@@ -485,7 +516,7 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark the online serving throughput.")
|
description="Benchmark the online serving throughput.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--backend",
|
"--backend",
|
||||||
@@ -618,6 +649,15 @@ if __name__ == "__main__":
|
|||||||
help="Specify directory to save benchmark json results."
|
help="Specify directory to save benchmark json results."
|
||||||
"If not specified, results are saved in the current directory.",
|
"If not specified, results are saved in the current directory.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--result-filename",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Specify the filename to save benchmark json results."
|
||||||
|
"If not specified, results will be saved in "
|
||||||
|
"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
|
||||||
|
" format.",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@@ -10,7 +10,9 @@ from tqdm import tqdm
|
|||||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||||
PreTrainedTokenizerBase)
|
PreTrainedTokenizerBase)
|
||||||
|
|
||||||
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
def sample_requests(
|
def sample_requests(
|
||||||
@@ -78,8 +80,10 @@ def run_vllm(
|
|||||||
enable_prefix_caching: bool,
|
enable_prefix_caching: bool,
|
||||||
enable_chunked_prefill: bool,
|
enable_chunked_prefill: bool,
|
||||||
max_num_batched_tokens: int,
|
max_num_batched_tokens: int,
|
||||||
|
distributed_executor_backend: Optional[str],
|
||||||
gpu_memory_utilization: float = 0.9,
|
gpu_memory_utilization: float = 0.9,
|
||||||
download_dir: Optional[str] = None,
|
download_dir: Optional[str] = None,
|
||||||
|
load_format: str = EngineArgs.load_format,
|
||||||
) -> float:
|
) -> float:
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
@@ -100,11 +104,13 @@ def run_vllm(
|
|||||||
download_dir=download_dir,
|
download_dir=download_dir,
|
||||||
enable_chunked_prefill=enable_chunked_prefill,
|
enable_chunked_prefill=enable_chunked_prefill,
|
||||||
max_num_batched_tokens=max_num_batched_tokens,
|
max_num_batched_tokens=max_num_batched_tokens,
|
||||||
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
|
load_format=load_format,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
prompts = []
|
prompts: List[str] = []
|
||||||
sampling_params = []
|
sampling_params: List[SamplingParams] = []
|
||||||
for prompt, _, output_len in requests:
|
for prompt, _, output_len in requests:
|
||||||
prompts.append(prompt)
|
prompts.append(prompt)
|
||||||
sampling_params.append(
|
sampling_params.append(
|
||||||
@@ -225,8 +231,8 @@ def main(args: argparse.Namespace):
|
|||||||
args.enforce_eager, args.kv_cache_dtype,
|
args.enforce_eager, args.kv_cache_dtype,
|
||||||
args.quantization_param_path, args.device,
|
args.quantization_param_path, args.device,
|
||||||
args.enable_prefix_caching, args.enable_chunked_prefill,
|
args.enable_prefix_caching, args.enable_chunked_prefill,
|
||||||
args.max_num_batched_tokens, args.gpu_memory_utilization,
|
args.max_num_batched_tokens, args.distributed_executor_backend,
|
||||||
args.download_dir)
|
args.gpu_memory_utilization, args.download_dir, args.load_format)
|
||||||
elif args.backend == "hf":
|
elif args.backend == "hf":
|
||||||
assert args.tensor_parallel_size == 1
|
assert args.tensor_parallel_size == 1
|
||||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
||||||
@@ -256,7 +262,7 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description="Benchmark the throughput.")
|
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
||||||
parser.add_argument("--backend",
|
parser.add_argument("--backend",
|
||||||
type=str,
|
type=str,
|
||||||
choices=["vllm", "hf", "mii"],
|
choices=["vllm", "hf", "mii"],
|
||||||
@@ -343,9 +349,10 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--device",
|
"--device",
|
||||||
type=str,
|
type=str,
|
||||||
default="cuda",
|
default="auto",
|
||||||
choices=["cuda", "cpu"],
|
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
|
||||||
help='device type for vLLM execution, supporting CUDA and CPU.')
|
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
|
||||||
|
'CPU.')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--enable-prefix-caching",
|
"--enable-prefix-caching",
|
||||||
action='store_true',
|
action='store_true',
|
||||||
@@ -368,6 +375,36 @@ if __name__ == "__main__":
|
|||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help='Path to save the throughput results in JSON format.')
|
help='Path to save the throughput results in JSON format.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--distributed-executor-backend',
|
||||||
|
choices=['ray', 'mp'],
|
||||||
|
default=None,
|
||||||
|
help='Backend to use for distributed serving. When more than 1 GPU '
|
||||||
|
'is used, will be automatically set to "ray" if installed '
|
||||||
|
'or "mp" (multiprocessing) otherwise.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--load-format',
|
||||||
|
type=str,
|
||||||
|
default=EngineArgs.load_format,
|
||||||
|
choices=[
|
||||||
|
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
|
||||||
|
'bitsandbytes'
|
||||||
|
],
|
||||||
|
help='The format of the model weights to load.\n\n'
|
||||||
|
'* "auto" will try to load the weights in the safetensors format '
|
||||||
|
'and fall back to the pytorch bin format if safetensors format '
|
||||||
|
'is not available.\n'
|
||||||
|
'* "pt" will load the weights in the pytorch bin format.\n'
|
||||||
|
'* "safetensors" will load the weights in the safetensors format.\n'
|
||||||
|
'* "npcache" will load the weights in pytorch format and store '
|
||||||
|
'a numpy cache to speed up the loading.\n'
|
||||||
|
'* "dummy" will initialize the weights with random values, '
|
||||||
|
'which is mainly for profiling.\n'
|
||||||
|
'* "tensorizer" will load the weights using tensorizer from '
|
||||||
|
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
|
||||||
|
'section for more information.\n'
|
||||||
|
'* "bitsandbytes" will load the weights using bitsandbytes '
|
||||||
|
'quantization.\n')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.tokenizer is None:
|
if args.tokenizer is None:
|
||||||
args.tokenizer = args.model
|
args.tokenizer = args.model
|
||||||
|
|||||||
353
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
Normal file
353
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
Normal file
@@ -0,0 +1,353 @@
|
|||||||
|
import argparse
|
||||||
|
import copy
|
||||||
|
import itertools
|
||||||
|
import pickle as pkl
|
||||||
|
import time
|
||||||
|
from typing import Callable, Iterable, List, Tuple
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.utils.benchmark as TBenchmark
|
||||||
|
from torch.utils.benchmark import Measurement as TMeasurement
|
||||||
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
|
||||||
|
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||||
|
DEFAULT_TP_SIZES = [1]
|
||||||
|
|
||||||
|
# helpers
|
||||||
|
|
||||||
|
|
||||||
|
def to_fp8(tensor: torch.tensor) -> torch.tensor:
|
||||||
|
finfo = torch.finfo(torch.float8_e4m3fn)
|
||||||
|
return torch.round(tensor.clamp(
|
||||||
|
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
|
||||||
|
|
||||||
|
|
||||||
|
def to_int8(tensor: torch.tensor) -> torch.tensor:
|
||||||
|
return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
|
||||||
|
|
||||||
|
|
||||||
|
def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
|
||||||
|
k: int) -> Tuple[torch.tensor, torch.tensor]:
|
||||||
|
|
||||||
|
a = torch.randn((m, k), device='cuda') * 5
|
||||||
|
b = torch.randn((n, k), device='cuda').t() * 5
|
||||||
|
|
||||||
|
if dtype == torch.int8:
|
||||||
|
return to_int8(a), to_int8(b)
|
||||||
|
if dtype == torch.float8_e4m3fn:
|
||||||
|
return to_fp8(a), to_fp8(b)
|
||||||
|
|
||||||
|
raise ValueError("unsupported dtype")
|
||||||
|
|
||||||
|
|
||||||
|
# impl
|
||||||
|
|
||||||
|
|
||||||
|
def pytorch_mm_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
|
||||||
|
scale_b: torch.tensor,
|
||||||
|
out_dtype: torch.dtype) -> torch.tensor:
|
||||||
|
return torch.mm(a, b)
|
||||||
|
|
||||||
|
|
||||||
|
def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
|
||||||
|
scale_b: torch.tensor,
|
||||||
|
out_dtype: torch.dtype) -> torch.tensor:
|
||||||
|
return torch._scaled_mm(a,
|
||||||
|
b,
|
||||||
|
scale_a=scale_a,
|
||||||
|
scale_b=scale_b,
|
||||||
|
out_dtype=out_dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
|
||||||
|
scale_a: torch.tensor, scale_b: torch.tensor,
|
||||||
|
out_dtype: torch.dtype) -> torch.tensor:
|
||||||
|
return torch._scaled_mm(a,
|
||||||
|
b,
|
||||||
|
scale_a=scale_a,
|
||||||
|
scale_b=scale_b,
|
||||||
|
out_dtype=out_dtype,
|
||||||
|
use_fast_accum=True)
|
||||||
|
|
||||||
|
|
||||||
|
def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
|
||||||
|
scale_b: torch.tensor,
|
||||||
|
out_dtype: torch.dtype) -> torch.tensor:
|
||||||
|
return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)
|
||||||
|
|
||||||
|
|
||||||
|
# bench
|
||||||
|
def bench_fn(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
|
||||||
|
scale_b: torch.tensor, out_dtype: torch.dtype, label: str,
|
||||||
|
sub_label: str, fn: Callable, description: str) -> TMeasurement:
|
||||||
|
|
||||||
|
min_run_time = 1
|
||||||
|
|
||||||
|
globals = {
|
||||||
|
"a": a,
|
||||||
|
"b": b,
|
||||||
|
"scale_a": scale_a,
|
||||||
|
"scale_b": scale_b,
|
||||||
|
"out_dtype": out_dtype,
|
||||||
|
"fn": fn,
|
||||||
|
}
|
||||||
|
return TBenchmark.Timer(
|
||||||
|
stmt="fn(a, b, scale_a, scale_b, out_dtype)",
|
||||||
|
globals=globals,
|
||||||
|
label=label,
|
||||||
|
sub_label=sub_label,
|
||||||
|
description=description,
|
||||||
|
).blocked_autorange(min_run_time=min_run_time)
|
||||||
|
|
||||||
|
|
||||||
|
def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
||||||
|
sub_label: str) -> Iterable[TMeasurement]:
|
||||||
|
assert dtype == torch.int8
|
||||||
|
a, b = make_rand_tensors(torch.int8, m, n, k)
|
||||||
|
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
|
||||||
|
timers = []
|
||||||
|
# pytorch impl
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
|
||||||
|
b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
|
||||||
|
torch.bfloat16, label, sub_label, pytorch_mm_impl,
|
||||||
|
"pytorch_bf16_bf16_bf16_matmul-no-scales"))
|
||||||
|
|
||||||
|
# cutlass impl
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
|
||||||
|
cutlass_impl, "cutlass_i8_i8_bf16_scaled_mm"))
|
||||||
|
|
||||||
|
return timers
|
||||||
|
|
||||||
|
|
||||||
|
def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
||||||
|
sub_label: str) -> Iterable[TMeasurement]:
|
||||||
|
assert dtype == torch.float8_e4m3fn
|
||||||
|
a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
|
||||||
|
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
|
||||||
|
timers = []
|
||||||
|
|
||||||
|
# pytorch impl w. bf16
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
|
||||||
|
b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
|
||||||
|
torch.bfloat16, label, sub_label, pytorch_mm_impl,
|
||||||
|
"pytorch_bf16_bf16_bf16_matmul-no-scales"))
|
||||||
|
|
||||||
|
# pytorch impl: bf16 output, without fp8 fast accum
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
|
||||||
|
pytorch_fp8_impl, "pytorch_fp8_fp8_bf16_scaled_mm"))
|
||||||
|
|
||||||
|
# pytorch impl: bf16 output, with fp8 fast accum
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
|
||||||
|
pytorch_fp8_impl_fast_accum,
|
||||||
|
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum"))
|
||||||
|
|
||||||
|
# pytorch impl: fp16 output, without fp8 fast accum
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
|
||||||
|
pytorch_fp8_impl, "pytorch_fp8_fp8_fp16_scaled_mm"))
|
||||||
|
|
||||||
|
# pytorch impl: fp16 output, with fp8 fast accum
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
|
||||||
|
pytorch_fp8_impl_fast_accum,
|
||||||
|
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum"))
|
||||||
|
|
||||||
|
# cutlass impl: bf16 output
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
|
||||||
|
cutlass_impl, "cutlass_fp8_fp8_bf16_scaled_mm"))
|
||||||
|
# cutlass impl: fp16 output
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
|
||||||
|
cutlass_impl, "cutlass_fp8_fp8_fp16_scaled_mm"))
|
||||||
|
return timers
|
||||||
|
|
||||||
|
|
||||||
|
def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
||||||
|
sub_label: str) -> Iterable[TMeasurement]:
|
||||||
|
if dtype == torch.int8:
|
||||||
|
return bench_int8(dtype, m, k, n, label, sub_label)
|
||||||
|
if dtype == torch.float8_e4m3fn:
|
||||||
|
return bench_fp8(dtype, m, k, n, label, sub_label)
|
||||||
|
raise ValueError("unsupported type")
|
||||||
|
|
||||||
|
|
||||||
|
# runner
|
||||||
|
def print_timers(timers: Iterable[TMeasurement]):
|
||||||
|
compare = TBenchmark.Compare(timers)
|
||||||
|
compare.print()
|
||||||
|
|
||||||
|
|
||||||
|
def run(dtype: torch.dtype,
|
||||||
|
MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for m, k, n in MKNs:
|
||||||
|
timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
|
||||||
|
f"MKN=({m}x{k}x{n})")
|
||||||
|
print_timers(timers)
|
||||||
|
results.extend(timers)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
# output makers
|
||||||
|
def make_output(data: Iterable[TMeasurement],
|
||||||
|
MKNs: Iterable[Tuple[int, int, int]],
|
||||||
|
base_description: str,
|
||||||
|
timestamp=None):
|
||||||
|
|
||||||
|
print(f"== All Results {base_description} ====")
|
||||||
|
print_timers(data)
|
||||||
|
|
||||||
|
# pickle all the results
|
||||||
|
timestamp = int(time.time()) if timestamp is None else timestamp
|
||||||
|
with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
|
||||||
|
pkl.dump(data, f)
|
||||||
|
|
||||||
|
|
||||||
|
# argparse runners
|
||||||
|
|
||||||
|
|
||||||
|
def run_square_bench(args):
|
||||||
|
dim_sizes = list(
|
||||||
|
range(args.dim_start, args.dim_end + 1, args.dim_increment))
|
||||||
|
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
|
||||||
|
data = run(args.dtype, MKNs)
|
||||||
|
|
||||||
|
make_output(data, MKNs, f"square_bench-{args.dtype}")
|
||||||
|
|
||||||
|
|
||||||
|
def run_range_bench(args):
|
||||||
|
dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
|
||||||
|
n = len(dim_sizes)
|
||||||
|
Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
|
||||||
|
Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
|
||||||
|
Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
|
||||||
|
MKNs = list(zip(Ms, Ks, Ns))
|
||||||
|
data = run(args.dtype, MKNs)
|
||||||
|
|
||||||
|
make_output(data, MKNs, f"range_bench-{args.dtype}")
|
||||||
|
|
||||||
|
|
||||||
|
def run_model_bench(args):
|
||||||
|
|
||||||
|
print("Benchmarking models:")
|
||||||
|
for i, model in enumerate(args.models):
|
||||||
|
print(f"[{i}] {model}")
|
||||||
|
|
||||||
|
def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
|
||||||
|
KNs = []
|
||||||
|
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
|
||||||
|
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
|
||||||
|
KNs.append(KN)
|
||||||
|
return KNs
|
||||||
|
|
||||||
|
model_bench_data = []
|
||||||
|
models_tps = list(itertools.product(args.models, args.tp_sizes))
|
||||||
|
for model, tp_size in models_tps:
|
||||||
|
Ms = args.batch_sizes
|
||||||
|
KNs = model_shapes(model, tp_size)
|
||||||
|
MKNs = []
|
||||||
|
for m in Ms:
|
||||||
|
for k, n in KNs:
|
||||||
|
MKNs.append((m, k, n))
|
||||||
|
|
||||||
|
data = run(args.dtype, MKNs)
|
||||||
|
model_bench_data.append(data)
|
||||||
|
|
||||||
|
# Print all results
|
||||||
|
for data, model_tp in zip(model_bench_data, models_tps):
|
||||||
|
model, tp_size = model_tp
|
||||||
|
print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
|
||||||
|
print_timers(data)
|
||||||
|
|
||||||
|
timestamp = int(time.time())
|
||||||
|
|
||||||
|
all_data = []
|
||||||
|
for d in model_bench_data:
|
||||||
|
all_data.extend(d)
|
||||||
|
# pickle all data
|
||||||
|
with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
|
||||||
|
pkl.dump(all_data, f)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
def to_torch_dtype(dt):
|
||||||
|
if dt == "int8":
|
||||||
|
return torch.int8
|
||||||
|
if dt == "fp8":
|
||||||
|
return torch.float8_e4m3fn
|
||||||
|
raise ValueError("unsupported dtype")
|
||||||
|
|
||||||
|
parser = FlexibleArgumentParser(
|
||||||
|
description="""
|
||||||
|
Benchmark Cutlass GEMM.
|
||||||
|
|
||||||
|
To run square GEMMs:
|
||||||
|
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
|
||||||
|
|
||||||
|
To run constant N and K and sweep M:
|
||||||
|
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
|
||||||
|
|
||||||
|
To run dimensions from a model:
|
||||||
|
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
|
||||||
|
|
||||||
|
Output:
|
||||||
|
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
|
||||||
|
""", # noqa: E501
|
||||||
|
formatter_class=argparse.RawTextHelpFormatter)
|
||||||
|
|
||||||
|
parser.add_argument("--dtype",
|
||||||
|
type=to_torch_dtype,
|
||||||
|
required=True,
|
||||||
|
help="Available options are ['int8', 'fp8']")
|
||||||
|
subparsers = parser.add_subparsers(dest="cmd")
|
||||||
|
|
||||||
|
square_parser = subparsers.add_parser("square_bench")
|
||||||
|
square_parser.add_argument("--dim-start", type=int, required=True)
|
||||||
|
square_parser.add_argument("--dim-end", type=int, required=True)
|
||||||
|
square_parser.add_argument("--dim-increment", type=int, required=True)
|
||||||
|
square_parser.set_defaults(func=run_square_bench)
|
||||||
|
|
||||||
|
range_parser = subparsers.add_parser("range_bench")
|
||||||
|
range_parser.add_argument("--dim-start", type=int, required=True)
|
||||||
|
range_parser.add_argument("--dim-end", type=int, required=True)
|
||||||
|
range_parser.add_argument("--dim-increment", type=int, required=True)
|
||||||
|
range_parser.add_argument("--m-constant", type=int, default=None)
|
||||||
|
range_parser.add_argument("--n-constant", type=int, default=None)
|
||||||
|
range_parser.add_argument("--k-constant", type=int, default=None)
|
||||||
|
range_parser.set_defaults(func=run_range_bench)
|
||||||
|
|
||||||
|
model_parser = subparsers.add_parser("model_bench")
|
||||||
|
model_parser.add_argument("--models",
|
||||||
|
nargs="+",
|
||||||
|
type=str,
|
||||||
|
default=DEFAULT_MODELS,
|
||||||
|
choices=WEIGHT_SHAPES.keys())
|
||||||
|
model_parser.add_argument("--tp-sizes",
|
||||||
|
nargs="+",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_TP_SIZES)
|
||||||
|
model_parser.add_argument("--batch-sizes",
|
||||||
|
nargs="+",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_BATCH_SIZES)
|
||||||
|
model_parser.set_defaults(func=run_model_bench)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
args.func(args)
|
||||||
43
benchmarks/cutlass_benchmarks/weight_shapes.py
Normal file
43
benchmarks/cutlass_benchmarks/weight_shapes.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
# Weight Shapes are in the format
|
||||||
|
# ([K, N], TP_SPLIT_DIM)
|
||||||
|
# Example:
|
||||||
|
# A shape of ([14336, 4096], 0) indicates the following GEMM shape,
|
||||||
|
# - TP1 : K = 14336, N = 4096
|
||||||
|
# - TP2 : K = 7168, N = 4096
|
||||||
|
# A shape of ([4096, 6144], 1) indicates the following GEMM shape,
|
||||||
|
# - TP1 : K = 4096, N = 6144
|
||||||
|
# - TP4 : K = 4096, N = 1536
|
||||||
|
|
||||||
|
# TP1 shapes
|
||||||
|
WEIGHT_SHAPES = {
|
||||||
|
"mistralai/Mistral-7B-v0.1": [
|
||||||
|
([4096, 6144], 1),
|
||||||
|
([4096, 4096], 0),
|
||||||
|
([4096, 28672], 1),
|
||||||
|
([14336, 4096], 0),
|
||||||
|
],
|
||||||
|
"meta-llama/Llama-2-7b-hf": [
|
||||||
|
([4096, 12288], 1),
|
||||||
|
([4096, 4096], 0),
|
||||||
|
([4096, 22016], 1),
|
||||||
|
([11008, 4096], 0),
|
||||||
|
],
|
||||||
|
"meta-llama/Llama-3-8b": [
|
||||||
|
([4096, 6144], 1),
|
||||||
|
([4096, 4096], 0),
|
||||||
|
([4096, 28672], 1),
|
||||||
|
([14336, 4096], 0),
|
||||||
|
],
|
||||||
|
"meta-llama/Llama-2-13b-hf": [
|
||||||
|
([5120, 15360], 1),
|
||||||
|
([5120, 5120], 0),
|
||||||
|
([5120, 27648], 1),
|
||||||
|
([13824, 5120], 0),
|
||||||
|
],
|
||||||
|
"meta-llama/Llama-2-70b-hf": [
|
||||||
|
([8192, 10240], 1),
|
||||||
|
([8192, 8192], 0),
|
||||||
|
([8192, 57344], 1),
|
||||||
|
([28672, 8192], 0),
|
||||||
|
],
|
||||||
|
}
|
||||||
@@ -1,4 +1,3 @@
|
|||||||
import argparse
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
@@ -10,6 +9,7 @@ from vllm import _custom_ops as ops
|
|||||||
from vllm.model_executor.layers.quantization.aqlm import (
|
from vllm.model_executor.layers.quantization.aqlm import (
|
||||||
dequantize_weight, generic_dequantize_gemm, get_int_dtype,
|
dequantize_weight, generic_dequantize_gemm, get_int_dtype,
|
||||||
optimized_dequantize_gemm)
|
optimized_dequantize_gemm)
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
||||||
|
|
||||||
@@ -86,9 +86,9 @@ def dequant_no_scale(
|
|||||||
# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
|
# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
|
||||||
# the generic pytorch version.
|
# the generic pytorch version.
|
||||||
# Just visual comparison.
|
# Just visual comparison.
|
||||||
def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
|
def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
|
||||||
|
|
||||||
n = parts.sum().item()
|
n = int(parts.sum().item())
|
||||||
|
|
||||||
device = torch.device('cuda:0')
|
device = torch.device('cuda:0')
|
||||||
|
|
||||||
@@ -137,7 +137,7 @@ def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="Benchmark aqlm performance.")
|
parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
|
||||||
|
|
||||||
# Add arguments
|
# Add arguments
|
||||||
parser.add_argument("--nbooks",
|
parser.add_argument("--nbooks",
|
||||||
@@ -204,7 +204,7 @@ def main():
|
|||||||
sys.stdout = sys.__stdout__
|
sys.stdout = sys.__stdout__
|
||||||
|
|
||||||
|
|
||||||
def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
|
def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
|
||||||
methods):
|
methods):
|
||||||
|
|
||||||
# I didn't see visible improvements from increasing these, but feel free :)
|
# I didn't see visible improvements from increasing these, but feel free :)
|
||||||
@@ -252,10 +252,10 @@ def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
|
|||||||
print('')
|
print('')
|
||||||
|
|
||||||
|
|
||||||
def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
|
def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor,
|
||||||
nbooks: int, bits: int, method) -> float:
|
nbooks: int, bits: int, method) -> float:
|
||||||
|
|
||||||
n = parts.sum().item()
|
n = int(parts.sum().item())
|
||||||
|
|
||||||
device = torch.device('cuda:0')
|
device = torch.device('cuda:0')
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
import argparse
|
from typing import List
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as benchmark
|
import torch.utils.benchmark as benchmark
|
||||||
@@ -15,6 +15,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
|
|||||||
MarlinWorkspace, marlin_24_quantize, marlin_quantize)
|
MarlinWorkspace, marlin_24_quantize, marlin_quantize)
|
||||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||||
gptq_pack, quantize_weights, sort_weights)
|
gptq_pack, quantize_weights, sort_weights)
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
|
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
|
||||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||||
@@ -23,8 +24,9 @@ ACT_ORDER_OPTS = [False, True]
|
|||||||
K_FULL_OPTS = [False, True]
|
K_FULL_OPTS = [False, True]
|
||||||
|
|
||||||
|
|
||||||
def bench_run(results, model, act_order, is_k_full, num_bits, group_size,
|
def bench_run(results: List[benchmark.Measurement], model: str,
|
||||||
size_m, size_k, size_n):
|
act_order: bool, is_k_full: bool, num_bits: int, group_size: int,
|
||||||
|
size_m: int, size_k: int, size_n: int):
|
||||||
label = "Quant Matmul"
|
label = "Quant Matmul"
|
||||||
|
|
||||||
sub_label = ("{}, act={} k_full={}, b={}, g={}, "
|
sub_label = ("{}, act={} k_full={}, b={}, g={}, "
|
||||||
@@ -156,7 +158,7 @@ def main(args):
|
|||||||
for i, model in enumerate(args.models):
|
for i, model in enumerate(args.models):
|
||||||
print(f"[{i}] {model}")
|
print(f"[{i}] {model}")
|
||||||
|
|
||||||
results = []
|
results: List[benchmark.Measurement] = []
|
||||||
|
|
||||||
for model in args.models:
|
for model in args.models:
|
||||||
for layer in WEIGHT_SHAPES[model]:
|
for layer in WEIGHT_SHAPES[model]:
|
||||||
@@ -209,7 +211,7 @@ def main(args):
|
|||||||
# python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
|
# python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
|
||||||
#
|
#
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark Marlin across specified models/shapes/batches")
|
description="Benchmark Marlin across specified models/shapes/batches")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--models",
|
"--models",
|
||||||
|
|||||||
@@ -1,239 +0,0 @@
|
|||||||
import argparse
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
|
|
||||||
import torch
|
|
||||||
import torch.nn.functional as F
|
|
||||||
import triton
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from vllm.model_executor.layers.fused_moe import (fused_moe,
|
|
||||||
get_config_file_name)
|
|
||||||
|
|
||||||
|
|
||||||
def main(model, tp_size, gpu, dtype: str):
|
|
||||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu)
|
|
||||||
method = fused_moe
|
|
||||||
for bs in [
|
|
||||||
1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
|
|
||||||
2048, 3072, 4096
|
|
||||||
]:
|
|
||||||
run_grid(bs,
|
|
||||||
model=model,
|
|
||||||
method=method,
|
|
||||||
gpu=gpu,
|
|
||||||
tp_size=tp_size,
|
|
||||||
dtype=dtype)
|
|
||||||
|
|
||||||
|
|
||||||
def run_grid(bs, model, method, gpu, tp_size, dtype: str):
|
|
||||||
if model == '8x7B':
|
|
||||||
d_model = 4096
|
|
||||||
model_intermediate_size = 14336
|
|
||||||
num_layers = 32
|
|
||||||
elif model == '8x22B':
|
|
||||||
d_model = 6144
|
|
||||||
model_intermediate_size = 16384
|
|
||||||
num_layers = 56
|
|
||||||
else:
|
|
||||||
raise ValueError(f'Unsupported Mixtral model {model}')
|
|
||||||
num_total_experts = 8
|
|
||||||
top_k = 2
|
|
||||||
# tp_size = 2
|
|
||||||
num_calls = 100
|
|
||||||
|
|
||||||
num_warmup_trials = 1
|
|
||||||
num_trials = 1
|
|
||||||
|
|
||||||
configs = []
|
|
||||||
|
|
||||||
for block_size_n in [32, 64, 128, 256]:
|
|
||||||
for block_size_m in [16, 32, 64, 128, 256]:
|
|
||||||
for block_size_k in [64, 128, 256]:
|
|
||||||
for group_size_m in [1, 16, 32, 64]:
|
|
||||||
for num_warps in [4, 8]:
|
|
||||||
for num_stages in [2, 3, 4, 5]:
|
|
||||||
configs.append({
|
|
||||||
"BLOCK_SIZE_M": block_size_m,
|
|
||||||
"BLOCK_SIZE_N": block_size_n,
|
|
||||||
"BLOCK_SIZE_K": block_size_k,
|
|
||||||
"GROUP_SIZE_M": group_size_m,
|
|
||||||
"num_warps": num_warps,
|
|
||||||
"num_stages": num_stages,
|
|
||||||
})
|
|
||||||
|
|
||||||
best_config = None
|
|
||||||
best_time_us = 1e20
|
|
||||||
|
|
||||||
print(f'{tp_size=} {bs=}')
|
|
||||||
|
|
||||||
for config in tqdm(configs):
|
|
||||||
# warmup
|
|
||||||
try:
|
|
||||||
for _ in range(num_warmup_trials):
|
|
||||||
run_timing(
|
|
||||||
num_calls=num_calls,
|
|
||||||
bs=bs,
|
|
||||||
d_model=d_model,
|
|
||||||
num_total_experts=num_total_experts,
|
|
||||||
top_k=top_k,
|
|
||||||
tp_size=tp_size,
|
|
||||||
model_intermediate_size=model_intermediate_size,
|
|
||||||
method=method,
|
|
||||||
config=config,
|
|
||||||
dtype=dtype,
|
|
||||||
)
|
|
||||||
except triton.runtime.autotuner.OutOfResources:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# trial
|
|
||||||
for _ in range(num_trials):
|
|
||||||
kernel_dur_ms = run_timing(
|
|
||||||
num_calls=num_calls,
|
|
||||||
bs=bs,
|
|
||||||
d_model=d_model,
|
|
||||||
num_total_experts=num_total_experts,
|
|
||||||
top_k=top_k,
|
|
||||||
tp_size=tp_size,
|
|
||||||
model_intermediate_size=model_intermediate_size,
|
|
||||||
method=method,
|
|
||||||
config=config,
|
|
||||||
dtype=dtype,
|
|
||||||
)
|
|
||||||
|
|
||||||
kernel_dur_us = 1000 * kernel_dur_ms
|
|
||||||
model_dur_ms = kernel_dur_ms * num_layers
|
|
||||||
|
|
||||||
if kernel_dur_us < best_time_us:
|
|
||||||
best_config = config
|
|
||||||
best_time_us = kernel_dur_us
|
|
||||||
|
|
||||||
tqdm.write(
|
|
||||||
f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
|
|
||||||
f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
|
|
||||||
f'{d_model=} {model_intermediate_size=} {num_layers=}')
|
|
||||||
|
|
||||||
print("best_time_us", best_time_us)
|
|
||||||
print("best_config", best_config)
|
|
||||||
|
|
||||||
# holds Dict[str, Dict[str, int]]
|
|
||||||
filename = get_config_file_name(num_total_experts,
|
|
||||||
model_intermediate_size // tp_size,
|
|
||||||
"float8" if dtype == "float8" else None)
|
|
||||||
print(f"writing config to file {filename}")
|
|
||||||
existing_content = {}
|
|
||||||
if os.path.exists(filename):
|
|
||||||
with open(filename, "r") as f:
|
|
||||||
existing_content = json.load(f)
|
|
||||||
existing_content[str(bs)] = best_config
|
|
||||||
with open(filename, "w") as f:
|
|
||||||
json.dump(existing_content, f, indent=4)
|
|
||||||
f.write("\n")
|
|
||||||
|
|
||||||
|
|
||||||
def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
|
|
||||||
top_k: int, tp_size: int, model_intermediate_size: int, method,
|
|
||||||
config, dtype: str) -> float:
|
|
||||||
shard_intermediate_size = model_intermediate_size // tp_size
|
|
||||||
|
|
||||||
hidden_states = torch.rand(
|
|
||||||
(bs, d_model),
|
|
||||||
device="cuda:0",
|
|
||||||
dtype=torch.float16,
|
|
||||||
)
|
|
||||||
|
|
||||||
w1 = torch.rand(
|
|
||||||
(num_total_experts, 2 * shard_intermediate_size, d_model),
|
|
||||||
device=hidden_states.device,
|
|
||||||
dtype=hidden_states.dtype,
|
|
||||||
)
|
|
||||||
|
|
||||||
w2 = torch.rand(
|
|
||||||
(num_total_experts, d_model, shard_intermediate_size),
|
|
||||||
device=hidden_states.device,
|
|
||||||
dtype=hidden_states.dtype,
|
|
||||||
)
|
|
||||||
|
|
||||||
w1_scale = None
|
|
||||||
w2_scale = None
|
|
||||||
a1_scale = None
|
|
||||||
a2_scale = None
|
|
||||||
|
|
||||||
if dtype == "float8":
|
|
||||||
w1 = w1.to(torch.float8_e4m3fn)
|
|
||||||
w2 = w2.to(torch.float8_e4m3fn)
|
|
||||||
w1_scale = torch.ones(num_total_experts,
|
|
||||||
device=hidden_states.device,
|
|
||||||
dtype=torch.float32)
|
|
||||||
w2_scale = torch.ones(num_total_experts,
|
|
||||||
device=hidden_states.device,
|
|
||||||
dtype=torch.float32)
|
|
||||||
a1_scale = torch.ones(1,
|
|
||||||
device=hidden_states.device,
|
|
||||||
dtype=torch.float32)
|
|
||||||
a2_scale = torch.ones(1,
|
|
||||||
device=hidden_states.device,
|
|
||||||
dtype=torch.float32)
|
|
||||||
|
|
||||||
gating_output = F.softmax(torch.rand(
|
|
||||||
(num_calls, bs, num_total_experts),
|
|
||||||
device=hidden_states.device,
|
|
||||||
dtype=torch.float32,
|
|
||||||
),
|
|
||||||
dim=-1)
|
|
||||||
|
|
||||||
start_event = torch.cuda.Event(enable_timing=True)
|
|
||||||
end_event = torch.cuda.Event(enable_timing=True)
|
|
||||||
|
|
||||||
start_event.record()
|
|
||||||
for i in range(num_calls):
|
|
||||||
hidden_states = method(
|
|
||||||
hidden_states=hidden_states,
|
|
||||||
w1=w1,
|
|
||||||
w2=w2,
|
|
||||||
w1_scale=w1_scale,
|
|
||||||
w2_scale=w2_scale,
|
|
||||||
a1_scale=a1_scale,
|
|
||||||
a2_scale=a2_scale,
|
|
||||||
gating_output=gating_output[i],
|
|
||||||
topk=2,
|
|
||||||
renormalize=True,
|
|
||||||
inplace=True,
|
|
||||||
override_config=config,
|
|
||||||
use_fp8=dtype == "float8",
|
|
||||||
)
|
|
||||||
end_event.record()
|
|
||||||
end_event.synchronize()
|
|
||||||
|
|
||||||
dur_ms = start_event.elapsed_time(end_event) / num_calls
|
|
||||||
return dur_ms
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
prog='benchmark_mixtral_moe',
|
|
||||||
description='Benchmark and tune the fused_moe kernel',
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--dtype',
|
|
||||||
type=str,
|
|
||||||
default='auto',
|
|
||||||
choices=['float8', 'float16'],
|
|
||||||
help='Data type used for fused_moe kernel computations',
|
|
||||||
)
|
|
||||||
parser.add_argument('--model',
|
|
||||||
type=str,
|
|
||||||
default='8x7B',
|
|
||||||
choices=['8x7B', '8x22B'],
|
|
||||||
help='The Mixtral model to benchmark')
|
|
||||||
parser.add_argument('--tp-size',
|
|
||||||
type=int,
|
|
||||||
default=2,
|
|
||||||
help='Tensor paralleli size')
|
|
||||||
parser.add_argument('--gpu',
|
|
||||||
type=int,
|
|
||||||
default=0,
|
|
||||||
help="GPU ID for benchmarking")
|
|
||||||
args = parser.parse_args()
|
|
||||||
sys.exit(main(args.model, args.tp_size, args.gpu, args.dtype))
|
|
||||||
333
benchmarks/kernels/benchmark_moe.py
Normal file
333
benchmarks/kernels/benchmark_moe.py
Normal file
@@ -0,0 +1,333 @@
|
|||||||
|
import argparse
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Any, Dict, List, Tuple, TypedDict
|
||||||
|
|
||||||
|
import ray
|
||||||
|
import torch
|
||||||
|
import triton
|
||||||
|
from ray.experimental.tqdm_ray import tqdm
|
||||||
|
from transformers import AutoConfig
|
||||||
|
|
||||||
|
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
class BenchmarkConfig(TypedDict):
|
||||||
|
BLOCK_SIZE_M: int
|
||||||
|
BLOCK_SIZE_N: int
|
||||||
|
BLOCK_SIZE_K: int
|
||||||
|
GROUP_SIZE_M: int
|
||||||
|
num_warps: int
|
||||||
|
num_stages: int
|
||||||
|
|
||||||
|
|
||||||
|
def benchmark_config(
|
||||||
|
config: BenchmarkConfig,
|
||||||
|
num_tokens: int,
|
||||||
|
num_experts: int,
|
||||||
|
shard_intermediate_size: int,
|
||||||
|
hidden_size: int,
|
||||||
|
topk: int,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
use_fp8: bool,
|
||||||
|
num_iters: int = 100,
|
||||||
|
) -> float:
|
||||||
|
init_dtype = torch.float16 if use_fp8 else dtype
|
||||||
|
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||||
|
w1 = torch.randn(num_experts,
|
||||||
|
shard_intermediate_size,
|
||||||
|
hidden_size,
|
||||||
|
dtype=init_dtype)
|
||||||
|
w2 = torch.randn(num_experts,
|
||||||
|
hidden_size,
|
||||||
|
shard_intermediate_size // 2,
|
||||||
|
dtype=init_dtype)
|
||||||
|
gating_output = torch.randn(num_iters,
|
||||||
|
num_tokens,
|
||||||
|
num_experts,
|
||||||
|
dtype=torch.float32)
|
||||||
|
|
||||||
|
w1_scale = None
|
||||||
|
w2_scale = None
|
||||||
|
a1_scale = None
|
||||||
|
a2_scale = None
|
||||||
|
if use_fp8:
|
||||||
|
w1_scale = torch.randn(num_experts, dtype=torch.float32)
|
||||||
|
w2_scale = torch.randn(num_experts, dtype=torch.float32)
|
||||||
|
a1_scale = torch.randn(1, dtype=torch.float32)
|
||||||
|
a2_scale = torch.randn(1, dtype=torch.float32)
|
||||||
|
|
||||||
|
w1 = w1.to(torch.float8_e4m3fn)
|
||||||
|
w2 = w2.to(torch.float8_e4m3fn)
|
||||||
|
|
||||||
|
input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
|
||||||
|
|
||||||
|
def prepare(i: int):
|
||||||
|
input_gating.copy_(gating_output[i])
|
||||||
|
|
||||||
|
def run():
|
||||||
|
fused_moe(
|
||||||
|
x,
|
||||||
|
w1,
|
||||||
|
w2,
|
||||||
|
input_gating,
|
||||||
|
topk,
|
||||||
|
renormalize=True,
|
||||||
|
inplace=True,
|
||||||
|
override_config=config,
|
||||||
|
use_fp8=use_fp8,
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a1_scale,
|
||||||
|
a2_scale=a2_scale,
|
||||||
|
)
|
||||||
|
|
||||||
|
# JIT compilation & warmup
|
||||||
|
run()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
# Capture 10 invocations with CUDA graph
|
||||||
|
graph = torch.cuda.CUDAGraph()
|
||||||
|
with torch.cuda.graph(graph):
|
||||||
|
for _ in range(10):
|
||||||
|
run()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
# Warmup
|
||||||
|
for _ in range(5):
|
||||||
|
graph.replay()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
start_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
end_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
|
||||||
|
latencies: List[float] = []
|
||||||
|
for i in range(num_iters):
|
||||||
|
prepare(i)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
start_event.record()
|
||||||
|
graph.replay()
|
||||||
|
end_event.record()
|
||||||
|
end_event.synchronize()
|
||||||
|
latencies.append(start_event.elapsed_time(end_event))
|
||||||
|
avg = sum(latencies) / (num_iters * 10) * 1000 # us
|
||||||
|
graph.reset()
|
||||||
|
return avg
|
||||||
|
|
||||||
|
|
||||||
|
def get_configs_compute_bound() -> List[Dict[str, int]]:
|
||||||
|
# Reduced search space for faster tuning.
|
||||||
|
# TODO(woosuk): Increase the search space and use a performance model to
|
||||||
|
# prune the search space.
|
||||||
|
configs: List[BenchmarkConfig] = []
|
||||||
|
for num_stages in [2, 3, 4, 5]:
|
||||||
|
for block_m in [16, 32, 64, 128, 256]:
|
||||||
|
for block_k in [64, 128, 256]:
|
||||||
|
for block_n in [32, 64, 128, 256]:
|
||||||
|
for num_warps in [4, 8]:
|
||||||
|
for group_size in [1, 16, 32, 64]:
|
||||||
|
configs.append({
|
||||||
|
"BLOCK_SIZE_M": block_m,
|
||||||
|
"BLOCK_SIZE_N": block_n,
|
||||||
|
"BLOCK_SIZE_K": block_k,
|
||||||
|
"GROUP_SIZE_M": group_size,
|
||||||
|
"num_warps": num_warps,
|
||||||
|
"num_stages": num_stages,
|
||||||
|
})
|
||||||
|
return configs
|
||||||
|
|
||||||
|
|
||||||
|
@ray.remote(num_gpus=1)
|
||||||
|
class BenchmarkWorker:
|
||||||
|
|
||||||
|
def __init__(self, seed: int) -> None:
|
||||||
|
torch.set_default_device("cuda")
|
||||||
|
torch.cuda.manual_seed_all(seed)
|
||||||
|
self.seed = seed
|
||||||
|
|
||||||
|
def benchmark(
|
||||||
|
self,
|
||||||
|
num_tokens: int,
|
||||||
|
num_experts: int,
|
||||||
|
shard_intermediate_size: int,
|
||||||
|
hidden_size: int,
|
||||||
|
topk: int,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
use_fp8: bool,
|
||||||
|
) -> Tuple[Dict[str, int], float]:
|
||||||
|
torch.cuda.manual_seed_all(self.seed)
|
||||||
|
|
||||||
|
dtype_str = "float8" if use_fp8 else None
|
||||||
|
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
|
||||||
|
# is the intermediate size after silu_and_mul.
|
||||||
|
op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
|
||||||
|
dtype_str)
|
||||||
|
if op_config is None:
|
||||||
|
config = get_default_config(num_tokens, num_experts,
|
||||||
|
shard_intermediate_size, hidden_size,
|
||||||
|
topk, dtype_str)
|
||||||
|
else:
|
||||||
|
config = op_config[min(op_config.keys(),
|
||||||
|
key=lambda x: abs(x - num_tokens))]
|
||||||
|
kernel_time = benchmark_config(config, num_tokens, num_experts,
|
||||||
|
shard_intermediate_size, hidden_size,
|
||||||
|
topk, dtype, use_fp8)
|
||||||
|
return config, kernel_time
|
||||||
|
|
||||||
|
def tune(
|
||||||
|
self,
|
||||||
|
num_tokens: int,
|
||||||
|
num_experts: int,
|
||||||
|
shard_intermediate_size: int,
|
||||||
|
hidden_size: int,
|
||||||
|
topk: int,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
use_fp8: bool,
|
||||||
|
search_space: List[BenchmarkConfig],
|
||||||
|
) -> BenchmarkConfig:
|
||||||
|
best_config = None
|
||||||
|
best_time = float("inf")
|
||||||
|
for config in tqdm(search_space):
|
||||||
|
try:
|
||||||
|
kernel_time = benchmark_config(config,
|
||||||
|
num_tokens,
|
||||||
|
num_experts,
|
||||||
|
shard_intermediate_size,
|
||||||
|
hidden_size,
|
||||||
|
topk,
|
||||||
|
dtype,
|
||||||
|
use_fp8,
|
||||||
|
num_iters=10)
|
||||||
|
except triton.runtime.autotuner.OutOfResources:
|
||||||
|
# Some configurations may be invalid and fail to compile.
|
||||||
|
continue
|
||||||
|
|
||||||
|
if kernel_time < best_time:
|
||||||
|
best_time = kernel_time
|
||||||
|
best_config = config
|
||||||
|
now = datetime.now()
|
||||||
|
print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
|
||||||
|
assert best_config is not None
|
||||||
|
return best_config
|
||||||
|
|
||||||
|
|
||||||
|
def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
|
||||||
|
return {
|
||||||
|
"BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
|
||||||
|
"BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
|
||||||
|
"BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
|
||||||
|
"GROUP_SIZE_M": config["GROUP_SIZE_M"],
|
||||||
|
"num_warps": config["num_warps"],
|
||||||
|
"num_stages": config["num_stages"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def save_configs(
|
||||||
|
configs: Dict[int, BenchmarkConfig],
|
||||||
|
num_experts: int,
|
||||||
|
shard_intermediate_size: int,
|
||||||
|
hidden_size: int,
|
||||||
|
topk: int,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
use_fp8: bool,
|
||||||
|
) -> None:
|
||||||
|
dtype_str = "float8" if use_fp8 else None
|
||||||
|
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
|
||||||
|
# is the intermediate size after silu_and_mul.
|
||||||
|
filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
|
||||||
|
dtype_str)
|
||||||
|
print(f"Writing best config to {filename}...")
|
||||||
|
with open(filename, "w") as f:
|
||||||
|
json.dump(configs, f, indent=4)
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: argparse.Namespace):
|
||||||
|
print(args)
|
||||||
|
|
||||||
|
config = AutoConfig.from_pretrained(args.model)
|
||||||
|
if config.architectures[0] == "DbrxForCausalLM":
|
||||||
|
E = config.ffn_config.moe_num_experts
|
||||||
|
topk = config.ffn_config.moe_top_k
|
||||||
|
intermediate_size = config.ffn_config.ffn_hidden_size
|
||||||
|
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||||
|
else:
|
||||||
|
# Default: Mixtral.
|
||||||
|
E = config.num_local_experts
|
||||||
|
topk = config.num_experts_per_tok
|
||||||
|
intermediate_size = config.intermediate_size
|
||||||
|
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||||
|
|
||||||
|
hidden_size = config.hidden_size
|
||||||
|
dtype = config.torch_dtype
|
||||||
|
use_fp8 = args.dtype == "fp8"
|
||||||
|
|
||||||
|
if args.batch_size is None:
|
||||||
|
batch_sizes = [
|
||||||
|
1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
|
||||||
|
2048, 3072, 4096
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
batch_sizes = [args.batch_size]
|
||||||
|
|
||||||
|
ray.init()
|
||||||
|
num_gpus = int(ray.available_resources()["GPU"])
|
||||||
|
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
|
||||||
|
|
||||||
|
def _distribute(method: str, inputs: List[Any]) -> List[Any]:
|
||||||
|
outputs = []
|
||||||
|
worker_idx = 0
|
||||||
|
for input_args in inputs:
|
||||||
|
worker = workers[worker_idx]
|
||||||
|
worker_method = getattr(worker, method)
|
||||||
|
output = worker_method.remote(*input_args)
|
||||||
|
outputs.append(output)
|
||||||
|
worker_idx = (worker_idx + 1) % num_gpus
|
||||||
|
return ray.get(outputs)
|
||||||
|
|
||||||
|
if args.tune:
|
||||||
|
search_space = get_configs_compute_bound()
|
||||||
|
print(f"Start tuning over {len(search_space)} configurations...")
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
configs = _distribute(
|
||||||
|
"tune", [(batch_size, E, shard_intermediate_size, hidden_size,
|
||||||
|
topk, dtype, use_fp8, search_space)
|
||||||
|
for batch_size in batch_sizes])
|
||||||
|
best_configs = {
|
||||||
|
M: sort_config(config)
|
||||||
|
for M, config in zip(batch_sizes, configs)
|
||||||
|
}
|
||||||
|
save_configs(best_configs, E, shard_intermediate_size, hidden_size,
|
||||||
|
topk, dtype, use_fp8)
|
||||||
|
end = time.time()
|
||||||
|
print(f"Tuning took {end - start:.2f} seconds")
|
||||||
|
else:
|
||||||
|
outputs = _distribute("benchmark",
|
||||||
|
[(batch_size, E, shard_intermediate_size,
|
||||||
|
hidden_size, topk, dtype, use_fp8)
|
||||||
|
for batch_size in batch_sizes])
|
||||||
|
|
||||||
|
for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
|
||||||
|
print(f"Batch size: {batch_size}, config: {config}")
|
||||||
|
print(f"Kernel time: {kernel_time:.2f} us")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = FlexibleArgumentParser()
|
||||||
|
parser.add_argument("--model",
|
||||||
|
type=str,
|
||||||
|
default="mistralai/Mixtral-8x7B-Instruct-v0.1")
|
||||||
|
parser.add_argument("--tp-size", "-tp", type=int, default=2)
|
||||||
|
parser.add_argument("--dtype",
|
||||||
|
type=str,
|
||||||
|
choices=["auto", "fp8"],
|
||||||
|
default="auto")
|
||||||
|
parser.add_argument("--seed", type=int, default=0)
|
||||||
|
parser.add_argument("--batch-size", type=int, required=False)
|
||||||
|
parser.add_argument("--tune", action="store_true")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
main(args)
|
||||||
@@ -1,12 +1,12 @@
|
|||||||
import argparse
|
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from typing import Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
|
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
|
||||||
|
create_kv_caches_with_random)
|
||||||
|
|
||||||
NUM_BLOCKS = 1024
|
NUM_BLOCKS = 1024
|
||||||
PARTITION_SIZE = 512
|
PARTITION_SIZE = 512
|
||||||
@@ -54,14 +54,17 @@ def main(
|
|||||||
|
|
||||||
# Create the block tables.
|
# Create the block tables.
|
||||||
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
|
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
|
||||||
block_tables = []
|
block_tables_lst: List[List[int]] = []
|
||||||
for _ in range(num_seqs):
|
for _ in range(num_seqs):
|
||||||
block_table = [
|
block_table = [
|
||||||
random.randint(0, NUM_BLOCKS - 1)
|
random.randint(0, NUM_BLOCKS - 1)
|
||||||
for _ in range(max_num_blocks_per_seq)
|
for _ in range(max_num_blocks_per_seq)
|
||||||
]
|
]
|
||||||
block_tables.append(block_table)
|
block_tables_lst.append(block_table)
|
||||||
block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)
|
|
||||||
|
block_tables = torch.tensor(block_tables_lst,
|
||||||
|
dtype=torch.int,
|
||||||
|
device=device)
|
||||||
|
|
||||||
# Create the KV cache.
|
# Create the KV cache.
|
||||||
key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
|
key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
|
||||||
@@ -158,14 +161,14 @@ def main(
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark the paged attention kernel.")
|
description="Benchmark the paged attention kernel.")
|
||||||
parser.add_argument("--version",
|
parser.add_argument("--version",
|
||||||
type=str,
|
type=str,
|
||||||
choices=["v1", "v2"],
|
choices=["v1", "v2"],
|
||||||
default="v2")
|
default="v2")
|
||||||
parser.add_argument("--batch-size", type=int, default=8)
|
parser.add_argument("--batch-size", type=int, default=8)
|
||||||
parser.add_argument("--seq_len", type=int, default=4096)
|
parser.add_argument("--seq-len", type=int, default=4096)
|
||||||
parser.add_argument("--num-query-heads", type=int, default=64)
|
parser.add_argument("--num-query-heads", type=int, default=64)
|
||||||
parser.add_argument("--num-kv-heads", type=int, default=8)
|
parser.add_argument("--num-kv-heads", type=int, default=8)
|
||||||
parser.add_argument("--head-size",
|
parser.add_argument("--head-size",
|
||||||
|
|||||||
@@ -1,11 +1,12 @@
|
|||||||
import argparse
|
|
||||||
from itertools import accumulate
|
from itertools import accumulate
|
||||||
from typing import Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
import nvtx
|
import nvtx
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
|
||||||
|
get_rope)
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
def benchmark_rope_kernels_multi_lora(
|
def benchmark_rope_kernels_multi_lora(
|
||||||
@@ -37,7 +38,7 @@ def benchmark_rope_kernels_multi_lora(
|
|||||||
})
|
})
|
||||||
# non-batched RoPE takes only one scaling factor, we create multiple
|
# non-batched RoPE takes only one scaling factor, we create multiple
|
||||||
# instances to simulate the same behavior
|
# instances to simulate the same behavior
|
||||||
non_batched_ropes = []
|
non_batched_ropes: List[RotaryEmbedding] = []
|
||||||
for scaling_factor in scaling_factors:
|
for scaling_factor in scaling_factors:
|
||||||
non_batched_ropes.append(
|
non_batched_ropes.append(
|
||||||
get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
|
get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
|
||||||
@@ -85,7 +86,7 @@ def benchmark_rope_kernels_multi_lora(
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark the rotary embedding kernels.")
|
description="Benchmark the rotary embedding kernels.")
|
||||||
parser.add_argument("--is-neox-style", type=bool, default=True)
|
parser.add_argument("--is-neox-style", type=bool, default=True)
|
||||||
parser.add_argument("--batch-size", type=int, default=16)
|
parser.add_argument("--batch-size", type=int, default=16)
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
import argparse
|
|
||||||
import cProfile
|
import cProfile
|
||||||
import pstats
|
import pstats
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
# A very long prompt, total number of tokens is about 15k.
|
# A very long prompt, total number of tokens is about 15k.
|
||||||
LONG_PROMPT = ["You are an expert in large language models, aren't you?"
|
LONG_PROMPT = ["You are an expert in large language models, aren't you?"
|
||||||
@@ -47,7 +47,7 @@ def main(args):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description='Benchmark the performance of hashing function in'
|
description='Benchmark the performance of hashing function in'
|
||||||
'automatic prefix caching.')
|
'automatic prefix caching.')
|
||||||
parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
|
parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
|
|||||||
#
|
#
|
||||||
# Check the compile flags
|
# Check the compile flags
|
||||||
#
|
#
|
||||||
list(APPEND CXX_COMPILE_FLAGS
|
list(APPEND CXX_COMPILE_FLAGS
|
||||||
"-fopenmp"
|
"-fopenmp"
|
||||||
"-DVLLM_CPU_EXTENSION")
|
"-DVLLM_CPU_EXTENSION")
|
||||||
|
|
||||||
@@ -33,9 +33,23 @@ function (find_isa CPUINFO TARGET OUT)
|
|||||||
endif()
|
endif()
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
|
function (is_avx512_disabled OUT)
|
||||||
|
set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512})
|
||||||
|
if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true")
|
||||||
|
set(${OUT} ON PARENT_SCOPE)
|
||||||
|
else()
|
||||||
|
set(${OUT} OFF PARENT_SCOPE)
|
||||||
|
endif()
|
||||||
|
endfunction()
|
||||||
|
|
||||||
if (AVX512_FOUND)
|
is_avx512_disabled(AVX512_DISABLED)
|
||||||
|
|
||||||
|
find_isa(${CPUINFO} "avx2" AVX2_FOUND)
|
||||||
|
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
|
||||||
|
find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
|
||||||
|
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
|
||||||
|
|
||||||
|
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||||
list(APPEND CXX_COMPILE_FLAGS
|
list(APPEND CXX_COMPILE_FLAGS
|
||||||
"-mavx512f"
|
"-mavx512f"
|
||||||
"-mavx512vl"
|
"-mavx512vl"
|
||||||
@@ -44,8 +58,8 @@ if (AVX512_FOUND)
|
|||||||
|
|
||||||
find_isa(${CPUINFO} "avx512_bf16" AVX512BF16_FOUND)
|
find_isa(${CPUINFO} "avx512_bf16" AVX512BF16_FOUND)
|
||||||
if (AVX512BF16_FOUND OR ENABLE_AVX512BF16)
|
if (AVX512BF16_FOUND OR ENABLE_AVX512BF16)
|
||||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
|
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
|
||||||
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
|
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
|
||||||
list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16")
|
list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16")
|
||||||
else()
|
else()
|
||||||
message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3")
|
message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3")
|
||||||
@@ -53,8 +67,18 @@ if (AVX512_FOUND)
|
|||||||
else()
|
else()
|
||||||
message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
|
message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
|
||||||
endif()
|
endif()
|
||||||
|
elseif (AVX2_FOUND)
|
||||||
|
list(APPEND CXX_COMPILE_FLAGS "-mavx2")
|
||||||
|
message(WARNING "vLLM CPU backend using AVX2 ISA")
|
||||||
|
elseif (POWER9_FOUND OR POWER10_FOUND)
|
||||||
|
message(STATUS "PowerPC detected")
|
||||||
|
# Check for PowerPC VSX support
|
||||||
|
list(APPEND CXX_COMPILE_FLAGS
|
||||||
|
"-mvsx"
|
||||||
|
"-mcpu=native"
|
||||||
|
"-mtune=native")
|
||||||
else()
|
else()
|
||||||
message(FATAL_ERROR "vLLM CPU backend requires AVX512 ISA support.")
|
message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
||||||
@@ -73,7 +97,7 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/cpu/cache.cpp"
|
"csrc/cpu/cache.cpp"
|
||||||
"csrc/cpu/layernorm.cpp"
|
"csrc/cpu/layernorm.cpp"
|
||||||
"csrc/cpu/pos_encoding.cpp"
|
"csrc/cpu/pos_encoding.cpp"
|
||||||
"csrc/cpu/pybind.cpp")
|
"csrc/cpu/torch_bindings.cpp")
|
||||||
|
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
_C
|
_C
|
||||||
@@ -81,10 +105,10 @@ define_gpu_extension_target(
|
|||||||
LANGUAGE CXX
|
LANGUAGE CXX
|
||||||
SOURCES ${VLLM_EXT_SRC}
|
SOURCES ${VLLM_EXT_SRC}
|
||||||
COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
|
COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
|
||||||
WITH_SOABI
|
USE_SABI 3
|
||||||
|
WITH_SOABI
|
||||||
)
|
)
|
||||||
|
|
||||||
add_custom_target(default)
|
add_custom_target(default)
|
||||||
message(STATUS "Enabling C extension.")
|
message(STATUS "Enabling C extension.")
|
||||||
add_dependencies(default _C)
|
add_dependencies(default _C)
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
|
macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
|
||||||
file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
|
file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
|
||||||
set(Python_EXECUTABLE ${EXECUTABLE})
|
set(Python_EXECUTABLE ${EXECUTABLE})
|
||||||
find_package(Python COMPONENTS Interpreter Development.Module)
|
find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule)
|
||||||
if (NOT Python_FOUND)
|
if (NOT Python_FOUND)
|
||||||
message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
|
message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
|
||||||
endif()
|
endif()
|
||||||
@@ -147,16 +147,23 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
|
|||||||
if (${GPU_LANG} STREQUAL "HIP")
|
if (${GPU_LANG} STREQUAL "HIP")
|
||||||
#
|
#
|
||||||
# `GPU_ARCHES` controls the `--offload-arch` flags.
|
# `GPU_ARCHES` controls the `--offload-arch` flags.
|
||||||
# `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled
|
|
||||||
# via the `PYTORCH_ROCM_ARCH` env variable.
|
|
||||||
#
|
#
|
||||||
|
# If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list,
|
||||||
|
# if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling
|
||||||
|
# "rocm_agent_enumerator" in "enable_language(HIP)"
|
||||||
|
# (in file Modules/CMakeDetermineHIPCompiler.cmake)
|
||||||
|
#
|
||||||
|
if(DEFINED ENV{PYTORCH_ROCM_ARCH})
|
||||||
|
set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH})
|
||||||
|
else()
|
||||||
|
set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES})
|
||||||
|
endif()
|
||||||
#
|
#
|
||||||
# Find the intersection of the supported + detected architectures to
|
# Find the intersection of the supported + detected architectures to
|
||||||
# set the module architecture flags.
|
# set the module architecture flags.
|
||||||
#
|
#
|
||||||
set(${GPU_ARCHES})
|
set(${GPU_ARCHES})
|
||||||
foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES})
|
foreach (_ARCH ${HIP_ARCHITECTURES})
|
||||||
if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
|
if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
|
||||||
list(APPEND ${GPU_ARCHES} ${_ARCH})
|
list(APPEND ${GPU_ARCHES} ${_ARCH})
|
||||||
endif()
|
endif()
|
||||||
@@ -164,7 +171,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
|
|||||||
|
|
||||||
if(NOT ${GPU_ARCHES})
|
if(NOT ${GPU_ARCHES})
|
||||||
message(FATAL_ERROR
|
message(FATAL_ERROR
|
||||||
"None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is"
|
"None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
|
||||||
" supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
|
" supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
@@ -294,6 +301,7 @@ endmacro()
|
|||||||
# INCLUDE_DIRECTORIES <dirs> - Extra include directories.
|
# INCLUDE_DIRECTORIES <dirs> - Extra include directories.
|
||||||
# LIBRARIES <libraries> - Extra link libraries.
|
# LIBRARIES <libraries> - Extra link libraries.
|
||||||
# WITH_SOABI - Generate library with python SOABI suffix name.
|
# WITH_SOABI - Generate library with python SOABI suffix name.
|
||||||
|
# USE_SABI <version> - Use python stable api <version>
|
||||||
#
|
#
|
||||||
# Note: optimization level/debug info is set via cmake build type.
|
# Note: optimization level/debug info is set via cmake build type.
|
||||||
#
|
#
|
||||||
@@ -301,7 +309,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
|
|||||||
cmake_parse_arguments(PARSE_ARGV 1
|
cmake_parse_arguments(PARSE_ARGV 1
|
||||||
GPU
|
GPU
|
||||||
"WITH_SOABI"
|
"WITH_SOABI"
|
||||||
"DESTINATION;LANGUAGE"
|
"DESTINATION;LANGUAGE;USE_SABI"
|
||||||
"SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
|
"SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
|
||||||
|
|
||||||
# Add hipify preprocessing step when building with HIP/ROCm.
|
# Add hipify preprocessing step when building with HIP/ROCm.
|
||||||
@@ -315,7 +323,11 @@ function (define_gpu_extension_target GPU_MOD_NAME)
|
|||||||
set(GPU_WITH_SOABI)
|
set(GPU_WITH_SOABI)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
Python_add_library(${GPU_MOD_NAME} MODULE "${GPU_SOURCES}" ${GPU_WITH_SOABI})
|
if (GPU_USE_SABI)
|
||||||
|
Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
|
||||||
|
else()
|
||||||
|
Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
|
||||||
|
endif()
|
||||||
|
|
||||||
if (GPU_LANGUAGE STREQUAL "HIP")
|
if (GPU_LANGUAGE STREQUAL "HIP")
|
||||||
# Make this target dependent on the hipify preprocessor step.
|
# Make this target dependent on the hipify preprocessor step.
|
||||||
|
|||||||
@@ -64,6 +64,7 @@ DEFAULT_CONDA_PATTERNS = {
|
|||||||
"triton",
|
"triton",
|
||||||
"optree",
|
"optree",
|
||||||
"nccl",
|
"nccl",
|
||||||
|
"transformers",
|
||||||
}
|
}
|
||||||
|
|
||||||
DEFAULT_PIP_PATTERNS = {
|
DEFAULT_PIP_PATTERNS = {
|
||||||
@@ -75,6 +76,7 @@ DEFAULT_PIP_PATTERNS = {
|
|||||||
"optree",
|
"optree",
|
||||||
"onnx",
|
"onnx",
|
||||||
"nccl",
|
"nccl",
|
||||||
|
"transformers",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -601,6 +603,11 @@ Versions of relevant libraries:
|
|||||||
{conda_packages}
|
{conda_packages}
|
||||||
""".strip()
|
""".strip()
|
||||||
|
|
||||||
|
# both the above code and the following code use `strip()` to
|
||||||
|
# remove leading/trailing whitespaces, so we need to add a newline
|
||||||
|
# in between to separate the two sections
|
||||||
|
env_info_fmt += "\n"
|
||||||
|
|
||||||
env_info_fmt += """
|
env_info_fmt += """
|
||||||
ROCM Version: {rocm_version}
|
ROCM Version: {rocm_version}
|
||||||
Neuron SDK Version: {neuron_sdk_version}
|
Neuron SDK Version: {neuron_sdk_version}
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <torch/extension.h>
|
#include <torch/all.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
@@ -135,6 +135,12 @@ __device__ __forceinline__ T gelu_fast_kernel(const T& x) {
|
|||||||
return ((T)0.5) * x * (((T)1.0) + t);
|
return ((T)0.5) * x * (((T)1.0) + t);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__device__ __forceinline__ T gelu_quick_kernel(const T& x) {
|
||||||
|
// x * sigmoid(1.702 * x)
|
||||||
|
return (T)(((float)x) / (1.0f + expf(-1.702f * (float)x)));
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
void gelu_new(torch::Tensor& out, // [..., d]
|
void gelu_new(torch::Tensor& out, // [..., d]
|
||||||
@@ -148,3 +154,9 @@ void gelu_fast(torch::Tensor& out, // [..., d]
|
|||||||
{
|
{
|
||||||
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
|
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void gelu_quick(torch::Tensor& out, // [..., d]
|
||||||
|
torch::Tensor& input) // [..., d]
|
||||||
|
{
|
||||||
|
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_quick_kernel);
|
||||||
|
}
|
||||||
|
|||||||
@@ -17,7 +17,7 @@
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <torch/extension.h>
|
#include <torch/all.h>
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
@@ -808,16 +808,17 @@ void paged_attention_v1(
|
|||||||
torch::Tensor&
|
torch::Tensor&
|
||||||
key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
|
key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
|
||||||
torch::Tensor&
|
torch::Tensor&
|
||||||
value_cache, // [num_blocks, num_heads, head_size, block_size]
|
value_cache, // [num_blocks, num_heads, head_size, block_size]
|
||||||
int num_kv_heads, // [num_heads]
|
int64_t num_kv_heads, // [num_heads]
|
||||||
float scale,
|
double scale,
|
||||||
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
|
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||||
torch::Tensor& seq_lens, // [num_seqs]
|
torch::Tensor& seq_lens, // [num_seqs]
|
||||||
int block_size, int max_seq_len,
|
int64_t block_size, int64_t max_seq_len,
|
||||||
const c10::optional<torch::Tensor>& alibi_slopes,
|
const c10::optional<torch::Tensor>& alibi_slopes,
|
||||||
const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
|
const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
|
||||||
const int blocksparse_local_blocks, const int blocksparse_vert_stride,
|
const int64_t blocksparse_local_blocks,
|
||||||
const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
|
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||||
|
const int64_t blocksparse_head_sliding_step) {
|
||||||
const bool is_block_sparse = (blocksparse_vert_stride > 1);
|
const bool is_block_sparse = (blocksparse_vert_stride > 1);
|
||||||
|
|
||||||
DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
|
DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
|
||||||
@@ -972,16 +973,17 @@ void paged_attention_v2(
|
|||||||
torch::Tensor&
|
torch::Tensor&
|
||||||
key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
|
key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
|
||||||
torch::Tensor&
|
torch::Tensor&
|
||||||
value_cache, // [num_blocks, num_heads, head_size, block_size]
|
value_cache, // [num_blocks, num_heads, head_size, block_size]
|
||||||
int num_kv_heads, // [num_heads]
|
int64_t num_kv_heads, // [num_heads]
|
||||||
float scale,
|
double scale,
|
||||||
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
|
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||||
torch::Tensor& seq_lens, // [num_seqs]
|
torch::Tensor& seq_lens, // [num_seqs]
|
||||||
int block_size, int max_seq_len,
|
int64_t block_size, int64_t max_seq_len,
|
||||||
const c10::optional<torch::Tensor>& alibi_slopes,
|
const c10::optional<torch::Tensor>& alibi_slopes,
|
||||||
const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
|
const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
|
||||||
const int blocksparse_local_blocks, const int blocksparse_vert_stride,
|
const int64_t blocksparse_local_blocks,
|
||||||
const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
|
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||||
|
const int64_t blocksparse_head_sliding_step) {
|
||||||
const bool is_block_sparse = (blocksparse_vert_stride > 1);
|
const bool is_block_sparse = (blocksparse_vert_stride > 1);
|
||||||
DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
|
DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
|
||||||
CALL_V2_LAUNCHER_BLOCK_SIZE)
|
CALL_V2_LAUNCHER_BLOCK_SIZE)
|
||||||
@@ -990,4 +992,4 @@ void paged_attention_v2(
|
|||||||
#undef WARP_SIZE
|
#undef WARP_SIZE
|
||||||
#undef MAX
|
#undef MAX
|
||||||
#undef MIN
|
#undef MIN
|
||||||
#undef DIVIDE_ROUND_UP
|
#undef DIVIDE_ROUND_UP
|
||||||
|
|||||||
14
csrc/cache.h
14
csrc/cache.h
@@ -1,6 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <torch/extension.h>
|
#include <torch/all.h>
|
||||||
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@@ -8,14 +8,18 @@
|
|||||||
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||||
const torch::Tensor& block_mapping);
|
const torch::Tensor& block_mapping);
|
||||||
|
|
||||||
void copy_blocks(std::vector<torch::Tensor>& key_caches,
|
// Note: the key_caches and value_caches vectors are constant but
|
||||||
std::vector<torch::Tensor>& value_caches,
|
// not the Tensors they contain. The vectors need to be const refs
|
||||||
|
// in order to satisfy pytorch's C++ operator registration code.
|
||||||
|
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
||||||
|
std::vector<torch::Tensor> const& value_caches,
|
||||||
const torch::Tensor& block_mapping);
|
const torch::Tensor& block_mapping);
|
||||||
|
|
||||||
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
||||||
torch::Tensor& key_cache, torch::Tensor& value_cache,
|
torch::Tensor& key_cache, torch::Tensor& value_cache,
|
||||||
torch::Tensor& slot_mapping,
|
torch::Tensor& slot_mapping,
|
||||||
const std::string& kv_cache_dtype, const float kv_scale);
|
const std::string& kv_cache_dtype,
|
||||||
|
const double kv_scale);
|
||||||
|
|
||||||
void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
|
void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
|
||||||
torch::Tensor& key_cache,
|
torch::Tensor& key_cache,
|
||||||
@@ -25,4 +29,4 @@ void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
|
|||||||
|
|
||||||
// Just for unittest
|
// Just for unittest
|
||||||
void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
||||||
const float scale, const std::string& kv_cache_dtype);
|
const double scale, const std::string& kv_cache_dtype);
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
#include <torch/extension.h>
|
#include <torch/all.h>
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
@@ -95,8 +95,11 @@ __global__ void copy_blocks_kernel(int64_t* key_cache_ptrs,
|
|||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
void copy_blocks(std::vector<torch::Tensor>& key_caches,
|
// Note: the key_caches and value_caches vectors are constant but
|
||||||
std::vector<torch::Tensor>& value_caches,
|
// not the Tensors they contain. The vectors need to be const refs
|
||||||
|
// in order to satisfy pytorch's C++ operator registration code.
|
||||||
|
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
||||||
|
std::vector<torch::Tensor> const& value_caches,
|
||||||
const torch::Tensor& block_mapping) {
|
const torch::Tensor& block_mapping) {
|
||||||
int num_layers = key_caches.size();
|
int num_layers = key_caches.size();
|
||||||
TORCH_CHECK(num_layers == value_caches.size());
|
TORCH_CHECK(num_layers == value_caches.size());
|
||||||
@@ -255,7 +258,7 @@ void reshape_and_cache(
|
|||||||
torch::Tensor&
|
torch::Tensor&
|
||||||
value_cache, // [num_blocks, num_heads, head_size, block_size]
|
value_cache, // [num_blocks, num_heads, head_size, block_size]
|
||||||
torch::Tensor& slot_mapping, // [num_tokens]
|
torch::Tensor& slot_mapping, // [num_tokens]
|
||||||
const std::string& kv_cache_dtype, const float kv_scale) {
|
const std::string& kv_cache_dtype, const double kv_scale) {
|
||||||
int num_tokens = key.size(0);
|
int num_tokens = key.size(0);
|
||||||
int num_heads = key.size(1);
|
int num_heads = key.size(1);
|
||||||
int head_size = key.size(2);
|
int head_size = key.size(2);
|
||||||
@@ -334,7 +337,7 @@ __global__ void convert_fp8_kernel(const Tin* __restrict__ src_cache,
|
|||||||
|
|
||||||
// Only for testing.
|
// Only for testing.
|
||||||
void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
||||||
const float kv_scale, const std::string& kv_cache_dtype) {
|
const double kv_scale, const std::string& kv_cache_dtype) {
|
||||||
torch::Device src_device = src_cache.device();
|
torch::Device src_device = src_cache.device();
|
||||||
torch::Device dst_device = dst_cache.device();
|
torch::Device dst_device = dst_cache.device();
|
||||||
TORCH_CHECK(src_device.is_cuda(), "src must be on a GPU")
|
TORCH_CHECK(src_device.is_cuda(), "src must be on a GPU")
|
||||||
|
|||||||
@@ -59,6 +59,13 @@ FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8& x) {
|
|||||||
return w3 * x * (ones + t);
|
return w3 * x * (ones + t);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE vec_op::FP32Vec8 gelu_quick_act(const vec_op::FP32Vec8& x) {
|
||||||
|
const vec_op::FP32Vec8 zeros(0.0);
|
||||||
|
const vec_op::FP32Vec8 ones(1.0);
|
||||||
|
const vec_op::FP32Vec8 w1(1.702f);
|
||||||
|
return x / (ones + (zeros - w1 * x).exp());
|
||||||
|
}
|
||||||
|
|
||||||
FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8& x) {
|
FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8& x) {
|
||||||
const vec_op::FP32Vec8 ones(1.0);
|
const vec_op::FP32Vec8 ones(1.0);
|
||||||
const vec_op::FP32Vec8 w1(M_SQRT1_2);
|
const vec_op::FP32Vec8 w1(M_SQRT1_2);
|
||||||
@@ -142,3 +149,15 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input) {
|
|||||||
CPU_KERNEL_GUARD_OUT(gelu_fast_impl)
|
CPU_KERNEL_GUARD_OUT(gelu_fast_impl)
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void gelu_quick(torch::Tensor& out, torch::Tensor& input) {
|
||||||
|
int num_tokens = input.numel() / input.size(-1);
|
||||||
|
int d = input.size(-1);
|
||||||
|
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_quick_impl", [&] {
|
||||||
|
CPU_KERNEL_GUARD_IN(gelu_quick_impl)
|
||||||
|
activation_kernel<scalar_t, gelu_quick_act, false>(
|
||||||
|
num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
|
||||||
|
CPU_KERNEL_GUARD_OUT(gelu_quick_impl)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|||||||
@@ -420,12 +420,13 @@ void paged_attention_v1_impl_launcher(
|
|||||||
|
|
||||||
void paged_attention_v1(
|
void paged_attention_v1(
|
||||||
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||||
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
|
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||||
int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||||
const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
|
const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
|
||||||
const int blocksparse_local_blocks, const int blocksparse_vert_stride,
|
const int64_t blocksparse_local_blocks,
|
||||||
const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
|
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||||
|
const int64_t blocksparse_head_sliding_step) {
|
||||||
TORCH_CHECK(kv_scale == 1.0f);
|
TORCH_CHECK(kv_scale == 1.0f);
|
||||||
TORCH_CHECK(blocksparse_vert_stride <= 1,
|
TORCH_CHECK(blocksparse_vert_stride <= 1,
|
||||||
"CPU backend does not support blocksparse attention yet.");
|
"CPU backend does not support blocksparse attention yet.");
|
||||||
@@ -738,12 +739,13 @@ void paged_attention_v2_impl_launcher(
|
|||||||
void paged_attention_v2(
|
void paged_attention_v2(
|
||||||
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
|
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
|
||||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||||
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
|
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||||
int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||||
const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
|
const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
|
||||||
const int blocksparse_local_blocks, const int blocksparse_vert_stride,
|
const int64_t blocksparse_local_blocks,
|
||||||
const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
|
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||||
|
const int64_t blocksparse_head_sliding_step) {
|
||||||
TORCH_CHECK(kv_scale == 1.0f);
|
TORCH_CHECK(kv_scale == 1.0f);
|
||||||
TORCH_CHECK(blocksparse_vert_stride <= 1,
|
TORCH_CHECK(blocksparse_vert_stride <= 1,
|
||||||
"CPU backend does not support blocksparse attention yet.");
|
"CPU backend does not support blocksparse attention yet.");
|
||||||
|
|||||||
@@ -5,8 +5,8 @@
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
template <typename scalar_t>
|
template <typename scalar_t>
|
||||||
void copy_blocks_cpu_impl(std::vector<torch::Tensor>& key_caches,
|
void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
|
||||||
std::vector<torch::Tensor>& value_caches,
|
std::vector<torch::Tensor> const& value_caches,
|
||||||
const torch::Tensor& mapping_pairs,
|
const torch::Tensor& mapping_pairs,
|
||||||
const int element_num_per_block,
|
const int element_num_per_block,
|
||||||
const int layer_num) {
|
const int layer_num) {
|
||||||
@@ -82,8 +82,11 @@ void reshape_and_cache_cpu_impl(
|
|||||||
}
|
}
|
||||||
}; // namespace
|
}; // namespace
|
||||||
|
|
||||||
void copy_blocks(std::vector<torch::Tensor>& key_caches,
|
// Note: the key_caches and value_caches vectors are constant but
|
||||||
std::vector<torch::Tensor>& value_caches,
|
// not the Tensors they contain. The vectors need to be const refs
|
||||||
|
// in order to satisfy pytorch's C++ operator registration code.
|
||||||
|
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
||||||
|
std::vector<torch::Tensor> const& value_caches,
|
||||||
const torch::Tensor& block_mapping) {
|
const torch::Tensor& block_mapping) {
|
||||||
unsigned num_layers = key_caches.size();
|
unsigned num_layers = key_caches.size();
|
||||||
TORCH_CHECK(num_layers == value_caches.size());
|
TORCH_CHECK(num_layers == value_caches.size());
|
||||||
@@ -104,7 +107,7 @@ void copy_blocks(std::vector<torch::Tensor>& key_caches,
|
|||||||
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
||||||
torch::Tensor& key_cache, torch::Tensor& value_cache,
|
torch::Tensor& key_cache, torch::Tensor& value_cache,
|
||||||
torch::Tensor& slot_mapping,
|
torch::Tensor& slot_mapping,
|
||||||
const std::string& kv_cache_dtype, float kv_scale) {
|
const std::string& kv_cache_dtype, double kv_scale) {
|
||||||
TORCH_CHECK(kv_scale == 1.0f);
|
TORCH_CHECK(kv_scale == 1.0f);
|
||||||
|
|
||||||
int num_tokens = key.size(0);
|
int num_tokens = key.size(0);
|
||||||
|
|||||||
@@ -2,351 +2,14 @@
|
|||||||
#ifndef CPU_TYPES_HPP
|
#ifndef CPU_TYPES_HPP
|
||||||
#define CPU_TYPES_HPP
|
#define CPU_TYPES_HPP
|
||||||
|
|
||||||
#include <immintrin.h>
|
#if defined(__x86_64__)
|
||||||
#include <torch/extension.h>
|
//x86 implementation
|
||||||
|
#include "cpu_types_x86.hpp"
|
||||||
namespace vec_op {
|
#elif defined(__POWER9_VECTOR__)
|
||||||
|
//ppc implementation
|
||||||
// FIXME: FP16 is not fully supported in Torch-CPU
|
#include "cpu_types_vsx.hpp"
|
||||||
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
|
||||||
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
|
||||||
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
|
|
||||||
|
|
||||||
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
|
||||||
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
|
|
||||||
|
|
||||||
#ifndef CPU_OP_GUARD
|
|
||||||
#define CPU_KERNEL_GUARD_IN(NAME)
|
|
||||||
#define CPU_KERNEL_GUARD_OUT(NAME)
|
|
||||||
#else
|
#else
|
||||||
#define CPU_KERNEL_GUARD_IN(NAME) \
|
#warning "unsupported vLLM cpu implementation"
|
||||||
std::cout << #NAME << " invoked." << std::endl;
|
|
||||||
#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define FORCE_INLINE __attribute__((always_inline)) inline
|
|
||||||
|
|
||||||
namespace {
|
|
||||||
template <typename T, T... indexes, typename F>
|
|
||||||
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
|
|
||||||
(f(std::integral_constant<T, indexes>{}), ...);
|
|
||||||
}
|
|
||||||
}; // namespace
|
|
||||||
|
|
||||||
template <typename T, T count, typename F,
|
|
||||||
typename = std::enable_if_t<std::is_invocable_v<F, T>>>
|
|
||||||
constexpr void unroll_loop(F &&f) {
|
|
||||||
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T> struct Vec {
|
|
||||||
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
|
|
||||||
};
|
|
||||||
|
|
||||||
struct FP32Vec8;
|
|
||||||
struct FP32Vec16;
|
|
||||||
|
|
||||||
#ifdef __AVX512FP16__
|
|
||||||
struct FP16Vec8 : public Vec<FP16Vec8> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 8;
|
|
||||||
|
|
||||||
__m128h reg;
|
|
||||||
|
|
||||||
explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
|
|
||||||
|
|
||||||
explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
|
|
||||||
|
|
||||||
explicit FP16Vec8(__m128h data) : reg(data) {}
|
|
||||||
|
|
||||||
FP16Vec8 operator*(const FP16Vec8 &b) const {
|
|
||||||
return FP16Vec8(_mm_mul_ph(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP16Vec8 operator+(const FP16Vec8 &b) const {
|
|
||||||
return FP16Vec8(_mm_add_ph(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP16Vec8 operator-(const FP16Vec8 &b) const {
|
|
||||||
return FP16Vec8(_mm_sub_ph(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP16Vec8 operator/(const FP16Vec8 &b) const {
|
|
||||||
return FP16Vec8(_mm_div_ph(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct BF16Vec8 : public Vec<BF16Vec8> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 8;
|
|
||||||
|
|
||||||
__m128i reg;
|
|
||||||
|
|
||||||
explicit BF16Vec8(const void *ptr)
|
|
||||||
: reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
|
|
||||||
|
|
||||||
explicit BF16Vec8(const FP32Vec8 &);
|
|
||||||
|
|
||||||
void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
|
|
||||||
};
|
|
||||||
|
|
||||||
struct BF16Vec16 : public Vec<BF16Vec16> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 16;
|
|
||||||
|
|
||||||
__m256i reg;
|
|
||||||
|
|
||||||
explicit BF16Vec16(const void *ptr)
|
|
||||||
: reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
|
|
||||||
|
|
||||||
explicit BF16Vec16(const FP32Vec16 &);
|
|
||||||
|
|
||||||
void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
|
|
||||||
};
|
|
||||||
|
|
||||||
struct BF16Vec32 : public Vec<BF16Vec32> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 32;
|
|
||||||
|
|
||||||
__m512i reg;
|
|
||||||
|
|
||||||
explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
|
|
||||||
|
|
||||||
explicit BF16Vec32(__m512i data) : reg(data) {}
|
|
||||||
|
|
||||||
explicit BF16Vec32(BF16Vec8 &vec8_data)
|
|
||||||
: reg((__m512i)_mm512_inserti32x4(
|
|
||||||
_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
|
|
||||||
(__m128i)vec8_data.reg),
|
|
||||||
(__m128i)vec8_data.reg, 1),
|
|
||||||
(__m128i)vec8_data.reg, 2),
|
|
||||||
(__m128i)vec8_data.reg, 3)) {}
|
|
||||||
|
|
||||||
void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
|
|
||||||
};
|
|
||||||
|
|
||||||
struct FP32Vec4 : public Vec<FP32Vec4> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 4;
|
|
||||||
union AliasReg {
|
|
||||||
__m128 reg;
|
|
||||||
float values[VEC_ELEM_NUM];
|
|
||||||
};
|
|
||||||
|
|
||||||
__m128 reg;
|
|
||||||
|
|
||||||
explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec4(__m128 data) : reg(data) {}
|
|
||||||
|
|
||||||
explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct FP32Vec8 : public Vec<FP32Vec8> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 8;
|
|
||||||
union AliasReg {
|
|
||||||
__m256 reg;
|
|
||||||
float values[VEC_ELEM_NUM];
|
|
||||||
};
|
|
||||||
|
|
||||||
__m256 reg;
|
|
||||||
|
|
||||||
explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec8(__m256 data) : reg(data) {}
|
|
||||||
|
|
||||||
explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
|
|
||||||
|
|
||||||
#ifdef __AVX512FP16__
|
|
||||||
explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
explicit FP32Vec8(const BF16Vec8 &v)
|
|
||||||
: reg(_mm256_castsi256_ps(
|
|
||||||
_mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
|
|
||||||
|
|
||||||
float reduce_sum() const {
|
|
||||||
AliasReg ar;
|
|
||||||
ar.reg = reg;
|
|
||||||
float result = 0;
|
|
||||||
unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8 exp() const {
|
|
||||||
AliasReg ar;
|
|
||||||
ar.reg = reg;
|
|
||||||
return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]),
|
|
||||||
expf(ar.values[5]), expf(ar.values[4]),
|
|
||||||
expf(ar.values[3]), expf(ar.values[2]),
|
|
||||||
expf(ar.values[1]), expf(ar.values[0])));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8 tanh() const {
|
|
||||||
AliasReg ar;
|
|
||||||
ar.reg = reg;
|
|
||||||
return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]),
|
|
||||||
tanhf(ar.values[5]), tanhf(ar.values[4]),
|
|
||||||
tanhf(ar.values[3]), tanhf(ar.values[2]),
|
|
||||||
tanhf(ar.values[1]), tanhf(ar.values[0])));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8 er() const {
|
|
||||||
AliasReg ar;
|
|
||||||
ar.reg = reg;
|
|
||||||
return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]),
|
|
||||||
erf(ar.values[5]), erf(ar.values[4]),
|
|
||||||
erf(ar.values[3]), erf(ar.values[2]),
|
|
||||||
erf(ar.values[1]), erf(ar.values[0])));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8 operator*(const FP32Vec8 &b) const {
|
|
||||||
return FP32Vec8(_mm256_mul_ps(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8 operator+(const FP32Vec8 &b) const {
|
|
||||||
return FP32Vec8(_mm256_add_ps(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8 operator-(const FP32Vec8 &b) const {
|
|
||||||
return FP32Vec8(_mm256_sub_ps(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec8 operator/(const FP32Vec8 &b) const {
|
|
||||||
return FP32Vec8(_mm256_div_ps(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
|
|
||||||
};
|
|
||||||
|
|
||||||
struct FP32Vec16 : public Vec<FP32Vec16> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 16;
|
|
||||||
union AliasReg {
|
|
||||||
__m512 reg;
|
|
||||||
float values[VEC_ELEM_NUM];
|
|
||||||
};
|
|
||||||
|
|
||||||
__m512 reg;
|
|
||||||
|
|
||||||
explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec16(__m512 data) : reg(data) {}
|
|
||||||
|
|
||||||
explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
|
|
||||||
|
|
||||||
explicit FP32Vec16(const FP32Vec4 &data)
|
|
||||||
: reg((__m512)_mm512_inserti32x4(
|
|
||||||
_mm512_inserti32x4(
|
|
||||||
_mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
|
|
||||||
(__m128i)data.reg, 1),
|
|
||||||
(__m128i)data.reg, 2),
|
|
||||||
(__m128i)data.reg, 3)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec16(const FP32Vec8 &data)
|
|
||||||
: reg((__m512)_mm512_inserti32x8(
|
|
||||||
_mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec16(const BF16Vec16 &v)
|
|
||||||
: reg(_mm512_castsi512_ps(
|
|
||||||
_mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
|
|
||||||
|
|
||||||
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
|
|
||||||
|
|
||||||
FP32Vec16 operator*(const FP32Vec16 &b) const {
|
|
||||||
return FP32Vec16(_mm512_mul_ps(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec16 operator+(const FP32Vec16 &b) const {
|
|
||||||
return FP32Vec16(_mm512_add_ps(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec16 operator-(const FP32Vec16 &b) const {
|
|
||||||
return FP32Vec16(_mm512_sub_ps(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec16 operator/(const FP32Vec16 &b) const {
|
|
||||||
return FP32Vec16(_mm512_div_ps(reg, b.reg));
|
|
||||||
}
|
|
||||||
|
|
||||||
float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
|
|
||||||
|
|
||||||
template <int group_size> float reduce_sub_sum(int idx) {
|
|
||||||
static_assert(VEC_ELEM_NUM % group_size == 0);
|
|
||||||
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
|
|
||||||
__mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
|
|
||||||
return _mm512_mask_reduce_add_ps(mask, reg);
|
|
||||||
}
|
|
||||||
|
|
||||||
void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename T> struct VecType { using vec_type = void; };
|
|
||||||
|
|
||||||
template <typename T> using vec_t = typename VecType<T>::vec_type;
|
|
||||||
|
|
||||||
template <> struct VecType<float> { using vec_type = FP32Vec8; };
|
|
||||||
|
|
||||||
#ifdef __AVX512FP16__
|
|
||||||
template <> struct VecType<c10::Half> { using vec_type = FP16Vec16; };
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
|
|
||||||
|
|
||||||
template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
|
|
||||||
|
|
||||||
#ifdef __AVX512FP16__
|
|
||||||
template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
|
|
||||||
*reinterpret_cast<_Float16 *>(ptr) = v;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
|
|
||||||
acc = acc + a * b;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef __AVX512BF16__
|
|
||||||
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
|
|
||||||
*reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
|
|
||||||
: reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
|
|
||||||
|
|
||||||
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
|
|
||||||
: reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
|
|
||||||
|
|
||||||
inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
|
|
||||||
acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
|
|
||||||
c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
|
|
||||||
reinterpret_cast<c10::BFloat16 *>(&v);
|
|
||||||
*ptr = *(v_ptr + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
|
|
||||||
: reg(_mm256_cvtepi32_epi16(
|
|
||||||
_mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
|
|
||||||
|
|
||||||
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
|
|
||||||
: reg(_mm512_cvtepi32_epi16(
|
|
||||||
_mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
|
|
||||||
|
|
||||||
}; // namespace vec_op
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
491
csrc/cpu/cpu_types_vsx.hpp
Normal file
491
csrc/cpu/cpu_types_vsx.hpp
Normal file
@@ -0,0 +1,491 @@
|
|||||||
|
|
||||||
|
#ifndef CPU_TYPES_VSX_HPP
|
||||||
|
#define CPU_TYPES_VSX_HPP
|
||||||
|
|
||||||
|
#include <altivec.h>
|
||||||
|
#include <cmath>
|
||||||
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
namespace vec_op {
|
||||||
|
|
||||||
|
// FIXME: FP16 is not fully supported in Torch-CPU
|
||||||
|
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
||||||
|
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
|
||||||
|
|
||||||
|
#ifndef CPU_OP_GUARD
|
||||||
|
#define CPU_KERNEL_GUARD_IN(NAME)
|
||||||
|
#define CPU_KERNEL_GUARD_OUT(NAME)
|
||||||
|
#else
|
||||||
|
#define CPU_KERNEL_GUARD_IN(NAME) \
|
||||||
|
std::cout << #NAME << " invoked." << std::endl;
|
||||||
|
#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define FORCE_INLINE __attribute__((always_inline)) inline
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
template <typename T, T... indexes, typename F>
|
||||||
|
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
|
||||||
|
(f(std::integral_constant<T, indexes>{}), ...);
|
||||||
|
}
|
||||||
|
}; // namespace
|
||||||
|
|
||||||
|
template <typename T, T count, typename F,
|
||||||
|
typename = std::enable_if_t<std::is_invocable_v<F, T>>>
|
||||||
|
constexpr void unroll_loop(F &&f) {
|
||||||
|
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> struct Vec {
|
||||||
|
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef struct ss16x8x2_t {
|
||||||
|
__vector signed short val[2];
|
||||||
|
} ss16x8x2_t;
|
||||||
|
|
||||||
|
typedef struct ss16x8x4_t {
|
||||||
|
__vector signed short val[4];
|
||||||
|
} ss16x8x4_t;
|
||||||
|
|
||||||
|
typedef struct f32x4x2_t {
|
||||||
|
__vector float val[2];
|
||||||
|
} f32x4x2_t;
|
||||||
|
|
||||||
|
typedef struct f32x4x4_t {
|
||||||
|
__vector float val[4];
|
||||||
|
} f32x4x4_t;
|
||||||
|
|
||||||
|
struct FP32Vec8;
|
||||||
|
struct FP32Vec16;
|
||||||
|
|
||||||
|
struct BF16Vec8 : public Vec<BF16Vec8> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 8;
|
||||||
|
|
||||||
|
__vector signed short reg;
|
||||||
|
|
||||||
|
explicit BF16Vec8(const void *ptr)
|
||||||
|
: reg((__vector signed short)vec_xl(0, (__vector signed short *)ptr)) {}
|
||||||
|
|
||||||
|
explicit BF16Vec8(const FP32Vec8 &);
|
||||||
|
|
||||||
|
void save(void *ptr) const { *reinterpret_cast<__vector signed short *>(ptr) = reg; }
|
||||||
|
};
|
||||||
|
|
||||||
|
struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 16;
|
||||||
|
|
||||||
|
ss16x8x2_t reg;
|
||||||
|
|
||||||
|
explicit BF16Vec16(const void *ptr) {
|
||||||
|
// Load 256 bits in two parts
|
||||||
|
reg.val[0] = (__vector signed short)vec_xl(0, (signed short *)ptr);
|
||||||
|
reg.val[1] = (__vector signed short)vec_xl(16, (signed short *)ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit BF16Vec16(const FP32Vec16 &);
|
||||||
|
|
||||||
|
void save(void *ptr) const {
|
||||||
|
// Save 256 bits in two parts
|
||||||
|
vec_xst(reg.val[0], 0, (signed short *)ptr);
|
||||||
|
vec_xst(reg.val[1], 16, (signed short *)ptr);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const static __vector signed short zero = vec_splats((signed short)0);
|
||||||
|
|
||||||
|
struct BF16Vec32 : public Vec<BF16Vec32> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 32;
|
||||||
|
|
||||||
|
ss16x8x4_t reg;
|
||||||
|
explicit BF16Vec32(const void *ptr)
|
||||||
|
: reg(*reinterpret_cast<const ss16x8x4_t *>(ptr)) {}
|
||||||
|
|
||||||
|
explicit BF16Vec32(ss16x8x4_t data) : reg(data) {}
|
||||||
|
|
||||||
|
explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({
|
||||||
|
vec8_data.reg,
|
||||||
|
vec8_data.reg,
|
||||||
|
vec8_data.reg,
|
||||||
|
vec8_data.reg
|
||||||
|
}) {}
|
||||||
|
|
||||||
|
void save(void *ptr) const { *reinterpret_cast<ss16x8x4_t *>(ptr) = reg; }
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FP32Vec4 : public Vec<FP32Vec4> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 4;
|
||||||
|
union AliasReg {
|
||||||
|
__vector float reg;
|
||||||
|
float values[VEC_ELEM_NUM];
|
||||||
|
};
|
||||||
|
|
||||||
|
__vector float reg;
|
||||||
|
|
||||||
|
explicit FP32Vec4(float v) : reg(vec_splats(v)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec4() : reg(vec_splats(0.0f)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec4(const float *ptr) : reg(vec_xl(0, ptr)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec4(__vector float data) : reg(data) {}
|
||||||
|
|
||||||
|
explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FP32Vec8 : public Vec<FP32Vec8> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 8;
|
||||||
|
union AliasReg {
|
||||||
|
f32x4x2_t reg;
|
||||||
|
float values[VEC_ELEM_NUM];
|
||||||
|
};
|
||||||
|
|
||||||
|
f32x4x2_t reg;
|
||||||
|
|
||||||
|
explicit FP32Vec8(float v) {
|
||||||
|
reg.val[0] = vec_splats(v);
|
||||||
|
reg.val[1] = vec_splats(v);
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec8() {
|
||||||
|
reg.val[0] = vec_splats(0.0f);
|
||||||
|
reg.val[1] = vec_splats(0.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec8(const float *ptr) {
|
||||||
|
reg.val[0] = vec_xl(0, ptr);
|
||||||
|
reg.val[1] = vec_xl(16, ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec8(f32x4x2_t data) : reg(data) {}
|
||||||
|
|
||||||
|
explicit FP32Vec8(const FP32Vec8 &data) {
|
||||||
|
reg.val[0] = data.reg.val[0];
|
||||||
|
reg.val[1] = data.reg.val[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec8(const BF16Vec8 &v) {
|
||||||
|
reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
|
||||||
|
reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
|
||||||
|
}
|
||||||
|
|
||||||
|
float reduce_sum() const {
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
float result = 0;
|
||||||
|
unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 exp() const {
|
||||||
|
// TODO: Vectorize this
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
f32x4x4_t ret;
|
||||||
|
ret.val[0][0] = std::exp(ar.values[0]);
|
||||||
|
ret.val[0][1] = std::exp(ar.values[1]);
|
||||||
|
ret.val[0][2] = std::exp(ar.values[2]);
|
||||||
|
ret.val[0][3] = std::exp(ar.values[3]);
|
||||||
|
ret.val[1][0] = std::exp(ar.values[4]);
|
||||||
|
ret.val[1][1] = std::exp(ar.values[5]);
|
||||||
|
ret.val[1][2] = std::exp(ar.values[6]);
|
||||||
|
ret.val[1][3] = std::exp(ar.values[7]);
|
||||||
|
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 tanh() const {
|
||||||
|
// TODO: Vectorize this
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
f32x4x4_t ret;
|
||||||
|
ret.val[0][0] = std::tanh(ar.values[0]);
|
||||||
|
ret.val[0][1] = std::tanh(ar.values[1]);
|
||||||
|
ret.val[0][2] = std::tanh(ar.values[2]);
|
||||||
|
ret.val[0][3] = std::tanh(ar.values[3]);
|
||||||
|
ret.val[1][0] = std::tanh(ar.values[4]);
|
||||||
|
ret.val[1][1] = std::tanh(ar.values[5]);
|
||||||
|
ret.val[1][2] = std::tanh(ar.values[6]);
|
||||||
|
ret.val[1][3] = std::tanh(ar.values[7]);
|
||||||
|
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 er() const {
|
||||||
|
// TODO: Vectorize this
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
f32x4x4_t ret;
|
||||||
|
ret.val[0][0] = std::erf(ar.values[0]);
|
||||||
|
ret.val[0][1] = std::erf(ar.values[1]);
|
||||||
|
ret.val[0][2] = std::erf(ar.values[2]);
|
||||||
|
ret.val[0][3] = std::erf(ar.values[3]);
|
||||||
|
ret.val[1][0] = std::erf(ar.values[4]);
|
||||||
|
ret.val[1][1] = std::erf(ar.values[5]);
|
||||||
|
ret.val[1][2] = std::erf(ar.values[6]);
|
||||||
|
ret.val[1][3] = std::erf(ar.values[7]);
|
||||||
|
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 operator*(const FP32Vec8 &b) const {
|
||||||
|
return FP32Vec8({vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])});
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 operator+(const FP32Vec8 &b) const {
|
||||||
|
return FP32Vec8({vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])});
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 operator-(const FP32Vec8 &b) const {
|
||||||
|
return FP32Vec8({vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])});
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 operator/(const FP32Vec8 &b) const {
|
||||||
|
return FP32Vec8({vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])});
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(float *ptr) const {
|
||||||
|
vec_xst(reg.val[0], 0, ptr);
|
||||||
|
vec_xst(reg.val[1], 16, ptr);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 16;
|
||||||
|
union AliasReg {
|
||||||
|
f32x4x4_t reg;
|
||||||
|
float values[VEC_ELEM_NUM];
|
||||||
|
};
|
||||||
|
|
||||||
|
f32x4x4_t reg;
|
||||||
|
|
||||||
|
explicit FP32Vec16(float v) {
|
||||||
|
reg.val[0] = vec_splats(v);
|
||||||
|
reg.val[1] = vec_splats(v);
|
||||||
|
reg.val[2] = vec_splats(v);
|
||||||
|
reg.val[3] = vec_splats(v);
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec16() {
|
||||||
|
reg.val[0] = vec_splats(0.0f);
|
||||||
|
reg.val[1] = vec_splats(0.0f);
|
||||||
|
reg.val[2] = vec_splats(0.0f);
|
||||||
|
reg.val[3] = vec_splats(0.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const float *ptr) {
|
||||||
|
reg.val[0] = vec_xl(0, ptr);
|
||||||
|
reg.val[1] = vec_xl(16, ptr);
|
||||||
|
reg.val[2] = vec_xl(32, ptr);
|
||||||
|
reg.val[3] = vec_xl(48, ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec16(f32x4x4_t data) : reg(data) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP32Vec16 &data) {
|
||||||
|
reg.val[0] = data.reg.val[0];
|
||||||
|
reg.val[1] = data.reg.val[1];
|
||||||
|
reg.val[2] = data.reg.val[2];
|
||||||
|
reg.val[3] = data.reg.val[3];
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP32Vec4 &data) {
|
||||||
|
reg.val[0] = data.reg;
|
||||||
|
reg.val[1] = data.reg;
|
||||||
|
reg.val[2] = data.reg;
|
||||||
|
reg.val[3] = data.reg;
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP32Vec8 &data) {
|
||||||
|
reg.val[0] = data.reg.val[0];
|
||||||
|
reg.val[1] = data.reg.val[1];
|
||||||
|
reg.val[2] = data.reg.val[0];
|
||||||
|
reg.val[3] = data.reg.val[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const BF16Vec16 &v) {
|
||||||
|
reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
|
||||||
|
reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
|
||||||
|
reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
|
||||||
|
reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
|
||||||
|
|
||||||
|
FP32Vec16 operator*(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(f32x4x4_t({
|
||||||
|
vec_mul(reg.val[0], b.reg.val[0]),
|
||||||
|
vec_mul(reg.val[1], b.reg.val[1]),
|
||||||
|
vec_mul(reg.val[2], b.reg.val[2]),
|
||||||
|
vec_mul(reg.val[3], b.reg.val[3])}));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 operator+(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(f32x4x4_t({
|
||||||
|
vec_add(reg.val[0], b.reg.val[0]),
|
||||||
|
vec_add(reg.val[1], b.reg.val[1]),
|
||||||
|
vec_add(reg.val[2], b.reg.val[2]),
|
||||||
|
vec_add(reg.val[3], b.reg.val[3])}));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 operator-(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(f32x4x4_t({
|
||||||
|
vec_sub(reg.val[0], b.reg.val[0]),
|
||||||
|
vec_sub(reg.val[1], b.reg.val[1]),
|
||||||
|
vec_sub(reg.val[2], b.reg.val[2]),
|
||||||
|
vec_sub(reg.val[3], b.reg.val[3])}));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 operator/(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(f32x4x4_t({
|
||||||
|
vec_div(reg.val[0], b.reg.val[0]),
|
||||||
|
vec_div(reg.val[1], b.reg.val[1]),
|
||||||
|
vec_div(reg.val[2], b.reg.val[2]),
|
||||||
|
vec_div(reg.val[3], b.reg.val[3])}));
|
||||||
|
}
|
||||||
|
|
||||||
|
float reduce_sum() const {
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
float result = 0;
|
||||||
|
unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int group_size> float reduce_sub_sum(int idx) {
|
||||||
|
static_assert(VEC_ELEM_NUM % group_size == 0);
|
||||||
|
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
float result = 0;
|
||||||
|
const int start = idx * group_size;
|
||||||
|
unroll_loop<int, group_size>(
|
||||||
|
[&result, &start, ar](int i) { result += ar.values[start + i]; });
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(float *ptr) const {
|
||||||
|
vec_xst(reg.val[0], 0, ptr);
|
||||||
|
vec_xst(reg.val[1], 16, ptr);
|
||||||
|
vec_xst(reg.val[2], 32, ptr);
|
||||||
|
vec_xst(reg.val[3], 48, ptr);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> struct VecType { using vec_type = void; };
|
||||||
|
|
||||||
|
template <typename T> using vec_t = typename VecType<T>::vec_type;
|
||||||
|
|
||||||
|
template <> struct VecType<float> { using vec_type = FP32Vec8; };
|
||||||
|
|
||||||
|
template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
|
||||||
|
|
||||||
|
template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
|
||||||
|
|
||||||
|
inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
|
||||||
|
acc = acc + a * b;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
|
||||||
|
c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
|
||||||
|
reinterpret_cast<c10::BFloat16 *>(&v);
|
||||||
|
*ptr = *(v_ptr + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef __VEC_CLASS_FP_NAN
|
||||||
|
#define __VEC_CLASS_FP_NAN (1 << 6)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
const static __vector unsigned char omask = { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
|
||||||
|
#ifndef _ARCH_PWR10
|
||||||
|
const static __vector unsigned int bias = { 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff };
|
||||||
|
const static __vector unsigned int nan = { 0x7fc00000, 0x7fc00000, 0x7fc00000, 0x7fc00000 };
|
||||||
|
const static __vector unsigned int sh16 = { 16, 16, 16, 16 };
|
||||||
|
const static __vector unsigned int one = { 1, 1, 1, 1 };
|
||||||
|
#endif
|
||||||
|
|
||||||
|
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) {
|
||||||
|
#ifdef _ARCH_PWR10
|
||||||
|
__vector signed short ret[2];
|
||||||
|
ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]);
|
||||||
|
ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]);
|
||||||
|
reg = vec_perm(ret[0], ret[1], omask);
|
||||||
|
#elif defined(_ARCH_PWR9)
|
||||||
|
__vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
|
||||||
|
__vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
|
||||||
|
__vector unsigned int lsb0 = vec_sr(inp0, sh16);
|
||||||
|
__vector unsigned int lsb1 = vec_sr(inp1, sh16);
|
||||||
|
lsb0 = vec_and(lsb0, one);
|
||||||
|
lsb1 = vec_and(lsb1, one);
|
||||||
|
__vector unsigned int rnd0 = vec_add(lsb0, bias);
|
||||||
|
__vector unsigned int rnd1 = vec_add(lsb1, bias);
|
||||||
|
inp0 = vec_add(inp0, rnd0);
|
||||||
|
inp1 = vec_add(inp1, rnd1);
|
||||||
|
__vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
|
||||||
|
__vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
|
||||||
|
inp0 = vec_sel(inp0, nan, sel0);
|
||||||
|
inp1 = vec_sel(inp1, nan, sel1);
|
||||||
|
inp0 = vec_sr(inp0, sh16);
|
||||||
|
inp1 = vec_sr(inp1, sh16);
|
||||||
|
reg = (__vector signed short)vec_perm(inp0, inp1, omask);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
|
||||||
|
#ifdef _ARCH_PWR10
|
||||||
|
__vector signed short ret[4];
|
||||||
|
ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]);
|
||||||
|
ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]);
|
||||||
|
ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[2]);
|
||||||
|
ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[3]);
|
||||||
|
reg.val[0] = vec_perm(ret[0], ret[1], omask);
|
||||||
|
reg.val[1] = vec_perm(ret[2], ret[3], omask);
|
||||||
|
#elif defined(_ARCH_PWR9)
|
||||||
|
__vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
|
||||||
|
__vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
|
||||||
|
__vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
|
||||||
|
__vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
|
||||||
|
__vector unsigned int lsb0 = vec_sr(inp0, sh16);
|
||||||
|
__vector unsigned int lsb1 = vec_sr(inp1, sh16);
|
||||||
|
__vector unsigned int lsb2 = vec_sr(inp2, sh16);
|
||||||
|
__vector unsigned int lsb3 = vec_sr(inp3, sh16);
|
||||||
|
lsb0 = vec_and(lsb0, one);
|
||||||
|
lsb1 = vec_and(lsb1, one);
|
||||||
|
lsb2 = vec_and(lsb2, one);
|
||||||
|
lsb3 = vec_and(lsb3, one);
|
||||||
|
__vector unsigned int rnd0 = vec_add(lsb0, bias);
|
||||||
|
__vector unsigned int rnd1 = vec_add(lsb1, bias);
|
||||||
|
__vector unsigned int rnd2 = vec_add(lsb2, bias);
|
||||||
|
__vector unsigned int rnd3 = vec_add(lsb3, bias);
|
||||||
|
inp0 = vec_add(inp0, rnd0);
|
||||||
|
inp1 = vec_add(inp1, rnd1);
|
||||||
|
inp2 = vec_add(inp2, rnd2);
|
||||||
|
inp3 = vec_add(inp3, rnd3);
|
||||||
|
__vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
|
||||||
|
__vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
|
||||||
|
__vector __bool int sel2 = vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN);
|
||||||
|
__vector __bool int sel3 = vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN);
|
||||||
|
inp0 = vec_sel(inp0, nan, sel0);
|
||||||
|
inp1 = vec_sel(inp1, nan, sel1);
|
||||||
|
inp2 = vec_sel(inp2, nan, sel2);
|
||||||
|
inp3 = vec_sel(inp3, nan, sel3);
|
||||||
|
inp0 = vec_sr(inp0, sh16);
|
||||||
|
inp1 = vec_sr(inp1, sh16);
|
||||||
|
inp2 = vec_sr(inp2, sh16);
|
||||||
|
inp3 = vec_sr(inp3, sh16);
|
||||||
|
reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
|
||||||
|
reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void prefetch(const void *addr) {
|
||||||
|
__asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory");
|
||||||
|
}
|
||||||
|
|
||||||
|
}; // namespace vec_op
|
||||||
|
|
||||||
|
#endif
|
||||||
515
csrc/cpu/cpu_types_x86.hpp
Normal file
515
csrc/cpu/cpu_types_x86.hpp
Normal file
@@ -0,0 +1,515 @@
|
|||||||
|
|
||||||
|
#ifndef CPU_TYPES_X86_HPP
|
||||||
|
#define CPU_TYPES_X86_HPP
|
||||||
|
|
||||||
|
#include <immintrin.h>
|
||||||
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
#ifndef __AVX2__
|
||||||
|
static_assert(false, "AVX2 must be supported for the current implementation.");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace vec_op {
|
||||||
|
|
||||||
|
// FIXME: FP16 is not fully supported in Torch-CPU
|
||||||
|
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
||||||
|
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
|
||||||
|
|
||||||
|
#ifndef CPU_OP_GUARD
|
||||||
|
#define CPU_KERNEL_GUARD_IN(NAME)
|
||||||
|
#define CPU_KERNEL_GUARD_OUT(NAME)
|
||||||
|
#else
|
||||||
|
#define CPU_KERNEL_GUARD_IN(NAME) \
|
||||||
|
std::cout << #NAME << " invoked." << std::endl;
|
||||||
|
#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define FORCE_INLINE __attribute__((always_inline)) inline
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
template <typename T, T... indexes, typename F>
|
||||||
|
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
|
||||||
|
(f(std::integral_constant<T, indexes>{}), ...);
|
||||||
|
}
|
||||||
|
}; // namespace
|
||||||
|
|
||||||
|
template <typename T, T count, typename F,
|
||||||
|
typename = std::enable_if_t<std::is_invocable_v<F, T>>>
|
||||||
|
constexpr void unroll_loop(F &&f) {
|
||||||
|
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> struct Vec {
|
||||||
|
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FP32Vec8;
|
||||||
|
struct FP32Vec16;
|
||||||
|
|
||||||
|
#ifdef __AVX512FP16__
|
||||||
|
struct FP16Vec8 : public Vec<FP16Vec8> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 8;
|
||||||
|
|
||||||
|
__m128h reg;
|
||||||
|
|
||||||
|
explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
|
||||||
|
|
||||||
|
explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
|
||||||
|
|
||||||
|
explicit FP16Vec8(__m128h data) : reg(data) {}
|
||||||
|
|
||||||
|
FP16Vec8 operator*(const FP16Vec8 &b) const {
|
||||||
|
return FP16Vec8(_mm_mul_ph(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP16Vec8 operator+(const FP16Vec8 &b) const {
|
||||||
|
return FP16Vec8(_mm_add_ph(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP16Vec8 operator-(const FP16Vec8 &b) const {
|
||||||
|
return FP16Vec8(_mm_sub_ph(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP16Vec8 operator/(const FP16Vec8 &b) const {
|
||||||
|
return FP16Vec8(_mm_div_ph(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct BF16Vec8 : public Vec<BF16Vec8> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 8;
|
||||||
|
|
||||||
|
__m128i reg;
|
||||||
|
|
||||||
|
explicit BF16Vec8(const void *ptr)
|
||||||
|
: reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
|
||||||
|
|
||||||
|
explicit BF16Vec8(const FP32Vec8 &);
|
||||||
|
|
||||||
|
void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
|
||||||
|
};
|
||||||
|
|
||||||
|
struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 16;
|
||||||
|
|
||||||
|
__m256i reg;
|
||||||
|
|
||||||
|
explicit BF16Vec16(const void *ptr)
|
||||||
|
: reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
|
||||||
|
|
||||||
|
explicit BF16Vec16(const FP32Vec16 &);
|
||||||
|
|
||||||
|
void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef __AVX512F__
|
||||||
|
struct BF16Vec32 : public Vec<BF16Vec32> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 32;
|
||||||
|
|
||||||
|
__m512i reg;
|
||||||
|
|
||||||
|
explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
|
||||||
|
|
||||||
|
explicit BF16Vec32(__m512i data) : reg(data) {}
|
||||||
|
|
||||||
|
explicit BF16Vec32(BF16Vec8 &vec8_data)
|
||||||
|
: reg((__m512i)_mm512_inserti32x4(
|
||||||
|
_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
|
||||||
|
(__m128i)vec8_data.reg),
|
||||||
|
(__m128i)vec8_data.reg, 1),
|
||||||
|
(__m128i)vec8_data.reg, 2),
|
||||||
|
(__m128i)vec8_data.reg, 3)) {}
|
||||||
|
|
||||||
|
void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
|
||||||
|
};
|
||||||
|
#else
|
||||||
|
struct BF16Vec32 : public Vec<BF16Vec32> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 32;
|
||||||
|
|
||||||
|
__m256i reg_low;
|
||||||
|
__m256i reg_high;
|
||||||
|
|
||||||
|
explicit BF16Vec32(const void *ptr)
|
||||||
|
: reg_low(_mm256_loadu_si256((__m256i const *)ptr)),
|
||||||
|
reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {}
|
||||||
|
|
||||||
|
explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low),
|
||||||
|
reg_high(high) {}
|
||||||
|
|
||||||
|
explicit BF16Vec32(BF16Vec8 &vec8_data)
|
||||||
|
: reg_low((__m256i)_mm256_inserti32x4(
|
||||||
|
_mm256_castsi128_si256((__m128i)vec8_data.reg),
|
||||||
|
(__m128i)vec8_data.reg, 1)),
|
||||||
|
reg_high((__m256i)_mm256_inserti32x4(
|
||||||
|
_mm256_castsi128_si256((__m128i)vec8_data.reg),
|
||||||
|
(__m128i)vec8_data.reg, 1)) {}
|
||||||
|
|
||||||
|
void save(void *ptr) const {
|
||||||
|
*reinterpret_cast<__m256i *>(ptr) = reg_low;
|
||||||
|
*reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct FP32Vec4 : public Vec<FP32Vec4> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 4;
|
||||||
|
union AliasReg {
|
||||||
|
__m128 reg;
|
||||||
|
float values[VEC_ELEM_NUM];
|
||||||
|
};
|
||||||
|
|
||||||
|
__m128 reg;
|
||||||
|
|
||||||
|
explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec4(__m128 data) : reg(data) {}
|
||||||
|
|
||||||
|
explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FP32Vec8 : public Vec<FP32Vec8> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 8;
|
||||||
|
union AliasReg {
|
||||||
|
__m256 reg;
|
||||||
|
float values[VEC_ELEM_NUM];
|
||||||
|
};
|
||||||
|
|
||||||
|
__m256 reg;
|
||||||
|
|
||||||
|
explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec8(__m256 data) : reg(data) {}
|
||||||
|
|
||||||
|
explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
|
||||||
|
|
||||||
|
#ifdef __AVX512FP16__
|
||||||
|
explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
explicit FP32Vec8(const BF16Vec8 &v)
|
||||||
|
: reg(_mm256_castsi256_ps(
|
||||||
|
_mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
|
||||||
|
|
||||||
|
float reduce_sum() const {
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
float result = 0;
|
||||||
|
unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 exp() const {
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]),
|
||||||
|
expf(ar.values[5]), expf(ar.values[4]),
|
||||||
|
expf(ar.values[3]), expf(ar.values[2]),
|
||||||
|
expf(ar.values[1]), expf(ar.values[0])));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 tanh() const {
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]),
|
||||||
|
tanhf(ar.values[5]), tanhf(ar.values[4]),
|
||||||
|
tanhf(ar.values[3]), tanhf(ar.values[2]),
|
||||||
|
tanhf(ar.values[1]), tanhf(ar.values[0])));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 er() const {
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]),
|
||||||
|
erf(ar.values[5]), erf(ar.values[4]),
|
||||||
|
erf(ar.values[3]), erf(ar.values[2]),
|
||||||
|
erf(ar.values[1]), erf(ar.values[0])));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 operator*(const FP32Vec8 &b) const {
|
||||||
|
return FP32Vec8(_mm256_mul_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 operator+(const FP32Vec8 &b) const {
|
||||||
|
return FP32Vec8(_mm256_add_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 operator-(const FP32Vec8 &b) const {
|
||||||
|
return FP32Vec8(_mm256_sub_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 operator/(const FP32Vec8 &b) const {
|
||||||
|
return FP32Vec8(_mm256_div_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef __AVX512F__
|
||||||
|
struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 16;
|
||||||
|
union AliasReg {
|
||||||
|
__m512 reg;
|
||||||
|
float values[VEC_ELEM_NUM];
|
||||||
|
};
|
||||||
|
|
||||||
|
__m512 reg;
|
||||||
|
|
||||||
|
explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(__m512 data) : reg(data) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP32Vec4 &data)
|
||||||
|
: reg((__m512)_mm512_inserti32x4(
|
||||||
|
_mm512_inserti32x4(
|
||||||
|
_mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
|
||||||
|
(__m128i)data.reg, 1),
|
||||||
|
(__m128i)data.reg, 2),
|
||||||
|
(__m128i)data.reg, 3)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP32Vec8 &data)
|
||||||
|
: reg((__m512)_mm512_inserti32x8(
|
||||||
|
_mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const BF16Vec16 &v)
|
||||||
|
: reg(_mm512_castsi512_ps(
|
||||||
|
_mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
|
||||||
|
|
||||||
|
FP32Vec16 operator*(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(_mm512_mul_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 operator+(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(_mm512_add_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 operator-(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(_mm512_sub_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 operator/(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(_mm512_div_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
|
||||||
|
|
||||||
|
template <int group_size> float reduce_sub_sum(int idx) {
|
||||||
|
static_assert(VEC_ELEM_NUM % group_size == 0);
|
||||||
|
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
|
||||||
|
__mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
|
||||||
|
return _mm512_mask_reduce_add_ps(mask, reg);
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
|
||||||
|
};
|
||||||
|
#else
|
||||||
|
struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 16;
|
||||||
|
|
||||||
|
union AliasReg {
|
||||||
|
__m256 reg;
|
||||||
|
float values[8];
|
||||||
|
};
|
||||||
|
|
||||||
|
__m256 reg_low;
|
||||||
|
__m256 reg_high;
|
||||||
|
|
||||||
|
explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)),
|
||||||
|
reg_high(_mm256_set1_ps(v)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)),
|
||||||
|
reg_high(_mm256_set1_ps(0.0)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)),
|
||||||
|
reg_high(_mm256_loadu_ps(ptr + 8)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low),
|
||||||
|
reg_high(data.reg_high) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP32Vec4 &data)
|
||||||
|
: reg_low((__m256)_mm256_inserti128_si256(
|
||||||
|
_mm256_castsi128_si256((__m128i)data.reg),
|
||||||
|
(__m128i)data.reg, 1)),
|
||||||
|
reg_high((__m256)_mm256_inserti128_si256(
|
||||||
|
_mm256_castsi128_si256((__m128i)data.reg),
|
||||||
|
(__m128i)data.reg, 1)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP32Vec8 &data)
|
||||||
|
: reg_low(data.reg), reg_high(data.reg) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const BF16Vec16 &v) {
|
||||||
|
__m128i low = _mm256_extractf128_si256(v.reg, 0);
|
||||||
|
__m128i high = _mm256_extractf128_si256(v.reg, 1);
|
||||||
|
|
||||||
|
__m256i v_low_epi32 = _mm256_cvtepu16_epi32(low);
|
||||||
|
__m256i v_high_epi32 = _mm256_cvtepu16_epi32(high);
|
||||||
|
|
||||||
|
__m256i v_low_shifted = _mm256_bslli_epi128(v_low_epi32, 2);
|
||||||
|
__m256i v_high_shifted = _mm256_bslli_epi128(v_high_epi32, 2);
|
||||||
|
|
||||||
|
reg_low = _mm256_castsi256_ps(v_low_shifted);
|
||||||
|
reg_high = _mm256_castsi256_ps(v_high_shifted);
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
|
||||||
|
|
||||||
|
FP32Vec16 operator*(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low),
|
||||||
|
_mm256_mul_ps(reg_high, b.reg_high));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 operator+(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low),
|
||||||
|
_mm256_add_ps(reg_high, b.reg_high));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 operator-(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low),
|
||||||
|
_mm256_sub_ps(reg_high, b.reg_high));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 operator/(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low),
|
||||||
|
_mm256_div_ps(reg_high, b.reg_high));
|
||||||
|
}
|
||||||
|
|
||||||
|
float reduce_sum() const {
|
||||||
|
FP32Vec8 low = FP32Vec8(reg_low);
|
||||||
|
FP32Vec8 high = FP32Vec8(reg_high);
|
||||||
|
return low.reduce_sum() + high.reduce_sum();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int group_size> float reduce_sub_sum(int idx) {
|
||||||
|
float sum = 0.0;
|
||||||
|
static_assert(VEC_ELEM_NUM % group_size == 0);
|
||||||
|
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
|
||||||
|
uint32_t mask = base_mask << (idx * group_size);
|
||||||
|
|
||||||
|
AliasReg ar;
|
||||||
|
|
||||||
|
auto func = [&sum, &mask, &ar](int i) {
|
||||||
|
int flag = mask & 0x1;
|
||||||
|
mask = mask >> 1;
|
||||||
|
if (flag != 0) sum += ar.values[i];
|
||||||
|
};
|
||||||
|
|
||||||
|
ar.reg = reg_low;
|
||||||
|
unroll_loop<int, 8>(func);
|
||||||
|
|
||||||
|
ar.reg = reg_high;
|
||||||
|
unroll_loop<int, 8>(func);
|
||||||
|
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(float *ptr) const {
|
||||||
|
_mm256_storeu_ps(ptr, reg_low);
|
||||||
|
_mm256_storeu_ps(ptr + 8, reg_high);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template <typename T> struct VecType { using vec_type = void; };
|
||||||
|
|
||||||
|
template <typename T> using vec_t = typename VecType<T>::vec_type;
|
||||||
|
|
||||||
|
template <> struct VecType<float> { using vec_type = FP32Vec8; };
|
||||||
|
|
||||||
|
#ifdef __AVX512FP16__
|
||||||
|
template <> struct VecType<c10::Half> { using vec_type = FP16Vec16; };
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
|
||||||
|
|
||||||
|
template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
|
||||||
|
|
||||||
|
#ifdef __AVX512FP16__
|
||||||
|
template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
|
||||||
|
*reinterpret_cast<_Float16 *>(ptr) = v;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
|
||||||
|
acc = acc + a * b;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef __AVX512BF16__
|
||||||
|
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
|
||||||
|
*reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
|
||||||
|
: reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
|
||||||
|
|
||||||
|
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
|
||||||
|
: reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
|
||||||
|
|
||||||
|
inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
|
||||||
|
acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
|
||||||
|
c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
|
||||||
|
reinterpret_cast<c10::BFloat16 *>(&v);
|
||||||
|
*ptr = *(v_ptr + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef __AVX512F__
|
||||||
|
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
|
||||||
|
: reg(_mm256_cvtepi32_epi16(
|
||||||
|
_mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
|
||||||
|
|
||||||
|
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
|
||||||
|
: reg(_mm512_cvtepi32_epi16(
|
||||||
|
_mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
|
||||||
|
#else
|
||||||
|
namespace{
|
||||||
|
__m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) {
|
||||||
|
__m256i ai = _mm256_castps_si256(a);
|
||||||
|
ai = _mm256_srli_epi32(ai, 16);
|
||||||
|
ai = _mm256_packus_epi32(ai, ai);
|
||||||
|
ai = _mm256_permute4x64_epi64(ai, 0b00111001);
|
||||||
|
return _mm256_extracti128_si256(ai, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
|
||||||
|
: reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {}
|
||||||
|
|
||||||
|
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
|
||||||
|
BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low));
|
||||||
|
BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high));
|
||||||
|
reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1);
|
||||||
|
}
|
||||||
|
#endif // __AVX512F__
|
||||||
|
#endif // __AVX512BF16__
|
||||||
|
|
||||||
|
inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
|
||||||
|
|
||||||
|
}; // namespace vec_op
|
||||||
|
|
||||||
|
#endif
|
||||||
@@ -88,7 +88,7 @@ void fused_add_rms_norm_impl(scalar_t* __restrict__ input,
|
|||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
|
void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
|
||||||
float epsilon) {
|
double epsilon) {
|
||||||
int hidden_size = input.size(-1);
|
int hidden_size = input.size(-1);
|
||||||
int num_tokens = input.numel() / hidden_size;
|
int num_tokens = input.numel() / hidden_size;
|
||||||
|
|
||||||
@@ -102,7 +102,7 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
|
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
|
||||||
torch::Tensor& weight, float epsilon) {
|
torch::Tensor& weight, double epsilon) {
|
||||||
int hidden_size = input.size(-1);
|
int hidden_size = input.size(-1);
|
||||||
int num_tokens = input.numel() / hidden_size;
|
int num_tokens = input.numel() / hidden_size;
|
||||||
|
|
||||||
|
|||||||
@@ -21,7 +21,57 @@ void rotary_embedding_impl(
|
|||||||
constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
|
constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
|
||||||
|
|
||||||
const int embed_dim = rot_dim / 2;
|
const int embed_dim = rot_dim / 2;
|
||||||
TORCH_CHECK(embed_dim % VEC_ELEM_NUM == 0);
|
bool flag = (embed_dim % VEC_ELEM_NUM == 0);
|
||||||
|
const int loop_upper = flag ? embed_dim : embed_dim - VEC_ELEM_NUM;
|
||||||
|
|
||||||
|
auto compute_loop = [&](const int64_t token_head, const scalar_t* cache_ptr,
|
||||||
|
scalar_t* qk) {
|
||||||
|
int j = 0;
|
||||||
|
for (; j < loop_upper; j += VEC_ELEM_NUM) {
|
||||||
|
const int rot_offset = j;
|
||||||
|
const int x_index = rot_offset;
|
||||||
|
const int y_index = embed_dim + rot_offset;
|
||||||
|
|
||||||
|
const int64_t out_x = token_head + x_index;
|
||||||
|
const int64_t out_y = token_head + y_index;
|
||||||
|
|
||||||
|
const scalar_vec_t cos(cache_ptr + x_index);
|
||||||
|
const scalar_vec_t sin(cache_ptr + y_index);
|
||||||
|
|
||||||
|
const scalar_vec_t q_x(qk + out_x);
|
||||||
|
const scalar_vec_t q_y(qk + out_y);
|
||||||
|
|
||||||
|
vec_op::FP32Vec8 fp32_cos(cos);
|
||||||
|
vec_op::FP32Vec8 fp32_sin(sin);
|
||||||
|
|
||||||
|
vec_op::FP32Vec8 fp32_q_x(q_x);
|
||||||
|
vec_op::FP32Vec8 fp32_q_y(q_y);
|
||||||
|
|
||||||
|
auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
|
||||||
|
scalar_vec_t(out1).save(qk + out_x);
|
||||||
|
|
||||||
|
auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
|
||||||
|
scalar_vec_t(out2).save(qk + out_y);
|
||||||
|
}
|
||||||
|
if (!flag) {
|
||||||
|
for (; j < embed_dim; ++j) {
|
||||||
|
const int x_index = j;
|
||||||
|
const int y_index = embed_dim + j;
|
||||||
|
|
||||||
|
const int64_t out_x = token_head + x_index;
|
||||||
|
const int64_t out_y = token_head + y_index;
|
||||||
|
|
||||||
|
const float fp32_cos = cache_ptr[x_index];
|
||||||
|
const float fp32_sin = cache_ptr[y_index];
|
||||||
|
|
||||||
|
const float fp32_q_x = qk[out_x];
|
||||||
|
const float fp32_q_y = qk[out_y];
|
||||||
|
|
||||||
|
qk[out_x] = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
|
||||||
|
qk[out_y] = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
|
for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
|
||||||
@@ -32,62 +82,13 @@ void rotary_embedding_impl(
|
|||||||
const int head_idx = i;
|
const int head_idx = i;
|
||||||
const int64_t token_head =
|
const int64_t token_head =
|
||||||
token_idx * query_stride + head_idx * head_size;
|
token_idx * query_stride + head_idx * head_size;
|
||||||
for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
|
compute_loop(token_head, cache_ptr, query);
|
||||||
const int rot_offset = j;
|
|
||||||
const int x_index = rot_offset;
|
|
||||||
const int y_index = embed_dim + rot_offset;
|
|
||||||
|
|
||||||
const int64_t out_x = token_head + x_index;
|
|
||||||
const int64_t out_y = token_head + y_index;
|
|
||||||
|
|
||||||
const scalar_vec_t cos(cache_ptr + x_index);
|
|
||||||
const scalar_vec_t sin(cache_ptr + y_index);
|
|
||||||
|
|
||||||
const scalar_vec_t q_x(query + out_x);
|
|
||||||
const scalar_vec_t q_y(query + out_y);
|
|
||||||
|
|
||||||
vec_op::FP32Vec8 fp32_cos(cos);
|
|
||||||
vec_op::FP32Vec8 fp32_sin(sin);
|
|
||||||
|
|
||||||
vec_op::FP32Vec8 fp32_q_x(q_x);
|
|
||||||
vec_op::FP32Vec8 fp32_q_y(q_y);
|
|
||||||
|
|
||||||
auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
|
|
||||||
scalar_vec_t(out1).save(query + out_x);
|
|
||||||
|
|
||||||
auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
|
|
||||||
scalar_vec_t(out2).save(query + out_y);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < num_kv_heads; ++i) {
|
for (int i = 0; i < num_kv_heads; ++i) {
|
||||||
const int head_idx = i;
|
const int head_idx = i;
|
||||||
const int64_t token_head = token_idx * key_stride + head_idx * head_size;
|
const int64_t token_head = token_idx * key_stride + head_idx * head_size;
|
||||||
for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
|
compute_loop(token_head, cache_ptr, key);
|
||||||
const int rot_offset = j;
|
|
||||||
const int x_index = rot_offset;
|
|
||||||
const int y_index = embed_dim + rot_offset;
|
|
||||||
|
|
||||||
const int64_t out_x = token_head + x_index;
|
|
||||||
const int64_t out_y = token_head + y_index;
|
|
||||||
|
|
||||||
const scalar_vec_t cos(cache_ptr + x_index);
|
|
||||||
const scalar_vec_t sin(cache_ptr + y_index);
|
|
||||||
|
|
||||||
const scalar_vec_t k_x(key + out_x);
|
|
||||||
const scalar_vec_t k_y(key + out_y);
|
|
||||||
|
|
||||||
vec_op::FP32Vec8 fp32_cos(cos);
|
|
||||||
vec_op::FP32Vec8 fp32_sin(sin);
|
|
||||||
|
|
||||||
vec_op::FP32Vec8 fp32_k_x(k_x);
|
|
||||||
vec_op::FP32Vec8 fp32_k_y(k_y);
|
|
||||||
|
|
||||||
auto out1 = fp32_k_x * fp32_cos - fp32_k_y * fp32_sin;
|
|
||||||
scalar_vec_t(out1).save(key + out_x);
|
|
||||||
auto out2 = fp32_k_y * fp32_cos + fp32_k_x * fp32_sin;
|
|
||||||
scalar_vec_t(out2).save(key + out_y);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -167,7 +168,7 @@ void rotary_embedding_gptj_impl(
|
|||||||
}; // namespace
|
}; // namespace
|
||||||
|
|
||||||
void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
|
void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
|
||||||
torch::Tensor& key, int head_size,
|
torch::Tensor& key, int64_t head_size,
|
||||||
torch::Tensor& cos_sin_cache, bool is_neox) {
|
torch::Tensor& cos_sin_cache, bool is_neox) {
|
||||||
int num_tokens = query.numel() / query.size(-1);
|
int num_tokens = query.numel() / query.size(-1);
|
||||||
int rot_dim = cos_sin_cache.size(1);
|
int rot_dim = cos_sin_cache.size(1);
|
||||||
|
|||||||
@@ -1,44 +0,0 @@
|
|||||||
#include "cache.h"
|
|
||||||
#include "cuda_utils.h"
|
|
||||||
#include "ops.h"
|
|
||||||
#include <torch/extension.h>
|
|
||||||
|
|
||||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
|
||||||
// vLLM custom ops
|
|
||||||
pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
|
|
||||||
|
|
||||||
// Attention ops
|
|
||||||
ops.def("paged_attention_v1", &paged_attention_v1,
|
|
||||||
"Compute the attention between an input query and the cached "
|
|
||||||
"keys/values using PagedAttention.");
|
|
||||||
ops.def("paged_attention_v2", &paged_attention_v2, "PagedAttention V2.");
|
|
||||||
|
|
||||||
// Activation ops
|
|
||||||
ops.def("silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU.");
|
|
||||||
ops.def("gelu_and_mul", &gelu_and_mul,
|
|
||||||
"Activation function used in GeGLU with `none` approximation.");
|
|
||||||
ops.def("gelu_tanh_and_mul", &gelu_tanh_and_mul,
|
|
||||||
"Activation function used in GeGLU with `tanh` approximation.");
|
|
||||||
ops.def("gelu_new", &gelu_new, "GELU implementation used in GPT-2.");
|
|
||||||
ops.def("gelu_fast", &gelu_fast, "Approximate GELU implementation.");
|
|
||||||
|
|
||||||
// Layernorm
|
|
||||||
ops.def("rms_norm", &rms_norm,
|
|
||||||
"Apply Root Mean Square (RMS) Normalization to the input tensor.");
|
|
||||||
|
|
||||||
ops.def("fused_add_rms_norm", &fused_add_rms_norm,
|
|
||||||
"In-place fused Add and RMS Normalization");
|
|
||||||
|
|
||||||
// Rotary embedding
|
|
||||||
ops.def("rotary_embedding", &rotary_embedding,
|
|
||||||
"Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
|
|
||||||
|
|
||||||
// Cache ops
|
|
||||||
pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
|
|
||||||
cache_ops.def("swap_blocks", &swap_blocks,
|
|
||||||
"Swap in (out) the cache blocks from src to dst");
|
|
||||||
cache_ops.def("copy_blocks", ©_blocks,
|
|
||||||
"Copy the cache blocks from src to dst");
|
|
||||||
cache_ops.def("reshape_and_cache", &reshape_and_cache,
|
|
||||||
"Reshape the key and value tensors and cache them");
|
|
||||||
}
|
|
||||||
110
csrc/cpu/torch_bindings.cpp
Normal file
110
csrc/cpu/torch_bindings.cpp
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
#include "cache.h"
|
||||||
|
#include "ops.h"
|
||||||
|
#include "registration.h"
|
||||||
|
|
||||||
|
#include <torch/library.h>
|
||||||
|
|
||||||
|
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||||
|
// vLLM custom ops
|
||||||
|
|
||||||
|
// Attention ops
|
||||||
|
// Compute the attention between an input query and the cached keys/values
|
||||||
|
// using PagedAttention.
|
||||||
|
ops.def(
|
||||||
|
"paged_attention_v1("
|
||||||
|
" Tensor! out, Tensor query, Tensor key_cache,"
|
||||||
|
" Tensor value_cache, int num_kv_heads, float scale,"
|
||||||
|
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
||||||
|
" int max_seq_len, Tensor? alibi_slopes,"
|
||||||
|
" str kv_cache_dtype, float kv_scale, int tp_rank,"
|
||||||
|
" int blocksparse_local_blocks,"
|
||||||
|
" int blocksparse_vert_stride, int blocksparse_block_size,"
|
||||||
|
" int blocksparse_head_sliding_step) -> ()");
|
||||||
|
ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
|
||||||
|
|
||||||
|
// PagedAttention V2.
|
||||||
|
ops.def(
|
||||||
|
"paged_attention_v2("
|
||||||
|
" Tensor! out, Tensor exp_sums, Tensor max_logits,"
|
||||||
|
" Tensor tmp_out, Tensor query, Tensor key_cache,"
|
||||||
|
" Tensor value_cache, int num_kv_heads, float scale,"
|
||||||
|
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
||||||
|
" int max_seq_len, Tensor? alibi_slopes,"
|
||||||
|
" str kv_cache_dtype, float kv_scale, int tp_rank,"
|
||||||
|
" int blocksparse_local_blocks,"
|
||||||
|
" int blocksparse_vert_stride, int blocksparse_block_size,"
|
||||||
|
" int blocksparse_head_sliding_step) -> ()");
|
||||||
|
ops.impl("paged_attention_v2", torch::kCPU, &paged_attention_v2);
|
||||||
|
|
||||||
|
// Activation ops
|
||||||
|
|
||||||
|
// Activation function used in SwiGLU.
|
||||||
|
ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
|
||||||
|
ops.impl("silu_and_mul", torch::kCPU, &silu_and_mul);
|
||||||
|
|
||||||
|
// Activation function used in GeGLU with `none` approximation.
|
||||||
|
ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
|
||||||
|
ops.impl("gelu_and_mul", torch::kCPU, &gelu_and_mul);
|
||||||
|
|
||||||
|
// Activation function used in GeGLU with `tanh` approximation.
|
||||||
|
ops.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()");
|
||||||
|
ops.impl("gelu_tanh_and_mul", torch::kCPU, &gelu_tanh_and_mul);
|
||||||
|
|
||||||
|
// GELU implementation used in GPT-2.
|
||||||
|
ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
|
||||||
|
ops.impl("gelu_new", torch::kCPU, &gelu_new);
|
||||||
|
|
||||||
|
// Approximate GELU implementation.
|
||||||
|
ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
|
||||||
|
ops.impl("gelu_fast", torch::kCPU, &gelu_fast);
|
||||||
|
|
||||||
|
// Quick GELU implementation.
|
||||||
|
ops.def("gelu_quick(Tensor! out, Tensor input) -> ()");
|
||||||
|
ops.impl("gelu_quick", torch::kCPU, &gelu_quick);
|
||||||
|
|
||||||
|
// Layernorm
|
||||||
|
// Apply Root Mean Square (RMS) Normalization to the input tensor.
|
||||||
|
ops.def(
|
||||||
|
"rms_norm(Tensor! out, Tensor input, Tensor weight, float epsilon) -> "
|
||||||
|
"()");
|
||||||
|
ops.impl("rms_norm", torch::kCPU, &rms_norm);
|
||||||
|
|
||||||
|
// In-place fused Add and RMS Normalization.
|
||||||
|
ops.def(
|
||||||
|
"fused_add_rms_norm(Tensor! input, Tensor! residual, Tensor weight, "
|
||||||
|
"float epsilon) -> ()");
|
||||||
|
ops.impl("fused_add_rms_norm", torch::kCPU, &fused_add_rms_norm);
|
||||||
|
|
||||||
|
// Rotary embedding
|
||||||
|
// Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
|
||||||
|
ops.def(
|
||||||
|
"rotary_embedding(Tensor positions, Tensor! query,"
|
||||||
|
" Tensor! key, int head_size,"
|
||||||
|
" Tensor cos_sin_cache, bool is_neox) -> ()");
|
||||||
|
ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
|
||||||
|
}
|
||||||
|
|
||||||
|
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||||
|
// Cache ops
|
||||||
|
// Swap in (out) the cache blocks from src to dst.
|
||||||
|
cache_ops.def(
|
||||||
|
"swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
|
||||||
|
cache_ops.impl("swap_blocks", torch::kCPU, &swap_blocks);
|
||||||
|
|
||||||
|
// Copy the cache blocks from src to dst.
|
||||||
|
cache_ops.def(
|
||||||
|
"copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
|
||||||
|
"block_mapping) -> ()");
|
||||||
|
cache_ops.impl("copy_blocks", torch::kCPU, ©_blocks);
|
||||||
|
|
||||||
|
// Reshape the key and value tensors and cache them.
|
||||||
|
cache_ops.def(
|
||||||
|
"reshape_and_cache(Tensor key, Tensor value,"
|
||||||
|
" Tensor! key_cache, Tensor! value_cache,"
|
||||||
|
" Tensor slot_mapping,"
|
||||||
|
" str kv_cache_dtype,"
|
||||||
|
" float kv_scale) -> ()");
|
||||||
|
cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
|
||||||
|
}
|
||||||
|
|
||||||
|
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
|
||||||
@@ -19,8 +19,12 @@
|
|||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
#define VLLM_SHFL_XOR_SYNC(var, lane_mask) \
|
#define VLLM_SHFL_XOR_SYNC(var, lane_mask) \
|
||||||
__shfl_xor_sync(uint32_t(-1), var, lane_mask)
|
__shfl_xor_sync(uint32_t(-1), var, lane_mask)
|
||||||
|
#define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
|
||||||
|
__shfl_xor_sync(uint32_t(-1), var, lane_mask, width)
|
||||||
#else
|
#else
|
||||||
#define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
|
#define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
|
||||||
|
#define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
|
||||||
|
__shfl_xor(var, lane_mask, width)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
|
|||||||
@@ -1,7 +1,5 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <torch/extension.h>
|
int64_t get_device_attribute(int64_t attribute, int64_t device_id);
|
||||||
|
|
||||||
int get_device_attribute(int attribute, int device_id);
|
int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
|
||||||
|
|
||||||
int get_max_shared_memory_per_block_device_attribute(int device_id);
|
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
#include <hip/hip_runtime.h>
|
#include <hip/hip_runtime.h>
|
||||||
#include <hip/hip_runtime_api.h>
|
#include <hip/hip_runtime_api.h>
|
||||||
#endif
|
#endif
|
||||||
int get_device_attribute(int attribute, int device_id) {
|
int64_t get_device_attribute(int64_t attribute, int64_t device_id) {
|
||||||
int device, value;
|
int device, value;
|
||||||
if (device_id < 0) {
|
if (device_id < 0) {
|
||||||
cudaGetDevice(&device);
|
cudaGetDevice(&device);
|
||||||
@@ -14,8 +14,8 @@ int get_device_attribute(int attribute, int device_id) {
|
|||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
int get_max_shared_memory_per_block_device_attribute(int device_id) {
|
int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id) {
|
||||||
int attribute;
|
int64_t attribute;
|
||||||
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
|
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
|
||||||
// cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74
|
// cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74
|
||||||
|
|
||||||
|
|||||||
@@ -1,17 +1,17 @@
|
|||||||
#include <ATen/cuda/Exceptions.h>
|
#include <ATen/cuda/Exceptions.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
#include <c10/cuda/CUDAStream.h>
|
#include <c10/cuda/CUDAStream.h>
|
||||||
#include <torch/extension.h>
|
#include <torch/all.h>
|
||||||
|
|
||||||
#include "custom_all_reduce.cuh"
|
#include "custom_all_reduce.cuh"
|
||||||
|
|
||||||
// fake pointer type
|
// fake pointer type, must match fptr_t type in ops.h
|
||||||
using fptr_t = uint64_t;
|
using fptr_t = int64_t;
|
||||||
static_assert(sizeof(void*) == sizeof(fptr_t));
|
static_assert(sizeof(void*) == sizeof(fptr_t));
|
||||||
|
|
||||||
fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
|
fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
|
||||||
const std::vector<std::string>& handles,
|
const std::vector<std::string>& handles,
|
||||||
const std::vector<int64_t>& offsets, int rank,
|
const std::vector<int64_t>& offsets, int64_t rank,
|
||||||
bool full_nvlink) {
|
bool full_nvlink) {
|
||||||
int world_size = offsets.size();
|
int world_size = offsets.size();
|
||||||
if (world_size > 8)
|
if (world_size > 8)
|
||||||
@@ -55,7 +55,7 @@ bool _is_weak_contiguous(torch::Tensor& t) {
|
|||||||
t.numel() * t.element_size());
|
t.numel() * t.element_size());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool should_custom_ar(torch::Tensor& inp, int max_size, int world_size,
|
bool should_custom_ar(torch::Tensor& inp, int64_t max_size, int64_t world_size,
|
||||||
bool full_nvlink) {
|
bool full_nvlink) {
|
||||||
auto inp_size = inp.numel() * inp.element_size();
|
auto inp_size = inp.numel() * inp.element_size();
|
||||||
// custom allreduce requires input byte size to be multiples of 16
|
// custom allreduce requires input byte size to be multiples of 16
|
||||||
@@ -125,7 +125,7 @@ void dispose(fptr_t _fa) {
|
|||||||
delete fa;
|
delete fa;
|
||||||
}
|
}
|
||||||
|
|
||||||
int meta_size() { return sizeof(vllm::Signal); }
|
int64_t meta_size() { return sizeof(vllm::Signal); }
|
||||||
|
|
||||||
void register_buffer(fptr_t _fa, torch::Tensor& t,
|
void register_buffer(fptr_t _fa, torch::Tensor& t,
|
||||||
const std::vector<std::string>& handles,
|
const std::vector<std::string>& handles,
|
||||||
@@ -134,10 +134,16 @@ void register_buffer(fptr_t _fa, torch::Tensor& t,
|
|||||||
fa->register_buffer(handles, offsets, t.data_ptr());
|
fa->register_buffer(handles, offsets, t.data_ptr());
|
||||||
}
|
}
|
||||||
|
|
||||||
std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(
|
std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
|
||||||
fptr_t _fa) {
|
fptr_t _fa) {
|
||||||
auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
|
auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
|
||||||
return fa->get_graph_buffer_ipc_meta();
|
auto [handle_bytes, offsets] = fa->get_graph_buffer_ipc_meta();
|
||||||
|
auto options =
|
||||||
|
torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
|
||||||
|
auto handles =
|
||||||
|
torch::empty({static_cast<int64_t>(handle_bytes.size())}, options);
|
||||||
|
std::memcpy(handles.data_ptr(), handle_bytes.data(), handle_bytes.size());
|
||||||
|
return {handles, std::move(offsets)};
|
||||||
}
|
}
|
||||||
|
|
||||||
void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
|
void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
*/
|
*/
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <torch/extension.h>
|
#include <torch/all.h>
|
||||||
|
|
||||||
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
||||||
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
#include <torch/extension.h>
|
#include <torch/all.h>
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
@@ -291,7 +291,7 @@ fused_add_rms_norm_kernel(
|
|||||||
void rms_norm(torch::Tensor& out, // [..., hidden_size]
|
void rms_norm(torch::Tensor& out, // [..., hidden_size]
|
||||||
torch::Tensor& input, // [..., hidden_size]
|
torch::Tensor& input, // [..., hidden_size]
|
||||||
torch::Tensor& weight, // [hidden_size]
|
torch::Tensor& weight, // [hidden_size]
|
||||||
float epsilon) {
|
double epsilon) {
|
||||||
int hidden_size = input.size(-1);
|
int hidden_size = input.size(-1);
|
||||||
int num_tokens = input.numel() / hidden_size;
|
int num_tokens = input.numel() / hidden_size;
|
||||||
|
|
||||||
@@ -319,7 +319,7 @@ void rms_norm(torch::Tensor& out, // [..., hidden_size]
|
|||||||
void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
||||||
torch::Tensor& residual, // [..., hidden_size]
|
torch::Tensor& residual, // [..., hidden_size]
|
||||||
torch::Tensor& weight, // [hidden_size]
|
torch::Tensor& weight, // [hidden_size]
|
||||||
float epsilon) {
|
double epsilon) {
|
||||||
int hidden_size = input.size(-1);
|
int hidden_size = input.size(-1);
|
||||||
int num_tokens = input.numel() / hidden_size;
|
int num_tokens = input.numel() / hidden_size;
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +0,0 @@
|
|||||||
#include "moe_ops.h"
|
|
||||||
|
|
||||||
#include <torch/extension.h>
|
|
||||||
|
|
||||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
|
||||||
m.def("topk_softmax", &topk_softmax,
|
|
||||||
"Apply topk softmax to the gating outputs.");
|
|
||||||
}
|
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <torch/extension.h>
|
#include <torch/all.h>
|
||||||
|
|
||||||
void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
|
void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
|
||||||
torch::Tensor& token_expert_indices,
|
torch::Tensor& token_expert_indices,
|
||||||
|
|||||||
@@ -16,18 +16,25 @@
|
|||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
#include <torch/extension.h>
|
#include <torch/all.h>
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
#include "../cuda_compat.h"
|
||||||
|
|
||||||
#include <cub/cub.cuh>
|
#ifndef USE_ROCM
|
||||||
#include <cub/util_type.cuh>
|
#include <cub/util_type.cuh>
|
||||||
|
#include <cub/cub.cuh>
|
||||||
|
#else
|
||||||
|
#include <hipcub/util_type.hpp>
|
||||||
|
#include <hipcub/hipcub.hpp>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
namespace moe {
|
namespace moe {
|
||||||
|
|
||||||
static constexpr int WARP_SIZE = 32;
|
|
||||||
|
|
||||||
/// Aligned array type
|
/// Aligned array type
|
||||||
template <
|
template <
|
||||||
typename T,
|
typename T,
|
||||||
@@ -265,7 +272,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
|
|||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
|
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
|
||||||
{
|
{
|
||||||
thread_max = max(thread_max, __shfl_xor_sync(0xFFFFFFFF, thread_max, mask, THREADS_PER_ROW));
|
thread_max = max(thread_max, VLLM_SHFL_XOR_SYNC_WIDTH(thread_max, mask, THREADS_PER_ROW));
|
||||||
}
|
}
|
||||||
|
|
||||||
// From this point, thread max in all the threads have the max within the row.
|
// From this point, thread max in all the threads have the max within the row.
|
||||||
@@ -282,7 +289,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
|
|||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
|
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
|
||||||
{
|
{
|
||||||
row_sum += __shfl_xor_sync(0xFFFFFFFF, row_sum, mask, THREADS_PER_ROW);
|
row_sum += VLLM_SHFL_XOR_SYNC_WIDTH(row_sum, mask, THREADS_PER_ROW);
|
||||||
}
|
}
|
||||||
|
|
||||||
// From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables
|
// From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables
|
||||||
@@ -332,8 +339,8 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
|
|||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
|
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
|
||||||
{
|
{
|
||||||
float other_max = __shfl_xor_sync(0xFFFFFFFF, max_val, mask, THREADS_PER_ROW);
|
float other_max = VLLM_SHFL_XOR_SYNC_WIDTH(max_val, mask, THREADS_PER_ROW);
|
||||||
int other_expert = __shfl_xor_sync(0xFFFFFFFF, expert, mask, THREADS_PER_ROW);
|
int other_expert = VLLM_SHFL_XOR_SYNC_WIDTH(expert, mask, THREADS_PER_ROW);
|
||||||
|
|
||||||
// We want lower indices to "win" in every thread so we break ties this way
|
// We want lower indices to "win" in every thread so we break ties this way
|
||||||
if (other_max > max_val || (other_max == max_val && other_expert < expert))
|
if (other_max > max_val || (other_max == max_val && other_expert < expert))
|
||||||
@@ -383,7 +390,7 @@ struct TopkConstants
|
|||||||
{
|
{
|
||||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
|
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
|
||||||
static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0, "");
|
static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0, "");
|
||||||
static constexpr int VECs_PER_THREAD = std::max(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE));
|
static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE));
|
||||||
static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
|
static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
|
||||||
static constexpr int THREADS_PER_ROW = EXPERTS / VPT;
|
static constexpr int THREADS_PER_ROW = EXPERTS / VPT;
|
||||||
static constexpr int ROWS_PER_WARP = WARP_SIZE / THREADS_PER_ROW;
|
static constexpr int ROWS_PER_WARP = WARP_SIZE / THREADS_PER_ROW;
|
||||||
@@ -396,7 +403,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
|
|||||||
{
|
{
|
||||||
static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
|
static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
|
||||||
|
|
||||||
static constexpr int BYTES_PER_LDG = std::min(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
|
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
|
||||||
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG>;
|
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG>;
|
||||||
static constexpr int VPT = Constants::VPT;
|
static constexpr int VPT = Constants::VPT;
|
||||||
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
||||||
|
|||||||
12
csrc/moe/torch_bindings.cpp
Normal file
12
csrc/moe/torch_bindings.cpp
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
#include "registration.h"
|
||||||
|
#include "moe_ops.h"
|
||||||
|
|
||||||
|
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||||
|
// Apply topk softmax to the gating outputs.
|
||||||
|
m.def(
|
||||||
|
"topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
|
||||||
|
"token_expert_indices, Tensor gating_output) -> ()");
|
||||||
|
m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
|
||||||
|
}
|
||||||
|
|
||||||
|
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
#include <torch/extension.h>
|
#include <torch/all.h>
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
|
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
@@ -108,8 +108,8 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
|
|||||||
}
|
}
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
void moe_align_block_size(torch::Tensor topk_ids, int num_experts,
|
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||||
int block_size, torch::Tensor sorted_token_ids,
|
int64_t block_size, torch::Tensor sorted_token_ids,
|
||||||
torch::Tensor experts_ids,
|
torch::Tensor experts_ids,
|
||||||
torch::Tensor num_tokens_post_pad) {
|
torch::Tensor num_tokens_post_pad) {
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|||||||
86
csrc/ops.h
86
csrc/ops.h
@@ -1,40 +1,43 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <torch/extension.h>
|
#include <optional>
|
||||||
|
#include <torch/library.h>
|
||||||
|
|
||||||
void paged_attention_v1(
|
void paged_attention_v1(
|
||||||
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||||
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
|
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||||
int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||||
const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
|
const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
|
||||||
const int blocksparse_local_blocks, const int blocksparse_vert_stride,
|
const int64_t blocksparse_local_blocks,
|
||||||
const int blocksparse_block_size, const int blocksparse_head_sliding_step);
|
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||||
|
const int64_t blocksparse_head_sliding_step);
|
||||||
|
|
||||||
void paged_attention_v2(
|
void paged_attention_v2(
|
||||||
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
|
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
|
||||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||||
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
|
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||||
int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||||
const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
|
const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
|
||||||
const int blocksparse_local_blocks, const int blocksparse_vert_stride,
|
const int64_t blocksparse_local_blocks,
|
||||||
const int blocksparse_block_size, const int blocksparse_head_sliding_step);
|
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||||
|
const int64_t blocksparse_head_sliding_step);
|
||||||
|
|
||||||
void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
|
void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
|
||||||
float epsilon);
|
double epsilon);
|
||||||
|
|
||||||
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
|
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
|
||||||
torch::Tensor& weight, float epsilon);
|
torch::Tensor& weight, double epsilon);
|
||||||
|
|
||||||
void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
|
void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
|
||||||
torch::Tensor& key, int head_size,
|
torch::Tensor& key, int64_t head_size,
|
||||||
torch::Tensor& cos_sin_cache, bool is_neox);
|
torch::Tensor& cos_sin_cache, bool is_neox);
|
||||||
|
|
||||||
void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
|
void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
|
||||||
torch::Tensor& key, int head_size,
|
torch::Tensor& key, int64_t head_size,
|
||||||
torch::Tensor& cos_sin_cache, bool is_neox,
|
torch::Tensor& cos_sin_cache, bool is_neox,
|
||||||
int rot_dim,
|
int64_t rot_dim,
|
||||||
torch::Tensor& cos_sin_cache_offsets);
|
torch::Tensor& cos_sin_cache_offsets);
|
||||||
|
|
||||||
void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
|
void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
|
||||||
@@ -47,6 +50,8 @@ void gelu_new(torch::Tensor& out, torch::Tensor& input);
|
|||||||
|
|
||||||
void gelu_fast(torch::Tensor& out, torch::Tensor& input);
|
void gelu_fast(torch::Tensor& out, torch::Tensor& input);
|
||||||
|
|
||||||
|
void gelu_quick(torch::Tensor& out, torch::Tensor& input);
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
|
torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
|
||||||
const torch::Tensor& codebooks,
|
const torch::Tensor& codebooks,
|
||||||
@@ -60,12 +65,12 @@ torch::Tensor aqlm_dequant(const torch::Tensor& codes,
|
|||||||
|
|
||||||
torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
|
torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
|
||||||
torch::Tensor _scaling_factors, torch::Tensor _zeros,
|
torch::Tensor _scaling_factors, torch::Tensor _zeros,
|
||||||
int split_k_iters);
|
int64_t split_k_iters);
|
||||||
|
|
||||||
torch::Tensor awq_dequantize(torch::Tensor _kernel,
|
torch::Tensor awq_dequantize(torch::Tensor _kernel,
|
||||||
torch::Tensor _scaling_factors,
|
torch::Tensor _scaling_factors,
|
||||||
torch::Tensor _zeros, int split_k_iters, int thx,
|
torch::Tensor _zeros, int64_t split_k_iters,
|
||||||
int thy);
|
int64_t thx, int64_t thy);
|
||||||
|
|
||||||
torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||||
torch::Tensor& b_scales, torch::Tensor& workspace,
|
torch::Tensor& b_scales, torch::Tensor& workspace,
|
||||||
@@ -88,14 +93,25 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
|
|||||||
int64_t size_k, int64_t size_n,
|
int64_t size_k, int64_t size_n,
|
||||||
int64_t num_bits);
|
int64_t num_bits);
|
||||||
|
|
||||||
int cutlass_scaled_mm_dq(torch::Tensor& out, torch::Tensor const& a,
|
torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||||
torch::Tensor const& b, torch::Tensor const& a_scales,
|
torch::Tensor& b_scales, torch::Tensor& workspace,
|
||||||
torch::Tensor const& b_scales);
|
int64_t num_bits, int64_t size_m, int64_t size_n,
|
||||||
|
int64_t size_k);
|
||||||
|
|
||||||
|
bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
|
||||||
|
|
||||||
|
void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
|
||||||
|
torch::Tensor const& b, torch::Tensor const& a_scales,
|
||||||
|
torch::Tensor const& b_scales,
|
||||||
|
c10::optional<torch::Tensor> const& bias);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor& input,
|
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||||
float scale);
|
torch::Tensor const& scale);
|
||||||
|
|
||||||
|
void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||||
|
torch::Tensor& scales);
|
||||||
|
|
||||||
void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
torch::Tensor lookup_table);
|
torch::Tensor lookup_table);
|
||||||
@@ -103,9 +119,9 @@ void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
|||||||
torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
|
torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
|
||||||
torch::Tensor b_gptq_qzeros,
|
torch::Tensor b_gptq_qzeros,
|
||||||
torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
|
torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
|
||||||
bool use_exllama, int bit);
|
bool use_exllama, int64_t bit);
|
||||||
|
|
||||||
void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int bit);
|
void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
|
||||||
|
|
||||||
void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
|
void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
|
||||||
torch::Tensor& scale);
|
torch::Tensor& scale);
|
||||||
@@ -113,28 +129,28 @@ void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
|
|||||||
void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
|
void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
|
||||||
torch::Tensor& scale);
|
torch::Tensor& scale);
|
||||||
|
|
||||||
void moe_align_block_size(torch::Tensor topk_ids, int num_experts,
|
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||||
int block_size, torch::Tensor sorted_token_ids,
|
int64_t block_size, torch::Tensor sorted_token_ids,
|
||||||
torch::Tensor experts_ids,
|
torch::Tensor experts_ids,
|
||||||
torch::Tensor num_tokens_post_pad);
|
torch::Tensor num_tokens_post_pad);
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
using fptr_t = uint64_t;
|
using fptr_t = int64_t;
|
||||||
fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
|
fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
|
||||||
const std::vector<std::string>& handles,
|
const std::vector<std::string>& handles,
|
||||||
const std::vector<int64_t>& offsets, int rank,
|
const std::vector<int64_t>& offsets, int64_t rank,
|
||||||
bool full_nvlink);
|
bool full_nvlink);
|
||||||
bool should_custom_ar(torch::Tensor& inp, int max_size, int world_size,
|
bool should_custom_ar(torch::Tensor& inp, int64_t max_size, int64_t world_size,
|
||||||
bool full_nvlink);
|
bool full_nvlink);
|
||||||
void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out);
|
void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out);
|
||||||
void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer,
|
void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer,
|
||||||
torch::Tensor& out);
|
torch::Tensor& out);
|
||||||
void dispose(fptr_t _fa);
|
void dispose(fptr_t _fa);
|
||||||
int meta_size();
|
int64_t meta_size();
|
||||||
void register_buffer(fptr_t _fa, torch::Tensor& t,
|
void register_buffer(fptr_t _fa, torch::Tensor& t,
|
||||||
const std::vector<std::string>& handles,
|
const std::vector<std::string>& handles,
|
||||||
const std::vector<int64_t>& offsets);
|
const std::vector<int64_t>& offsets);
|
||||||
std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(
|
std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
|
||||||
fptr_t _fa);
|
fptr_t _fa);
|
||||||
void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
|
void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
|
||||||
const std::vector<std::vector<int64_t>>& offsets);
|
const std::vector<std::vector<int64_t>>& offsets);
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
#include <torch/extension.h>
|
#include <torch/all.h>
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
@@ -127,7 +127,7 @@ void rotary_embedding(
|
|||||||
// [num_tokens, num_heads * head_size]
|
// [num_tokens, num_heads * head_size]
|
||||||
torch::Tensor& key, // [batch_size, seq_len, num_kv_heads * head_size] or
|
torch::Tensor& key, // [batch_size, seq_len, num_kv_heads * head_size] or
|
||||||
// [num_tokens, num_kv_heads * head_size]
|
// [num_tokens, num_kv_heads * head_size]
|
||||||
int head_size,
|
int64_t head_size,
|
||||||
torch::Tensor& cos_sin_cache, // [max_position, rot_dim]
|
torch::Tensor& cos_sin_cache, // [max_position, rot_dim]
|
||||||
bool is_neox) {
|
bool is_neox) {
|
||||||
int64_t num_tokens = query.numel() / query.size(-1);
|
int64_t num_tokens = query.numel() / query.size(-1);
|
||||||
@@ -138,7 +138,7 @@ void rotary_embedding(
|
|||||||
int64_t key_stride = key.stride(-2);
|
int64_t key_stride = key.stride(-2);
|
||||||
|
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
dim3 block(std::min(num_heads * rot_dim / 2, 512));
|
dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
|
VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
|
||||||
@@ -168,9 +168,9 @@ void batched_rotary_embedding(
|
|||||||
// [num_tokens, num_heads * head_size]
|
// [num_tokens, num_heads * head_size]
|
||||||
torch::Tensor& key, // [batch_size, seq_len, num_kv_heads * head_size] or
|
torch::Tensor& key, // [batch_size, seq_len, num_kv_heads * head_size] or
|
||||||
// [num_tokens, num_kv_heads * head_size]
|
// [num_tokens, num_kv_heads * head_size]
|
||||||
int head_size,
|
int64_t head_size,
|
||||||
torch::Tensor& cos_sin_cache, // [max_position, rot_dim]
|
torch::Tensor& cos_sin_cache, // [max_position, rot_dim]
|
||||||
bool is_neox, int rot_dim,
|
bool is_neox, int64_t rot_dim,
|
||||||
torch::Tensor& cos_sin_cache_offsets // [num_tokens]
|
torch::Tensor& cos_sin_cache_offsets // [num_tokens]
|
||||||
) {
|
) {
|
||||||
int64_t num_tokens = cos_sin_cache_offsets.size(0);
|
int64_t num_tokens = cos_sin_cache_offsets.size(0);
|
||||||
@@ -180,7 +180,7 @@ void batched_rotary_embedding(
|
|||||||
int64_t key_stride = key.stride(-2);
|
int64_t key_stride = key.stride(-2);
|
||||||
|
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
dim3 block(std::min(num_heads * rot_dim / 2, 512));
|
dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
|
VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
|
||||||
|
|||||||
@@ -16,14 +16,20 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
|
|||||||
f(in_T, out_T, W_T, narrow, 512) \
|
f(in_T, out_T, W_T, narrow, 512) \
|
||||||
f(in_T, out_T, W_T, narrow, 640) \
|
f(in_T, out_T, W_T, narrow, 640) \
|
||||||
f(in_T, out_T, W_T, narrow, 768) \
|
f(in_T, out_T, W_T, narrow, 768) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 896) \
|
||||||
f(in_T, out_T, W_T, narrow, 1024) \
|
f(in_T, out_T, W_T, narrow, 1024) \
|
||||||
f(in_T, out_T, W_T, narrow, 1152) \
|
f(in_T, out_T, W_T, narrow, 1152) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 1216) \
|
||||||
f(in_T, out_T, W_T, narrow, 1280) \
|
f(in_T, out_T, W_T, narrow, 1280) \
|
||||||
f(in_T, out_T, W_T, narrow, 1536) \
|
f(in_T, out_T, W_T, narrow, 1536) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 1664) \
|
||||||
f(in_T, out_T, W_T, narrow, 1728) \
|
f(in_T, out_T, W_T, narrow, 1728) \
|
||||||
f(in_T, out_T, W_T, narrow, 1792) \
|
f(in_T, out_T, W_T, narrow, 1792) \
|
||||||
f(in_T, out_T, W_T, narrow, 2048) \
|
f(in_T, out_T, W_T, narrow, 2048) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 2240) \
|
||||||
f(in_T, out_T, W_T, narrow, 2304) \
|
f(in_T, out_T, W_T, narrow, 2304) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 2368) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 2432) \
|
||||||
f(in_T, out_T, W_T, narrow, 2560) \
|
f(in_T, out_T, W_T, narrow, 2560) \
|
||||||
f(in_T, out_T, W_T, narrow, 2752) \
|
f(in_T, out_T, W_T, narrow, 2752) \
|
||||||
f(in_T, out_T, W_T, narrow, 2816) \
|
f(in_T, out_T, W_T, narrow, 2816) \
|
||||||
@@ -31,32 +37,47 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
|
|||||||
f(in_T, out_T, W_T, narrow, 3328) \
|
f(in_T, out_T, W_T, narrow, 3328) \
|
||||||
f(in_T, out_T, W_T, narrow, 3456) \
|
f(in_T, out_T, W_T, narrow, 3456) \
|
||||||
f(in_T, out_T, W_T, narrow, 3584) \
|
f(in_T, out_T, W_T, narrow, 3584) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 3712) \
|
||||||
f(in_T, out_T, W_T, narrow, 4096) \
|
f(in_T, out_T, W_T, narrow, 4096) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 4480) \
|
||||||
f(in_T, out_T, W_T, narrow, 4608) \
|
f(in_T, out_T, W_T, narrow, 4608) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 4736) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 4864) \
|
||||||
f(in_T, out_T, W_T, narrow, 5120) \
|
f(in_T, out_T, W_T, narrow, 5120) \
|
||||||
f(in_T, out_T, W_T, narrow, 5504) \
|
f(in_T, out_T, W_T, narrow, 5504) \
|
||||||
f(in_T, out_T, W_T, narrow, 5632) \
|
f(in_T, out_T, W_T, narrow, 5632) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 5888) \
|
||||||
f(in_T, out_T, W_T, narrow, 6144) \
|
f(in_T, out_T, W_T, narrow, 6144) \
|
||||||
f(in_T, out_T, W_T, narrow, 6400) \
|
f(in_T, out_T, W_T, narrow, 6400) \
|
||||||
f(in_T, out_T, W_T, narrow, 6848) \
|
f(in_T, out_T, W_T, narrow, 6848) \
|
||||||
f(in_T, out_T, W_T, narrow, 6912) \
|
f(in_T, out_T, W_T, narrow, 6912) \
|
||||||
f(in_T, out_T, W_T, narrow, 7168) \
|
f(in_T, out_T, W_T, narrow, 7168) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 7424) \
|
||||||
f(in_T, out_T, W_T, narrow, 8192) \
|
f(in_T, out_T, W_T, narrow, 8192) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 8960) \
|
||||||
f(in_T, out_T, W_T, narrow, 9216) \
|
f(in_T, out_T, W_T, narrow, 9216) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 9472) \
|
||||||
f(in_T, out_T, W_T, narrow, 10240) \
|
f(in_T, out_T, W_T, narrow, 10240) \
|
||||||
f(in_T, out_T, W_T, narrow, 11008) \
|
f(in_T, out_T, W_T, narrow, 11008) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 11264) \
|
||||||
f(in_T, out_T, W_T, narrow, 12288) \
|
f(in_T, out_T, W_T, narrow, 12288) \
|
||||||
f(in_T, out_T, W_T, narrow, 13696) \
|
f(in_T, out_T, W_T, narrow, 13696) \
|
||||||
f(in_T, out_T, W_T, narrow, 13824) \
|
f(in_T, out_T, W_T, narrow, 13824) \
|
||||||
f(in_T, out_T, W_T, narrow, 14336) \
|
f(in_T, out_T, W_T, narrow, 14336) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 14784) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 14848) \
|
||||||
f(in_T, out_T, W_T, narrow, 15360) \
|
f(in_T, out_T, W_T, narrow, 15360) \
|
||||||
f(in_T, out_T, W_T, narrow, 16384) \
|
f(in_T, out_T, W_T, narrow, 16384) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 18944) \
|
||||||
f(in_T, out_T, W_T, narrow, 20480) \
|
f(in_T, out_T, W_T, narrow, 20480) \
|
||||||
f(in_T, out_T, W_T, narrow, 22016) \
|
f(in_T, out_T, W_T, narrow, 22016) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 22528) \
|
||||||
f(in_T, out_T, W_T, narrow, 24576) \
|
f(in_T, out_T, W_T, narrow, 24576) \
|
||||||
f(in_T, out_T, W_T, narrow, 27392) \
|
f(in_T, out_T, W_T, narrow, 27392) \
|
||||||
f(in_T, out_T, W_T, narrow, 27648) \
|
f(in_T, out_T, W_T, narrow, 27648) \
|
||||||
f(in_T, out_T, W_T, narrow, 28672) \
|
f(in_T, out_T, W_T, narrow, 28672) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 29568) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 29696) \
|
||||||
f(in_T, out_T, W_T, narrow, 32000) \
|
f(in_T, out_T, W_T, narrow, 32000) \
|
||||||
f(in_T, out_T, W_T, narrow, 32256) \
|
f(in_T, out_T, W_T, narrow, 32256) \
|
||||||
f(in_T, out_T, W_T, narrow, 32512) \
|
f(in_T, out_T, W_T, narrow, 32512) \
|
||||||
@@ -65,6 +86,9 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
|
|||||||
f(in_T, out_T, W_T, narrow, 36864) \
|
f(in_T, out_T, W_T, narrow, 36864) \
|
||||||
f(in_T, out_T, W_T, narrow, 43264) \
|
f(in_T, out_T, W_T, narrow, 43264) \
|
||||||
f(in_T, out_T, W_T, narrow, 49152) \
|
f(in_T, out_T, W_T, narrow, 49152) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 49408) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 60544) \
|
||||||
|
f(in_T, out_T, W_T, narrow, 60672) \
|
||||||
f(in_T, out_T, W_T, narrow, 64000) \
|
f(in_T, out_T, W_T, narrow, 64000) \
|
||||||
f(in_T, out_T, W_T, narrow, 64256) \
|
f(in_T, out_T, W_T, narrow, 64256) \
|
||||||
f(in_T, out_T, W_T, narrow, 64512) \
|
f(in_T, out_T, W_T, narrow, 64512) \
|
||||||
@@ -74,12 +98,14 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
|
|||||||
f(in_T, out_T, W_T, narrow, 128000) \
|
f(in_T, out_T, W_T, narrow, 128000) \
|
||||||
f(in_T, out_T, W_T, narrow, 128256) \
|
f(in_T, out_T, W_T, narrow, 128256) \
|
||||||
f(in_T, out_T, W_T, narrow, 128512) \
|
f(in_T, out_T, W_T, narrow, 128512) \
|
||||||
|
|
||||||
|
|
||||||
// Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA
|
// Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA
|
||||||
// and vllm/tests/lora/test_punica.py
|
// and vllm/tests/lora/test_punica.py
|
||||||
|
|
||||||
// Used for defining kernels going from the variety of
|
// Used for defining kernels going from the variety of
|
||||||
// dim in to the narrow dim out
|
// dim in to the narrow dim out
|
||||||
// Using it for the fully sharded column
|
// Using it for the fully sharded column
|
||||||
// parallel LoRA A which splits the rank dim
|
// parallel LoRA A which splits the rank dim
|
||||||
#define FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, narrow) \
|
#define FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, narrow) \
|
||||||
f(in_T, out_T, W_T, 128, narrow) \
|
f(in_T, out_T, W_T, 128, narrow) \
|
||||||
@@ -87,14 +113,20 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
|
|||||||
f(in_T, out_T, W_T, 512, narrow) \
|
f(in_T, out_T, W_T, 512, narrow) \
|
||||||
f(in_T, out_T, W_T, 640, narrow) \
|
f(in_T, out_T, W_T, 640, narrow) \
|
||||||
f(in_T, out_T, W_T, 768, narrow) \
|
f(in_T, out_T, W_T, 768, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 896, narrow) \
|
||||||
f(in_T, out_T, W_T, 1024, narrow) \
|
f(in_T, out_T, W_T, 1024, narrow) \
|
||||||
f(in_T, out_T, W_T, 1152, narrow) \
|
f(in_T, out_T, W_T, 1152, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 1216, narrow) \
|
||||||
f(in_T, out_T, W_T, 1280, narrow) \
|
f(in_T, out_T, W_T, 1280, narrow) \
|
||||||
f(in_T, out_T, W_T, 1536, narrow) \
|
f(in_T, out_T, W_T, 1536, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 1664, narrow) \
|
||||||
f(in_T, out_T, W_T, 1728, narrow) \
|
f(in_T, out_T, W_T, 1728, narrow) \
|
||||||
f(in_T, out_T, W_T, 1792, narrow) \
|
f(in_T, out_T, W_T, 1792, narrow) \
|
||||||
f(in_T, out_T, W_T, 2048, narrow) \
|
f(in_T, out_T, W_T, 2048, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 2240, narrow) \
|
||||||
f(in_T, out_T, W_T, 2304, narrow) \
|
f(in_T, out_T, W_T, 2304, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 2368, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 2432, narrow) \
|
||||||
f(in_T, out_T, W_T, 2560, narrow) \
|
f(in_T, out_T, W_T, 2560, narrow) \
|
||||||
f(in_T, out_T, W_T, 2752, narrow) \
|
f(in_T, out_T, W_T, 2752, narrow) \
|
||||||
f(in_T, out_T, W_T, 2816, narrow) \
|
f(in_T, out_T, W_T, 2816, narrow) \
|
||||||
@@ -102,32 +134,47 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
|
|||||||
f(in_T, out_T, W_T, 3328, narrow) \
|
f(in_T, out_T, W_T, 3328, narrow) \
|
||||||
f(in_T, out_T, W_T, 3456, narrow) \
|
f(in_T, out_T, W_T, 3456, narrow) \
|
||||||
f(in_T, out_T, W_T, 3584, narrow) \
|
f(in_T, out_T, W_T, 3584, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 3712, narrow) \
|
||||||
f(in_T, out_T, W_T, 4096, narrow) \
|
f(in_T, out_T, W_T, 4096, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 4480, narrow) \
|
||||||
f(in_T, out_T, W_T, 4608, narrow) \
|
f(in_T, out_T, W_T, 4608, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 4736, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 4864, narrow) \
|
||||||
f(in_T, out_T, W_T, 5120, narrow) \
|
f(in_T, out_T, W_T, 5120, narrow) \
|
||||||
f(in_T, out_T, W_T, 5504, narrow) \
|
f(in_T, out_T, W_T, 5504, narrow) \
|
||||||
f(in_T, out_T, W_T, 5632, narrow) \
|
f(in_T, out_T, W_T, 5632, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 5888, narrow) \
|
||||||
f(in_T, out_T, W_T, 6144, narrow) \
|
f(in_T, out_T, W_T, 6144, narrow) \
|
||||||
f(in_T, out_T, W_T, 6400, narrow) \
|
f(in_T, out_T, W_T, 6400, narrow) \
|
||||||
f(in_T, out_T, W_T, 6848, narrow) \
|
f(in_T, out_T, W_T, 6848, narrow) \
|
||||||
f(in_T, out_T, W_T, 6912, narrow) \
|
f(in_T, out_T, W_T, 6912, narrow) \
|
||||||
f(in_T, out_T, W_T, 7168, narrow) \
|
f(in_T, out_T, W_T, 7168, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 7424, narrow) \
|
||||||
f(in_T, out_T, W_T, 8192, narrow) \
|
f(in_T, out_T, W_T, 8192, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 8960, narrow) \
|
||||||
f(in_T, out_T, W_T, 9216, narrow) \
|
f(in_T, out_T, W_T, 9216, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 9472, narrow) \
|
||||||
f(in_T, out_T, W_T, 10240, narrow) \
|
f(in_T, out_T, W_T, 10240, narrow) \
|
||||||
f(in_T, out_T, W_T, 11008, narrow) \
|
f(in_T, out_T, W_T, 11008, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 11264, narrow) \
|
||||||
f(in_T, out_T, W_T, 12288, narrow) \
|
f(in_T, out_T, W_T, 12288, narrow) \
|
||||||
f(in_T, out_T, W_T, 13696, narrow) \
|
f(in_T, out_T, W_T, 13696, narrow) \
|
||||||
f(in_T, out_T, W_T, 13824, narrow) \
|
f(in_T, out_T, W_T, 13824, narrow) \
|
||||||
f(in_T, out_T, W_T, 14336, narrow) \
|
f(in_T, out_T, W_T, 14336, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 14784, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 14848, narrow) \
|
||||||
f(in_T, out_T, W_T, 15360, narrow) \
|
f(in_T, out_T, W_T, 15360, narrow) \
|
||||||
f(in_T, out_T, W_T, 16384, narrow) \
|
f(in_T, out_T, W_T, 16384, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 18944, narrow) \
|
||||||
f(in_T, out_T, W_T, 20480, narrow) \
|
f(in_T, out_T, W_T, 20480, narrow) \
|
||||||
f(in_T, out_T, W_T, 22016, narrow) \
|
f(in_T, out_T, W_T, 22016, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 22528, narrow) \
|
||||||
f(in_T, out_T, W_T, 24576, narrow) \
|
f(in_T, out_T, W_T, 24576, narrow) \
|
||||||
f(in_T, out_T, W_T, 27392, narrow) \
|
f(in_T, out_T, W_T, 27392, narrow) \
|
||||||
f(in_T, out_T, W_T, 27648, narrow) \
|
f(in_T, out_T, W_T, 27648, narrow) \
|
||||||
f(in_T, out_T, W_T, 28672, narrow) \
|
f(in_T, out_T, W_T, 28672, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 29568, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 29696, narrow) \
|
||||||
f(in_T, out_T, W_T, 32000, narrow) \
|
f(in_T, out_T, W_T, 32000, narrow) \
|
||||||
f(in_T, out_T, W_T, 32256, narrow) \
|
f(in_T, out_T, W_T, 32256, narrow) \
|
||||||
f(in_T, out_T, W_T, 32512, narrow) \
|
f(in_T, out_T, W_T, 32512, narrow) \
|
||||||
@@ -136,6 +183,9 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
|
|||||||
f(in_T, out_T, W_T, 36864, narrow) \
|
f(in_T, out_T, W_T, 36864, narrow) \
|
||||||
f(in_T, out_T, W_T, 43264, narrow) \
|
f(in_T, out_T, W_T, 43264, narrow) \
|
||||||
f(in_T, out_T, W_T, 49152, narrow) \
|
f(in_T, out_T, W_T, 49152, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 49408, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 60544, narrow) \
|
||||||
|
f(in_T, out_T, W_T, 60672, narrow) \
|
||||||
f(in_T, out_T, W_T, 64000, narrow) \
|
f(in_T, out_T, W_T, 64000, narrow) \
|
||||||
f(in_T, out_T, W_T, 64256, narrow) \
|
f(in_T, out_T, W_T, 64256, narrow) \
|
||||||
f(in_T, out_T, W_T, 64512, narrow) \
|
f(in_T, out_T, W_T, 64512, narrow) \
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
#include <torch/extension.h>
|
#include <torch/all.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
|
||||||
@@ -88,7 +88,7 @@ inline bool launch_bgmv_kernel(out_T *Y, const in_T *X, const W_T *W,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
|
void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
|
||||||
torch::Tensor indicies, int64_t layer_idx, float scale) {
|
torch::Tensor indicies, int64_t layer_idx, double scale) {
|
||||||
CHECK_INPUT(y);
|
CHECK_INPUT(y);
|
||||||
CHECK_INPUT(x);
|
CHECK_INPUT(x);
|
||||||
CHECK_INPUT(w);
|
CHECK_INPUT(w);
|
||||||
@@ -320,7 +320,7 @@ void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
|
|||||||
|
|
||||||
void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
|
void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
|
||||||
torch::Tensor indicies, int64_t layer_idx,
|
torch::Tensor indicies, int64_t layer_idx,
|
||||||
float scale, int64_t h_in, int64_t h_out,
|
double scale, int64_t h_in, int64_t h_out,
|
||||||
int64_t y_offset) {
|
int64_t y_offset) {
|
||||||
CHECK_INPUT(y);
|
CHECK_INPUT(y);
|
||||||
CHECK_INPUT(x);
|
CHECK_INPUT(x);
|
||||||
|
|||||||
@@ -1,11 +1,11 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <torch/extension.h>
|
#include <torch/all.h>
|
||||||
|
|
||||||
void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
|
void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
|
||||||
torch::Tensor indicies, int64_t layer_idx, float scale);
|
torch::Tensor indicies, int64_t layer_idx, double scale);
|
||||||
|
|
||||||
void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
|
void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
|
||||||
torch::Tensor indicies, int64_t layer_idx,
|
torch::Tensor indicies, int64_t layer_idx,
|
||||||
float scale, int64_t h_in, int64_t h_out,
|
double scale, int64_t h_in, int64_t h_out,
|
||||||
int64_t y_offset);
|
int64_t y_offset);
|
||||||
|
|||||||
@@ -1,13 +0,0 @@
|
|||||||
#include <torch/extension.h>
|
|
||||||
|
|
||||||
#include "punica_ops.h"
|
|
||||||
|
|
||||||
//====== pybind ======
|
|
||||||
|
|
||||||
#define DEFINE_pybind(name) m.def(#name, &name, #name);
|
|
||||||
|
|
||||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
|
||||||
m.def("dispatch_bgmv", &dispatch_bgmv, "dispatch_bgmv");
|
|
||||||
m.def("dispatch_bgmv_low_level", &dispatch_bgmv_low_level,
|
|
||||||
"dispatch_bgmv_low_level");
|
|
||||||
}
|
|
||||||
18
csrc/punica/torch_bindings.cpp
Normal file
18
csrc/punica/torch_bindings.cpp
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
#include "registration.h"
|
||||||
|
#include "punica_ops.h"
|
||||||
|
|
||||||
|
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||||
|
m.def(
|
||||||
|
"dispatch_bgmv(Tensor! y, Tensor x, Tensor w, Tensor indicies, int "
|
||||||
|
"layer_idx, float scale) -> ()");
|
||||||
|
m.impl("dispatch_bgmv", torch::kCUDA, &dispatch_bgmv);
|
||||||
|
|
||||||
|
m.def(
|
||||||
|
"dispatch_bgmv_low_level(Tensor! y, Tensor x, Tensor w,"
|
||||||
|
"Tensor indicies, int layer_idx,"
|
||||||
|
"float scale, int h_in, int h_out,"
|
||||||
|
"int y_offset) -> ()");
|
||||||
|
m.impl("dispatch_bgmv_low_level", torch::kCUDA, &dispatch_bgmv_low_level);
|
||||||
|
}
|
||||||
|
|
||||||
|
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
|
||||||
111
csrc/pybind.cpp
111
csrc/pybind.cpp
@@ -1,111 +0,0 @@
|
|||||||
#include "cache.h"
|
|
||||||
#include "cuda_utils.h"
|
|
||||||
#include "ops.h"
|
|
||||||
#include <torch/extension.h>
|
|
||||||
|
|
||||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
|
||||||
// vLLM custom ops
|
|
||||||
pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
|
|
||||||
|
|
||||||
// Attention ops
|
|
||||||
ops.def("paged_attention_v1", &paged_attention_v1,
|
|
||||||
"Compute the attention between an input query and the cached "
|
|
||||||
"keys/values using PagedAttention.");
|
|
||||||
ops.def("paged_attention_v2", &paged_attention_v2, "PagedAttention V2.");
|
|
||||||
|
|
||||||
// Activation ops
|
|
||||||
ops.def("silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU.");
|
|
||||||
ops.def("gelu_and_mul", &gelu_and_mul,
|
|
||||||
"Activation function used in GeGLU with `none` approximation.");
|
|
||||||
ops.def("gelu_tanh_and_mul", &gelu_tanh_and_mul,
|
|
||||||
"Activation function used in GeGLU with `tanh` approximation.");
|
|
||||||
ops.def("gelu_new", &gelu_new, "GELU implementation used in GPT-2.");
|
|
||||||
ops.def("gelu_fast", &gelu_fast, "Approximate GELU implementation.");
|
|
||||||
|
|
||||||
// Layernorm
|
|
||||||
ops.def("rms_norm", &rms_norm,
|
|
||||||
"Apply Root Mean Square (RMS) Normalization to the input tensor.");
|
|
||||||
|
|
||||||
ops.def("fused_add_rms_norm", &fused_add_rms_norm,
|
|
||||||
"In-place fused Add and RMS Normalization");
|
|
||||||
|
|
||||||
// Rotary embedding
|
|
||||||
ops.def("rotary_embedding", &rotary_embedding,
|
|
||||||
"Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
|
|
||||||
|
|
||||||
ops.def("batched_rotary_embedding", &batched_rotary_embedding,
|
|
||||||
"Apply GPT-NeoX or GPT-J style rotary embedding to query and key "
|
|
||||||
"(supports multiple loras)");
|
|
||||||
|
|
||||||
// Quantization ops
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM");
|
|
||||||
ops.def("aqlm_dequant", &aqlm_dequant, "Decompression method for AQLM");
|
|
||||||
ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
|
|
||||||
ops.def("marlin_gemm", &marlin_gemm,
|
|
||||||
"Marlin (Dense) Optimized Quantized GEMM for GPTQ");
|
|
||||||
ops.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm,
|
|
||||||
"Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ");
|
|
||||||
ops.def("gptq_marlin_gemm", &gptq_marlin_gemm,
|
|
||||||
"gptq_marlin Optimized Quantized GEMM for GPTQ");
|
|
||||||
ops.def("gptq_marlin_repack", &gptq_marlin_repack,
|
|
||||||
"gptq_marlin repack from GPTQ");
|
|
||||||
ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");
|
|
||||||
ops.def("cutlass_scaled_mm_dq", &cutlass_scaled_mm_dq,
|
|
||||||
"CUTLASS w8a8 GEMM, supporting symmetric per-tensor or "
|
|
||||||
"per-row/column quantization.");
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
|
|
||||||
ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
|
|
||||||
ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
|
|
||||||
ops.def("static_scaled_fp8_quant", &static_scaled_fp8_quant,
|
|
||||||
"Compute FP8 quantized tensor for given scaling factor");
|
|
||||||
ops.def("dynamic_scaled_fp8_quant", &dynamic_scaled_fp8_quant,
|
|
||||||
"Compute FP8 quantized tensor and scaling factor");
|
|
||||||
ops.def("moe_align_block_size", &moe_align_block_size,
|
|
||||||
"Aligning the number of tokens to be processed by each expert such "
|
|
||||||
"that it is divisible by the block size.");
|
|
||||||
|
|
||||||
ops.def("static_scaled_int8_quant", &static_scaled_int8_quant,
|
|
||||||
"Compute int8 quantized tensor for given scaling factor");
|
|
||||||
|
|
||||||
// Cache ops
|
|
||||||
pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
|
|
||||||
cache_ops.def("swap_blocks", &swap_blocks,
|
|
||||||
"Swap in (out) the cache blocks from src to dst");
|
|
||||||
cache_ops.def("copy_blocks", ©_blocks,
|
|
||||||
"Copy the cache blocks from src to dst");
|
|
||||||
cache_ops.def("reshape_and_cache", &reshape_and_cache,
|
|
||||||
"Reshape the key and value tensors and cache them");
|
|
||||||
cache_ops.def("reshape_and_cache_flash", &reshape_and_cache_flash,
|
|
||||||
"Reshape the key and value tensors and cache them");
|
|
||||||
cache_ops.def("convert_fp8", &convert_fp8,
|
|
||||||
"Convert the key and value cache to fp8 data type");
|
|
||||||
|
|
||||||
// Cuda utils
|
|
||||||
pybind11::module cuda_utils =
|
|
||||||
m.def_submodule("cuda_utils", "vLLM cuda utils");
|
|
||||||
cuda_utils.def("get_device_attribute", &get_device_attribute,
|
|
||||||
"Gets the specified device attribute.");
|
|
||||||
|
|
||||||
cuda_utils.def("get_max_shared_memory_per_block_device_attribute",
|
|
||||||
&get_max_shared_memory_per_block_device_attribute,
|
|
||||||
"Gets the maximum shared memory per block device attribute.");
|
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
|
||||||
// Custom all-reduce kernels
|
|
||||||
pybind11::module custom_ar = m.def_submodule("custom_ar", "custom allreduce");
|
|
||||||
custom_ar.def("init_custom_ar", &init_custom_ar, "init_custom_ar");
|
|
||||||
custom_ar.def("should_custom_ar", &should_custom_ar, "should_custom_ar");
|
|
||||||
custom_ar.def("all_reduce_reg", &all_reduce_reg, "all_reduce_reg");
|
|
||||||
custom_ar.def("all_reduce_unreg", &all_reduce_unreg, "all_reduce_unreg");
|
|
||||||
custom_ar.def("dispose", &dispose, "dispose");
|
|
||||||
custom_ar.def("meta_size", &meta_size, "meta_size");
|
|
||||||
custom_ar.def("register_buffer", ®ister_buffer, "register_buffer");
|
|
||||||
custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta,
|
|
||||||
"get_graph_buffer_ipc_meta");
|
|
||||||
custom_ar.def("register_graph_buffers", ®ister_graph_buffers,
|
|
||||||
"register_graph_buffers");
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
@@ -18,7 +18,7 @@
|
|||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
#include <torch/extension.h>
|
#include <torch/all.h>
|
||||||
#include <c10/cuda/CUDAStream.h>
|
#include <c10/cuda/CUDAStream.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023}
|
|||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <torch/extension.h>
|
#include <torch/all.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
#include "dequantize.cuh"
|
#include "dequantize.cuh"
|
||||||
@@ -435,8 +435,8 @@ __global__ void __launch_bounds__(64)
|
|||||||
|
|
||||||
torch::Tensor awq_dequantize(torch::Tensor _kernel,
|
torch::Tensor awq_dequantize(torch::Tensor _kernel,
|
||||||
torch::Tensor _scaling_factors,
|
torch::Tensor _scaling_factors,
|
||||||
torch::Tensor _zeros, int split_k_iters, int thx,
|
torch::Tensor _zeros, int64_t split_k_iters,
|
||||||
int thy) {
|
int64_t thx, int64_t thy) {
|
||||||
int in_c = _kernel.size(0);
|
int in_c = _kernel.size(0);
|
||||||
int qout_c = _kernel.size(1);
|
int qout_c = _kernel.size(1);
|
||||||
int out_c = qout_c * 8;
|
int out_c = qout_c * 8;
|
||||||
@@ -491,7 +491,7 @@ torch::Tensor awq_dequantize(torch::Tensor _kernel,
|
|||||||
|
|
||||||
torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
|
torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
|
||||||
torch::Tensor _scaling_factors, torch::Tensor _zeros,
|
torch::Tensor _scaling_factors, torch::Tensor _zeros,
|
||||||
int split_k_iters) {
|
int64_t split_k_iters) {
|
||||||
int num_in_feats = _in_feats.size(0);
|
int num_in_feats = _in_feats.size(0);
|
||||||
int num_in_channels = _in_feats.size(1);
|
int num_in_channels = _in_feats.size(1);
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(_in_feats));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(_in_feats));
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <torch/extension.h>
|
#include <torch/all.h>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
#include "../../dispatch_utils.h"
|
#include "../../dispatch_utils.h"
|
||||||
|
#include "../../reduction_utils.cuh"
|
||||||
|
|
||||||
static inline __device__ int8_t float_to_int8_rn(float x) {
|
static inline __device__ int8_t float_to_int8_rn(float x) {
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
@@ -27,33 +28,88 @@ namespace vllm {
|
|||||||
|
|
||||||
template <typename scalar_t, typename scale_type>
|
template <typename scalar_t, typename scale_type>
|
||||||
__global__ void static_scaled_int8_quant_kernel(
|
__global__ void static_scaled_int8_quant_kernel(
|
||||||
const scalar_t* __restrict__ input, int8_t* __restrict__ out,
|
scalar_t const* __restrict__ input, int8_t* __restrict__ out,
|
||||||
scale_type scale, const int hidden_size) {
|
scale_type const* scale_ptr, const int hidden_size) {
|
||||||
const int tid = threadIdx.x;
|
int const tid = threadIdx.x;
|
||||||
const int token_idx = blockIdx.x;
|
int const token_idx = blockIdx.x;
|
||||||
|
scale_type const scale = *scale_ptr;
|
||||||
|
|
||||||
for (int i = tid; i < hidden_size; i += blockDim.x) {
|
for (int i = tid; i < hidden_size; i += blockDim.x) {
|
||||||
out[token_idx * hidden_size + i] =
|
out[token_idx * hidden_size + i] = float_to_int8_rn(
|
||||||
float_to_int8_rn(((float)input[token_idx * hidden_size + i]) / scale);
|
static_cast<float>(input[token_idx * hidden_size + i]) / scale);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t, typename scale_type>
|
||||||
|
__global__ void dynamic_scaled_int8_quant_kernel(
|
||||||
|
scalar_t const* __restrict__ input, int8_t* __restrict__ out,
|
||||||
|
scale_type* scale, const int hidden_size) {
|
||||||
|
int const tid = threadIdx.x;
|
||||||
|
int const token_idx = blockIdx.x;
|
||||||
|
float absmax_val = 0.0f;
|
||||||
|
float const zero = 0.0f;
|
||||||
|
|
||||||
|
for (int i = tid; i < hidden_size; i += blockDim.x) {
|
||||||
|
float val = static_cast<float>(input[token_idx * hidden_size + i]);
|
||||||
|
val = val > zero ? val : -val;
|
||||||
|
absmax_val = val > absmax_val ? val : absmax_val;
|
||||||
|
}
|
||||||
|
|
||||||
|
float const block_absmax_val_maybe = blockReduceMax(absmax_val);
|
||||||
|
__shared__ float block_absmax_val;
|
||||||
|
if (tid == 0) {
|
||||||
|
block_absmax_val = block_absmax_val_maybe;
|
||||||
|
scale[token_idx] = block_absmax_val / 127.0f;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
float const tmp_scale = 127.0f / block_absmax_val;
|
||||||
|
for (int i = tid; i < hidden_size; i += blockDim.x) {
|
||||||
|
out[token_idx * hidden_size + i] = float_to_int8_rn(
|
||||||
|
static_cast<float>(input[token_idx * hidden_size + i]) * tmp_scale);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
|
void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
|
||||||
torch::Tensor& input, // [..., hidden_size]
|
torch::Tensor const& input, // [..., hidden_size]
|
||||||
float scale) {
|
torch::Tensor const& scale) {
|
||||||
TORCH_CHECK(input.is_contiguous());
|
TORCH_CHECK(input.is_contiguous());
|
||||||
TORCH_CHECK(out.is_contiguous());
|
TORCH_CHECK(out.is_contiguous());
|
||||||
int hidden_size = input.size(-1);
|
TORCH_CHECK(scale.numel() == 1);
|
||||||
int num_tokens = input.numel() / hidden_size;
|
|
||||||
dim3 grid(num_tokens);
|
int const hidden_size = input.size(-1);
|
||||||
dim3 block(std::min(hidden_size, 1024));
|
int const num_tokens = input.numel() / hidden_size;
|
||||||
|
dim3 const grid(num_tokens);
|
||||||
|
dim3 const block(std::min(hidden_size, 1024));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
input.scalar_type(), "static_scaled_int8_quant_kernel", [&] {
|
input.scalar_type(), "static_scaled_int8_quant_kernel", [&] {
|
||||||
vllm::static_scaled_int8_quant_kernel<scalar_t, float>
|
vllm::static_scaled_int8_quant_kernel<scalar_t, float>
|
||||||
<<<grid, block, 0, stream>>>(input.data_ptr<scalar_t>(),
|
<<<grid, block, 0, stream>>>(input.data_ptr<scalar_t>(),
|
||||||
out.data_ptr<int8_t>(), scale,
|
out.data_ptr<int8_t>(),
|
||||||
hidden_size);
|
scale.data_ptr<float>(), hidden_size);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void dynamic_scaled_int8_quant(
|
||||||
|
torch::Tensor& out, // [..., hidden_size]
|
||||||
|
torch::Tensor const& input, // [..., hidden_size]
|
||||||
|
torch::Tensor& scales) {
|
||||||
|
TORCH_CHECK(input.is_contiguous());
|
||||||
|
TORCH_CHECK(out.is_contiguous());
|
||||||
|
|
||||||
|
int const hidden_size = input.size(-1);
|
||||||
|
int const num_tokens = input.numel() / hidden_size;
|
||||||
|
dim3 const grid(num_tokens);
|
||||||
|
dim3 const block(std::min(hidden_size, 1024));
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
|
input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] {
|
||||||
|
vllm::dynamic_scaled_int8_quant_kernel<scalar_t, float>
|
||||||
|
<<<grid, block, 0, stream>>>(input.data_ptr<scalar_t>(),
|
||||||
|
out.data_ptr<int8_t>(),
|
||||||
|
scales.data_ptr<float>(), hidden_size);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -33,20 +33,27 @@
|
|||||||
//
|
//
|
||||||
// This file is a modified excerpt of
|
// This file is a modified excerpt of
|
||||||
// include/cutlass/epilogue/fusion/visitor_load.hpp from
|
// include/cutlass/epilogue/fusion/visitor_load.hpp from
|
||||||
// https://github.com/NVIDIA/cutlass It's beem modified to support either
|
// https://github.com/NVIDIA/cutlass v3.5.0
|
||||||
// row/column or scalar broadcasting, like is already supported in CUTLASS 3.x.
|
// It has been modified to support either
|
||||||
// Important because this saves us a factor 4x on the number of kernels
|
// row/column or scalar broadcasting where the tensor being loaded from is
|
||||||
// compiled.
|
// always passed in via a device pointer. This lets one compiled kernel handle
|
||||||
|
// all cases of per-tensor or per-channel/per-token quantization.
|
||||||
|
//
|
||||||
|
// This interface also allows the scales to be passed in as tensors that
|
||||||
|
// consistently reside on the device, which avoids an issue with a previous
|
||||||
|
// implementation where scalars needed to be on the CPU since they
|
||||||
|
// were passed in via float values. This created a potential performance hazard
|
||||||
|
// if scales were initially on the device, and caused torch.compile graph
|
||||||
|
// breaks when moving scales to the CPU.
|
||||||
//
|
//
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
// Turn off clang-format for the entire file to keep it close to upstream
|
||||||
// clang-format off
|
// clang-format off
|
||||||
|
|
||||||
#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
|
#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
|
||||||
#include "cute/tensor.hpp"
|
#include "cute/tensor.hpp"
|
||||||
|
|
||||||
// clang-format on
|
|
||||||
|
|
||||||
namespace cutlass::epilogue::threadblock {
|
namespace cutlass::epilogue::threadblock {
|
||||||
|
|
||||||
using namespace cute;
|
using namespace cute;
|
||||||
@@ -59,9 +66,11 @@ template<
|
|||||||
>
|
>
|
||||||
struct VisitorRowOrScalarBroadcast {
|
struct VisitorRowOrScalarBroadcast {
|
||||||
|
|
||||||
|
// This struct has been modified to have a bool indicating that ptr_row is a
|
||||||
|
// scalar that must be broadcast.
|
||||||
struct Arguments {
|
struct Arguments {
|
||||||
Element const* ptr_row = nullptr;
|
Element const* ptr_row = nullptr;
|
||||||
Element null_default = Element(0);
|
bool row_broadcast = true;
|
||||||
StrideMNL dRow = {};
|
StrideMNL dRow = {};
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -125,25 +134,25 @@ struct VisitorRowOrScalarBroadcast {
|
|||||||
auto coord_v = filter(tC_cRow);
|
auto coord_v = filter(tC_cRow);
|
||||||
auto dst_v = filter(tC_rRow);
|
auto dst_v = filter(tC_rRow);
|
||||||
|
|
||||||
if (params_ptr->ptr_row) {
|
if (params_ptr->row_broadcast) {
|
||||||
// In this case we are loading from a row vector and broadcasting
|
// In this case we are loading from a row vector and broadcasting
|
||||||
CUTLASS_PRAGMA_UNROLL
|
CUTLASS_PRAGMA_UNROLL
|
||||||
for (int i = 0; i < size(src_v); ++i) {
|
for (int i = 0; i < size(src_v); ++i) {
|
||||||
bool guard = get<1>(coord_v(i)) < n;
|
bool guard = get<1>(coord_v(i)) < n;
|
||||||
cutlass::arch::global_load<VecType, sizeof(VecType)>(dst_v(i), (void const*)&src_v(i), guard);
|
cutlass::arch::global_load<VecType, sizeof(VecType)>(
|
||||||
|
dst_v(i), (void const*)&src_v(i), guard);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// In this case we are loading from a scalar and broadcasting
|
// In this case we are loading from a scalar and broadcasting
|
||||||
VecType filled_vec;
|
VecType filled_vec;
|
||||||
CUTLASS_PRAGMA_UNROLL
|
CUTLASS_PRAGMA_UNROLL
|
||||||
for (int i = 0; i < VecLength; i++) {
|
for (int i = 0; i < VecLength; i++) {
|
||||||
reinterpret_cast<Element*>(&filled_vec)[i] = params_ptr->null_default;
|
reinterpret_cast<Element*>(&filled_vec)[i] = *(params_ptr->ptr_row);
|
||||||
}
|
}
|
||||||
|
|
||||||
CUTLASS_PRAGMA_UNROLL
|
CUTLASS_PRAGMA_UNROLL
|
||||||
for (int i = 0; i < size(src_v); ++i) {
|
for (int i = 0; i < size(src_v); ++i) {
|
||||||
if(get<1>(coord_v(i)) < n)
|
if (get<1>(coord_v(i)) < n) {
|
||||||
{
|
|
||||||
dst_v(i) = filled_vec;
|
dst_v(i) = filled_vec;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -208,9 +217,11 @@ template<
|
|||||||
>
|
>
|
||||||
struct VisitorColOrScalarBroadcast {
|
struct VisitorColOrScalarBroadcast {
|
||||||
|
|
||||||
|
// This struct has been modified to have a bool indicating that ptr_col is a
|
||||||
|
// scalar that must be broadcast.
|
||||||
struct Arguments {
|
struct Arguments {
|
||||||
Element const* ptr_col = nullptr;
|
Element const* ptr_col = nullptr;
|
||||||
Element null_default = Element(0);
|
bool col_broadcast = true;
|
||||||
StrideMNL dCol = {};
|
StrideMNL dCol = {};
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -230,11 +241,6 @@ struct VisitorColOrScalarBroadcast {
|
|||||||
|
|
||||||
struct SharedStorage { };
|
struct SharedStorage { };
|
||||||
|
|
||||||
// Global load type
|
|
||||||
static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
|
|
||||||
using VecType = uint_bit_t<cute::min(128, vec_bits)>;
|
|
||||||
static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
|
|
||||||
|
|
||||||
CUTLASS_HOST_DEVICE
|
CUTLASS_HOST_DEVICE
|
||||||
VisitorColOrScalarBroadcast() { }
|
VisitorColOrScalarBroadcast() { }
|
||||||
|
|
||||||
@@ -267,7 +273,7 @@ struct VisitorColOrScalarBroadcast {
|
|||||||
int m;
|
int m;
|
||||||
|
|
||||||
// This function is modified from VisitorColBroadcast
|
// This function is modified from VisitorColBroadcast
|
||||||
CUTLASS_DEVICE void
|
CUTLASS_DEVICE void
|
||||||
begin_epilogue() {
|
begin_epilogue() {
|
||||||
clear(tC_rCol);
|
clear(tC_rCol);
|
||||||
|
|
||||||
@@ -277,7 +283,7 @@ struct VisitorColOrScalarBroadcast {
|
|||||||
pred(i) = get<0>(tC_cCol(i)) < m;
|
pred(i) = get<0>(tC_cCol(i)) < m;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params_ptr->ptr_col) {
|
if (params_ptr->col_broadcast) {
|
||||||
// In this case we are loading from a column vector and broadcasting
|
// In this case we are loading from a column vector and broadcasting
|
||||||
copy_if(pred, tC_gCol, tC_rCol);
|
copy_if(pred, tC_gCol, tC_rCol);
|
||||||
} else {
|
} else {
|
||||||
@@ -286,8 +292,8 @@ struct VisitorColOrScalarBroadcast {
|
|||||||
|
|
||||||
CUTLASS_PRAGMA_UNROLL
|
CUTLASS_PRAGMA_UNROLL
|
||||||
for (int i = 0; i < size(dst_v); ++i) {
|
for (int i = 0; i < size(dst_v); ++i) {
|
||||||
if(pred(i)){
|
if (pred(i)) {
|
||||||
dst_v(i) = params_ptr->null_default;
|
dst_v(i) = *(params_ptr->ptr_col);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
389
csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
Normal file
389
csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
Normal file
@@ -0,0 +1,389 @@
|
|||||||
|
/***************************************************************************************************
|
||||||
|
* Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
|
||||||
|
*reserved. SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
*this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* 3. Neither the name of the copyright holder nor the names of its
|
||||||
|
* contributors may be used to endorse or promote products derived from
|
||||||
|
* this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
*ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
*LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
*CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
*SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
*INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
*CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
*ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
*POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*
|
||||||
|
**************************************************************************************************/
|
||||||
|
|
||||||
|
//
|
||||||
|
// This file is a modified excerpt of
|
||||||
|
// include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
|
||||||
|
// from https://github.com/NVIDIA/cutlass v3.5.0
|
||||||
|
// It has been modified to support either row/column or scalar broadcasting
|
||||||
|
// where the tensor being loaded from is always passed in via a device pointer.
|
||||||
|
// This lets one compiled kernel handle all cases of per-tensor or
|
||||||
|
// per-channel/per-token quantization.
|
||||||
|
//
|
||||||
|
// This interface also allows the scales to be passed in as tensors that
|
||||||
|
// consistently reside on the device, which avoids an issue with a previous
|
||||||
|
// implementation where scalars needed to be on the CPU since they
|
||||||
|
// were passed in via float values. This created a potential performance hazard
|
||||||
|
// if scales were initially on the device, and caused torch.compile graphs
|
||||||
|
// breaks when moving scales to the CPU.
|
||||||
|
//
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
// Turn off clang-format for the entire file to keep it close to upstream
|
||||||
|
// clang-format off
|
||||||
|
|
||||||
|
#include "cutlass/cutlass.h"
|
||||||
|
#include "cutlass/arch/barrier.h"
|
||||||
|
|
||||||
|
#include "cute/tensor.hpp"
|
||||||
|
#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
|
||||||
|
|
||||||
|
namespace cutlass::epilogue::fusion {
|
||||||
|
|
||||||
|
using namespace cute;
|
||||||
|
using namespace detail;
|
||||||
|
|
||||||
|
// Row vector broadcast
|
||||||
|
template<
|
||||||
|
// Row bcast reuses the mbarriers from the epilogue subtile load pipeline, so this must be at least
|
||||||
|
// ceil_div(StagesC, epi tiles per CTA tile) + 1 to ensure no data races
|
||||||
|
int Stages,
|
||||||
|
class CtaTileShapeMNK,
|
||||||
|
class Element,
|
||||||
|
class StrideMNL = Stride<_0,_1,_0>,
|
||||||
|
int Alignment = 128 / sizeof_bits_v<Element>
|
||||||
|
>
|
||||||
|
struct Sm90RowOrScalarBroadcast {
|
||||||
|
static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
|
||||||
|
static_assert(
|
||||||
|
(cute::is_same_v<StrideMNL, Stride<_0,_1, _0>>) || // row vector broadcast, e.g. per-col alpha/bias
|
||||||
|
(cute::is_same_v<StrideMNL, Stride<_0,_1,int>>)); // batched row vector broadcast
|
||||||
|
|
||||||
|
// Accumulator doesn't distribute row elements evenly amongst threads so we must buffer in smem
|
||||||
|
struct SharedStorage {
|
||||||
|
alignas(16) array_aligned<Element, size<1>(CtaTileShapeMNK{}) * Stages> smem_row;
|
||||||
|
};
|
||||||
|
|
||||||
|
// This struct has been modified to have a bool indicating that ptr_row is a
|
||||||
|
// scalar that must be broadcast, instead of containing a scalar that is
|
||||||
|
// valid if ptr_row is null.
|
||||||
|
struct Arguments {
|
||||||
|
Element const* ptr_row = nullptr;
|
||||||
|
bool row_broadcast = true;
|
||||||
|
StrideMNL dRow = {};
|
||||||
|
};
|
||||||
|
|
||||||
|
using Params = Arguments;
|
||||||
|
|
||||||
|
template <class ProblemShape>
|
||||||
|
static constexpr Params
|
||||||
|
to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
|
||||||
|
return args;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class ProblemShape>
|
||||||
|
static size_t
|
||||||
|
get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class ProblemShape>
|
||||||
|
static cutlass::Status
|
||||||
|
initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
|
||||||
|
CudaHostAdapter* cuda_adapter = nullptr) {
|
||||||
|
return cutlass::Status::kSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
CUTLASS_HOST_DEVICE
|
||||||
|
Sm90RowOrScalarBroadcast() { }
|
||||||
|
|
||||||
|
CUTLASS_HOST_DEVICE
|
||||||
|
Sm90RowOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
|
||||||
|
: params(params),
|
||||||
|
smem_row(const_cast<Element*>(shared_storage.smem_row.data())) { }
|
||||||
|
|
||||||
|
Params params;
|
||||||
|
Element* smem_row;
|
||||||
|
|
||||||
|
CUTLASS_DEVICE bool
|
||||||
|
is_producer_load_needed() const {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
CUTLASS_DEVICE bool
|
||||||
|
is_C_load_needed() const {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
CUTLASS_DEVICE bool
|
||||||
|
is_zero() const {
|
||||||
|
return (!params.row_broadcast && *(params.ptr_row) == Element(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int EpiTiles, class GTensor, class STensor>
|
||||||
|
struct ProducerLoadCallbacks : EmptyProducerLoadCallbacks {
|
||||||
|
CUTLASS_DEVICE
|
||||||
|
ProducerLoadCallbacks(GTensor&& gRow, STensor&& sRow, Params const& params)
|
||||||
|
: gRow(cute::forward<GTensor>(gRow)),
|
||||||
|
sRow(cute::forward<STensor>(sRow)),
|
||||||
|
params(params) {}
|
||||||
|
|
||||||
|
GTensor gRow; // (CTA_M,CTA_N)
|
||||||
|
STensor sRow; // (CTA_M,CTA_N,PIPE)
|
||||||
|
Params const& params;
|
||||||
|
|
||||||
|
CUTLASS_DEVICE void
|
||||||
|
begin(uint64_t* full_mbarrier_ptr, int load_iteration, bool issue_tma_load) {
|
||||||
|
if (!params.row_broadcast) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (issue_tma_load) {
|
||||||
|
// Increment the expect-tx count of the first subtile's mbarrier by the row vector's byte-size
|
||||||
|
constexpr uint32_t copy_bytes = size<1>(CtaTileShapeMNK{}) * sizeof_bits_v<Element> / 8;
|
||||||
|
cutlass::arch::ClusterTransactionBarrier::expect_transaction(full_mbarrier_ptr, copy_bytes);
|
||||||
|
// Issue the TMA bulk copy
|
||||||
|
auto bulk_copy = Copy_Atom<SM90_BULK_COPY_AUTO, Element>{}.with(*full_mbarrier_ptr);
|
||||||
|
// Filter so we don't issue redundant copies over stride-0 modes
|
||||||
|
int bcast_pipe_index = (load_iteration / EpiTiles) % Stages;
|
||||||
|
copy(bulk_copy, filter(gRow), filter(sRow(_,_,bcast_pipe_index)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <class... Args>
|
||||||
|
CUTLASS_DEVICE auto
|
||||||
|
get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
|
||||||
|
|
||||||
|
auto [M, N, K, L] = args.problem_shape_mnkl;
|
||||||
|
auto [m, n, k, l] = args.tile_coord_mnkl;
|
||||||
|
Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row), make_shape(M,N,L), params.dRow);
|
||||||
|
Tensor gRow = local_tile(mRow, take<0,2>(args.tile_shape_mnk), make_coord(m,n,l)); // (CTA_M,CTA_N)
|
||||||
|
Tensor sRow = make_tensor(make_smem_ptr(smem_row), // (CTA_M,CTA_N,PIPE)
|
||||||
|
make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{}), Stages),
|
||||||
|
make_stride(_0{},_1{},size<1>(CtaTileShapeMNK{})));
|
||||||
|
|
||||||
|
constexpr int EpiTiles = decltype(size<1>(zipped_divide(make_layout(take<0,2>(args.tile_shape_mnk)), args.epi_tile)))::value;
|
||||||
|
return ProducerLoadCallbacks<EpiTiles, decltype(gRow), decltype(sRow)>(
|
||||||
|
cute::move(gRow), cute::move(sRow), params);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int EpiTiles, class RTensor, class STensor>
|
||||||
|
struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
|
||||||
|
CUTLASS_DEVICE
|
||||||
|
ConsumerStoreCallbacks(RTensor&& tCrRow, STensor&& tCsRow, Params const& params)
|
||||||
|
: tCrRow(cute::forward<RTensor>(tCrRow)),
|
||||||
|
tCsRow(cute::forward<STensor>(tCsRow)),
|
||||||
|
params(params) {}
|
||||||
|
|
||||||
|
RTensor tCrRow; // (CPY,CPY_M,CPY_N)
|
||||||
|
STensor tCsRow; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,PIPE)
|
||||||
|
Params const& params;
|
||||||
|
|
||||||
|
CUTLASS_DEVICE void
|
||||||
|
previsit(int epi_m, int epi_n, int load_iteration, bool is_producer_load_needed) {
|
||||||
|
if (!params.row_broadcast) {
|
||||||
|
fill(tCrRow, *(params.ptr_row));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (epi_m == 0) { // Assumes M-major subtile loop
|
||||||
|
// Filter so we don't issue redundant copies over stride-0 modes
|
||||||
|
// (only works if 0-strides are in same location, which is by construction)
|
||||||
|
int bcast_pipe_index = (load_iteration / EpiTiles) % Stages;
|
||||||
|
copy_aligned(filter(tCsRow(_,_,_,epi_m,epi_n,bcast_pipe_index)), filter(tCrRow));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename ElementAccumulator, int FragmentSize>
|
||||||
|
CUTLASS_DEVICE Array<Element, FragmentSize>
|
||||||
|
visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
|
||||||
|
Array<Element, FragmentSize> frg_row;
|
||||||
|
|
||||||
|
CUTLASS_PRAGMA_UNROLL
|
||||||
|
for (int i = 0; i < FragmentSize; ++i) {
|
||||||
|
frg_row[i] = tCrRow(epi_v * FragmentSize + i);
|
||||||
|
}
|
||||||
|
|
||||||
|
return frg_row;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <
|
||||||
|
bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
|
||||||
|
class... Args
|
||||||
|
>
|
||||||
|
CUTLASS_DEVICE auto
|
||||||
|
get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
|
||||||
|
|
||||||
|
Tensor sRow = make_tensor(make_smem_ptr(smem_row), // (CTA_M,CTA_N,PIPE)
|
||||||
|
make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{}), Stages),
|
||||||
|
make_stride(_0{},_1{},size<1>(CtaTileShapeMNK{})));
|
||||||
|
Tensor tCsRow = sm90_partition_for_epilogue<ReferenceSrc>( // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,PIPE)
|
||||||
|
sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
|
||||||
|
Tensor tCrRow = make_tensor_like(take<0,3>(tCsRow)); // (CPY,CPY_M,CPY_N)
|
||||||
|
|
||||||
|
constexpr int EpiTiles = decltype(size<1>(zipped_divide(make_layout(take<0,2>(args.tile_shape_mnk)), args.epi_tile)))::value;
|
||||||
|
return ConsumerStoreCallbacks<EpiTiles, decltype(tCrRow), decltype(tCsRow)>(
|
||||||
|
cute::move(tCrRow), cute::move(tCsRow), params);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// Column vector broadcast
|
||||||
|
template<
|
||||||
|
int Stages,
|
||||||
|
class CtaTileShapeMNK,
|
||||||
|
class Element,
|
||||||
|
class StrideMNL = Stride<_1,_0,_0>,
|
||||||
|
int Alignment = 128 / sizeof_bits_v<Element>
|
||||||
|
>
|
||||||
|
struct Sm90ColOrScalarBroadcast {
|
||||||
|
static_assert(Stages == 0, "Column broadcast doesn't support smem usage yet");
|
||||||
|
static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
|
||||||
|
static_assert(
|
||||||
|
(cute::is_same_v<StrideMNL, Stride<_1,_0, _0>>) || // col vector broadcast, e.g. per-row alpha/bias
|
||||||
|
(cute::is_same_v<StrideMNL, Stride<_1,_0,int>>)); // batched col vector broadcast, e.g. batched per-row bias
|
||||||
|
|
||||||
|
// Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem
|
||||||
|
struct SharedStorage { };
|
||||||
|
|
||||||
|
// This struct has been modified to have a bool indicating that ptr_col is a
|
||||||
|
// scalar that must be broadcast, instead of containing a scalar that is
|
||||||
|
// valid if ptr_col is null.
|
||||||
|
struct Arguments {
|
||||||
|
Element const* ptr_col = nullptr;
|
||||||
|
bool col_broadcast = true;
|
||||||
|
StrideMNL dCol = {};
|
||||||
|
};
|
||||||
|
|
||||||
|
using Params = Arguments;
|
||||||
|
|
||||||
|
template <class ProblemShape>
|
||||||
|
static constexpr Params
|
||||||
|
to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
|
||||||
|
return args;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class ProblemShape>
|
||||||
|
static size_t
|
||||||
|
get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class ProblemShape>
|
||||||
|
static cutlass::Status
|
||||||
|
initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
|
||||||
|
CudaHostAdapter* cuda_adapter = nullptr) {
|
||||||
|
return cutlass::Status::kSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
CUTLASS_DEVICE bool
|
||||||
|
is_producer_load_needed() const {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
CUTLASS_DEVICE bool
|
||||||
|
is_C_load_needed() const {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
CUTLASS_DEVICE bool
|
||||||
|
is_zero() const {
|
||||||
|
return (!params.col_broadcast && *(params.ptr_col) == Element(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
CUTLASS_HOST_DEVICE
|
||||||
|
Sm90ColOrScalarBroadcast() { }
|
||||||
|
|
||||||
|
CUTLASS_HOST_DEVICE
|
||||||
|
Sm90ColOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
|
||||||
|
: params(params) { }
|
||||||
|
|
||||||
|
Params params;
|
||||||
|
|
||||||
|
template <class... Args>
|
||||||
|
CUTLASS_DEVICE auto
|
||||||
|
get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
|
||||||
|
return EmptyProducerLoadCallbacks{};
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class GTensor, class RTensor>
|
||||||
|
struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
|
||||||
|
CUTLASS_DEVICE
|
||||||
|
ConsumerStoreCallbacks(GTensor&& tCgCol, RTensor&& tCrCol, Params const& params)
|
||||||
|
: tCgCol(cute::forward<GTensor>(tCgCol)),
|
||||||
|
tCrCol(cute::forward<RTensor>(tCrCol)),
|
||||||
|
params(params) {}
|
||||||
|
|
||||||
|
GTensor tCgCol; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
|
||||||
|
RTensor tCrCol; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
|
||||||
|
Params const& params;
|
||||||
|
|
||||||
|
CUTLASS_DEVICE void
|
||||||
|
begin() {
|
||||||
|
if (!params.col_broadcast) {
|
||||||
|
fill(tCrCol, *(params.ptr_col));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Filter so we don't issue redundant copies over stride-0 modes
|
||||||
|
// (only works if 0-strides are in same location, which is by construction)
|
||||||
|
copy_aligned(filter(tCgCol), filter(tCrCol));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename ElementAccumulator, int FragmentSize>
|
||||||
|
CUTLASS_DEVICE Array<Element, FragmentSize>
|
||||||
|
visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
|
||||||
|
Array<Element, FragmentSize> frg_col;
|
||||||
|
Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
|
||||||
|
|
||||||
|
CUTLASS_PRAGMA_UNROLL
|
||||||
|
for (int i = 0; i < FragmentSize; ++i) {
|
||||||
|
frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i);
|
||||||
|
}
|
||||||
|
|
||||||
|
return frg_col;
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
template <
|
||||||
|
bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
|
||||||
|
class... Args
|
||||||
|
>
|
||||||
|
CUTLASS_DEVICE auto
|
||||||
|
get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
|
||||||
|
|
||||||
|
auto [M, N, K, L] = args.problem_shape_mnkl;
|
||||||
|
Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col), make_shape(M,N,L), params.dCol);
|
||||||
|
Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>( // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
|
||||||
|
mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
|
||||||
|
Tensor tCrCol = make_tensor_like(tCgCol); // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
|
||||||
|
|
||||||
|
return ConsumerStoreCallbacks<decltype(tCgCol), decltype(tCrCol)>(
|
||||||
|
cute::move(tCgCol), cute::move(tCrCol), params);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "cutlass/cutlass.h"
|
#include "cutlass/cutlass.h"
|
||||||
|
#include <climits>
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper function for checking CUTLASS errors
|
* Helper function for checking CUTLASS errors
|
||||||
@@ -10,3 +11,17 @@
|
|||||||
TORCH_CHECK(status == cutlass::Status::kSuccess, \
|
TORCH_CHECK(status == cutlass::Status::kSuccess, \
|
||||||
cutlassGetStatusString(status)) \
|
cutlassGetStatusString(status)) \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline uint32_t next_pow_2(uint32_t const num) {
|
||||||
|
if (num <= 1) return num;
|
||||||
|
return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
|
||||||
|
int max_shared_mem_per_block_opt_in = 0;
|
||||||
|
cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
|
||||||
|
cudaDevAttrMaxSharedMemoryPerBlockOptin,
|
||||||
|
device);
|
||||||
|
return max_shared_mem_per_block_opt_in;
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
609
csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
Normal file
609
csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
Normal file
@@ -0,0 +1,609 @@
|
|||||||
|
#include <stddef.h>
|
||||||
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
|
|
||||||
|
// clang-format will break include orders
|
||||||
|
// clang-format off
|
||||||
|
#include "cute/tensor.hpp"
|
||||||
|
#include "cute/atom/mma_atom.hpp"
|
||||||
|
#include "cutlass/numeric_types.h"
|
||||||
|
|
||||||
|
#include "cutlass/util/device_memory.h"
|
||||||
|
|
||||||
|
#include "cutlass/cutlass.h"
|
||||||
|
#include "cutlass/gemm_coord.h"
|
||||||
|
#include "cutlass/arch/mma_sm75.h"
|
||||||
|
#include "cutlass/arch/arch.h"
|
||||||
|
#include "cutlass/arch/mma.h"
|
||||||
|
#include "cutlass/gemm/device/gemm.h"
|
||||||
|
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||||
|
|
||||||
|
#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
|
||||||
|
#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
|
||||||
|
|
||||||
|
#include "broadcast_load_epilogue_c2x.hpp"
|
||||||
|
#include "common.hpp"
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
|
using namespace cute;
|
||||||
|
|
||||||
|
/*
|
||||||
|
This file defines quantized GEMM operations using the CUTLASS 2.x API, for
|
||||||
|
NVIDIA GPUs with SM versions prior to sm90 (Hopper).
|
||||||
|
|
||||||
|
Epilogue functions can be defined to post-process the output before it is
|
||||||
|
written to GPU memory.
|
||||||
|
Epilogues must contain a public type named EVTCompute of type Sm80EVT,
|
||||||
|
as well as a static prepare_args function that constructs an
|
||||||
|
EVTCompute::Arguments struct.
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
// Wrappers for the GEMM kernel that is used to guard against compilation on
|
||||||
|
// architectures that will never use the kernel. The purpose of this is to
|
||||||
|
// reduce the size of the compiled binary.
|
||||||
|
// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
|
||||||
|
// into code that will be executed on the device where it is defined.
|
||||||
|
template <typename Kernel>
|
||||||
|
struct enable_sm75_to_sm80 : Kernel {
|
||||||
|
template <typename... Args>
|
||||||
|
CUTLASS_DEVICE static void invoke(Args&&... args) {
|
||||||
|
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
|
||||||
|
Kernel::invoke(std::forward<Args>(args)...);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Kernel>
|
||||||
|
struct enable_sm80_to_sm89 : Kernel {
|
||||||
|
template <typename... Args>
|
||||||
|
CUTLASS_DEVICE static void invoke(Args&&... args) {
|
||||||
|
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
|
||||||
|
Kernel::invoke(std::forward<Args>(args)...);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Kernel>
|
||||||
|
struct enable_sm89_to_sm90 : Kernel {
|
||||||
|
template <typename... Args>
|
||||||
|
CUTLASS_DEVICE static void invoke(Args&&... args) {
|
||||||
|
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
|
||||||
|
Kernel::invoke(std::forward<Args>(args)...);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This class provides the common ScaleA and ScaleB descriptors for the
|
||||||
|
* ScaledEpilogue and ScaledEpilogueBias classes.
|
||||||
|
*/
|
||||||
|
template <typename ElementD, typename OutputTileThreadMap>
|
||||||
|
struct ScaledEpilogueBase {
|
||||||
|
protected:
|
||||||
|
using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
|
||||||
|
|
||||||
|
using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
|
||||||
|
OutputTileThreadMap, float, Stride<Int<1>, Int<0>, Int<0>>>;
|
||||||
|
|
||||||
|
using ScaleB = cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast<
|
||||||
|
OutputTileThreadMap, float, Stride<Int<0>, Int<1>, Int<0>>>;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
This epilogue function defines a quantized GEMM operation similar to
|
||||||
|
torch._scaled_mm.
|
||||||
|
|
||||||
|
A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
|
||||||
|
per-row. B can be quantized per-tensor or per-column.
|
||||||
|
Any combination of per-tensor and per-row or column is supported.
|
||||||
|
A and B must have symmetric quantization (zero point == 0).
|
||||||
|
|
||||||
|
So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
|
||||||
|
scales are applied elementwise with numpy-style broadcasting.
|
||||||
|
|
||||||
|
ScaleA and ScaleB define the epilogue functions that apply the scales for
|
||||||
|
the A and B operands respectively. These scales may be either per-tensor or
|
||||||
|
per row or column.
|
||||||
|
*/
|
||||||
|
template <typename ElementD, typename OutputTileThreadMap>
|
||||||
|
struct ScaledEpilogue
|
||||||
|
: private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
|
||||||
|
private:
|
||||||
|
using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
|
||||||
|
using Accum = typename SUPER::Accum;
|
||||||
|
using ScaleA = typename SUPER::ScaleA;
|
||||||
|
using ScaleB = typename SUPER::ScaleB;
|
||||||
|
|
||||||
|
using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
|
||||||
|
cutlass::multiplies, float, float,
|
||||||
|
cutlass::FloatRoundStyle::round_to_nearest>;
|
||||||
|
|
||||||
|
using EVTCompute0 =
|
||||||
|
cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
|
||||||
|
|
||||||
|
using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
|
||||||
|
cutlass::multiplies, ElementD, float,
|
||||||
|
cutlass::FloatRoundStyle::round_to_nearest>;
|
||||||
|
|
||||||
|
public:
|
||||||
|
using EVTCompute =
|
||||||
|
cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA, EVTCompute0>;
|
||||||
|
using ArgumentType = typename EVTCompute::Arguments;
|
||||||
|
|
||||||
|
static ArgumentType prepare_args(torch::Tensor const& a_scales,
|
||||||
|
torch::Tensor const& b_scales) {
|
||||||
|
using ScaleAArgs = typename ScaleA::Arguments;
|
||||||
|
using ScaleBArgs = typename ScaleB::Arguments;
|
||||||
|
|
||||||
|
ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
|
||||||
|
ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
|
||||||
|
|
||||||
|
typename EVTCompute0::Arguments evt0_compute_args{b_args};
|
||||||
|
|
||||||
|
typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args};
|
||||||
|
return evt_compute_args;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename ElementD, typename OutputTileThreadMap>
|
||||||
|
struct ScaledEpilogueBias
|
||||||
|
: private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
|
||||||
|
private:
|
||||||
|
using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
|
||||||
|
using Accum = typename SUPER::Accum;
|
||||||
|
using ScaleA = typename SUPER::ScaleA;
|
||||||
|
using ScaleB = typename SUPER::ScaleB;
|
||||||
|
|
||||||
|
using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
|
||||||
|
cutlass::multiplies, float, float,
|
||||||
|
cutlass::FloatRoundStyle::round_to_nearest>;
|
||||||
|
|
||||||
|
using EVTCompute0 =
|
||||||
|
cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
|
||||||
|
|
||||||
|
using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
|
||||||
|
cutlass::multiply_add, ElementD, float,
|
||||||
|
cutlass::FloatRoundStyle::round_to_nearest>;
|
||||||
|
|
||||||
|
using Bias = cutlass::epilogue::threadblock::VisitorRowBroadcast<
|
||||||
|
OutputTileThreadMap, ElementD, Stride<Int<0>, Int<1>, Int<0>>>;
|
||||||
|
|
||||||
|
public:
|
||||||
|
using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA,
|
||||||
|
EVTCompute0, Bias>;
|
||||||
|
using ArgumentType = typename EVTCompute::Arguments;
|
||||||
|
|
||||||
|
static ArgumentType prepare_args(torch::Tensor const& a_scales,
|
||||||
|
torch::Tensor const& b_scales,
|
||||||
|
torch::Tensor const& bias) {
|
||||||
|
using ScaleAArgs = typename ScaleA::Arguments;
|
||||||
|
using ScaleBArgs = typename ScaleB::Arguments;
|
||||||
|
using BiasArgs = typename Bias::Arguments;
|
||||||
|
|
||||||
|
ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
|
||||||
|
ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
|
||||||
|
BiasArgs bias_args{static_cast<ElementD*>(bias.data_ptr()), {}};
|
||||||
|
|
||||||
|
typename EVTCompute0::Arguments evt0_compute_args{b_args};
|
||||||
|
|
||||||
|
typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args,
|
||||||
|
bias_args};
|
||||||
|
return evt_compute_args;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Arch, template <typename> typename ArchGuard,
|
||||||
|
typename ElementAB_, typename ElementD_,
|
||||||
|
template <typename, typename> typename Epilogue_, typename TileShape,
|
||||||
|
typename WarpShape, typename InstructionShape, int32_t MainLoopStages>
|
||||||
|
struct cutlass_2x_gemm {
|
||||||
|
using ElementAB = ElementAB_;
|
||||||
|
using ElementD = ElementD_;
|
||||||
|
|
||||||
|
using ElementAcc =
|
||||||
|
typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
|
||||||
|
float>::type;
|
||||||
|
|
||||||
|
using Operator =
|
||||||
|
typename std::conditional<std::is_same_v<ElementAB, int8_t>,
|
||||||
|
cutlass::arch::OpMultiplyAddSaturate,
|
||||||
|
cutlass::arch::OpMultiplyAdd>::type;
|
||||||
|
|
||||||
|
using OutputTileThreadMap =
|
||||||
|
cutlass::epilogue::threadblock::OutputTileThreadLayout<
|
||||||
|
TileShape, WarpShape, float, 4, 1 /* epilogue stages */
|
||||||
|
>;
|
||||||
|
|
||||||
|
using Epilogue = Epilogue_<ElementD, OutputTileThreadMap>;
|
||||||
|
using EVTCompute = typename Epilogue::EVTCompute;
|
||||||
|
|
||||||
|
using D = cutlass::epilogue::threadblock::VisitorAuxStore<
|
||||||
|
OutputTileThreadMap, ElementD, cutlass::FloatRoundStyle::round_to_nearest,
|
||||||
|
Stride<int64_t, Int<1>, Int<0>>>;
|
||||||
|
|
||||||
|
using EVTD = cutlass::epilogue::threadblock::Sm80EVT<D, EVTCompute>;
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
using RowMajor = typename cutlass::layout::RowMajor;
|
||||||
|
using ColumnMajor = typename cutlass::layout::ColumnMajor;
|
||||||
|
using KernelType =
|
||||||
|
ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
|
||||||
|
ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16,
|
||||||
|
ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16,
|
||||||
|
float, cutlass::layout::RowMajor, 4,
|
||||||
|
ElementAcc, float, cutlass::arch::OpClassTensorOp,
|
||||||
|
Arch,
|
||||||
|
TileShape, WarpShape, InstructionShape,
|
||||||
|
EVTD,
|
||||||
|
cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
|
||||||
|
MainLoopStages, Operator,
|
||||||
|
1 /* epilogue stages */
|
||||||
|
>::GemmKernel>;
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
|
using Op = cutlass::gemm::device::GemmUniversalAdapter<KernelType>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Gemm, typename... EpilogueArgs>
|
||||||
|
void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
|
||||||
|
torch::Tensor const& b,
|
||||||
|
EpilogueArgs&&... epilogue_params) {
|
||||||
|
using ElementAB = typename Gemm::ElementAB;
|
||||||
|
using ElementD = typename Gemm::ElementD;
|
||||||
|
|
||||||
|
int32_t m = a.size(0);
|
||||||
|
int32_t n = b.size(1);
|
||||||
|
int32_t k = a.size(1);
|
||||||
|
cutlass::gemm::GemmCoord problem_size{m, n, k};
|
||||||
|
|
||||||
|
int64_t lda = a.stride(0);
|
||||||
|
int64_t ldb = b.stride(1);
|
||||||
|
int64_t ldc = out.stride(0);
|
||||||
|
|
||||||
|
using StrideC = Stride<int64_t, Int<1>, Int<0>>;
|
||||||
|
StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
|
||||||
|
|
||||||
|
auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
|
||||||
|
auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
|
||||||
|
auto c_ptr = static_cast<ElementD*>(out.data_ptr());
|
||||||
|
|
||||||
|
typename Gemm::D::Arguments d_args{c_ptr, c_stride};
|
||||||
|
|
||||||
|
using Epilogue = typename Gemm::Epilogue;
|
||||||
|
auto evt_args =
|
||||||
|
Epilogue::prepare_args(std::forward<EpilogueArgs>(epilogue_params)...);
|
||||||
|
|
||||||
|
typename Gemm::EVTD::Arguments epilogue_args{
|
||||||
|
evt_args,
|
||||||
|
d_args,
|
||||||
|
};
|
||||||
|
|
||||||
|
typename Gemm::Op::Arguments args{
|
||||||
|
cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel, // universal mode
|
||||||
|
problem_size, // problem size
|
||||||
|
1, // batch count
|
||||||
|
epilogue_args,
|
||||||
|
a_ptr,
|
||||||
|
b_ptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
lda,
|
||||||
|
ldb,
|
||||||
|
ldc,
|
||||||
|
ldc};
|
||||||
|
|
||||||
|
// Launch the CUTLASS GEMM kernel.
|
||||||
|
typename Gemm::Op gemm_op;
|
||||||
|
size_t workspace_size = gemm_op.get_workspace_size(args);
|
||||||
|
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
|
||||||
|
|
||||||
|
auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
|
||||||
|
|
||||||
|
CUTLASS_CHECK(gemm_op.can_implement(args));
|
||||||
|
cutlass::Status status = gemm_op(args, workspace.get(), stream);
|
||||||
|
CUTLASS_CHECK(status);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Gemm, typename FallbackGemm, typename... EpilogueArgs>
|
||||||
|
void fallback_cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
|
||||||
|
torch::Tensor const& b,
|
||||||
|
EpilogueArgs&&... args) {
|
||||||
|
// In some cases, the GPU isn't able to accommodate the
|
||||||
|
// shared memory requirements of the Gemm. In such cases, use
|
||||||
|
// the FallbackGemm instead.
|
||||||
|
static const int max_shared_mem_per_block_opt_in =
|
||||||
|
get_cuda_max_shared_memory_per_block_opt_in(0);
|
||||||
|
|
||||||
|
size_t const gemm_shared_mem_size =
|
||||||
|
sizeof(typename Gemm::KernelType::SharedStorage);
|
||||||
|
size_t const fallback_gemm_shared_mem_size =
|
||||||
|
sizeof(typename FallbackGemm::KernelType::SharedStorage);
|
||||||
|
|
||||||
|
if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) {
|
||||||
|
return cutlass_gemm_caller<Gemm>(out, a, b,
|
||||||
|
std::forward<EpilogueArgs>(args)...);
|
||||||
|
} else {
|
||||||
|
TORCH_CHECK(fallback_gemm_shared_mem_size <=
|
||||||
|
max_shared_mem_per_block_opt_in);
|
||||||
|
return cutlass_gemm_caller<FallbackGemm>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename InType, typename OutType,
|
||||||
|
template <typename, typename> typename Epilogue>
|
||||||
|
struct sm80_config_default {
|
||||||
|
// This config is used in 2 cases,
|
||||||
|
// - M in (128, inf)
|
||||||
|
// - M in (64, 128] and N >= 8192
|
||||||
|
// Shared Memory required by this Gemm - 81920 bytes
|
||||||
|
static_assert(std::is_same<InType, int8_t>());
|
||||||
|
using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
|
||||||
|
using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
|
||||||
|
using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
|
||||||
|
using Cutlass2xGemm =
|
||||||
|
cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
|
||||||
|
Epilogue, TileShape, WarpShape, InstructionShape, 5>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType,
|
||||||
|
template <typename, typename> typename Epilogue>
|
||||||
|
struct sm80_config_M64 {
|
||||||
|
// This config is used in 2 cases,
|
||||||
|
// - M in (32, 64]
|
||||||
|
// - M in (64, 128] and N < 8192
|
||||||
|
// Shared Memory required by this Gemm - 122880 bytes
|
||||||
|
static_assert(std::is_same<InType, int8_t>());
|
||||||
|
using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
|
||||||
|
using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
|
||||||
|
using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
|
||||||
|
using Cutlass2xGemm =
|
||||||
|
cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
|
||||||
|
Epilogue, TileShape, WarpShape, InstructionShape, 5>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType,
|
||||||
|
template <typename, typename> typename Epilogue>
|
||||||
|
struct sm80_config_M32 {
|
||||||
|
// M in (16, 32]
|
||||||
|
// Shared Memory required by this Gemm - 61440 bytes
|
||||||
|
static_assert(std::is_same<InType, int8_t>());
|
||||||
|
using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
|
||||||
|
using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
|
||||||
|
using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
|
||||||
|
using Cutlass2xGemm =
|
||||||
|
cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
|
||||||
|
Epilogue, TileShape, WarpShape, InstructionShape, 5>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType,
|
||||||
|
template <typename, typename> typename Epilogue>
|
||||||
|
struct sm80_config_M16 {
|
||||||
|
// M in [1, 16]
|
||||||
|
// Shared Memory required by this Gemm - 51200 bytes
|
||||||
|
static_assert(std::is_same<InType, int8_t>());
|
||||||
|
using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
|
||||||
|
using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
|
||||||
|
using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
|
||||||
|
using Cutlass2xGemm =
|
||||||
|
cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
|
||||||
|
Epilogue, TileShape, WarpShape, InstructionShape, 5>;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
template <typename InType, typename OutType,
|
||||||
|
template <typename, typename> typename Epilogue,
|
||||||
|
typename... EpilogueArgs>
|
||||||
|
void cutlass_gemm_sm80_dispatch(torch::Tensor& out, torch::Tensor const& a,
|
||||||
|
torch::Tensor const& b,
|
||||||
|
EpilogueArgs&&... args) {
|
||||||
|
static_assert(std::is_same<InType, int8_t>());
|
||||||
|
TORCH_CHECK(a.dtype() == torch::kInt8);
|
||||||
|
TORCH_CHECK(b.dtype() == torch::kInt8);
|
||||||
|
|
||||||
|
using Cutlass2xGemmDefault =
|
||||||
|
typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
|
||||||
|
using Cutlass2xGemmM128BigN =
|
||||||
|
typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
|
||||||
|
using Cutlass2xGemmM128SmallN =
|
||||||
|
typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
|
||||||
|
using Cutlass2xGemmM64 =
|
||||||
|
typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
|
||||||
|
using Cutlass2xGemmM32 =
|
||||||
|
typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
|
||||||
|
using Cutlass2xGemmM16 =
|
||||||
|
typename sm80_config_M16<InType, OutType, Epilogue>::Cutlass2xGemm;
|
||||||
|
|
||||||
|
// Due to shared memory requirements, some Gemms may fail to run on some
|
||||||
|
// GPUs. As the name indicates, the Fallback Gemm is used as an alternative
|
||||||
|
// in such cases.
|
||||||
|
// sm80_config_M16 has the least shared-memory requirement. However,
|
||||||
|
// based on some profiling, we select sm80_config_M32 as a better alternative
|
||||||
|
// performance wise.
|
||||||
|
using FallbackGemm =
|
||||||
|
typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
|
||||||
|
|
||||||
|
uint32_t const m = a.size(0);
|
||||||
|
uint32_t const mp2 =
|
||||||
|
std::max(static_cast<uint32_t>(16), next_pow_2(m)); // next power of 2
|
||||||
|
if (mp2 <= 16) {
|
||||||
|
// M in [1, 16]
|
||||||
|
return fallback_cutlass_gemm_caller<Cutlass2xGemmM16, FallbackGemm>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
} else if (mp2 <= 32) {
|
||||||
|
// M in (16, 32]
|
||||||
|
return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
} else if (mp2 <= 64) {
|
||||||
|
// M in (32, 64]
|
||||||
|
return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
} else if (mp2 <= 128) {
|
||||||
|
// M in (64, 128]
|
||||||
|
uint32_t const n = out.size(1);
|
||||||
|
bool const small_n = n < 8192;
|
||||||
|
if (small_n) {
|
||||||
|
return fallback_cutlass_gemm_caller<Cutlass2xGemmM128SmallN,
|
||||||
|
FallbackGemm>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
} else {
|
||||||
|
return fallback_cutlass_gemm_caller<Cutlass2xGemmM128BigN, FallbackGemm>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// M in (128, inf)
|
||||||
|
return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <template <typename, typename> typename Epilogue,
|
||||||
|
typename... EpilogueArgs>
|
||||||
|
void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
|
||||||
|
torch::Tensor const& b,
|
||||||
|
EpilogueArgs&&... epilogue_args) {
|
||||||
|
TORCH_CHECK(a.dtype() == torch::kInt8);
|
||||||
|
TORCH_CHECK(b.dtype() == torch::kInt8);
|
||||||
|
|
||||||
|
using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
|
||||||
|
using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
|
||||||
|
using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
|
||||||
|
|
||||||
|
if (out.dtype() == torch::kBFloat16) {
|
||||||
|
return cutlass_gemm_caller<cutlass_2x_gemm<
|
||||||
|
cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::bfloat16_t,
|
||||||
|
Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
|
} else {
|
||||||
|
TORCH_CHECK(out.dtype() == torch::kFloat16);
|
||||||
|
return cutlass_gemm_caller<cutlass_2x_gemm<
|
||||||
|
cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::half_t,
|
||||||
|
Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
|
||||||
|
torch::Tensor const& b,
|
||||||
|
torch::Tensor const& a_scales,
|
||||||
|
torch::Tensor const& b_scales,
|
||||||
|
c10::optional<torch::Tensor> const& bias) {
|
||||||
|
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||||
|
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||||
|
if (bias) {
|
||||||
|
TORCH_CHECK(bias->dtype() == out.dtype(),
|
||||||
|
"currently bias dtype must match output dtype ", out.dtype());
|
||||||
|
return cutlass_scaled_mm_sm75_epilogue<ScaledEpilogueBias>(
|
||||||
|
out, a, b, a_scales, b_scales, *bias);
|
||||||
|
} else {
|
||||||
|
return cutlass_scaled_mm_sm75_epilogue<ScaledEpilogue>(out, a, b, a_scales,
|
||||||
|
b_scales);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <template <typename, typename> typename Epilogue,
|
||||||
|
typename... EpilogueArgs>
|
||||||
|
void cutlass_scaled_mm_sm80_epilogue(torch::Tensor& out, torch::Tensor const& a,
|
||||||
|
torch::Tensor const& b,
|
||||||
|
EpilogueArgs&&... epilogue_args) {
|
||||||
|
TORCH_CHECK(a.dtype() == torch::kInt8);
|
||||||
|
TORCH_CHECK(b.dtype() == torch::kInt8);
|
||||||
|
|
||||||
|
if (out.dtype() == torch::kBFloat16) {
|
||||||
|
return cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
|
} else {
|
||||||
|
TORCH_CHECK(out.dtype() == torch::kFloat16);
|
||||||
|
return cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
|
||||||
|
torch::Tensor const& b,
|
||||||
|
torch::Tensor const& a_scales,
|
||||||
|
torch::Tensor const& b_scales,
|
||||||
|
c10::optional<torch::Tensor> const& bias) {
|
||||||
|
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||||
|
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||||
|
if (bias) {
|
||||||
|
TORCH_CHECK(bias->dtype() == out.dtype(),
|
||||||
|
"currently bias dtype must match output dtype ", out.dtype());
|
||||||
|
return cutlass_scaled_mm_sm80_epilogue<ScaledEpilogueBias>(
|
||||||
|
out, a, b, a_scales, b_scales, *bias);
|
||||||
|
} else {
|
||||||
|
return cutlass_scaled_mm_sm80_epilogue<ScaledEpilogue>(out, a, b, a_scales,
|
||||||
|
b_scales);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <template <typename, typename> typename Epilogue,
|
||||||
|
typename... EpilogueArgs>
|
||||||
|
void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
|
||||||
|
torch::Tensor const& b,
|
||||||
|
EpilogueArgs&&... epilogue_args) {
|
||||||
|
using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
|
||||||
|
using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
|
||||||
|
using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
|
||||||
|
|
||||||
|
if (a.dtype() == torch::kInt8) {
|
||||||
|
TORCH_CHECK(b.dtype() == torch::kInt8);
|
||||||
|
|
||||||
|
if (out.dtype() == torch::kBFloat16) {
|
||||||
|
return cutlass_gemm_caller<cutlass_2x_gemm<
|
||||||
|
cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::bfloat16_t,
|
||||||
|
Epilogue, TileShape, WarpShape, InstructionShape, 5>>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
|
} else {
|
||||||
|
assert(out.dtype() == torch::kFloat16);
|
||||||
|
return cutlass_gemm_caller<cutlass_2x_gemm<
|
||||||
|
cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::half_t,
|
||||||
|
Epilogue, TileShape, WarpShape, InstructionShape, 5>>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
|
||||||
|
TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
|
||||||
|
|
||||||
|
if (out.dtype() == torch::kBFloat16) {
|
||||||
|
return cutlass_gemm_caller<
|
||||||
|
cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||||
|
cutlass::float_e4m3_t, cutlass::bfloat16_t, Epilogue,
|
||||||
|
TileShape, WarpShape, InstructionShape, 5>>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
|
} else {
|
||||||
|
TORCH_CHECK(out.dtype() == torch::kFloat16);
|
||||||
|
return cutlass_gemm_caller<
|
||||||
|
cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
|
||||||
|
cutlass::float_e4m3_t, cutlass::half_t, Epilogue,
|
||||||
|
TileShape, WarpShape, InstructionShape, 5>>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
|
||||||
|
torch::Tensor const& b,
|
||||||
|
torch::Tensor const& a_scales,
|
||||||
|
torch::Tensor const& b_scales,
|
||||||
|
c10::optional<torch::Tensor> const& bias) {
|
||||||
|
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||||
|
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||||
|
if (bias) {
|
||||||
|
TORCH_CHECK(bias->dtype() == out.dtype(),
|
||||||
|
"currently bias dtype must match output dtype ", out.dtype());
|
||||||
|
return cutlass_scaled_mm_sm89_epilogue<ScaledEpilogueBias>(
|
||||||
|
out, a, b, a_scales, b_scales, *bias);
|
||||||
|
} else {
|
||||||
|
return cutlass_scaled_mm_sm89_epilogue<ScaledEpilogue>(out, a, b, a_scales,
|
||||||
|
b_scales);
|
||||||
|
}
|
||||||
|
}
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user