Compare commits
408 Commits
v0.6.3.pos
...
v0.6.4
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
02dbf30e9a | ||
|
|
2ac6d0e75b | ||
|
|
2ec8827288 | ||
|
|
b40cf6402e | ||
|
|
2885ba0e24 | ||
|
|
bf2ddc6610 | ||
|
|
972112d82f | ||
|
|
11cd1ae6ad | ||
|
|
554af9228d | ||
|
|
b2e0ad3b59 | ||
|
|
4a18fd14ba | ||
|
|
1dbae0329c | ||
|
|
675d603400 | ||
|
|
03025c023f | ||
|
|
29f3ef26a3 | ||
|
|
294bf467ba | ||
|
|
52b48c1ead | ||
|
|
f67ce05d0b | ||
|
|
e0853b6508 | ||
|
|
504ac53d18 | ||
|
|
15bb8330aa | ||
|
|
ac49b59d8b | ||
|
|
0b8bb86bf1 | ||
|
|
bb7991aa29 | ||
|
|
d909acf9fe | ||
|
|
b6dde33019 | ||
|
|
1b886aa104 | ||
|
|
3945c82346 | ||
|
|
032fcf16ae | ||
|
|
56a955e774 | ||
|
|
bbd3e86926 | ||
|
|
0d4ea3fb5c | ||
|
|
112fa0bbe5 | ||
|
|
377b74fe87 | ||
|
|
18081451f9 | ||
|
|
96ae0eaeb2 | ||
|
|
1f55e05713 | ||
|
|
8a06428c70 | ||
|
|
b41fb9d3b1 | ||
|
|
7c65527918 | ||
|
|
47db6ec831 | ||
|
|
176fcb1c71 | ||
|
|
a838ba7254 | ||
|
|
36c513a076 | ||
|
|
d201d41973 | ||
|
|
3a28f18b0b | ||
|
|
812c981fa0 | ||
|
|
7f5edb5900 | ||
|
|
eea55cca5b | ||
|
|
9cdba9669c | ||
|
|
d1c6799b88 | ||
|
|
6ace6fba2c | ||
|
|
08f93e7439 | ||
|
|
9d5b4e4dea | ||
|
|
8a7fe47d32 | ||
|
|
4800339c62 | ||
|
|
fe15729a2b | ||
|
|
330e82d34a | ||
|
|
d7a4f2207b | ||
|
|
f9dadfbee3 | ||
|
|
25144ceed0 | ||
|
|
e6de9784d2 | ||
|
|
36fc439de0 | ||
|
|
874f551b36 | ||
|
|
2cebda42bb | ||
|
|
5fb1f935b0 | ||
|
|
36e4acd02a | ||
|
|
58170d6503 | ||
|
|
9804ac7c7c | ||
|
|
f89d18ff74 | ||
|
|
f0f2e5638e | ||
|
|
ad9a78bf64 | ||
|
|
73b9083e99 | ||
|
|
20cf2f553c | ||
|
|
bfb7d61a7c | ||
|
|
19682023b6 | ||
|
|
9fa4bdde9d | ||
|
|
51c2e1fcef | ||
|
|
b09895a618 | ||
|
|
d88bff1b96 | ||
|
|
9e37266420 | ||
|
|
8a4358ecb5 | ||
|
|
bd46357ad9 | ||
|
|
f192aeba74 | ||
|
|
8e1529dc57 | ||
|
|
1a95f10ee7 | ||
|
|
49d2a41a86 | ||
|
|
47672f38b5 | ||
|
|
f83feccd7f | ||
|
|
e0191a95d8 | ||
|
|
d7edca1dee | ||
|
|
127c07480e | ||
|
|
10b67d865d | ||
|
|
4f93dfe952 | ||
|
|
e1b5a82179 | ||
|
|
87713c6053 | ||
|
|
b5815c8413 | ||
|
|
6b30471586 | ||
|
|
f6778620a9 | ||
|
|
0535e5fe6c | ||
|
|
b489fc3c91 | ||
|
|
208ce622c7 | ||
|
|
1ff4aed5bd | ||
|
|
f10797c0ce | ||
|
|
f4c2187e29 | ||
|
|
aea6ad629f | ||
|
|
da07a9ead7 | ||
|
|
3a7f15a398 | ||
|
|
7371749d54 | ||
|
|
ad39bd640c | ||
|
|
40d0e7411d | ||
|
|
6bb52b0f97 | ||
|
|
201fc07730 | ||
|
|
42b4f46b71 | ||
|
|
073a472728 | ||
|
|
93bff421bc | ||
|
|
28b2877d30 | ||
|
|
97b8475beb | ||
|
|
a2f1f3b089 | ||
|
|
3be5b26a76 | ||
|
|
de0e61a323 | ||
|
|
9d43afcc53 | ||
|
|
ae62fd17c0 | ||
|
|
a62bc0109c | ||
|
|
999df95b4e | ||
|
|
a6f332d0d9 | ||
|
|
0dfba97b42 | ||
|
|
aa9078fa03 | ||
|
|
e036e527a0 | ||
|
|
6192e9b8fe | ||
|
|
d7263a1bb8 | ||
|
|
104d729656 | ||
|
|
db7db4aab9 | ||
|
|
1fa020c539 | ||
|
|
e7b84c394d | ||
|
|
a4b3e0c1e9 | ||
|
|
29862b884b | ||
|
|
d3859f1891 | ||
|
|
4ab3256644 | ||
|
|
719c1ca468 | ||
|
|
74f2f8a0f1 | ||
|
|
d58268c56a | ||
|
|
87bd7e0515 | ||
|
|
098f94de42 | ||
|
|
399c798608 | ||
|
|
406d4cc480 | ||
|
|
a5bba7d234 | ||
|
|
2003cc3513 | ||
|
|
6a585a23d2 | ||
|
|
a02a50e6e5 | ||
|
|
a5fda50a10 | ||
|
|
21063c11c7 | ||
|
|
4be3a45158 | ||
|
|
4089985552 | ||
|
|
9d59b75593 | ||
|
|
ea928f608c | ||
|
|
2bcbae704c | ||
|
|
ffc0f2b47a | ||
|
|
82bfc38d07 | ||
|
|
c4cacbaa7f | ||
|
|
0c63c34f72 | ||
|
|
966e31697b | ||
|
|
43300bd98a | ||
|
|
ca9844b340 | ||
|
|
235366fe2e | ||
|
|
02462465ea | ||
|
|
b9c64c0ca7 | ||
|
|
d2e80332a7 | ||
|
|
a53046b16f | ||
|
|
731aec5be7 | ||
|
|
09d3550372 | ||
|
|
cd34029e91 | ||
|
|
5952d81139 | ||
|
|
93dee88f6b | ||
|
|
7a83b1aec0 | ||
|
|
ad23318928 | ||
|
|
bbc3619dc8 | ||
|
|
04bbf38e05 | ||
|
|
8f0a9ca890 | ||
|
|
2094062b4e | ||
|
|
d93478b399 | ||
|
|
ac04a97a9f | ||
|
|
9a5664d4a4 | ||
|
|
04cef2c6ab | ||
|
|
6e056bcf04 | ||
|
|
5208dc7a20 | ||
|
|
1c45f4c385 | ||
|
|
603a661ae8 | ||
|
|
fb2716d641 | ||
|
|
8d72bb20fa | ||
|
|
ac6b8f19b9 | ||
|
|
ccb5376a9a | ||
|
|
ea4adeddc1 | ||
|
|
4dbcbbeb09 | ||
|
|
b67feb1274 | ||
|
|
c49f0407ba | ||
|
|
91c9ebbb1b | ||
|
|
54597724f4 | ||
|
|
1f1b6d6eda | ||
|
|
3bb4befea7 | ||
|
|
ae5279a163 | ||
|
|
1b73ab2a1f | ||
|
|
cea808f325 | ||
|
|
74b529ceee | ||
|
|
d6459b4516 | ||
|
|
e893795443 | ||
|
|
1d4cfe2be1 | ||
|
|
eed92f12fc | ||
|
|
af7380d83b | ||
|
|
a78dd3303e | ||
|
|
d522034c85 | ||
|
|
6c0b7f548d | ||
|
|
d151fde834 | ||
|
|
27cd36e6e2 | ||
|
|
18bd7587b7 | ||
|
|
598b6d7b07 | ||
|
|
aff1fd8188 | ||
|
|
4581d2cc02 | ||
|
|
1dd4cb2935 | ||
|
|
ba0d892074 | ||
|
|
30a2e80742 | ||
|
|
06386a64dd | ||
|
|
d3aa2a8b2f | ||
|
|
2b5bf20988 | ||
|
|
93a76dd21d | ||
|
|
566cd27797 | ||
|
|
37a4947dcd | ||
|
|
96e0c9cbbd | ||
|
|
031a7995f3 | ||
|
|
b63c64d95b | ||
|
|
9fb12f7848 | ||
|
|
55650c83a0 | ||
|
|
77f7ef2908 | ||
|
|
16b8f7a86f | ||
|
|
5608e611c2 | ||
|
|
3ea2dc2ec4 | ||
|
|
d087bf863e | ||
|
|
890ca36072 | ||
|
|
abbfb6134d | ||
|
|
64384bbcdf | ||
|
|
00d91c8a2c | ||
|
|
c2cd1a2142 | ||
|
|
c787f2d81d | ||
|
|
33d257735f | ||
|
|
3b3f1e7436 | ||
|
|
9ff4511e43 | ||
|
|
81f09cfd80 | ||
|
|
cc98f1e079 | ||
|
|
211fe91aa8 | ||
|
|
6aa6020f9b | ||
|
|
ff5ed6e1bc | ||
|
|
7b0365efef | ||
|
|
04a3ae0aca | ||
|
|
62fac4b9aa | ||
|
|
226688bd61 | ||
|
|
64cb1cdc3f | ||
|
|
1ab6f6b4ad | ||
|
|
bc73e9821c | ||
|
|
8d7724104a | ||
|
|
882a1ad0de | ||
|
|
67bdf8e523 | ||
|
|
0ad216f575 | ||
|
|
7585ec996f | ||
|
|
ab6f981671 | ||
|
|
ac3d748dba | ||
|
|
0ce7798f44 | ||
|
|
0f43387157 | ||
|
|
08600ddc68 | ||
|
|
74fc2d77ae | ||
|
|
622b7ab955 | ||
|
|
09500f7dde | ||
|
|
ef7865b4f9 | ||
|
|
eae3d48181 | ||
|
|
e74f2d448c | ||
|
|
7a4df5f200 | ||
|
|
c5d7fb9ddc | ||
|
|
76ed5340f0 | ||
|
|
97b61bfae6 | ||
|
|
aa0addb397 | ||
|
|
5f8d8075f9 | ||
|
|
8b0e4f2ad7 | ||
|
|
2adb4409e0 | ||
|
|
feb92fbe4a | ||
|
|
32176fee73 | ||
|
|
4e2d95e372 | ||
|
|
34a9941620 | ||
|
|
e130c40e4e | ||
|
|
3cb07a36a2 | ||
|
|
8549c82660 | ||
|
|
67a6882da4 | ||
|
|
6650e6a930 | ||
|
|
07e981fdf4 | ||
|
|
55137e8ee3 | ||
|
|
5cbdccd151 | ||
|
|
067e77f9a8 | ||
|
|
6567e13724 | ||
|
|
228cfbd03f | ||
|
|
ca0d92227e | ||
|
|
9645b9f646 | ||
|
|
a6f3721861 | ||
|
|
9f7b4ba865 | ||
|
|
c91ed47c43 | ||
|
|
59449095ab | ||
|
|
e26d37a185 | ||
|
|
722d46edb9 | ||
|
|
c866e0079d | ||
|
|
d27cfbf791 | ||
|
|
de662d32b5 | ||
|
|
f58454968f | ||
|
|
b979143d5b | ||
|
|
ad6f78053e | ||
|
|
295a061fb3 | ||
|
|
8a02cd045a | ||
|
|
4fdc581f9e | ||
|
|
3770071eb4 | ||
|
|
836e8ef6ee | ||
|
|
056a68c7db | ||
|
|
33bab41060 | ||
|
|
b7df53cd42 | ||
|
|
bb01f2915e | ||
|
|
b548d7a5f4 | ||
|
|
fc6c274626 | ||
|
|
150b779081 | ||
|
|
9013e24f7b | ||
|
|
fd0e2cfdb2 | ||
|
|
e5ac6a4199 | ||
|
|
dbdd3b5e5a | ||
|
|
e7116c017c | ||
|
|
31a08f5bd2 | ||
|
|
c18e1a3418 | ||
|
|
3ff57ebfca | ||
|
|
2394962d70 | ||
|
|
51c24c9736 | ||
|
|
831540cf04 | ||
|
|
29061ed9df | ||
|
|
65050a40e6 | ||
|
|
208cb34c81 | ||
|
|
b17046e298 | ||
|
|
d1e8240875 | ||
|
|
cb6fdaa0a0 | ||
|
|
23b899a8e6 | ||
|
|
17c79f3c36 | ||
|
|
cd5601ac37 | ||
|
|
434984e665 | ||
|
|
32a1ee74a0 | ||
|
|
08075c3448 | ||
|
|
bb392ea2d2 | ||
|
|
9dbcce84a7 | ||
|
|
a48e3ec052 | ||
|
|
6c5af09b39 | ||
|
|
3ddbe25502 | ||
|
|
0d02747f2e | ||
|
|
f7db5f0fa9 | ||
|
|
ca30c3c84b | ||
|
|
c0292211ce | ||
|
|
74692421f7 | ||
|
|
29acd2c34c | ||
|
|
f085995a7b | ||
|
|
b729901139 | ||
|
|
76a5e13270 | ||
|
|
ef7faad1b8 | ||
|
|
575dcebe9a | ||
|
|
711f3a7806 | ||
|
|
15713e3b75 | ||
|
|
d621c43df7 | ||
|
|
9d9186be97 | ||
|
|
5241aa1494 | ||
|
|
ec6bd6c4c6 | ||
|
|
8ca8954841 | ||
|
|
f6b97293aa | ||
|
|
496e991da8 | ||
|
|
696b01af8f | ||
|
|
855e0e6f97 | ||
|
|
4fa3e33349 | ||
|
|
962d2c6349 | ||
|
|
5b59fe0f08 | ||
|
|
8e3e7f2713 | ||
|
|
263d8ee150 | ||
|
|
c5eea3c8ba | ||
|
|
85dc92fc98 | ||
|
|
dfd951ed9b | ||
|
|
82c25151ec | ||
|
|
1325872ec8 | ||
|
|
380e18639f | ||
|
|
337ed76671 | ||
|
|
0c9a5258f9 | ||
|
|
d11bf435a0 | ||
|
|
9bb10a7d27 | ||
|
|
3921a2f29e | ||
|
|
67a7e5ef38 | ||
|
|
051eaf6db3 | ||
|
|
7dbe738d65 | ||
|
|
ae8b633ba3 | ||
|
|
1bbbcc0b1d | ||
|
|
25aeb7d4c9 | ||
|
|
d2b1bf55ec | ||
|
|
1ffc8a7362 | ||
|
|
944dd8edaf | ||
|
|
154a8ae880 | ||
|
|
de4008e2ab | ||
|
|
48138a8415 | ||
|
|
343f8e0905 | ||
|
|
bb76538bbd | ||
|
|
d615b5c9f8 | ||
|
|
d65049daab | ||
|
|
eca2c5f7c0 | ||
|
|
0f41fbe5a3 | ||
|
|
7871659abb |
@@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||||
|
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.356
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.358
|
||||||
|
limit: 1000
|
||||||
|
num_fewshot: 5
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
Meta-Llama-3-8B-Instruct.yaml
|
Meta-Llama-3-8B-Instruct.yaml
|
||||||
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
|
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
|
||||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
|
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
||||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
||||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||||
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
|
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
|
||||||
|
|||||||
@@ -41,6 +41,6 @@ while getopts "m:b:l:f:" OPT; do
|
|||||||
done
|
done
|
||||||
|
|
||||||
lm_eval --model hf \
|
lm_eval --model hf \
|
||||||
--model_args pretrained=$MODEL,parallelize=True \
|
--model_args "pretrained=$MODEL,parallelize=True" \
|
||||||
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
|
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
||||||
--batch_size $BATCH_SIZE
|
--batch_size "$BATCH_SIZE"
|
||||||
|
|||||||
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
|
|||||||
done
|
done
|
||||||
|
|
||||||
lm_eval --model vllm \
|
lm_eval --model vllm \
|
||||||
--model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
|
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
|
||||||
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
|
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
||||||
--batch_size $BATCH_SIZE
|
--batch_size "$BATCH_SIZE"
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ while getopts "c:t:" OPT; do
|
|||||||
done
|
done
|
||||||
|
|
||||||
# Parse list of configs.
|
# Parse list of configs.
|
||||||
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
|
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
|
||||||
|
|
||||||
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
|
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
|
||||||
do
|
do
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ serving_column_mapping = {
|
|||||||
|
|
||||||
def read_markdown(file):
|
def read_markdown(file):
|
||||||
if os.path.exists(file):
|
if os.path.exists(file):
|
||||||
with open(file, "r") as f:
|
with open(file) as f:
|
||||||
return f.read() + "\n"
|
return f.read() + "\n"
|
||||||
else:
|
else:
|
||||||
return f"{file} not found.\n"
|
return f"{file} not found.\n"
|
||||||
@@ -75,14 +75,14 @@ if __name__ == "__main__":
|
|||||||
# collect results
|
# collect results
|
||||||
for test_file in results_folder.glob("*.json"):
|
for test_file in results_folder.glob("*.json"):
|
||||||
|
|
||||||
with open(test_file, "r") as f:
|
with open(test_file) as f:
|
||||||
raw_result = json.loads(f.read())
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
if "serving" in str(test_file):
|
if "serving" in str(test_file):
|
||||||
# this result is generated via `benchmark_serving.py`
|
# this result is generated via `benchmark_serving.py`
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
with open(test_file.with_suffix(".commands"), "r") as f:
|
with open(test_file.with_suffix(".commands")) as f:
|
||||||
command = json.loads(f.read())
|
command = json.loads(f.read())
|
||||||
raw_result.update(command)
|
raw_result.update(command)
|
||||||
|
|
||||||
@@ -97,7 +97,7 @@ if __name__ == "__main__":
|
|||||||
# this result is generated via `benchmark_latency.py`
|
# this result is generated via `benchmark_latency.py`
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
with open(test_file.with_suffix(".commands"), "r") as f:
|
with open(test_file.with_suffix(".commands")) as f:
|
||||||
command = json.loads(f.read())
|
command = json.loads(f.read())
|
||||||
raw_result.update(command)
|
raw_result.update(command)
|
||||||
|
|
||||||
@@ -119,7 +119,7 @@ if __name__ == "__main__":
|
|||||||
# this result is generated via `benchmark_throughput.py`
|
# this result is generated via `benchmark_throughput.py`
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
with open(test_file.with_suffix(".commands"), "r") as f:
|
with open(test_file.with_suffix(".commands")) as f:
|
||||||
command = json.loads(f.read())
|
command = json.loads(f.read())
|
||||||
raw_result.update(command)
|
raw_result.update(command)
|
||||||
|
|
||||||
|
|||||||
@@ -72,7 +72,7 @@ def main(args):
|
|||||||
|
|
||||||
# collect results
|
# collect results
|
||||||
for test_file in results_folder.glob("*_nightly_results.json"):
|
for test_file in results_folder.glob("*_nightly_results.json"):
|
||||||
with open(test_file, "r") as f:
|
with open(test_file) as f:
|
||||||
results = results + json.loads(f.read())
|
results = results + json.loads(f.read())
|
||||||
|
|
||||||
# generate markdown table
|
# generate markdown table
|
||||||
@@ -80,7 +80,7 @@ def main(args):
|
|||||||
|
|
||||||
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
|
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
|
||||||
|
|
||||||
with open(args.description, "r") as f:
|
with open(args.description) as f:
|
||||||
description = f.read()
|
description = f.read()
|
||||||
|
|
||||||
description = description.format(
|
description = description.format(
|
||||||
|
|||||||
@@ -50,31 +50,30 @@ launch_trt_server() {
|
|||||||
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
|
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
|
||||||
git lfs install
|
git lfs install
|
||||||
cd tensorrtllm_backend
|
cd tensorrtllm_backend
|
||||||
git checkout $trt_llm_version
|
git checkout "$trt_llm_version"
|
||||||
tensorrtllm_backend_dir=$(pwd)
|
|
||||||
git submodule update --init --recursive
|
git submodule update --init --recursive
|
||||||
|
|
||||||
# build trtllm engine
|
# build trtllm engine
|
||||||
cd /tensorrtllm_backend
|
cd /tensorrtllm_backend
|
||||||
cd ./tensorrt_llm/examples/${model_type}
|
cd "./tensorrt_llm/examples/${model_type}"
|
||||||
python3 convert_checkpoint.py \
|
python3 convert_checkpoint.py \
|
||||||
--model_dir ${model_path} \
|
--model_dir "${model_path}" \
|
||||||
--dtype ${model_dtype} \
|
--dtype "${model_dtype}" \
|
||||||
--tp_size ${model_tp_size} \
|
--tp_size "${model_tp_size}" \
|
||||||
--output_dir ${trt_model_path}
|
--output_dir "${trt_model_path}"
|
||||||
trtllm-build \
|
trtllm-build \
|
||||||
--checkpoint_dir ${trt_model_path} \
|
--checkpoint_dir "${trt_model_path}" \
|
||||||
--use_fused_mlp \
|
--use_fused_mlp \
|
||||||
--reduce_fusion disable \
|
--reduce_fusion disable \
|
||||||
--workers 8 \
|
--workers 8 \
|
||||||
--gpt_attention_plugin ${model_dtype} \
|
--gpt_attention_plugin "${model_dtype}" \
|
||||||
--gemm_plugin ${model_dtype} \
|
--gemm_plugin "${model_dtype}" \
|
||||||
--tp_size ${model_tp_size} \
|
--tp_size "${model_tp_size}" \
|
||||||
--max_batch_size ${max_batch_size} \
|
--max_batch_size "${max_batch_size}" \
|
||||||
--max_input_len ${max_input_len} \
|
--max_input_len "${max_input_len}" \
|
||||||
--max_seq_len ${max_seq_len} \
|
--max_seq_len "${max_seq_len}" \
|
||||||
--max_num_tokens ${max_num_tokens} \
|
--max_num_tokens "${max_num_tokens}" \
|
||||||
--output_dir ${trt_engine_path}
|
--output_dir "${trt_engine_path}"
|
||||||
|
|
||||||
# handle triton protobuf files and launch triton server
|
# handle triton protobuf files and launch triton server
|
||||||
cd /tensorrtllm_backend
|
cd /tensorrtllm_backend
|
||||||
@@ -82,15 +81,15 @@ launch_trt_server() {
|
|||||||
cp -r all_models/inflight_batcher_llm/* triton_model_repo/
|
cp -r all_models/inflight_batcher_llm/* triton_model_repo/
|
||||||
cd triton_model_repo
|
cd triton_model_repo
|
||||||
rm -rf ./tensorrt_llm/1/*
|
rm -rf ./tensorrt_llm/1/*
|
||||||
cp -r ${trt_engine_path}/* ./tensorrt_llm/1
|
cp -r "${trt_engine_path}"/* ./tensorrt_llm/1
|
||||||
python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
|
python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
|
||||||
python3 ../tools/fill_template.py -i preprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5
|
python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5"
|
||||||
python3 ../tools/fill_template.py -i postprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false
|
python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false"
|
||||||
python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:$max_batch_size
|
python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size"
|
||||||
python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:"False",bls_instance_count:1
|
python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1"
|
||||||
cd /tensorrtllm_backend
|
cd /tensorrtllm_backend
|
||||||
python3 scripts/launch_triton_server.py \
|
python3 scripts/launch_triton_server.py \
|
||||||
--world_size=${model_tp_size} \
|
--world_size="${model_tp_size}" \
|
||||||
--model_repo=/tensorrtllm_backend/triton_model_repo &
|
--model_repo=/tensorrtllm_backend/triton_model_repo &
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -98,10 +97,7 @@ launch_trt_server() {
|
|||||||
launch_tgi_server() {
|
launch_tgi_server() {
|
||||||
model=$(echo "$common_params" | jq -r '.model')
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
tp=$(echo "$common_params" | jq -r '.tp')
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
|
||||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
|
||||||
port=$(echo "$common_params" | jq -r '.port')
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
|
||||||
server_args=$(json2args "$server_params")
|
server_args=$(json2args "$server_params")
|
||||||
|
|
||||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||||
@@ -129,10 +125,7 @@ launch_tgi_server() {
|
|||||||
launch_lmdeploy_server() {
|
launch_lmdeploy_server() {
|
||||||
model=$(echo "$common_params" | jq -r '.model')
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
tp=$(echo "$common_params" | jq -r '.tp')
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
|
||||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
|
||||||
port=$(echo "$common_params" | jq -r '.port')
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
|
||||||
server_args=$(json2args "$server_params")
|
server_args=$(json2args "$server_params")
|
||||||
|
|
||||||
server_command="lmdeploy serve api_server $model \
|
server_command="lmdeploy serve api_server $model \
|
||||||
@@ -149,10 +142,7 @@ launch_sglang_server() {
|
|||||||
|
|
||||||
model=$(echo "$common_params" | jq -r '.model')
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
tp=$(echo "$common_params" | jq -r '.tp')
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
|
||||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
|
||||||
port=$(echo "$common_params" | jq -r '.port')
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
|
||||||
server_args=$(json2args "$server_params")
|
server_args=$(json2args "$server_params")
|
||||||
|
|
||||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||||
@@ -185,10 +175,7 @@ launch_vllm_server() {
|
|||||||
|
|
||||||
model=$(echo "$common_params" | jq -r '.model')
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
tp=$(echo "$common_params" | jq -r '.tp')
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
|
||||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
|
||||||
port=$(echo "$common_params" | jq -r '.port')
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
|
||||||
server_args=$(json2args "$server_params")
|
server_args=$(json2args "$server_params")
|
||||||
|
|
||||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||||
@@ -217,19 +204,19 @@ launch_vllm_server() {
|
|||||||
|
|
||||||
main() {
|
main() {
|
||||||
|
|
||||||
if [[ $CURRENT_LLM_SERVING_ENGINE == "trt" ]]; then
|
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then
|
||||||
launch_trt_server
|
launch_trt_server
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $CURRENT_LLM_SERVING_ENGINE == "tgi" ]]; then
|
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then
|
||||||
launch_tgi_server
|
launch_tgi_server
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $CURRENT_LLM_SERVING_ENGINE == "lmdeploy" ]]; then
|
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
|
||||||
launch_lmdeploy_server
|
launch_lmdeploy_server
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $CURRENT_LLM_SERVING_ENGINE == "sglang" ]]; then
|
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then
|
||||||
launch_sglang_server
|
launch_sglang_server
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
@@ -16,10 +16,10 @@ main() {
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# initial annotation
|
# initial annotation
|
||||||
description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
|
#description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
|
||||||
|
|
||||||
# download results
|
# download results
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||||
mkdir -p results/
|
mkdir -p results/
|
||||||
/workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
|
/workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
|
||||||
ls
|
ls
|
||||||
@@ -30,15 +30,15 @@ main() {
|
|||||||
/workspace/buildkite-agent artifact upload "results.zip"
|
/workspace/buildkite-agent artifact upload "results.zip"
|
||||||
|
|
||||||
# upload benchmarking scripts
|
# upload benchmarking scripts
|
||||||
cd $VLLM_SOURCE_CODE_LOC/
|
cd "$VLLM_SOURCE_CODE_LOC/"
|
||||||
zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
|
zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
|
||||||
/workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
|
/workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
|
||||||
|
|
||||||
cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
|
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
||||||
# upload benchmarking pipeline
|
# upload benchmarking pipeline
|
||||||
/workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
|
/workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
|
||||||
|
|
||||||
cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
|
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
||||||
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
|
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ check_gpus() {
|
|||||||
echo "Need at least 1 GPU to run benchmarking."
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
|
||||||
echo "GPU type is $gpu_type"
|
echo "GPU type is $gpu_type"
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -102,7 +102,7 @@ kill_gpu_processes() {
|
|||||||
pkill -f text-generation
|
pkill -f text-generation
|
||||||
pkill -f lmdeploy
|
pkill -f lmdeploy
|
||||||
|
|
||||||
while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
|
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||||
sleep 1
|
sleep 1
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
@@ -119,8 +119,8 @@ wait_for_server() {
|
|||||||
ensure_installed() {
|
ensure_installed() {
|
||||||
# Ensure that the given command is installed by apt-get
|
# Ensure that the given command is installed by apt-get
|
||||||
local cmd=$1
|
local cmd=$1
|
||||||
if ! which $cmd >/dev/null; then
|
if ! which "$cmd" >/dev/null; then
|
||||||
apt-get update && apt-get install -y $cmd
|
apt-get update && apt-get install -y "$cmd"
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -173,13 +173,11 @@ run_serving_tests() {
|
|||||||
echo "Reuse previous server for test case $test_name"
|
echo "Reuse previous server for test case $test_name"
|
||||||
else
|
else
|
||||||
kill_gpu_processes
|
kill_gpu_processes
|
||||||
bash $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh \
|
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
|
||||||
"$server_params" "$common_params"
|
"$server_params" "$common_params"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
wait_for_server
|
if wait_for_server; then
|
||||||
|
|
||||||
if [ $? -eq 0 ]; then
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
||||||
else
|
else
|
||||||
@@ -190,13 +188,13 @@ run_serving_tests() {
|
|||||||
|
|
||||||
# prepare tokenizer
|
# prepare tokenizer
|
||||||
# this is required for lmdeploy.
|
# this is required for lmdeploy.
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||||
rm -rf /tokenizer_cache
|
rm -rf /tokenizer_cache
|
||||||
mkdir /tokenizer_cache
|
mkdir /tokenizer_cache
|
||||||
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
||||||
--model "$model" \
|
--model "$model" \
|
||||||
--cachedir /tokenizer_cache
|
--cachedir /tokenizer_cache
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||||
|
|
||||||
|
|
||||||
# change model name for lmdeploy (it will not follow standard hf name)
|
# change model name for lmdeploy (it will not follow standard hf name)
|
||||||
@@ -307,11 +305,11 @@ run_serving_tests() {
|
|||||||
prepare_dataset() {
|
prepare_dataset() {
|
||||||
|
|
||||||
# download sharegpt dataset
|
# download sharegpt dataset
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
# duplicate sonnet by 4x, to allow benchmarking with input length 2048
|
# duplicate sonnet by 4x, to allow benchmarking with input length 2048
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||||
echo "" > sonnet_4x.txt
|
echo "" > sonnet_4x.txt
|
||||||
for _ in {1..4}
|
for _ in {1..4}
|
||||||
do
|
do
|
||||||
@@ -339,17 +337,17 @@ main() {
|
|||||||
|
|
||||||
prepare_dataset
|
prepare_dataset
|
||||||
|
|
||||||
cd $VLLM_SOURCE_CODE_LOC/benchmarks
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||||
declare -g RESULTS_FOLDER=results/
|
declare -g RESULTS_FOLDER=results/
|
||||||
mkdir -p $RESULTS_FOLDER
|
mkdir -p $RESULTS_FOLDER
|
||||||
BENCHMARK_ROOT=$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
|
BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
||||||
|
|
||||||
# run the test
|
# run the test
|
||||||
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
|
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
|
||||||
|
|
||||||
# upload benchmark results to buildkite
|
# upload benchmark results to buildkite
|
||||||
python3 -m pip install tabulate pandas
|
python3 -m pip install tabulate pandas
|
||||||
python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
|
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
|
||||||
upload_to_buildkite
|
upload_to_buildkite
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ check_gpus() {
|
|||||||
echo "Need at least 1 GPU to run benchmarking."
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||||
echo "GPU type is $gpu_type"
|
echo "GPU type is $gpu_type"
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -93,7 +93,7 @@ kill_gpu_processes() {
|
|||||||
|
|
||||||
|
|
||||||
# wait until GPU memory usage smaller than 1GB
|
# wait until GPU memory usage smaller than 1GB
|
||||||
while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
|
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||||
sleep 1
|
sleep 1
|
||||||
done
|
done
|
||||||
|
|
||||||
@@ -117,7 +117,7 @@ upload_to_buildkite() {
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# Use the determined command to annotate and upload artifacts
|
# Use the determined command to annotate and upload artifacts
|
||||||
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md
|
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
|
||||||
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
|
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -150,7 +150,7 @@ run_latency_tests() {
|
|||||||
# check if there is enough GPU to run the test
|
# check if there is enough GPU to run the test
|
||||||
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -206,9 +206,9 @@ run_throughput_tests() {
|
|||||||
throughput_args=$(json2args "$throughput_params")
|
throughput_args=$(json2args "$throughput_params")
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
# check if there is enough GPU to run the test
|
||||||
tp=$(echo $throughput_params | jq -r '.tensor_parallel_size')
|
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -270,7 +270,7 @@ run_serving_tests() {
|
|||||||
# check if there is enough GPU to run the test
|
# check if there is enough GPU to run the test
|
||||||
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -278,7 +278,7 @@ run_serving_tests() {
|
|||||||
server_model=$(echo "$server_params" | jq -r '.model')
|
server_model=$(echo "$server_params" | jq -r '.model')
|
||||||
client_model=$(echo "$client_params" | jq -r '.model')
|
client_model=$(echo "$client_params" | jq -r '.model')
|
||||||
if [[ $server_model != "$client_model" ]]; then
|
if [[ $server_model != "$client_model" ]]; then
|
||||||
echo "Server model and client model must be the same. Skip testcase $testname."
|
echo "Server model and client model must be the same. Skip testcase $test_name."
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -293,8 +293,7 @@ run_serving_tests() {
|
|||||||
server_pid=$!
|
server_pid=$!
|
||||||
|
|
||||||
# wait until the server is alive
|
# wait until the server is alive
|
||||||
wait_for_server
|
if wait_for_server; then
|
||||||
if [ $? -eq 0 ]; then
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "vllm server is up and running."
|
echo "vllm server is up and running."
|
||||||
else
|
else
|
||||||
|
|||||||
@@ -36,11 +36,11 @@ if __name__ == "__main__":
|
|||||||
# collect results
|
# collect results
|
||||||
for test_file in results_folder.glob("*.json"):
|
for test_file in results_folder.glob("*.json"):
|
||||||
|
|
||||||
with open(test_file, "r") as f:
|
with open(test_file) as f:
|
||||||
raw_result = json.loads(f.read())
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
with open(test_file.with_suffix(".commands"), "r") as f:
|
with open(test_file.with_suffix(".commands")) as f:
|
||||||
command = json.loads(f.read())
|
command = json.loads(f.read())
|
||||||
raw_result.update(command)
|
raw_result.update(command)
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ TIMEOUT_SECONDS=10
|
|||||||
|
|
||||||
retries=0
|
retries=0
|
||||||
while [ $retries -lt 1000 ]; do
|
while [ $retries -lt 1000 ]; do
|
||||||
if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
|
if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
@@ -6,28 +6,23 @@ steps:
|
|||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
# rename the files to change linux -> manylinux1
|
- "bash .buildkite/upload-wheels.sh"
|
||||||
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
|
|
||||||
- "mv artifacts/dist/$(ls artifacts/dist) artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
|
|
||||||
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
|
|
||||||
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
|
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- block: "Build CUDA 11.8 wheel"
|
# Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
|
||||||
key: block-build-cu118-wheel
|
# However, this block can be uncommented to save some compute hours.
|
||||||
|
# - block: "Build CUDA 11.8 wheel"
|
||||||
|
# key: block-build-cu118-wheel
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 11.8"
|
- label: "Build wheel - CUDA 11.8"
|
||||||
depends_on: block-build-cu118-wheel
|
# depends_on: block-build-cu118-wheel
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue
|
queue: cpu_queue
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
# rename the files to change linux -> manylinux1
|
- "bash .buildkite/upload-wheels.sh"
|
||||||
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
|
|
||||||
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
|
||||||
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
|
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
# This script runs test inside the corresponding ROCm docker container.
|
# This script runs test inside the corresponding ROCm docker container.
|
||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
|
||||||
@@ -31,8 +33,8 @@ cleanup_docker() {
|
|||||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||||
# Remove dangling images (those that are not tagged and not used by any container)
|
# Remove dangling images (those that are not tagged and not used by any container)
|
||||||
docker image prune -f
|
docker image prune -f
|
||||||
# Remove unused volumes
|
# Remove unused volumes / force the system prune for old images as well.
|
||||||
docker volume prune -f
|
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||||
echo "Docker images and volumes cleanup completed."
|
echo "Docker images and volumes cleanup completed."
|
||||||
else
|
else
|
||||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||||
@@ -57,17 +59,17 @@ done
|
|||||||
echo "--- Pulling container"
|
echo "--- Pulling container"
|
||||||
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||||
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||||
docker pull ${image_name}
|
docker pull "${image_name}"
|
||||||
|
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
docker rm -f ${container_name} || docker image rm -f ${image_name} || true
|
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
|
||||||
}
|
}
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
echo "--- Running container"
|
echo "--- Running container"
|
||||||
|
|
||||||
HF_CACHE="$(realpath ~)/huggingface"
|
HF_CACHE="$(realpath ~)/huggingface"
|
||||||
mkdir -p ${HF_CACHE}
|
mkdir -p "${HF_CACHE}"
|
||||||
HF_MOUNT="/root/.cache/huggingface"
|
HF_MOUNT="/root/.cache/huggingface"
|
||||||
|
|
||||||
commands=$@
|
commands=$@
|
||||||
@@ -107,35 +109,36 @@ fi
|
|||||||
PARALLEL_JOB_COUNT=8
|
PARALLEL_JOB_COUNT=8
|
||||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||||
if [[ $commands == *"--shard-id="* ]]; then
|
if [[ $commands == *"--shard-id="* ]]; then
|
||||||
|
# assign job count as the number of shards used
|
||||||
|
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
||||||
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
||||||
#replace shard arguments
|
# assign shard-id for each shard
|
||||||
commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
||||||
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
echo "Shard ${GPU} commands:$commands_gpu"
|
||||||
echo "Shard ${GPU} commands:$commands"
|
|
||||||
docker run \
|
docker run \
|
||||||
--device /dev/kfd --device /dev/dri \
|
--device /dev/kfd --device /dev/dri \
|
||||||
--network host \
|
--network host \
|
||||||
--shm-size=16gb \
|
--shm-size=16gb \
|
||||||
--rm \
|
--rm \
|
||||||
-e HIP_VISIBLE_DEVICES=${GPU} \
|
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||||
-e HF_TOKEN \
|
-e HF_TOKEN \
|
||||||
-v ${HF_CACHE}:${HF_MOUNT} \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e HF_HOME=${HF_MOUNT} \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
--name ${container_name}_${GPU} \
|
--name "${container_name}_${GPU}" \
|
||||||
${image_name} \
|
"${image_name}" \
|
||||||
/bin/bash -c "${commands}" \
|
/bin/bash -c "${commands_gpu}" \
|
||||||
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
|
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
|
||||||
PIDS+=($!)
|
PIDS+=($!)
|
||||||
done
|
done
|
||||||
#wait for all processes to finish and collect exit codes
|
#wait for all processes to finish and collect exit codes
|
||||||
for pid in ${PIDS[@]}; do
|
for pid in "${PIDS[@]}"; do
|
||||||
wait ${pid}
|
wait "${pid}"
|
||||||
STATUS+=($?)
|
STATUS+=($?)
|
||||||
done
|
done
|
||||||
for st in ${STATUS[@]}; do
|
for st in "${STATUS[@]}"; do
|
||||||
if [[ ${st} -ne 0 ]]; then
|
if [[ ${st} -ne 0 ]]; then
|
||||||
echo "One of the processes failed with $st"
|
echo "One of the processes failed with $st"
|
||||||
exit ${st}
|
exit "${st}"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
else
|
else
|
||||||
@@ -146,9 +149,9 @@ else
|
|||||||
--rm \
|
--rm \
|
||||||
-e HIP_VISIBLE_DEVICES=0 \
|
-e HIP_VISIBLE_DEVICES=0 \
|
||||||
-e HF_TOKEN \
|
-e HF_TOKEN \
|
||||||
-v ${HF_CACHE}:${HF_MOUNT} \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e HF_HOME=${HF_MOUNT} \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
--name ${container_name} \
|
--name "${container_name}" \
|
||||||
${image_name} \
|
"${image_name}" \
|
||||||
/bin/bash -c "${commands}"
|
/bin/bash -c "${commands}"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
# This script is run by buildkite to run the benchmarks and upload the results to buildkite
|
# This script is run by buildkite to run the benchmarks and upload the results to buildkite
|
||||||
|
|
||||||
set -ex
|
set -ex
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
set -ex
|
set -ex
|
||||||
@@ -13,27 +15,38 @@ remove_docker_container
|
|||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
source /etc/environment
|
source /etc/environment
|
||||||
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
|
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
|
||||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test
|
||||||
|
|
||||||
# Run basic model test
|
function cpu_tests() {
|
||||||
docker exec cpu-test bash -c "
|
set -e
|
||||||
pip install pytest matplotlib einops transformers_stream_generator
|
|
||||||
pytest -v -s tests/models -m \"not vlm\" \
|
|
||||||
--ignore=tests/models/test_embedding.py \
|
|
||||||
--ignore=tests/models/test_oot_registration.py \
|
|
||||||
--ignore=tests/models/test_registry.py \
|
|
||||||
--ignore=tests/models/test_jamba.py \
|
|
||||||
--ignore=tests/models/test_mamba.py \
|
|
||||||
--ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported
|
|
||||||
|
|
||||||
# online inference
|
# Run basic model test
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
|
set -e
|
||||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
pip install pytest pytest-asyncio \
|
||||||
python3 benchmarks/benchmark_serving.py \
|
decord einops librosa peft Pillow sentence-transformers soundfile \
|
||||||
--backend vllm \
|
transformers_stream_generator matplotlib datamodel_code_generator
|
||||||
--dataset-name random \
|
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
|
||||||
--model facebook/opt-125m \
|
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||||
--num-prompts 20 \
|
pytest -v -s tests/models/embedding/language -m cpu_model
|
||||||
--endpoint /v1/completions \
|
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
||||||
--tokenizer facebook/opt-125m"
|
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
|
||||||
|
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
||||||
|
|
||||||
|
# online inference
|
||||||
|
docker exec cpu-test bash -c "
|
||||||
|
set -e
|
||||||
|
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
|
||||||
|
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||||
|
python3 benchmarks/benchmark_serving.py \
|
||||||
|
--backend vllm \
|
||||||
|
--dataset-name random \
|
||||||
|
--model facebook/opt-125m \
|
||||||
|
--num-prompts 20 \
|
||||||
|
--endpoint /v1/completions \
|
||||||
|
--tokenizer facebook/opt-125m"
|
||||||
|
}
|
||||||
|
|
||||||
|
# All of CPU tests are expected to be finished less than 25 mins.
|
||||||
|
export -f cpu_tests
|
||||||
|
timeout 25m bash -c "cpu_tests"
|
||||||
|
|||||||
@@ -1,10 +1,16 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
|
# allow to bind to different cores
|
||||||
|
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||||
|
NUMA_NODE=${NUMA_NODE:-1}
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
|
||||||
numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
|
remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
|
||||||
@@ -12,46 +18,61 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||||
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
|
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
|
||||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||||
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
|
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
|
||||||
|
|
||||||
# offline inference
|
function cpu_tests() {
|
||||||
docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
|
set -e
|
||||||
|
|
||||||
# Run basic model test
|
# offline inference
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test-avx2 bash -c "
|
||||||
pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
|
set -e
|
||||||
pytest -v -s tests/models/encoder_decoder/language
|
python3 examples/offline_inference.py"
|
||||||
pytest -v -s tests/models/decoder_only/language \
|
|
||||||
--ignore=tests/models/test_fp8.py \
|
|
||||||
--ignore=tests/models/decoder_only/language/test_jamba.py \
|
|
||||||
--ignore=tests/models/decoder_only/language/test_mamba.py \
|
|
||||||
--ignore=tests/models/decoder_only/language/test_granitemoe.py \
|
|
||||||
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
|
|
||||||
|
|
||||||
# Run compressed-tensor test
|
# Run basic model test
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
pytest -s -v \
|
set -e
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
pip install pytest pytest-asyncio \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
decord einops librosa peft Pillow sentence-transformers soundfile \
|
||||||
|
transformers_stream_generator matplotlib datamodel_code_generator
|
||||||
|
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
|
||||||
|
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||||
|
pytest -v -s tests/models/embedding/language -m cpu_model
|
||||||
|
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
||||||
|
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
|
||||||
|
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
||||||
|
|
||||||
# Run AWQ test
|
# Run compressed-tensor test
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
pytest -s -v \
|
set -e
|
||||||
tests/quantization/test_ipex_quant.py"
|
pytest -s -v \
|
||||||
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||||
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
||||||
|
|
||||||
# online inference
|
# Run AWQ test
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
export VLLM_CPU_KVCACHE_SPACE=10
|
set -e
|
||||||
export VLLM_CPU_OMP_THREADS_BIND=48-92
|
pytest -s -v \
|
||||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
|
tests/quantization/test_ipex_quant.py"
|
||||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
|
||||||
python3 benchmarks/benchmark_serving.py \
|
# online inference
|
||||||
--backend vllm \
|
docker exec cpu-test bash -c "
|
||||||
--dataset-name random \
|
set -e
|
||||||
--model facebook/opt-125m \
|
export VLLM_CPU_KVCACHE_SPACE=10
|
||||||
--num-prompts 20 \
|
export VLLM_CPU_OMP_THREADS_BIND=$1
|
||||||
--endpoint /v1/completions \
|
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
||||||
--tokenizer facebook/opt-125m"
|
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||||
|
python3 benchmarks/benchmark_serving.py \
|
||||||
|
--backend vllm \
|
||||||
|
--dataset-name random \
|
||||||
|
--model facebook/opt-125m \
|
||||||
|
--num-prompts 20 \
|
||||||
|
--endpoint /v1/completions \
|
||||||
|
--tokenizer facebook/opt-125m"
|
||||||
|
}
|
||||||
|
|
||||||
|
# All of CPU tests are expected to be finished less than 25 mins.
|
||||||
|
export -f cpu_tests
|
||||||
|
timeout 25m bash -c "cpu_tests $CORE_RANGE"
|
||||||
|
|||||||
16
.buildkite/run-hpu-test.sh
Normal file
16
.buildkite/run-hpu-test.sh
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
docker build -t hpu-test-env -f Dockerfile.hpu .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f hpu-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image and launch offline inference
|
||||||
|
docker run --runtime=habana --name=hpu-test --network=host -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
|
||||||
@@ -14,7 +14,7 @@ DOCKER_IMAGE=$4
|
|||||||
|
|
||||||
shift 4
|
shift 4
|
||||||
COMMANDS=("$@")
|
COMMANDS=("$@")
|
||||||
if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
|
if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then
|
||||||
echo "The number of commands must be equal to the number of nodes."
|
echo "The number of commands must be equal to the number of nodes."
|
||||||
echo "Number of nodes: $NUM_NODES"
|
echo "Number of nodes: $NUM_NODES"
|
||||||
echo "Number of commands: ${#COMMANDS[@]}"
|
echo "Number of commands: ${#COMMANDS[@]}"
|
||||||
@@ -23,7 +23,7 @@ fi
|
|||||||
|
|
||||||
echo "List of commands"
|
echo "List of commands"
|
||||||
for command in "${COMMANDS[@]}"; do
|
for command in "${COMMANDS[@]}"; do
|
||||||
echo $command
|
echo "$command"
|
||||||
done
|
done
|
||||||
|
|
||||||
start_network() {
|
start_network() {
|
||||||
@@ -36,7 +36,7 @@ start_nodes() {
|
|||||||
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
|
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
|
||||||
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
|
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
|
||||||
GPU_DEVICES+=$(($DEVICE_NUM))
|
GPU_DEVICES+=$(($DEVICE_NUM))
|
||||||
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
|
if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
|
||||||
GPU_DEVICES+=','
|
GPU_DEVICES+=','
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
@@ -49,17 +49,20 @@ start_nodes() {
|
|||||||
# 3. map the huggingface cache directory to the container
|
# 3. map the huggingface cache directory to the container
|
||||||
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
|
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
|
||||||
# starting from 192.168.10.11)
|
# starting from 192.168.10.11)
|
||||||
docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
|
docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
|
||||||
|
-v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
|
||||||
|
--network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
|
||||||
|
/bin/bash -c "tail -f /dev/null"
|
||||||
|
|
||||||
# organize containers into a ray cluster
|
# organize containers into a ray cluster
|
||||||
if [ $node -eq 0 ]; then
|
if [ "$node" -eq 0 ]; then
|
||||||
# start the ray head node
|
# start the ray head node
|
||||||
docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
|
docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block"
|
||||||
# wait for the head node to be ready
|
# wait for the head node to be ready
|
||||||
sleep 10
|
sleep 10
|
||||||
else
|
else
|
||||||
# start the ray worker nodes, and connect them to the head node
|
# start the ray worker nodes, and connect them to the head node
|
||||||
docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
|
docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
@@ -79,22 +82,22 @@ run_nodes() {
|
|||||||
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
|
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
|
||||||
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
|
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
|
||||||
GPU_DEVICES+=$(($DEVICE_NUM))
|
GPU_DEVICES+=$(($DEVICE_NUM))
|
||||||
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
|
if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
|
||||||
GPU_DEVICES+=','
|
GPU_DEVICES+=','
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
GPU_DEVICES+='"'
|
GPU_DEVICES+='"'
|
||||||
echo "Running node$node with GPU devices: $GPU_DEVICES"
|
echo "Running node$node with GPU devices: $GPU_DEVICES"
|
||||||
if [ $node -ne 0 ]; then
|
if [ "$node" -ne 0 ]; then
|
||||||
docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
|
docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
|
||||||
else
|
else
|
||||||
docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
|
docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
cleanup() {
|
cleanup() {
|
||||||
for node in $(seq 0 $(($NUM_NODES-1))); do
|
for node in $(seq 0 $(($NUM_NODES-1))); do
|
||||||
docker stop node$node
|
docker stop "node$node"
|
||||||
done
|
done
|
||||||
docker network rm docker-net
|
docker network rm docker-net
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
# This script build the Neuron docker image and run the API server inside the container.
|
# This script build the Neuron docker image and run the API server inside the container.
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
set -e
|
set -e
|
||||||
@@ -12,10 +14,10 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
|||||||
current_time=$(date +%s)
|
current_time=$(date +%s)
|
||||||
if [ $((current_time - last_build)) -gt 86400 ]; then
|
if [ $((current_time - last_build)) -gt 86400 ]; then
|
||||||
docker system prune -f
|
docker system prune -f
|
||||||
echo $current_time > /tmp/neuron-docker-build-timestamp
|
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
echo $(date +%s) > /tmp/neuron-docker-build-timestamp
|
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||||
fi
|
fi
|
||||||
|
|
||||||
docker build -t neuron -f Dockerfile.neuron .
|
docker build -t neuron -f Dockerfile.neuron .
|
||||||
@@ -34,7 +36,7 @@ wait_for_server_to_start() {
|
|||||||
timeout=300
|
timeout=300
|
||||||
counter=0
|
counter=0
|
||||||
|
|
||||||
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
|
while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
|
||||||
sleep 1
|
sleep 1
|
||||||
counter=$((counter + 1))
|
counter=$((counter + 1))
|
||||||
if [ $counter -ge $timeout ]; then
|
if [ $counter -ge $timeout ]; then
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
# This script build the OpenVINO docker image and run the offline inference inside the container.
|
# This script build the OpenVINO docker image and run the offline inference inside the container.
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
set -ex
|
set -ex
|
||||||
@@ -11,4 +13,4 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image and launch offline inference
|
# Run the image and launch offline inference
|
||||||
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
|
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
# Build the docker image.
|
# Build the docker image.
|
||||||
@@ -12,4 +14,4 @@ remove_docker_container
|
|||||||
# For HF_TOKEN.
|
# For HF_TOKEN.
|
||||||
source /etc/environment
|
source /etc/environment
|
||||||
# Run a simple end-to-end example.
|
# Run a simple end-to-end example.
|
||||||
docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
|
docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
set -ex
|
set -ex
|
||||||
|
|||||||
@@ -9,6 +9,7 @@
|
|||||||
# label(str): the name of the test. emoji allowed.
|
# label(str): the name of the test. emoji allowed.
|
||||||
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
||||||
# fast_check_only(bool): run this test on fastcheck pipeline only
|
# fast_check_only(bool): run this test on fastcheck pipeline only
|
||||||
|
# nightly(bool): run this test in nightly pipeline only
|
||||||
# optional(bool): never run this test by default (i.e. need to unblock manually)
|
# optional(bool): never run this test by default (i.e. need to unblock manually)
|
||||||
# command(str): the single command to run for tests. incompatible with commands.
|
# command(str): the single command to run for tests. incompatible with commands.
|
||||||
# commands(list): the list of commands to run for test. incompatbile with command.
|
# commands(list): the list of commands to run for test. incompatbile with command.
|
||||||
@@ -119,6 +120,7 @@ steps:
|
|||||||
- tests/spec_decode/e2e/test_integration_dist_tp4
|
- tests/spec_decode/e2e/test_integration_dist_tp4
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||||
@@ -163,6 +165,14 @@ steps:
|
|||||||
# OOM in the CI unless we run this separately
|
# OOM in the CI unless we run this separately
|
||||||
- pytest -v -s tokenization
|
- pytest -v -s tokenization
|
||||||
|
|
||||||
|
- label: V1 Test
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/v1
|
||||||
|
commands:
|
||||||
|
- pytest -v -s v1
|
||||||
|
|
||||||
- label: Examples Test # 15min
|
- label: Examples Test # 15min
|
||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
@@ -229,15 +239,16 @@ steps:
|
|||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
|
# these tests need to be separated, cannot combine
|
||||||
|
- pytest -v -s compile/piecewise/test_simple.py
|
||||||
|
- pytest -v -s compile/piecewise/test_toy_llama.py
|
||||||
|
|
||||||
# TODO: re-write in comparison tests, and fix symbolic shape
|
- label: "PyTorch Fullgraph Test" # 18min
|
||||||
# for quantization ops.
|
source_file_dependencies:
|
||||||
# - label: "PyTorch Fullgraph Test" # 18min
|
- vllm/
|
||||||
# source_file_dependencies:
|
- tests/compile
|
||||||
# - vllm/
|
commands:
|
||||||
# - tests/compile
|
- pytest -v -s compile/test_full_graph.py
|
||||||
# commands:
|
|
||||||
# - pytest -v -s compile/test_full_graph.py
|
|
||||||
|
|
||||||
- label: Kernels Test %N # 1h each
|
- label: Kernels Test %N # 1h each
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
@@ -266,7 +277,6 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- benchmarks/
|
- benchmarks/
|
||||||
commands:
|
commands:
|
||||||
- pip install aiohttp
|
|
||||||
- bash run-benchmarks.sh
|
- bash run-benchmarks.sh
|
||||||
|
|
||||||
- label: Quantization Test # 33min
|
- label: Quantization Test # 33min
|
||||||
@@ -303,46 +313,70 @@ steps:
|
|||||||
|
|
||||||
##### models test #####
|
##### models test #####
|
||||||
|
|
||||||
- label: Basic Models Test # 3min
|
- label: Basic Models Test # 30min
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models
|
- tests/models
|
||||||
commands:
|
commands:
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||||
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py
|
- pytest -v -s models/test_registry.py
|
||||||
|
- pytest -v -s models/test_initialization.py
|
||||||
|
|
||||||
- label: Decoder-only Language Models Test # 1h36min
|
- label: Language Models Test (Standard) # 42min
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/decoder_only/language
|
- tests/models/decoder_only/language
|
||||||
|
- tests/models/embedding/language
|
||||||
|
- tests/models/encoder_decoder/language
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/decoder_only/language
|
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
|
||||||
|
- pytest -v -s models/embedding/language -m core_model
|
||||||
|
- pytest -v -s models/embedding/vision_language -m core_model
|
||||||
|
|
||||||
- label: Decoder-only Multi-Modal Models Test # 1h31min
|
- label: Language Models Test (Extended) # 50min
|
||||||
|
nightly: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/decoder_only/language
|
||||||
|
- tests/models/embedding/language
|
||||||
|
- tests/models/encoder_decoder/language
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
|
||||||
|
- pytest -v -s models/embedding/language -m 'not core_model'
|
||||||
|
- pytest -v -s models/embedding/vision_language -m 'not core_model'
|
||||||
|
|
||||||
|
- label: Multi-Modal Models Test (Standard) # 26min
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/decoder_only/audio_language
|
- tests/models/decoder_only/audio_language
|
||||||
- tests/models/decoder_only/vision_language
|
- tests/models/decoder_only/vision_language
|
||||||
commands:
|
|
||||||
- pytest -v -s models/decoder_only/audio_language
|
|
||||||
- pytest -v -s models/decoder_only/vision_language
|
|
||||||
|
|
||||||
- label: Other Models Test # 6min
|
|
||||||
#mirror_hardwares: [amd]
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/embedding/language
|
|
||||||
- tests/models/embedding/vision_language
|
- tests/models/embedding/vision_language
|
||||||
- tests/models/encoder_decoder/language
|
|
||||||
- tests/models/encoder_decoder/vision_language
|
- tests/models/encoder_decoder/vision_language
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/embedding/language
|
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
|
||||||
- pytest -v -s models/embedding/vision_language
|
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
|
||||||
- pytest -v -s models/encoder_decoder/language
|
- pytest -v -s models/encoder_decoder/language -m core_model
|
||||||
- pytest -v -s models/encoder_decoder/vision_language
|
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
||||||
|
|
||||||
|
- label: Multi-Modal Models Test (Extended) # 1h15m
|
||||||
|
nightly: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/decoder_only/audio_language
|
||||||
|
- tests/models/decoder_only/vision_language
|
||||||
|
- tests/models/embedding/vision_language
|
||||||
|
- tests/models/encoder_decoder/vision_language
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
|
||||||
|
# HACK - run phi3v tests separately to sidestep this transformers bug
|
||||||
|
# https://github.com/huggingface/transformers/issues/34307
|
||||||
|
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
|
||||||
|
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
|
||||||
|
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
|
||||||
|
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
|
||||||
|
|
||||||
# This test is used only in PR development phase to test individual models and should never run on main
|
# This test is used only in PR development phase to test individual models and should never run on main
|
||||||
- label: Custom Models Test
|
- label: Custom Models Test
|
||||||
@@ -403,12 +437,11 @@ steps:
|
|||||||
# Avoid importing model tests that cause CUDA reinitialization error
|
# Avoid importing model tests that cause CUDA reinitialization error
|
||||||
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
|
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
|
||||||
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
|
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
|
||||||
- pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
|
- pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
- pytest -v -s distributed/test_distributed_oot.py
|
- pytest -v -s distributed/test_distributed_oot.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
|
|
||||||
|
|
||||||
- label: Multi-step Tests (4 GPUs) # 36min
|
- label: Multi-step Tests (4 GPUs) # 36min
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
@@ -487,6 +520,7 @@ steps:
|
|||||||
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||||
|
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
|
||||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
|
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
|
||||||
- pytest -v -s -x lora/test_mixtral.py
|
- pytest -v -s -x lora/test_mixtral.py
|
||||||
|
|
||||||
|
|||||||
38
.buildkite/upload-wheels.sh
Normal file
38
.buildkite/upload-wheels.sh
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Assume wheels are in artifacts/dist/*.whl
|
||||||
|
wheel_files=(artifacts/dist/*.whl)
|
||||||
|
|
||||||
|
# Check that exactly one wheel is found
|
||||||
|
if [[ ${#wheel_files[@]} -ne 1 ]]; then
|
||||||
|
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Get the single wheel file
|
||||||
|
wheel="${wheel_files[0]}"
|
||||||
|
|
||||||
|
# Rename 'linux' to 'manylinux1' in the wheel filename
|
||||||
|
new_wheel="${wheel/linux/manylinux1}"
|
||||||
|
mv -- "$wheel" "$new_wheel"
|
||||||
|
wheel="$new_wheel"
|
||||||
|
|
||||||
|
# Extract the version from the wheel
|
||||||
|
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||||
|
echo "Version: $version"
|
||||||
|
|
||||||
|
# If the version contains "dev", rename it to v1.0.0.dev for consistency
|
||||||
|
if [[ $version == *dev* ]]; then
|
||||||
|
new_version="1.0.0.dev"
|
||||||
|
new_wheel="${wheel/$version/$new_version}"
|
||||||
|
mv -- "$wheel" "$new_wheel"
|
||||||
|
wheel="$new_wheel"
|
||||||
|
version="$new_version"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Upload the wheel to S3
|
||||||
|
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
|
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
||||||
|
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||||
25
.github/dependabot.yml
vendored
25
.github/dependabot.yml
vendored
@@ -5,3 +5,28 @@ updates:
|
|||||||
directory: "/"
|
directory: "/"
|
||||||
schedule:
|
schedule:
|
||||||
interval: "weekly"
|
interval: "weekly"
|
||||||
|
- package-ecosystem: "pip"
|
||||||
|
directory: "/"
|
||||||
|
schedule:
|
||||||
|
interval: "weekly"
|
||||||
|
labels: ["dependencies"]
|
||||||
|
open-pull-requests-limit: 5
|
||||||
|
reviewers: ["khluu", "simon-mo"]
|
||||||
|
allow:
|
||||||
|
- dependency-type: "all"
|
||||||
|
ignore:
|
||||||
|
- dependency-name: "torch"
|
||||||
|
- dependency-name: "torchvision"
|
||||||
|
- dependency-name: "xformers"
|
||||||
|
- dependency-name: "lm-format-enforcer"
|
||||||
|
- dependency-name: "gguf"
|
||||||
|
- dependency-name: "compressed-tensors"
|
||||||
|
- dependency-name: "ray[adag]"
|
||||||
|
- dependency-name: "lm-eval"
|
||||||
|
groups:
|
||||||
|
patch-update:
|
||||||
|
applies-to: version-updates
|
||||||
|
update-types: ["patch"]
|
||||||
|
minor-update:
|
||||||
|
applies-to: version-updates
|
||||||
|
update-types: ["minor"]
|
||||||
|
|||||||
60
.github/mergify.yml
vendored
Normal file
60
.github/mergify.yml
vendored
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
pull_request_rules:
|
||||||
|
- name: label-documentation
|
||||||
|
description: Automatically apply documentation label
|
||||||
|
conditions:
|
||||||
|
- or:
|
||||||
|
- files~=^[^/]+\.md$
|
||||||
|
- files~=^docs/
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- documentation
|
||||||
|
|
||||||
|
- name: label-ci-build
|
||||||
|
description: Automatically apply ci/build label
|
||||||
|
conditions:
|
||||||
|
- or:
|
||||||
|
- files~=^\.github/
|
||||||
|
- files~=\.buildkite/
|
||||||
|
- files~=^cmake/
|
||||||
|
- files=CMakeLists.txt
|
||||||
|
- files~=^Dockerfile
|
||||||
|
- files~=^requirements.*\.txt
|
||||||
|
- files=setup.py
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- ci/build
|
||||||
|
|
||||||
|
- name: label-frontend
|
||||||
|
description: Automatically apply frontend label
|
||||||
|
conditions:
|
||||||
|
- files~=^vllm/entrypoints/
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- frontend
|
||||||
|
|
||||||
|
- name: ping author on conflicts and add 'needs-rebase' label
|
||||||
|
conditions:
|
||||||
|
- conflict
|
||||||
|
- -closed
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- needs-rebase
|
||||||
|
comment:
|
||||||
|
message: |
|
||||||
|
This pull request has merge conflicts that must be resolved before it can be
|
||||||
|
merged. Please rebase the PR, @{{author}}.
|
||||||
|
|
||||||
|
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
|
||||||
|
|
||||||
|
- name: remove 'needs-rebase' label when conflict is resolved
|
||||||
|
conditions:
|
||||||
|
- -conflict
|
||||||
|
- -closed
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
remove:
|
||||||
|
- needs-rebase
|
||||||
33
.github/scripts/cleanup_pr_body.sh
vendored
Executable file
33
.github/scripts/cleanup_pr_body.sh
vendored
Executable file
@@ -0,0 +1,33 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
# ensure 1 argument is passed
|
||||||
|
if [ "$#" -ne 1 ]; then
|
||||||
|
echo "Usage: $0 <pr_number>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
PR_NUMBER=$1
|
||||||
|
OLD=/tmp/orig_pr_body.txt
|
||||||
|
NEW=/tmp/new_pr_body.txt
|
||||||
|
|
||||||
|
gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
|
||||||
|
cp "${OLD}" "${NEW}"
|
||||||
|
|
||||||
|
# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
|
||||||
|
sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE\*\*/,$d' "${NEW}"
|
||||||
|
|
||||||
|
# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
|
||||||
|
sed -i '/FIX #xxxx.*$/d' "${NEW}"
|
||||||
|
|
||||||
|
# Remove "FILL IN THE PR DESCRIPTION HERE"
|
||||||
|
sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
|
||||||
|
|
||||||
|
# Run this only if ${NEW} is different than ${OLD}
|
||||||
|
if ! cmp -s "${OLD}" "${NEW}"; then
|
||||||
|
echo "Updating PR body"
|
||||||
|
gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
|
||||||
|
else
|
||||||
|
echo "No changes needed"
|
||||||
|
fi
|
||||||
5
.github/workflows/actionlint.yml
vendored
5
.github/workflows/actionlint.yml
vendored
@@ -6,12 +6,14 @@ on:
|
|||||||
paths:
|
paths:
|
||||||
- '.github/workflows/*.ya?ml'
|
- '.github/workflows/*.ya?ml'
|
||||||
- '.github/workflows/actionlint.*'
|
- '.github/workflows/actionlint.*'
|
||||||
|
- '.github/workflows/matchers/actionlint.json'
|
||||||
pull_request:
|
pull_request:
|
||||||
branches:
|
branches:
|
||||||
- "main"
|
- "main"
|
||||||
paths:
|
paths:
|
||||||
- '.github/workflows/*.ya?ml'
|
- '.github/workflows/*.ya?ml'
|
||||||
- '.github/workflows/actionlint.*'
|
- '.github/workflows/actionlint.*'
|
||||||
|
- '.github/workflows/matchers/actionlint.json'
|
||||||
|
|
||||||
env:
|
env:
|
||||||
LC_ALL: en_US.UTF-8
|
LC_ALL: en_US.UTF-8
|
||||||
@@ -28,10 +30,11 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: "Checkout"
|
- name: "Checkout"
|
||||||
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
with:
|
with:
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
- name: "Run actionlint"
|
- name: "Run actionlint"
|
||||||
run: |
|
run: |
|
||||||
|
echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
||||||
tools/actionlint.sh -color
|
tools/actionlint.sh -color
|
||||||
|
|||||||
2
.github/workflows/add_label_automerge.yml
vendored
2
.github/workflows/add_label_automerge.yml
vendored
@@ -8,7 +8,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Add label
|
- name: Add label
|
||||||
uses: actions/github-script@v7
|
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
|
||||||
with:
|
with:
|
||||||
script: |
|
script: |
|
||||||
github.rest.issues.addLabels({
|
github.rest.issues.addLabels({
|
||||||
|
|||||||
16
.github/workflows/clang-format.yml
vendored
16
.github/workflows/clang-format.yml
vendored
@@ -6,9 +6,21 @@ on:
|
|||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- main
|
- main
|
||||||
|
paths:
|
||||||
|
- '**/*.h'
|
||||||
|
- '**/*.cpp'
|
||||||
|
- '**/*.cu'
|
||||||
|
- '**/*.cuh'
|
||||||
|
- '.github/workflows/clang-format.yml'
|
||||||
pull_request:
|
pull_request:
|
||||||
branches:
|
branches:
|
||||||
- main
|
- main
|
||||||
|
paths:
|
||||||
|
- '**/*.h'
|
||||||
|
- '**/*.cpp'
|
||||||
|
- '**/*.cu'
|
||||||
|
- '**/*.cuh'
|
||||||
|
- '.github/workflows/clang-format.yml'
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
clang-format:
|
clang-format:
|
||||||
@@ -17,9 +29,9 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.11"]
|
python-version: ["3.11"]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
|
|||||||
26
.github/workflows/cleanup_pr_body.yml
vendored
Normal file
26
.github/workflows/cleanup_pr_body.yml
vendored
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
name: Cleanup PR Body
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request_target:
|
||||||
|
types: [opened, reopened, edited]
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
pull-requests: write
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
update-description:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||||
|
with:
|
||||||
|
python-version: '3.12'
|
||||||
|
|
||||||
|
- name: Update PR description
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
|
||||||
45
.github/workflows/codespell.yml
vendored
Normal file
45
.github/workflows/codespell.yml
vendored
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
name: codespell
|
||||||
|
|
||||||
|
on:
|
||||||
|
# Trigger the workflow on push or pull request,
|
||||||
|
# but only for the main branch
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
paths:
|
||||||
|
- "**/*.py"
|
||||||
|
- "**/*.md"
|
||||||
|
- "**/*.rst"
|
||||||
|
- pyproject.toml
|
||||||
|
- requirements-lint.txt
|
||||||
|
- .github/workflows/codespell.yml
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
paths:
|
||||||
|
- "**/*.py"
|
||||||
|
- "**/*.md"
|
||||||
|
- "**/*.rst"
|
||||||
|
- pyproject.toml
|
||||||
|
- requirements-lint.txt
|
||||||
|
- .github/workflows/codespell.yml
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
codespell:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
python-version: ["3.12"]
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
|
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install -r requirements-lint.txt
|
||||||
|
- name: Spelling check with codespell
|
||||||
|
run: |
|
||||||
|
codespell --toml pyproject.toml
|
||||||
16
.github/workflows/matchers/mypy.json
vendored
Normal file
16
.github/workflows/matchers/mypy.json
vendored
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
{
|
||||||
|
"problemMatcher": [
|
||||||
|
{
|
||||||
|
"owner": "mypy",
|
||||||
|
"pattern": [
|
||||||
|
{
|
||||||
|
"regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$",
|
||||||
|
"file": 1,
|
||||||
|
"line": 2,
|
||||||
|
"severity": 3,
|
||||||
|
"message": 4
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
17
.github/workflows/matchers/ruff.json
vendored
Normal file
17
.github/workflows/matchers/ruff.json
vendored
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
{
|
||||||
|
"problemMatcher": [
|
||||||
|
{
|
||||||
|
"owner": "ruff",
|
||||||
|
"pattern": [
|
||||||
|
{
|
||||||
|
"regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$",
|
||||||
|
"file": 1,
|
||||||
|
"line": 2,
|
||||||
|
"column": 3,
|
||||||
|
"code": 4,
|
||||||
|
"message": 5
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
24
.github/workflows/mypy.yaml
vendored
24
.github/workflows/mypy.yaml
vendored
@@ -6,20 +6,35 @@ on:
|
|||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- main
|
- main
|
||||||
|
paths:
|
||||||
|
- '**/*.py'
|
||||||
|
- '.github/workflows/mypy.yaml'
|
||||||
|
- 'tools/mypy.sh'
|
||||||
|
- 'pyproject.toml'
|
||||||
pull_request:
|
pull_request:
|
||||||
branches:
|
branches:
|
||||||
- main
|
- main
|
||||||
|
# This workflow is only relevant when one of the following files changes.
|
||||||
|
# However, we have github configured to expect and require this workflow
|
||||||
|
# to run and pass before github with auto-merge a pull request. Until github
|
||||||
|
# allows more flexible auto-merge policy, we can just run this on every PR.
|
||||||
|
# It doesn't take that long to run, anyway.
|
||||||
|
#paths:
|
||||||
|
# - '**/*.py'
|
||||||
|
# - '.github/workflows/mypy.yaml'
|
||||||
|
# - 'tools/mypy.sh'
|
||||||
|
# - 'pyproject.toml'
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
mypy:
|
mypy:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
@@ -32,4 +47,5 @@ jobs:
|
|||||||
pip install types-setuptools
|
pip install types-setuptools
|
||||||
- name: Mypy
|
- name: Mypy
|
||||||
run: |
|
run: |
|
||||||
tools/mypy.sh
|
echo "::add-matcher::.github/workflows/matchers/mypy.json"
|
||||||
|
tools/mypy.sh 1 ${{ matrix.python-version }}
|
||||||
|
|||||||
14
.github/workflows/publish.yml
vendored
14
.github/workflows/publish.yml
vendored
@@ -21,7 +21,7 @@ jobs:
|
|||||||
upload_url: ${{ steps.create_release.outputs.upload_url }}
|
upload_url: ${{ steps.create_release.outputs.upload_url }}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Extract branch info
|
- name: Extract branch info
|
||||||
shell: bash
|
shell: bash
|
||||||
@@ -30,7 +30,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Create Release
|
- name: Create Release
|
||||||
id: create_release
|
id: create_release
|
||||||
uses: "actions/github-script@v7"
|
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
|
||||||
env:
|
env:
|
||||||
RELEASE_TAG: ${{ env.release_tag }}
|
RELEASE_TAG: ${{ env.release_tag }}
|
||||||
with:
|
with:
|
||||||
@@ -48,16 +48,16 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
os: ['ubuntu-20.04']
|
os: ['ubuntu-20.04']
|
||||||
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
|
python-version: ['3.9', '3.10', '3.11', '3.12']
|
||||||
pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
|
pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
|
||||||
cuda-version: ['11.8', '12.1']
|
cuda-version: ['11.8', '12.1']
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Setup ccache
|
- name: Setup ccache
|
||||||
uses: hendrikmuhs/ccache-action@v1.2
|
uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
|
||||||
with:
|
with:
|
||||||
create-symlink: true
|
create-symlink: true
|
||||||
key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
|
key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
|
||||||
@@ -68,7 +68,7 @@ jobs:
|
|||||||
bash -x .github/workflows/scripts/env.sh
|
bash -x .github/workflows/scripts/env.sh
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
|
|
||||||
@@ -92,7 +92,7 @@ jobs:
|
|||||||
echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
|
echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
|
||||||
|
|
||||||
- name: Upload Release Asset
|
- name: Upload Release Asset
|
||||||
uses: actions/upload-release-asset@v1
|
uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
|
||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
with:
|
with:
|
||||||
|
|||||||
2
.github/workflows/reminder_comment.yml
vendored
2
.github/workflows/reminder_comment.yml
vendored
@@ -8,7 +8,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Remind to run full CI on PR
|
- name: Remind to run full CI on PR
|
||||||
uses: actions/github-script@v7
|
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
|
||||||
with:
|
with:
|
||||||
script: |
|
script: |
|
||||||
github.rest.issues.createComment({
|
github.rest.issues.createComment({
|
||||||
|
|||||||
53
.github/workflows/ruff.yml
vendored
53
.github/workflows/ruff.yml
vendored
@@ -6,32 +6,47 @@ on:
|
|||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- main
|
- main
|
||||||
|
paths:
|
||||||
|
- "**/*.py"
|
||||||
|
- pyproject.toml
|
||||||
|
- requirements-lint.txt
|
||||||
|
- .github/workflows/matchers/ruff.json
|
||||||
|
- .github/workflows/ruff.yml
|
||||||
pull_request:
|
pull_request:
|
||||||
branches:
|
branches:
|
||||||
- main
|
- main
|
||||||
|
# This workflow is only relevant when one of the following files changes.
|
||||||
|
# However, we have github configured to expect and require this workflow
|
||||||
|
# to run and pass before github with auto-merge a pull request. Until github
|
||||||
|
# allows more flexible auto-merge policy, we can just run this on every PR.
|
||||||
|
# It doesn't take that long to run, anyway.
|
||||||
|
#paths:
|
||||||
|
# - "**/*.py"
|
||||||
|
# - pyproject.toml
|
||||||
|
# - requirements-lint.txt
|
||||||
|
# - .github/workflows/matchers/ruff.json
|
||||||
|
# - .github/workflows/ruff.yml
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
ruff:
|
ruff:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
python-version: ["3.12"]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install -r requirements-lint.txt
|
pip install -r requirements-lint.txt
|
||||||
- name: Analysing the code with ruff
|
- name: Analysing the code with ruff
|
||||||
run: |
|
run: |
|
||||||
ruff check .
|
echo "::add-matcher::.github/workflows/matchers/ruff.json"
|
||||||
- name: Spelling check with codespell
|
ruff check --output-format github .
|
||||||
run: |
|
- name: Run isort
|
||||||
codespell --toml pyproject.toml
|
run: |
|
||||||
- name: Run isort
|
isort . --check-only
|
||||||
run: |
|
|
||||||
isort . --check-only
|
|
||||||
|
|||||||
8
.github/workflows/scripts/cuda-install.sh
vendored
8
.github/workflows/scripts/cuda-install.sh
vendored
@@ -1,16 +1,16 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# Replace '.' with '-' ex: 11.8 -> 11-8
|
# Replace '.' with '-' ex: 11.8 -> 11-8
|
||||||
cuda_version=$(echo $1 | tr "." "-")
|
cuda_version=$(echo "$1" | tr "." "-")
|
||||||
# Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
|
# Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
|
||||||
OS=$(echo $2 | tr -d ".\-")
|
OS=$(echo "$2" | tr -d ".\-")
|
||||||
|
|
||||||
# Installs CUDA
|
# Installs CUDA
|
||||||
wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
|
wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb"
|
||||||
sudo dpkg -i cuda-keyring_1.1-1_all.deb
|
sudo dpkg -i cuda-keyring_1.1-1_all.deb
|
||||||
rm cuda-keyring_1.1-1_all.deb
|
rm cuda-keyring_1.1-1_all.deb
|
||||||
sudo apt -qq update
|
sudo apt -qq update
|
||||||
sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
|
sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}"
|
||||||
sudo apt clean
|
sudo apt clean
|
||||||
|
|
||||||
# Test nvcc
|
# Test nvcc
|
||||||
|
|||||||
2
.github/workflows/scripts/pytorch-install.sh
vendored
2
.github/workflows/scripts/pytorch-install.sh
vendored
@@ -6,7 +6,7 @@ cuda_version=$3
|
|||||||
|
|
||||||
# Install torch
|
# Install torch
|
||||||
$python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
|
$python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
|
||||||
$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
|
$python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}"
|
||||||
|
|
||||||
# Print version information
|
# Print version information
|
||||||
$python_executable --version
|
$python_executable --version
|
||||||
|
|||||||
37
.github/workflows/shellcheck.yml
vendored
Normal file
37
.github/workflows/shellcheck.yml
vendored
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
name: Lint shell scripts
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- "main"
|
||||||
|
paths:
|
||||||
|
- '**/*.sh'
|
||||||
|
- '.github/workflows/shellcheck.yml'
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- "main"
|
||||||
|
paths:
|
||||||
|
- '**/*.sh'
|
||||||
|
- '.github/workflows/shellcheck.yml'
|
||||||
|
|
||||||
|
env:
|
||||||
|
LC_ALL: en_US.UTF-8
|
||||||
|
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
shellcheck:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: "Checkout"
|
||||||
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: "Check shell scripts"
|
||||||
|
run: |
|
||||||
|
tools/shellcheck.sh
|
||||||
52
.github/workflows/stale.yml
vendored
Normal file
52
.github/workflows/stale.yml
vendored
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
name: 'Close inactive issues and PRs'
|
||||||
|
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
# Daily at 1:30 AM UTC
|
||||||
|
- cron: '30 1 * * *'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
close-issues-and-pull-requests:
|
||||||
|
permissions:
|
||||||
|
issues: write
|
||||||
|
pull-requests: write
|
||||||
|
actions: write
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
|
||||||
|
with:
|
||||||
|
# Increasing this value ensures that changes to this workflow
|
||||||
|
# propagate to all issues and PRs in days rather than months
|
||||||
|
operations-per-run: 1000
|
||||||
|
|
||||||
|
exempt-draft-pr: true
|
||||||
|
exempt-issue-labels: 'keep-open'
|
||||||
|
exempt-pr-labels: 'keep-open'
|
||||||
|
|
||||||
|
labels-to-add-when-unstale: 'unstale'
|
||||||
|
labels-to-remove-when-stale: 'unstale'
|
||||||
|
|
||||||
|
days-before-issue-stale: 90
|
||||||
|
days-before-issue-close: 30
|
||||||
|
stale-issue-label: 'stale'
|
||||||
|
stale-issue-message: >
|
||||||
|
This issue has been automatically marked as stale because it has not
|
||||||
|
had any activity within 90 days. It will be automatically closed if no
|
||||||
|
further activity occurs within 30 days. Leave a comment if
|
||||||
|
you feel this issue should remain open. Thank you!
|
||||||
|
close-issue-message: >
|
||||||
|
This issue has been automatically closed due to inactivity. Please
|
||||||
|
feel free to reopen if you feel it is still relevant. Thank you!
|
||||||
|
|
||||||
|
days-before-pr-stale: 90
|
||||||
|
days-before-pr-close: 30
|
||||||
|
stale-pr-label: 'stale'
|
||||||
|
stale-pr-message: >
|
||||||
|
This pull request has been automatically marked as stale because it
|
||||||
|
has not had any activity within 90 days. It will be automatically
|
||||||
|
closed if no further activity occurs within 30 days. Leave a comment
|
||||||
|
if you feel this pull request should remain open. Thank you!
|
||||||
|
close-pr-message: >
|
||||||
|
This pull request has been automatically closed due to inactivity.
|
||||||
|
Please feel free to reopen if you intend to continue working on it.
|
||||||
|
Thank you!
|
||||||
35
.github/workflows/yapf.yml
vendored
35
.github/workflows/yapf.yml
vendored
@@ -6,26 +6,33 @@ on:
|
|||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- main
|
- main
|
||||||
|
paths:
|
||||||
|
- "**/*.py"
|
||||||
|
- .github/workflows/yapf.yml
|
||||||
pull_request:
|
pull_request:
|
||||||
branches:
|
branches:
|
||||||
- main
|
- main
|
||||||
|
paths:
|
||||||
|
- "**/*.py"
|
||||||
|
- .github/workflows/yapf.yml
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
yapf:
|
yapf:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
python-version: ["3.12"]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install yapf==0.32.0
|
pip install yapf==0.32.0
|
||||||
pip install toml==0.10.2
|
pip install toml==0.10.2
|
||||||
- name: Running yapf
|
- name: Running yapf
|
||||||
run: |
|
run: |
|
||||||
yapf --diff --recursive .
|
yapf --diff --recursive .
|
||||||
|
|||||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -202,3 +202,4 @@ benchmarks/*.json
|
|||||||
|
|
||||||
# Linting
|
# Linting
|
||||||
actionlint
|
actionlint
|
||||||
|
shellcheck*/
|
||||||
|
|||||||
@@ -6,17 +6,16 @@ version: 2
|
|||||||
build:
|
build:
|
||||||
os: ubuntu-22.04
|
os: ubuntu-22.04
|
||||||
tools:
|
tools:
|
||||||
python: "3.8"
|
python: "3.12"
|
||||||
|
|
||||||
sphinx:
|
sphinx:
|
||||||
configuration: docs/source/conf.py
|
configuration: docs/source/conf.py
|
||||||
fail_on_warning: true
|
fail_on_warning: true
|
||||||
|
|
||||||
# If using Sphinx, optionally build your docs in additional formats such as PDF
|
# If using Sphinx, optionally build your docs in additional formats such as PDF
|
||||||
formats: []
|
formats: []
|
||||||
|
|
||||||
# Optionally declare the Python requirements required to build your docs
|
# Optionally declare the Python requirements required to build your docs
|
||||||
python:
|
python:
|
||||||
install:
|
install:
|
||||||
- requirements: docs/requirements-docs.txt
|
- requirements: docs/requirements-docs.txt
|
||||||
|
|
||||||
|
|||||||
9
.shellcheckrc
Normal file
9
.shellcheckrc
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
# rules currently disabled:
|
||||||
|
#
|
||||||
|
# SC1091 (info): Not following: <sourced file> was not specified as input (see shellcheck -x)
|
||||||
|
# SC2004 (style): $/${} is unnecessary on arithmetic variables.
|
||||||
|
# SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects.
|
||||||
|
# SC2155 (warning): Declare and assign separately to avoid masking return values.
|
||||||
|
# SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails.
|
||||||
|
#
|
||||||
|
disable=SC1091,SC2004,SC2129,SC2155,SC2164
|
||||||
@@ -31,13 +31,13 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
|||||||
# Supported python versions. These versions will be searched in order, the
|
# Supported python versions. These versions will be searched in order, the
|
||||||
# first match will be selected. These should be kept in sync with setup.py.
|
# first match will be selected. These should be kept in sync with setup.py.
|
||||||
#
|
#
|
||||||
set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12")
|
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
||||||
|
|
||||||
# Supported NVIDIA architectures.
|
# Supported NVIDIA architectures.
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
|
||||||
|
|
||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported/expected torch versions for CUDA/ROCm.
|
# Supported/expected torch versions for CUDA/ROCm.
|
||||||
@@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
|
|||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from Dockerfile.rocm
|
# versions are derived from Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
@@ -83,24 +83,6 @@ endif()
|
|||||||
#
|
#
|
||||||
find_package(Torch REQUIRED)
|
find_package(Torch REQUIRED)
|
||||||
|
|
||||||
#
|
|
||||||
message(STATUS "Enabling core extension.")
|
|
||||||
|
|
||||||
# Define _core_C extension
|
|
||||||
# built for (almost) every target platform, (excludes TPU and Neuron)
|
|
||||||
|
|
||||||
set(VLLM_EXT_SRC
|
|
||||||
"csrc/core/torch_bindings.cpp")
|
|
||||||
|
|
||||||
define_gpu_extension_target(
|
|
||||||
_core_C
|
|
||||||
DESTINATION vllm
|
|
||||||
LANGUAGE CXX
|
|
||||||
SOURCES ${VLLM_EXT_SRC}
|
|
||||||
COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
|
|
||||||
USE_SABI 3
|
|
||||||
WITH_SOABI)
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Forward the non-CUDA device extensions to external CMake scripts.
|
# Forward the non-CUDA device extensions to external CMake scripts.
|
||||||
#
|
#
|
||||||
@@ -187,12 +169,12 @@ endif()
|
|||||||
|
|
||||||
#
|
#
|
||||||
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
|
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
|
||||||
# Configure it to place files in vllm/.deps, in order to play nicely with sccache.
|
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
|
||||||
|
# Each dependency that produces build artifacts should override its BINARY_DIR to avoid
|
||||||
|
# conflicts between build types. It should instead be set to ${CMAKE_BINARY_DIR}/<dependency>.
|
||||||
#
|
#
|
||||||
include(FetchContent)
|
include(FetchContent)
|
||||||
get_filename_component(PROJECT_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
|
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
|
||||||
file(MAKE_DIRECTORY "${FETCHCONTENT_BASE_DIR}")
|
|
||||||
set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT_DIR}/.deps")
|
|
||||||
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
||||||
|
|
||||||
#
|
#
|
||||||
@@ -205,15 +187,16 @@ message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
|||||||
|
|
||||||
set(VLLM_EXT_SRC
|
set(VLLM_EXT_SRC
|
||||||
"csrc/cache_kernels.cu"
|
"csrc/cache_kernels.cu"
|
||||||
"csrc/attention/attention_kernels.cu"
|
"csrc/attention/paged_attention_v1.cu"
|
||||||
|
"csrc/attention/paged_attention_v2.cu"
|
||||||
"csrc/pos_encoding_kernels.cu"
|
"csrc/pos_encoding_kernels.cu"
|
||||||
"csrc/activation_kernels.cu"
|
"csrc/activation_kernels.cu"
|
||||||
"csrc/layernorm_kernels.cu"
|
"csrc/layernorm_kernels.cu"
|
||||||
|
"csrc/layernorm_quant_kernels.cu"
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||||
"csrc/quantization/fp8/common.cu"
|
"csrc/quantization/fp8/common.cu"
|
||||||
"csrc/cuda_utils_kernels.cu"
|
"csrc/cuda_utils_kernels.cu"
|
||||||
"csrc/moe_align_block_size_kernels.cu"
|
|
||||||
"csrc/prepare_inputs/advance_step.cu"
|
"csrc/prepare_inputs/advance_step.cu"
|
||||||
"csrc/torch_bindings.cpp")
|
"csrc/torch_bindings.cpp")
|
||||||
|
|
||||||
@@ -270,7 +253,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
|
message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
|
||||||
else()
|
else()
|
||||||
message(STATUS "Not building Marlin kernels as no compatible archs found"
|
message(STATUS "Not building Marlin kernels as no compatible archs found"
|
||||||
"in CUDA target architectures")
|
" in CUDA target architectures")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
@@ -423,6 +406,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
|||||||
|
|
||||||
set(VLLM_MOE_EXT_SRC
|
set(VLLM_MOE_EXT_SRC
|
||||||
"csrc/moe/torch_bindings.cpp"
|
"csrc/moe/torch_bindings.cpp"
|
||||||
|
"csrc/moe/moe_align_sum_kernels.cu"
|
||||||
"csrc/moe/topk_softmax_kernels.cu")
|
"csrc/moe/topk_softmax_kernels.cu")
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
@@ -450,7 +434,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
||||||
else()
|
else()
|
||||||
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
||||||
"in CUDA target architectures")
|
" in CUDA target architectures")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
@@ -525,8 +509,10 @@ else()
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
vllm-flash-attn
|
vllm-flash-attn
|
||||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||||
GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
|
GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
|
# Don't share the vllm-flash-attn build between build types
|
||||||
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|||||||
@@ -1,50 +1,3 @@
|
|||||||
# Contributing to vLLM
|
# Contributing to vLLM
|
||||||
|
|
||||||
Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project:
|
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
|
||||||
|
|
||||||
- Identify and report any issues or bugs.
|
|
||||||
- Request or add support for a new model.
|
|
||||||
- Suggest or implement new features.
|
|
||||||
- Improve documentation or contribute a how-to guide.
|
|
||||||
|
|
||||||
We also believe in the power of community support; thus, answering queries, offering PR reviews, and assisting others are also highly regarded and beneficial contributions.
|
|
||||||
|
|
||||||
Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
|
|
||||||
|
|
||||||
|
|
||||||
## Developing
|
|
||||||
|
|
||||||
Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details.
|
|
||||||
|
|
||||||
|
|
||||||
## Testing
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install -r requirements-dev.txt
|
|
||||||
|
|
||||||
# linting and formatting
|
|
||||||
bash format.sh
|
|
||||||
# Static type checking
|
|
||||||
mypy
|
|
||||||
# Unit tests
|
|
||||||
pytest tests/
|
|
||||||
```
|
|
||||||
**Note:** Currently, the repository does not pass the ``mypy`` tests.
|
|
||||||
|
|
||||||
## Contribution Guidelines
|
|
||||||
|
|
||||||
### Issues
|
|
||||||
|
|
||||||
If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
|
|
||||||
|
|
||||||
> [!IMPORTANT]
|
|
||||||
> If you discover a security vulnerability, please follow the instructions [here](/SECURITY.md#reporting-a-vulnerability).
|
|
||||||
|
|
||||||
### Pull Requests & Code Reviews
|
|
||||||
|
|
||||||
Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution.
|
|
||||||
|
|
||||||
### Thank You
|
|
||||||
|
|
||||||
Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
|
|
||||||
All of your contributions help make vLLM a great tool and community for everyone!
|
|
||||||
|
|||||||
34
DCO
Normal file
34
DCO
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
Developer Certificate of Origin
|
||||||
|
Version 1.1
|
||||||
|
|
||||||
|
Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
|
||||||
|
|
||||||
|
Everyone is permitted to copy and distribute verbatim copies of this
|
||||||
|
license document, but changing it is not allowed.
|
||||||
|
|
||||||
|
|
||||||
|
Developer's Certificate of Origin 1.1
|
||||||
|
|
||||||
|
By making a contribution to this project, I certify that:
|
||||||
|
|
||||||
|
(a) The contribution was created in whole or in part by me and I
|
||||||
|
have the right to submit it under the open source license
|
||||||
|
indicated in the file; or
|
||||||
|
|
||||||
|
(b) The contribution is based upon previous work that, to the best
|
||||||
|
of my knowledge, is covered under an appropriate open source
|
||||||
|
license and I have the right under that license to submit that
|
||||||
|
work with modifications, whether created in whole or in part
|
||||||
|
by me, under the same open source license (unless I am
|
||||||
|
permitted to submit under a different license), as indicated
|
||||||
|
in the file; or
|
||||||
|
|
||||||
|
(c) The contribution was provided directly to me by some other
|
||||||
|
person who certified (a), (b) or (c) and I have not modified
|
||||||
|
it.
|
||||||
|
|
||||||
|
(d) I understand and agree that this project and the contribution
|
||||||
|
are public and that a record of the contribution (including all
|
||||||
|
personal information I submit with it, including my sign-off) is
|
||||||
|
maintained indefinitely and may be redistributed consistent with
|
||||||
|
this project or the open source license(s) involved.
|
||||||
10
Dockerfile
10
Dockerfile
@@ -191,6 +191,14 @@ ADD . /vllm-workspace/
|
|||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
python3 -m pip install -r requirements-dev.txt
|
python3 -m pip install -r requirements-dev.txt
|
||||||
|
|
||||||
|
# enable fast downloads from hf (for testing)
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
python3 -m pip install hf_transfer
|
||||||
|
ENV HF_HUB_ENABLE_HF_TRANSFER 1
|
||||||
|
|
||||||
|
# Copy in the v1 package for testing (it isn't distributed yet)
|
||||||
|
COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
|
||||||
|
|
||||||
# doc requires source code
|
# doc requires source code
|
||||||
# we hide them inside `test_docs/` , so that this source code
|
# we hide them inside `test_docs/` , so that this source code
|
||||||
# will not be imported by other tests
|
# will not be imported by other tests
|
||||||
@@ -206,7 +214,7 @@ FROM vllm-base AS vllm-openai
|
|||||||
|
|
||||||
# install additional dependencies for openai api server
|
# install additional dependencies for openai api server
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install accelerate hf_transfer 'modelscope!=1.15.0' bitsandbytes>=0.44.0 timm==0.9.10
|
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10
|
||||||
|
|
||||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||||
|
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li
|
|||||||
|
|
||||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||||
|
|
||||||
RUN pip install intel_extension_for_pytorch==2.4.0
|
RUN pip install intel_extension_for_pytorch==2.5.0
|
||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
|||||||
18
Dockerfile.hpu
Normal file
18
Dockerfile.hpu
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
|
||||||
|
|
||||||
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
|
RUN pip install -v -r requirements-hpu.txt
|
||||||
|
|
||||||
|
ENV no_proxy=localhost,127.0.0.1
|
||||||
|
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
|
||||||
|
|
||||||
|
RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
|
||||||
|
|
||||||
|
WORKDIR /workspace/
|
||||||
|
|
||||||
|
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||||
|
|
||||||
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
@@ -31,11 +31,11 @@ RUN --mount=type=bind,source=.git,target=.git \
|
|||||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||||
|
|
||||||
RUN python3 -m pip install -U \
|
RUN python3 -m pip install -U \
|
||||||
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
|
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
|
||||||
-r requirements-neuron.txt
|
-r requirements-neuron.txt
|
||||||
|
|
||||||
ENV VLLM_TARGET_DEVICE neuron
|
ENV VLLM_TARGET_DEVICE neuron
|
||||||
RUN --mount=type=bind,source=.git,target=.git \
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
pip install --no-build-isolation -v -e . \
|
pip install --no-build-isolation -v -e .
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
|||||||
@@ -15,11 +15,11 @@ RUN --mount=type=bind,source=.git,target=.git \
|
|||||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||||
|
|
||||||
# install build requirements
|
# install build requirements
|
||||||
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
|
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
|
||||||
# build vLLM with OpenVINO backend
|
# build vLLM with OpenVINO backend
|
||||||
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
|
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
|
||||||
|
|
||||||
COPY examples/ /workspace/vllm/examples
|
COPY examples/ /workspace/examples
|
||||||
COPY benchmarks/ /workspace/vllm/benchmarks
|
COPY benchmarks/ /workspace/benchmarks
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ RUN --mount=type=bind,source=.git,target=.git \
|
|||||||
# These packages will be in rocketce eventually
|
# These packages will be in rocketce eventually
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
|
pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
|
||||||
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
|
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
|
||||||
torch==2.3.1 \
|
torch==2.3.1 \
|
||||||
-r requirements-cpu.txt \
|
-r requirements-cpu.txt \
|
||||||
xformers uvloop==0.20.0
|
xformers uvloop==0.20.0
|
||||||
@@ -33,4 +33,4 @@ WORKDIR /workspace/
|
|||||||
|
|
||||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||||
|
|
||||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
python3 -m pip uninstall -y torch torchvision \
|
python3 -m pip uninstall -y torch torchvision \
|
||||||
&& python3 -m pip install --pre \
|
&& python3 -m pip install --pre \
|
||||||
torch==2.6.0.dev20240918 \
|
torch==2.6.0.dev20240918 \
|
||||||
setuptools-scm>=8 \
|
'setuptools-scm>=8' \
|
||||||
torchvision==0.20.0.dev20240918 \
|
torchvision==0.20.0.dev20240918 \
|
||||||
--extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
|
--extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
|
||||||
*) ;; esac
|
*) ;; esac
|
||||||
@@ -121,6 +121,8 @@ ARG GIT_REPO_CHECK=0
|
|||||||
RUN --mount=type=bind,source=.git,target=.git \
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||||
|
|
||||||
|
RUN python3 -m pip install --upgrade pip
|
||||||
|
|
||||||
# Package upgrades for useful functionality or to avoid dependency issues
|
# Package upgrades for useful functionality or to avoid dependency issues
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard
|
python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
ARG NIGHTLY_DATE="20240828"
|
ARG NIGHTLY_DATE="20241017"
|
||||||
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
|
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
|
||||||
|
|
||||||
FROM $BASE_IMAGE
|
FROM $BASE_IMAGE
|
||||||
@@ -9,12 +9,6 @@ RUN apt-get update && apt-get install -y \
|
|||||||
git \
|
git \
|
||||||
ffmpeg libsm6 libxext6 libgl1
|
ffmpeg libsm6 libxext6 libgl1
|
||||||
|
|
||||||
# Install the TPU and Pallas dependencies.
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
|
||||||
|
|
||||||
# Build vLLM.
|
# Build vLLM.
|
||||||
COPY . .
|
COPY . .
|
||||||
ARG GIT_REPO_CHECK=0
|
ARG GIT_REPO_CHECK=0
|
||||||
@@ -25,7 +19,6 @@ ENV VLLM_TARGET_DEVICE="tpu"
|
|||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
--mount=type=bind,source=.git,target=.git \
|
--mount=type=bind,source=.git,target=.git \
|
||||||
python3 -m pip install \
|
python3 -m pip install \
|
||||||
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
|
|
||||||
-r requirements-tpu.txt
|
-r requirements-tpu.txt
|
||||||
RUN python3 setup.py develop
|
RUN python3 setup.py develop
|
||||||
|
|
||||||
|
|||||||
@@ -30,9 +30,19 @@ COPY requirements-common.txt /workspace/vllm/requirements-common.txt
|
|||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install --no-cache-dir \
|
pip install --no-cache-dir \
|
||||||
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
|
|
||||||
-r requirements-xpu.txt
|
-r requirements-xpu.txt
|
||||||
|
|
||||||
|
RUN git clone https://github.com/intel/pti-gpu && \
|
||||||
|
cd pti-gpu/sdk && \
|
||||||
|
git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
|
||||||
|
mkdir build && \
|
||||||
|
cd build && \
|
||||||
|
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
|
||||||
|
make -j && \
|
||||||
|
cmake --install . --config Release --prefix "/usr/local"
|
||||||
|
|
||||||
|
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
ARG GIT_REPO_CHECK
|
ARG GIT_REPO_CHECK
|
||||||
RUN --mount=type=bind,source=.git,target=.git \
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
|
|||||||
@@ -13,8 +13,10 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing).
|
||||||
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
||||||
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users!
|
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users!
|
||||||
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
|
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
|
||||||
|
|||||||
@@ -6,3 +6,14 @@ You can download the dataset by running:
|
|||||||
```bash
|
```bash
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Downloading the ShareGPT4V dataset
|
||||||
|
|
||||||
|
The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
|
||||||
|
will ignore a datapoint if the referred image is missing.
|
||||||
|
```bash
|
||||||
|
wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
|
||||||
|
mkdir coco -p
|
||||||
|
wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
|
||||||
|
unzip coco/train2017.zip -d coco/
|
||||||
|
```
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ async def async_request_tgi(
|
|||||||
# any data, we should skip it.
|
# any data, we should skip it.
|
||||||
if chunk_bytes.startswith(":"):
|
if chunk_bytes.startswith(":"):
|
||||||
continue
|
continue
|
||||||
chunk = remove_prefix(chunk_bytes, "data:")
|
chunk = chunk_bytes.removeprefix("data:")
|
||||||
|
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
timestamp = time.perf_counter()
|
timestamp = time.perf_counter()
|
||||||
@@ -144,8 +144,8 @@ async def async_request_trt_llm(
|
|||||||
if not chunk_bytes:
|
if not chunk_bytes:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
|
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||||
"data:")
|
"data:")
|
||||||
|
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
output.generated_text += data["text_output"]
|
output.generated_text += data["text_output"]
|
||||||
@@ -256,13 +256,14 @@ async def async_request_openai_completions(
|
|||||||
async with session.post(url=api_url, json=payload,
|
async with session.post(url=api_url, json=payload,
|
||||||
headers=headers) as response:
|
headers=headers) as response:
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
|
first_chunk_received = False
|
||||||
async for chunk_bytes in response.content:
|
async for chunk_bytes in response.content:
|
||||||
chunk_bytes = chunk_bytes.strip()
|
chunk_bytes = chunk_bytes.strip()
|
||||||
if not chunk_bytes:
|
if not chunk_bytes:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
|
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||||
"data: ")
|
"data: ")
|
||||||
if chunk == "[DONE]":
|
if chunk == "[DONE]":
|
||||||
latency = time.perf_counter() - st
|
latency = time.perf_counter() - st
|
||||||
else:
|
else:
|
||||||
@@ -274,7 +275,8 @@ async def async_request_openai_completions(
|
|||||||
if data["choices"][0]["text"]:
|
if data["choices"][0]["text"]:
|
||||||
timestamp = time.perf_counter()
|
timestamp = time.perf_counter()
|
||||||
# First token
|
# First token
|
||||||
if ttft == 0.0:
|
if not first_chunk_received:
|
||||||
|
first_chunk_received = True
|
||||||
ttft = time.perf_counter() - st
|
ttft = time.perf_counter() - st
|
||||||
output.ttft = ttft
|
output.ttft = ttft
|
||||||
|
|
||||||
@@ -285,9 +287,14 @@ async def async_request_openai_completions(
|
|||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
generated_text += data["choices"][0]["text"]
|
generated_text += data["choices"][0]["text"]
|
||||||
|
if first_chunk_received:
|
||||||
|
output.success = True
|
||||||
|
else:
|
||||||
|
output.success = False
|
||||||
|
output.error = (
|
||||||
|
"Never received a valid chunk to calculate TTFT."
|
||||||
|
"This response will be marked as failed!")
|
||||||
output.generated_text = generated_text
|
output.generated_text = generated_text
|
||||||
output.success = True
|
|
||||||
output.latency = latency
|
output.latency = latency
|
||||||
else:
|
else:
|
||||||
output.error = response.reason or ""
|
output.error = response.reason or ""
|
||||||
@@ -324,7 +331,7 @@ async def async_request_openai_chat_completions(
|
|||||||
},
|
},
|
||||||
],
|
],
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"max_tokens": request_func_input.output_len,
|
"max_completion_tokens": request_func_input.output_len,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
"ignore_eos": request_func_input.ignore_eos,
|
"ignore_eos": request_func_input.ignore_eos,
|
||||||
}
|
}
|
||||||
@@ -349,8 +356,8 @@ async def async_request_openai_chat_completions(
|
|||||||
if not chunk_bytes:
|
if not chunk_bytes:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
|
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||||
"data: ")
|
"data: ")
|
||||||
if chunk == "[DONE]":
|
if chunk == "[DONE]":
|
||||||
latency = time.perf_counter() - st
|
latency = time.perf_counter() - st
|
||||||
else:
|
else:
|
||||||
@@ -389,14 +396,6 @@ async def async_request_openai_chat_completions(
|
|||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
|
|
||||||
# introduced in Python 3.9
|
|
||||||
def remove_prefix(text: str, prefix: str) -> str:
|
|
||||||
if text.startswith(prefix):
|
|
||||||
return text[len(prefix):]
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
def get_model(pretrained_model_name_or_path: str) -> str:
|
def get_model(pretrained_model_name_or_path: str) -> str:
|
||||||
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
|
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
|
||||||
from modelscope import snapshot_download
|
from modelscope import snapshot_download
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
"""Benchmark the latency of processing a single batch of requests."""
|
"""Benchmark the latency of processing a single batch of requests."""
|
||||||
import argparse
|
import argparse
|
||||||
|
import dataclasses
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -10,43 +11,19 @@ import torch
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.inputs import PromptType
|
from vllm.inputs import PromptType
|
||||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
print(args)
|
print(args)
|
||||||
|
|
||||||
|
engine_args = EngineArgs.from_cli_args(args)
|
||||||
|
|
||||||
# NOTE(woosuk): If the request cannot be processed in a single batch,
|
# NOTE(woosuk): If the request cannot be processed in a single batch,
|
||||||
# the engine will automatically process the request in multiple batches.
|
# the engine will automatically process the request in multiple batches.
|
||||||
llm = LLM(
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
model=args.model,
|
|
||||||
speculative_model=args.speculative_model,
|
|
||||||
num_speculative_tokens=args.num_speculative_tokens,
|
|
||||||
speculative_draft_tensor_parallel_size=\
|
|
||||||
args.speculative_draft_tensor_parallel_size,
|
|
||||||
tokenizer=args.tokenizer,
|
|
||||||
quantization=args.quantization,
|
|
||||||
tensor_parallel_size=args.tensor_parallel_size,
|
|
||||||
trust_remote_code=args.trust_remote_code,
|
|
||||||
dtype=args.dtype,
|
|
||||||
max_model_len=args.max_model_len,
|
|
||||||
enforce_eager=args.enforce_eager,
|
|
||||||
kv_cache_dtype=args.kv_cache_dtype,
|
|
||||||
quantization_param_path=args.quantization_param_path,
|
|
||||||
device=args.device,
|
|
||||||
ray_workers_use_nsight=args.ray_workers_use_nsight,
|
|
||||||
enable_chunked_prefill=args.enable_chunked_prefill,
|
|
||||||
download_dir=args.download_dir,
|
|
||||||
block_size=args.block_size,
|
|
||||||
gpu_memory_utilization=args.gpu_memory_utilization,
|
|
||||||
load_format=args.load_format,
|
|
||||||
distributed_executor_backend=args.distributed_executor_backend,
|
|
||||||
otlp_traces_endpoint=args.otlp_traces_endpoint,
|
|
||||||
enable_prefix_caching=args.enable_prefix_caching,
|
|
||||||
)
|
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
n=args.n,
|
n=args.n,
|
||||||
@@ -125,19 +102,6 @@ if __name__ == '__main__':
|
|||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description='Benchmark the latency of processing a single batch of '
|
description='Benchmark the latency of processing a single batch of '
|
||||||
'requests till completion.')
|
'requests till completion.')
|
||||||
parser.add_argument('--model', type=str, default='facebook/opt-125m')
|
|
||||||
parser.add_argument('--speculative-model', type=str, default=None)
|
|
||||||
parser.add_argument('--num-speculative-tokens', type=int, default=None)
|
|
||||||
parser.add_argument('--speculative-draft-tensor-parallel-size',
|
|
||||||
'-spec-draft-tp',
|
|
||||||
type=int,
|
|
||||||
default=None)
|
|
||||||
parser.add_argument('--tokenizer', type=str, default=None)
|
|
||||||
parser.add_argument('--quantization',
|
|
||||||
'-q',
|
|
||||||
choices=[*QUANTIZATION_METHODS, None],
|
|
||||||
default=None)
|
|
||||||
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
|
|
||||||
parser.add_argument('--input-len', type=int, default=32)
|
parser.add_argument('--input-len', type=int, default=32)
|
||||||
parser.add_argument('--output-len', type=int, default=128)
|
parser.add_argument('--output-len', type=int, default=128)
|
||||||
parser.add_argument('--batch-size', type=int, default=8)
|
parser.add_argument('--batch-size', type=int, default=8)
|
||||||
@@ -154,45 +118,6 @@ if __name__ == '__main__':
|
|||||||
type=int,
|
type=int,
|
||||||
default=30,
|
default=30,
|
||||||
help='Number of iterations to run.')
|
help='Number of iterations to run.')
|
||||||
parser.add_argument('--trust-remote-code',
|
|
||||||
action='store_true',
|
|
||||||
help='trust remote code from huggingface')
|
|
||||||
parser.add_argument(
|
|
||||||
'--max-model-len',
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help='Maximum length of a sequence (including prompt and output). '
|
|
||||||
'If None, will be derived from the model.')
|
|
||||||
parser.add_argument(
|
|
||||||
'--dtype',
|
|
||||||
type=str,
|
|
||||||
default='auto',
|
|
||||||
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
|
|
||||||
help='data type for model weights and activations. '
|
|
||||||
'The "auto" option will use FP16 precision '
|
|
||||||
'for FP32 and FP16 models, and BF16 precision '
|
|
||||||
'for BF16 models.')
|
|
||||||
parser.add_argument('--enforce-eager',
|
|
||||||
action='store_true',
|
|
||||||
help='enforce eager mode and disable CUDA graph')
|
|
||||||
parser.add_argument(
|
|
||||||
'--kv-cache-dtype',
|
|
||||||
type=str,
|
|
||||||
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
|
|
||||||
default="auto",
|
|
||||||
help='Data type for kv cache storage. If "auto", will use model '
|
|
||||||
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
|
|
||||||
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
|
|
||||||
parser.add_argument(
|
|
||||||
'--quantization-param-path',
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help='Path to the JSON file containing the KV cache scaling factors. '
|
|
||||||
'This should generally be supplied, when KV cache dtype is FP8. '
|
|
||||||
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
|
|
||||||
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
|
|
||||||
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
|
|
||||||
'instead supported for common inference criteria.')
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--profile',
|
'--profile',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
@@ -203,78 +128,12 @@ if __name__ == '__main__':
|
|||||||
default=None,
|
default=None,
|
||||||
help=('path to save the pytorch profiler output. Can be visualized '
|
help=('path to save the pytorch profiler output. Can be visualized '
|
||||||
'with ui.perfetto.dev or Tensorboard.'))
|
'with ui.perfetto.dev or Tensorboard.'))
|
||||||
parser.add_argument("--device",
|
|
||||||
type=str,
|
|
||||||
default="auto",
|
|
||||||
choices=DEVICE_OPTIONS,
|
|
||||||
help='device type for vLLM execution')
|
|
||||||
parser.add_argument('--block-size',
|
|
||||||
type=int,
|
|
||||||
default=16,
|
|
||||||
help='block size of key/value cache')
|
|
||||||
parser.add_argument(
|
|
||||||
'--enable-chunked-prefill',
|
|
||||||
action='store_true',
|
|
||||||
help='If True, the prefill requests can be chunked based on the '
|
|
||||||
'max_num_batched_tokens')
|
|
||||||
parser.add_argument("--enable-prefix-caching",
|
|
||||||
action='store_true',
|
|
||||||
help="Enable automatic prefix caching")
|
|
||||||
parser.add_argument(
|
|
||||||
"--ray-workers-use-nsight",
|
|
||||||
action='store_true',
|
|
||||||
help="If specified, use nsight to profile ray workers",
|
|
||||||
)
|
|
||||||
parser.add_argument('--download-dir',
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help='directory to download and load the weights, '
|
|
||||||
'default to the default cache dir of huggingface')
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--output-json',
|
'--output-json',
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help='Path to save the latency results in JSON format.')
|
help='Path to save the latency results in JSON format.')
|
||||||
parser.add_argument('--gpu-memory-utilization',
|
|
||||||
type=float,
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
default=0.9,
|
|
||||||
help='the fraction of GPU memory to be used for '
|
|
||||||
'the model executor, which can range from 0 to 1.'
|
|
||||||
'If unspecified, will use the default value of 0.9.')
|
|
||||||
parser.add_argument(
|
|
||||||
'--load-format',
|
|
||||||
type=str,
|
|
||||||
default=EngineArgs.load_format,
|
|
||||||
choices=[
|
|
||||||
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
|
|
||||||
'bitsandbytes'
|
|
||||||
],
|
|
||||||
help='The format of the model weights to load.\n\n'
|
|
||||||
'* "auto" will try to load the weights in the safetensors format '
|
|
||||||
'and fall back to the pytorch bin format if safetensors format '
|
|
||||||
'is not available.\n'
|
|
||||||
'* "pt" will load the weights in the pytorch bin format.\n'
|
|
||||||
'* "safetensors" will load the weights in the safetensors format.\n'
|
|
||||||
'* "npcache" will load the weights in pytorch format and store '
|
|
||||||
'a numpy cache to speed up the loading.\n'
|
|
||||||
'* "dummy" will initialize the weights with random values, '
|
|
||||||
'which is mainly for profiling.\n'
|
|
||||||
'* "tensorizer" will load the weights using tensorizer from '
|
|
||||||
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
|
|
||||||
'section for more information.\n'
|
|
||||||
'* "bitsandbytes" will load the weights using bitsandbytes '
|
|
||||||
'quantization.\n')
|
|
||||||
parser.add_argument(
|
|
||||||
'--distributed-executor-backend',
|
|
||||||
choices=['ray', 'mp'],
|
|
||||||
default=None,
|
|
||||||
help='Backend to use for distributed serving. When more than 1 GPU '
|
|
||||||
'is used, will be automatically set to "ray" if installed '
|
|
||||||
'or "mp" (multiprocessing) otherwise.')
|
|
||||||
parser.add_argument(
|
|
||||||
'--otlp-traces-endpoint',
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help='Target URL to which OpenTelemetry traces will be sent.')
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ ShareGPT example usage:
|
|||||||
--input-length-range 128:256
|
--input-length-range 128:256
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import dataclasses
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
@@ -33,6 +34,7 @@ from typing import List, Optional, Tuple
|
|||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -116,7 +118,7 @@ def main(args):
|
|||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
if args.dataset_path is not None:
|
if args.dataset_path is not None:
|
||||||
print(f"Start to sample {args.num_prompts} prompts"
|
print(f"Start to sample {args.num_prompts} prompts"
|
||||||
"from {args.dataset_path}")
|
f"from {args.dataset_path}")
|
||||||
filtered_datasets = sample_requests(
|
filtered_datasets = sample_requests(
|
||||||
dataset_path=args.dataset_path,
|
dataset_path=args.dataset_path,
|
||||||
num_requests=args.num_prompts,
|
num_requests=args.num_prompts,
|
||||||
@@ -129,12 +131,9 @@ def main(args):
|
|||||||
filtered_datasets = [(PROMPT, prompt_len, args.output_len)
|
filtered_datasets = [(PROMPT, prompt_len, args.output_len)
|
||||||
] * args.num_prompts
|
] * args.num_prompts
|
||||||
|
|
||||||
llm = LLM(model=args.model,
|
engine_args = EngineArgs.from_cli_args(args)
|
||||||
tokenizer_mode='auto',
|
|
||||||
trust_remote_code=True,
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
enforce_eager=True,
|
|
||||||
tensor_parallel_size=args.tensor_parallel_size,
|
|
||||||
enable_prefix_caching=args.enable_prefix_caching)
|
|
||||||
|
|
||||||
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
|
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
|
||||||
|
|
||||||
@@ -143,13 +142,6 @@ def main(args):
|
|||||||
repeat_count=args.repeat_count,
|
repeat_count=args.repeat_count,
|
||||||
sort=args.sort)
|
sort=args.sort)
|
||||||
|
|
||||||
print("------warm up------")
|
|
||||||
test_prefix(
|
|
||||||
llm=llm,
|
|
||||||
prompts=prompts,
|
|
||||||
sampling_params=sampling_params,
|
|
||||||
)
|
|
||||||
|
|
||||||
print("------start generating------")
|
print("------start generating------")
|
||||||
test_prefix(
|
test_prefix(
|
||||||
llm=llm,
|
llm=llm,
|
||||||
@@ -162,18 +154,11 @@ if __name__ == "__main__":
|
|||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description=
|
description=
|
||||||
'Benchmark the performance with or without automatic prefix caching.')
|
'Benchmark the performance with or without automatic prefix caching.')
|
||||||
parser.add_argument('--model',
|
|
||||||
type=str,
|
|
||||||
default='baichuan-inc/Baichuan2-13B-Chat')
|
|
||||||
parser.add_argument("--dataset-path",
|
parser.add_argument("--dataset-path",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help="Path to the dataset.")
|
help="Path to the dataset.")
|
||||||
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
|
|
||||||
parser.add_argument('--output-len', type=int, default=10)
|
parser.add_argument('--output-len', type=int, default=10)
|
||||||
parser.add_argument('--enable-prefix-caching',
|
|
||||||
action='store_true',
|
|
||||||
help='enable prefix caching')
|
|
||||||
parser.add_argument('--num-prompts',
|
parser.add_argument('--num-prompts',
|
||||||
type=int,
|
type=int,
|
||||||
default=1,
|
default=1,
|
||||||
@@ -190,9 +175,7 @@ if __name__ == "__main__":
|
|||||||
default='128:256',
|
default='128:256',
|
||||||
help='Range of input lengths for sampling prompts,'
|
help='Range of input lengths for sampling prompts,'
|
||||||
'specified as "min:max" (e.g., "128:256").')
|
'specified as "min:max" (e.g., "128:256").')
|
||||||
parser.add_argument("--seed",
|
|
||||||
type=int,
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
default=0,
|
|
||||||
help='Random seed for reproducibility')
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
"""Benchmark offline prioritization."""
|
"""Benchmark offline prioritization."""
|
||||||
import argparse
|
import argparse
|
||||||
|
import dataclasses
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
@@ -7,7 +8,8 @@ from typing import List, Optional, Tuple
|
|||||||
|
|
||||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||||
|
|
||||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
def sample_requests(
|
def sample_requests(
|
||||||
@@ -62,46 +64,11 @@ def sample_requests(
|
|||||||
|
|
||||||
def run_vllm(
|
def run_vllm(
|
||||||
requests: List[Tuple[str, int, int]],
|
requests: List[Tuple[str, int, int]],
|
||||||
model: str,
|
|
||||||
tokenizer: str,
|
|
||||||
quantization: Optional[str],
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
seed: int,
|
|
||||||
n: int,
|
n: int,
|
||||||
trust_remote_code: bool,
|
engine_args: EngineArgs,
|
||||||
dtype: str,
|
|
||||||
max_model_len: Optional[int],
|
|
||||||
enforce_eager: bool,
|
|
||||||
kv_cache_dtype: str,
|
|
||||||
quantization_param_path: Optional[str],
|
|
||||||
device: str,
|
|
||||||
enable_prefix_caching: bool,
|
|
||||||
enable_chunked_prefill: bool,
|
|
||||||
max_num_batched_tokens: int,
|
|
||||||
gpu_memory_utilization: float = 0.9,
|
|
||||||
download_dir: Optional[str] = None,
|
|
||||||
) -> float:
|
) -> float:
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
llm = LLM(
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
model=model,
|
|
||||||
tokenizer=tokenizer,
|
|
||||||
quantization=quantization,
|
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
|
||||||
seed=seed,
|
|
||||||
trust_remote_code=trust_remote_code,
|
|
||||||
dtype=dtype,
|
|
||||||
max_model_len=max_model_len,
|
|
||||||
gpu_memory_utilization=gpu_memory_utilization,
|
|
||||||
enforce_eager=enforce_eager,
|
|
||||||
kv_cache_dtype=kv_cache_dtype,
|
|
||||||
quantization_param_path=quantization_param_path,
|
|
||||||
device=device,
|
|
||||||
enable_prefix_caching=enable_prefix_caching,
|
|
||||||
download_dir=download_dir,
|
|
||||||
enable_chunked_prefill=enable_chunked_prefill,
|
|
||||||
max_num_batched_tokens=max_num_batched_tokens,
|
|
||||||
disable_log_stats=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
prompts = []
|
prompts = []
|
||||||
@@ -142,16 +109,8 @@ def main(args: argparse.Namespace):
|
|||||||
args.output_len)
|
args.output_len)
|
||||||
|
|
||||||
if args.backend == "vllm":
|
if args.backend == "vllm":
|
||||||
elapsed_time = run_vllm(requests, args.model, args.tokenizer,
|
elapsed_time = run_vllm(requests, args.n,
|
||||||
args.quantization, args.tensor_parallel_size,
|
EngineArgs.from_cli_args(args))
|
||||||
args.seed, args.n, args.trust_remote_code,
|
|
||||||
args.dtype, args.max_model_len,
|
|
||||||
args.enforce_eager, args.kv_cache_dtype,
|
|
||||||
args.quantization_param_path, args.device,
|
|
||||||
args.enable_prefix_caching,
|
|
||||||
args.enable_chunked_prefill,
|
|
||||||
args.max_num_batched_tokens,
|
|
||||||
args.gpu_memory_utilization, args.download_dir)
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown backend: {args.backend}")
|
raise ValueError(f"Unknown backend: {args.backend}")
|
||||||
total_num_tokens = sum(prompt_len + output_len
|
total_num_tokens = sum(prompt_len + output_len
|
||||||
@@ -173,7 +132,7 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description="Benchmark the throughput.")
|
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
||||||
parser.add_argument("--backend",
|
parser.add_argument("--backend",
|
||||||
type=str,
|
type=str,
|
||||||
choices=["vllm", "hf", "mii"],
|
choices=["vllm", "hf", "mii"],
|
||||||
@@ -191,13 +150,6 @@ if __name__ == "__main__":
|
|||||||
default=None,
|
default=None,
|
||||||
help="Output length for each request. Overrides the "
|
help="Output length for each request. Overrides the "
|
||||||
"output length from the dataset.")
|
"output length from the dataset.")
|
||||||
parser.add_argument("--model", type=str, default="facebook/opt-125m")
|
|
||||||
parser.add_argument("--tokenizer", type=str, default=None)
|
|
||||||
parser.add_argument('--quantization',
|
|
||||||
'-q',
|
|
||||||
choices=[*QUANTIZATION_METHODS, None],
|
|
||||||
default=None)
|
|
||||||
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
|
|
||||||
parser.add_argument("--n",
|
parser.add_argument("--n",
|
||||||
type=int,
|
type=int,
|
||||||
default=1,
|
default=1,
|
||||||
@@ -206,81 +158,13 @@ if __name__ == "__main__":
|
|||||||
type=int,
|
type=int,
|
||||||
default=200,
|
default=200,
|
||||||
help="Number of prompts to process.")
|
help="Number of prompts to process.")
|
||||||
parser.add_argument("--seed", type=int, default=0)
|
|
||||||
parser.add_argument('--trust-remote-code',
|
|
||||||
action='store_true',
|
|
||||||
help='trust remote code from huggingface')
|
|
||||||
parser.add_argument(
|
|
||||||
'--max-model-len',
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help='Maximum length of a sequence (including prompt and output). '
|
|
||||||
'If None, will be derived from the model.')
|
|
||||||
parser.add_argument(
|
|
||||||
'--dtype',
|
|
||||||
type=str,
|
|
||||||
default='auto',
|
|
||||||
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
|
|
||||||
help='data type for model weights and activations. '
|
|
||||||
'The "auto" option will use FP16 precision '
|
|
||||||
'for FP32 and FP16 models, and BF16 precision '
|
|
||||||
'for BF16 models.')
|
|
||||||
parser.add_argument('--gpu-memory-utilization',
|
|
||||||
type=float,
|
|
||||||
default=0.9,
|
|
||||||
help='the fraction of GPU memory to be used for '
|
|
||||||
'the model executor, which can range from 0 to 1.'
|
|
||||||
'If unspecified, will use the default value of 0.9.')
|
|
||||||
parser.add_argument("--enforce-eager",
|
|
||||||
action="store_true",
|
|
||||||
help="enforce eager execution")
|
|
||||||
parser.add_argument(
|
|
||||||
'--kv-cache-dtype',
|
|
||||||
type=str,
|
|
||||||
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
|
|
||||||
default="auto",
|
|
||||||
help='Data type for kv cache storage. If "auto", will use model '
|
|
||||||
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
|
|
||||||
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
|
|
||||||
parser.add_argument(
|
|
||||||
'--quantization-param-path',
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help='Path to the JSON file containing the KV cache scaling factors. '
|
|
||||||
'This should generally be supplied, when KV cache dtype is FP8. '
|
|
||||||
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
|
|
||||||
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
|
|
||||||
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
|
|
||||||
'instead supported for common inference criteria.')
|
|
||||||
parser.add_argument(
|
|
||||||
"--device",
|
|
||||||
type=str,
|
|
||||||
default="cuda",
|
|
||||||
choices=["cuda", "cpu"],
|
|
||||||
help='device type for vLLM execution, supporting CUDA and CPU.')
|
|
||||||
parser.add_argument(
|
|
||||||
"--enable-prefix-caching",
|
|
||||||
action='store_true',
|
|
||||||
help="enable automatic prefix caching for vLLM backend.")
|
|
||||||
parser.add_argument("--enable-chunked-prefill",
|
|
||||||
action='store_true',
|
|
||||||
help="enable chunked prefill for vLLM backend.")
|
|
||||||
parser.add_argument('--max-num-batched-tokens',
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help='maximum number of batched tokens per '
|
|
||||||
'iteration')
|
|
||||||
parser.add_argument('--download-dir',
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help='directory to download and load the weights, '
|
|
||||||
'default to the default cache dir of huggingface')
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--output-json',
|
'--output-json',
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help='Path to save the throughput results in JSON format.')
|
help='Path to save the throughput results in JSON format.')
|
||||||
|
|
||||||
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.tokenizer is None:
|
if args.tokenizer is None:
|
||||||
args.tokenizer = args.model
|
args.tokenizer = args.model
|
||||||
|
|||||||
@@ -53,6 +53,8 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from argparse import ArgumentParser as FlexibleArgumentParser
|
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||||
|
|
||||||
|
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class BenchmarkMetrics:
|
class BenchmarkMetrics:
|
||||||
@@ -60,6 +62,7 @@ class BenchmarkMetrics:
|
|||||||
total_input: int
|
total_input: int
|
||||||
total_output: int
|
total_output: int
|
||||||
request_throughput: float
|
request_throughput: float
|
||||||
|
request_goodput: float
|
||||||
output_throughput: float
|
output_throughput: float
|
||||||
total_token_throughput: float
|
total_token_throughput: float
|
||||||
mean_ttft_ms: float
|
mean_ttft_ms: float
|
||||||
@@ -202,6 +205,7 @@ def sample_hf_requests(
|
|||||||
dataset_split: str,
|
dataset_split: str,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
random_seed: int,
|
||||||
fixed_output_len: Optional[int] = None,
|
fixed_output_len: Optional[int] = None,
|
||||||
) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
|
) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
|
||||||
dataset = load_dataset(dataset_path,
|
dataset = load_dataset(dataset_path,
|
||||||
@@ -210,8 +214,8 @@ def sample_hf_requests(
|
|||||||
streaming=True)
|
streaming=True)
|
||||||
assert "conversations" in dataset.features, (
|
assert "conversations" in dataset.features, (
|
||||||
"HF Dataset must have 'conversations' column.")
|
"HF Dataset must have 'conversations' column.")
|
||||||
filtered_dataset = dataset.shuffle().filter(
|
filter_func = lambda x: len(x["conversations"]) >= 2
|
||||||
lambda x: len(x["conversations"]) >= 2)
|
filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
|
||||||
sampled_requests: List[Tuple[str, int, int, Dict[str,
|
sampled_requests: List[Tuple[str, int, int, Dict[str,
|
||||||
Collection[str]]]] = []
|
Collection[str]]]] = []
|
||||||
for data in filtered_dataset:
|
for data in filtered_dataset:
|
||||||
@@ -293,8 +297,33 @@ def sample_random_requests(
|
|||||||
async def get_request(
|
async def get_request(
|
||||||
input_requests: List[Tuple[str, int, int]],
|
input_requests: List[Tuple[str, int, int]],
|
||||||
request_rate: float,
|
request_rate: float,
|
||||||
|
burstiness: float = 1.0,
|
||||||
) -> AsyncGenerator[Tuple[str, int, int], None]:
|
) -> AsyncGenerator[Tuple[str, int, int], None]:
|
||||||
|
"""
|
||||||
|
Asynchronously generates requests at a specified rate
|
||||||
|
with OPTIONAL burstiness.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_requests:
|
||||||
|
A list of input requests, each represented as a tuple.
|
||||||
|
request_rate:
|
||||||
|
The rate at which requests are generated (requests/s).
|
||||||
|
burstiness (optional):
|
||||||
|
The burstiness factor of the request generation.
|
||||||
|
Only takes effect when request_rate is not inf.
|
||||||
|
Default value is 1, which follows a Poisson process.
|
||||||
|
Otherwise, the request intervals follow a gamma distribution.
|
||||||
|
A lower burstiness value (0 < burstiness < 1) results
|
||||||
|
in more bursty requests, while a higher burstiness value
|
||||||
|
(burstiness > 1) results in a more uniform arrival of requests.
|
||||||
|
"""
|
||||||
input_requests = iter(input_requests)
|
input_requests = iter(input_requests)
|
||||||
|
|
||||||
|
# Calculate scale parameter theta to maintain the desired request_rate.
|
||||||
|
assert burstiness > 0, (
|
||||||
|
f"A positive burstiness factor is expected, but given {burstiness}.")
|
||||||
|
theta = 1.0 / (request_rate * burstiness)
|
||||||
|
|
||||||
for request in input_requests:
|
for request in input_requests:
|
||||||
yield request
|
yield request
|
||||||
|
|
||||||
@@ -302,8 +331,9 @@ async def get_request(
|
|||||||
# If the request rate is infinity, then we don't need to wait.
|
# If the request rate is infinity, then we don't need to wait.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Sample the request interval from the exponential distribution.
|
# Sample the request interval from the gamma distribution.
|
||||||
interval = np.random.exponential(1.0 / request_rate)
|
# If burstiness is 1, it follows exponential distribution.
|
||||||
|
interval = np.random.gamma(shape=burstiness, scale=theta)
|
||||||
# The next request will be sent after the interval.
|
# The next request will be sent after the interval.
|
||||||
await asyncio.sleep(interval)
|
await asyncio.sleep(interval)
|
||||||
|
|
||||||
@@ -315,12 +345,15 @@ def calculate_metrics(
|
|||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
selected_percentile_metrics: List[str],
|
selected_percentile_metrics: List[str],
|
||||||
selected_percentiles: List[float],
|
selected_percentiles: List[float],
|
||||||
|
gootput_config_dict: Dict[str, float],
|
||||||
) -> Tuple[BenchmarkMetrics, List[int]]:
|
) -> Tuple[BenchmarkMetrics, List[int]]:
|
||||||
actual_output_lens: List[int] = []
|
actual_output_lens: List[int] = []
|
||||||
total_input = 0
|
total_input = 0
|
||||||
completed = 0
|
completed = 0
|
||||||
|
good_completed = 0
|
||||||
itls: List[float] = []
|
itls: List[float] = []
|
||||||
tpots: List[float] = []
|
tpots: List[float] = []
|
||||||
|
all_tpots: List[float] = []
|
||||||
ttfts: List[float] = []
|
ttfts: List[float] = []
|
||||||
e2els: List[float] = []
|
e2els: List[float] = []
|
||||||
for i in range(len(outputs)):
|
for i in range(len(outputs)):
|
||||||
@@ -334,9 +367,13 @@ def calculate_metrics(
|
|||||||
add_special_tokens=False).input_ids)
|
add_special_tokens=False).input_ids)
|
||||||
actual_output_lens.append(output_len)
|
actual_output_lens.append(output_len)
|
||||||
total_input += input_requests[i][1]
|
total_input += input_requests[i][1]
|
||||||
|
tpot = 0
|
||||||
if output_len > 1:
|
if output_len > 1:
|
||||||
tpots.append(
|
tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
|
||||||
(outputs[i].latency - outputs[i].ttft) / (output_len - 1))
|
1)
|
||||||
|
tpots.append(tpot)
|
||||||
|
# Note: if output_len <= 1, we regard tpot as 0 for goodput
|
||||||
|
all_tpots.append(tpot)
|
||||||
itls += outputs[i].itl
|
itls += outputs[i].itl
|
||||||
ttfts.append(outputs[i].ttft)
|
ttfts.append(outputs[i].ttft)
|
||||||
e2els.append(outputs[i].latency)
|
e2els.append(outputs[i].latency)
|
||||||
@@ -344,6 +381,28 @@ def calculate_metrics(
|
|||||||
else:
|
else:
|
||||||
actual_output_lens.append(0)
|
actual_output_lens.append(0)
|
||||||
|
|
||||||
|
if gootput_config_dict:
|
||||||
|
valid_metrics = []
|
||||||
|
slo_values = []
|
||||||
|
|
||||||
|
if "ttft" in gootput_config_dict:
|
||||||
|
valid_metrics.append(ttfts)
|
||||||
|
slo_values.append(gootput_config_dict["ttft"] /
|
||||||
|
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||||
|
if "tpot" in gootput_config_dict:
|
||||||
|
valid_metrics.append(all_tpots)
|
||||||
|
slo_values.append(gootput_config_dict["tpot"] /
|
||||||
|
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||||
|
if "e2el" in gootput_config_dict:
|
||||||
|
valid_metrics.append(e2els)
|
||||||
|
slo_values.append(gootput_config_dict["e2el"] /
|
||||||
|
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||||
|
|
||||||
|
for req_metric in zip(*valid_metrics):
|
||||||
|
is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
|
||||||
|
if is_good_req:
|
||||||
|
good_completed += 1
|
||||||
|
|
||||||
if completed == 0:
|
if completed == 0:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"All requests failed. This is likely due to a misconfiguration "
|
"All requests failed. This is likely due to a misconfiguration "
|
||||||
@@ -354,6 +413,7 @@ def calculate_metrics(
|
|||||||
total_input=total_input,
|
total_input=total_input,
|
||||||
total_output=sum(actual_output_lens),
|
total_output=sum(actual_output_lens),
|
||||||
request_throughput=completed / dur_s,
|
request_throughput=completed / dur_s,
|
||||||
|
request_goodput=good_completed / dur_s,
|
||||||
output_throughput=sum(actual_output_lens) / dur_s,
|
output_throughput=sum(actual_output_lens) / dur_s,
|
||||||
total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
|
total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
|
||||||
mean_ttft_ms=np.mean(ttfts or 0) *
|
mean_ttft_ms=np.mean(ttfts or 0) *
|
||||||
@@ -372,9 +432,9 @@ def calculate_metrics(
|
|||||||
median_itl_ms=np.median(itls or 0) * 1000,
|
median_itl_ms=np.median(itls or 0) * 1000,
|
||||||
percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
|
percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
|
||||||
for p in selected_percentiles],
|
for p in selected_percentiles],
|
||||||
mean_e2el_ms=np.median(e2els or 0) * 1000,
|
mean_e2el_ms=np.mean(e2els or 0) * 1000,
|
||||||
std_e2el_ms=np.std(e2els or 0) * 1000,
|
std_e2el_ms=np.std(e2els or 0) * 1000,
|
||||||
median_e2el_ms=np.mean(e2els or 0) * 1000,
|
median_e2el_ms=np.median(e2els or 0) * 1000,
|
||||||
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
|
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
|
||||||
for p in selected_percentiles],
|
for p in selected_percentiles],
|
||||||
)
|
)
|
||||||
@@ -392,11 +452,14 @@ async def benchmark(
|
|||||||
logprobs: Optional[int],
|
logprobs: Optional[int],
|
||||||
best_of: int,
|
best_of: int,
|
||||||
request_rate: float,
|
request_rate: float,
|
||||||
|
burstiness: float,
|
||||||
disable_tqdm: bool,
|
disable_tqdm: bool,
|
||||||
profile: bool,
|
profile: bool,
|
||||||
selected_percentile_metrics: List[str],
|
selected_percentile_metrics: List[str],
|
||||||
selected_percentiles: List[str],
|
selected_percentiles: List[str],
|
||||||
ignore_eos: bool,
|
ignore_eos: bool,
|
||||||
|
gootput_config_dict: Dict[str, float],
|
||||||
|
max_concurrency: Optional[int],
|
||||||
):
|
):
|
||||||
if backend in ASYNC_REQUEST_FUNCS:
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||||
@@ -444,13 +507,35 @@ async def benchmark(
|
|||||||
if profile_output.success:
|
if profile_output.success:
|
||||||
print("Profiler started")
|
print("Profiler started")
|
||||||
|
|
||||||
|
if burstiness == 1.0:
|
||||||
|
distribution = "Poisson process"
|
||||||
|
else:
|
||||||
|
distribution = "Gamma distribution"
|
||||||
|
|
||||||
print(f"Traffic request rate: {request_rate}")
|
print(f"Traffic request rate: {request_rate}")
|
||||||
|
print(f"Burstiness factor: {burstiness} ({distribution})")
|
||||||
|
print(f"Maximum request concurrency: {max_concurrency}")
|
||||||
|
|
||||||
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
||||||
|
|
||||||
|
# This can be used once the minimum Python version is 3.10 or higher,
|
||||||
|
# and it will simplify the code in limited_request_func.
|
||||||
|
# semaphore = (asyncio.Semaphore(max_concurrency)
|
||||||
|
# if max_concurrency else contextlib.nullcontext())
|
||||||
|
semaphore = (asyncio.Semaphore(max_concurrency)
|
||||||
|
if max_concurrency else None)
|
||||||
|
|
||||||
|
async def limited_request_func(request_func_input, pbar):
|
||||||
|
if semaphore is None:
|
||||||
|
return await request_func(request_func_input=request_func_input,
|
||||||
|
pbar=pbar)
|
||||||
|
async with semaphore:
|
||||||
|
return await request_func(request_func_input=request_func_input,
|
||||||
|
pbar=pbar)
|
||||||
|
|
||||||
benchmark_start_time = time.perf_counter()
|
benchmark_start_time = time.perf_counter()
|
||||||
tasks: List[asyncio.Task] = []
|
tasks: List[asyncio.Task] = []
|
||||||
async for request in get_request(input_requests, request_rate):
|
async for request in get_request(input_requests, request_rate, burstiness):
|
||||||
prompt, prompt_len, output_len, mm_content = request
|
prompt, prompt_len, output_len, mm_content = request
|
||||||
request_func_input = RequestFuncInput(model=model_id,
|
request_func_input = RequestFuncInput(model=model_id,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
@@ -463,8 +548,8 @@ async def benchmark(
|
|||||||
ignore_eos=ignore_eos)
|
ignore_eos=ignore_eos)
|
||||||
tasks.append(
|
tasks.append(
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
request_func(request_func_input=request_func_input,
|
limited_request_func(request_func_input=request_func_input,
|
||||||
pbar=pbar)))
|
pbar=pbar)))
|
||||||
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
|
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
if profile:
|
if profile:
|
||||||
@@ -494,6 +579,7 @@ async def benchmark(
|
|||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
selected_percentile_metrics=selected_percentile_metrics,
|
selected_percentile_metrics=selected_percentile_metrics,
|
||||||
selected_percentiles=selected_percentiles,
|
selected_percentiles=selected_percentiles,
|
||||||
|
gootput_config_dict=gootput_config_dict,
|
||||||
)
|
)
|
||||||
|
|
||||||
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
|
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
|
||||||
@@ -505,6 +591,9 @@ async def benchmark(
|
|||||||
metrics.total_output))
|
metrics.total_output))
|
||||||
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
|
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
|
||||||
metrics.request_throughput))
|
metrics.request_throughput))
|
||||||
|
if gootput_config_dict:
|
||||||
|
print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
|
||||||
|
metrics.request_goodput))
|
||||||
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
|
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
|
||||||
metrics.output_throughput))
|
metrics.output_throughput))
|
||||||
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
|
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
|
||||||
@@ -516,6 +605,8 @@ async def benchmark(
|
|||||||
"total_input_tokens": metrics.total_input,
|
"total_input_tokens": metrics.total_input,
|
||||||
"total_output_tokens": metrics.total_output,
|
"total_output_tokens": metrics.total_output,
|
||||||
"request_throughput": metrics.request_throughput,
|
"request_throughput": metrics.request_throughput,
|
||||||
|
"request_goodput:":
|
||||||
|
metrics.request_goodput if gootput_config_dict else None,
|
||||||
"output_throughput": metrics.output_throughput,
|
"output_throughput": metrics.output_throughput,
|
||||||
"total_token_throughput": metrics.total_token_throughput,
|
"total_token_throughput": metrics.total_token_throughput,
|
||||||
"input_lens": [output.prompt_len for output in outputs],
|
"input_lens": [output.prompt_len for output in outputs],
|
||||||
@@ -569,6 +660,41 @@ async def benchmark(
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def check_goodput_args(args):
|
||||||
|
# Check and parse goodput arguments
|
||||||
|
gootput_config_dict = {}
|
||||||
|
VALID_NAMES = ["ttft", "tpot", "e2el"]
|
||||||
|
if args.goodput:
|
||||||
|
gootput_config_dict = parse_goodput(args.goodput)
|
||||||
|
for slo_name, slo_val in gootput_config_dict.items():
|
||||||
|
if slo_name not in VALID_NAMES:
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid metric name found, {slo_name}: {slo_val}. "
|
||||||
|
"The service level objective name should be one of "
|
||||||
|
f"{str(VALID_NAMES)}. ")
|
||||||
|
if slo_val < 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid value found, {slo_name}: {slo_val}. "
|
||||||
|
"The service level objective value should be "
|
||||||
|
"non-negative.")
|
||||||
|
return gootput_config_dict
|
||||||
|
|
||||||
|
|
||||||
|
def parse_goodput(slo_pairs):
|
||||||
|
gootput_config_dict = {}
|
||||||
|
try:
|
||||||
|
for slo_pair in slo_pairs:
|
||||||
|
slo_name, slo_val = slo_pair.split(":")
|
||||||
|
gootput_config_dict[slo_name] = float(slo_val)
|
||||||
|
except ValueError as err:
|
||||||
|
raise argparse.ArgumentTypeError(
|
||||||
|
"Invalid format found for service level objectives. "
|
||||||
|
"Specify service level objectives for goodput as \"KEY:VALUE\" "
|
||||||
|
"pairs, where the key is a metric name, and the value is a "
|
||||||
|
"number in milliseconds.") from err
|
||||||
|
return gootput_config_dict
|
||||||
|
|
||||||
|
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
print(args)
|
print(args)
|
||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
@@ -646,6 +772,7 @@ def main(args: argparse.Namespace):
|
|||||||
dataset_split=args.hf_split,
|
dataset_split=args.hf_split,
|
||||||
num_requests=args.num_prompts,
|
num_requests=args.num_prompts,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
|
random_seed=args.seed,
|
||||||
fixed_output_len=args.hf_output_len,
|
fixed_output_len=args.hf_output_len,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -662,6 +789,8 @@ def main(args: argparse.Namespace):
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
||||||
|
|
||||||
|
gootput_config_dict = check_goodput_args(args)
|
||||||
|
|
||||||
benchmark_result = asyncio.run(
|
benchmark_result = asyncio.run(
|
||||||
benchmark(
|
benchmark(
|
||||||
backend=backend,
|
backend=backend,
|
||||||
@@ -673,6 +802,7 @@ def main(args: argparse.Namespace):
|
|||||||
logprobs=args.logprobs,
|
logprobs=args.logprobs,
|
||||||
best_of=args.best_of,
|
best_of=args.best_of,
|
||||||
request_rate=args.request_rate,
|
request_rate=args.request_rate,
|
||||||
|
burstiness=args.burstiness,
|
||||||
disable_tqdm=args.disable_tqdm,
|
disable_tqdm=args.disable_tqdm,
|
||||||
profile=args.profile,
|
profile=args.profile,
|
||||||
selected_percentile_metrics=args.percentile_metrics.split(","),
|
selected_percentile_metrics=args.percentile_metrics.split(","),
|
||||||
@@ -680,6 +810,8 @@ def main(args: argparse.Namespace):
|
|||||||
float(p) for p in args.metric_percentiles.split(",")
|
float(p) for p in args.metric_percentiles.split(",")
|
||||||
],
|
],
|
||||||
ignore_eos=args.ignore_eos,
|
ignore_eos=args.ignore_eos,
|
||||||
|
gootput_config_dict=gootput_config_dict,
|
||||||
|
max_concurrency=args.max_concurrency,
|
||||||
))
|
))
|
||||||
|
|
||||||
# Save config and results to json
|
# Save config and results to json
|
||||||
@@ -709,13 +841,17 @@ def main(args: argparse.Namespace):
|
|||||||
# Traffic
|
# Traffic
|
||||||
result_json["request_rate"] = (
|
result_json["request_rate"] = (
|
||||||
args.request_rate if args.request_rate < float("inf") else "inf")
|
args.request_rate if args.request_rate < float("inf") else "inf")
|
||||||
|
result_json["burstiness"] = args.burstiness
|
||||||
|
result_json["max_concurrency"] = args.max_concurrency
|
||||||
|
|
||||||
# Merge with benchmark result
|
# Merge with benchmark result
|
||||||
result_json = {**result_json, **benchmark_result}
|
result_json = {**result_json, **benchmark_result}
|
||||||
|
|
||||||
# Save to file
|
# Save to file
|
||||||
base_model_id = model_id.split("/")[-1]
|
base_model_id = model_id.split("/")[-1]
|
||||||
file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa
|
max_concurrency_str = (f"-concurrency{args.max_concurrency}"
|
||||||
|
if args.max_concurrency is not None else "")
|
||||||
|
file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" #noqa
|
||||||
if args.result_filename:
|
if args.result_filename:
|
||||||
file_name = args.result_filename
|
file_name = args.result_filename
|
||||||
if args.result_dir:
|
if args.result_dir:
|
||||||
@@ -766,6 +902,19 @@ if __name__ == "__main__":
|
|||||||
default=None,
|
default=None,
|
||||||
help="Path to the sharegpt/sonnet dataset. "
|
help="Path to the sharegpt/sonnet dataset. "
|
||||||
"Or the huggingface dataset ID if using HF dataset.")
|
"Or the huggingface dataset ID if using HF dataset.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-concurrency",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Maximum number of concurrent requests. This can be used "
|
||||||
|
"to help simulate an environment where a higher level component "
|
||||||
|
"is enforcing a maximum number of concurrent requests. While the "
|
||||||
|
"--request-rate argument controls the rate at which requests are "
|
||||||
|
"initiated, this argument will control how many are actually allowed "
|
||||||
|
"to execute at a time. This means that when used in combination, the "
|
||||||
|
"actual request rate may be lower than specified with --request-rate, "
|
||||||
|
"if the server is not processing requests fast enough to keep up.")
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--model",
|
"--model",
|
||||||
type=str,
|
type=str,
|
||||||
@@ -808,8 +957,20 @@ if __name__ == "__main__":
|
|||||||
default=float("inf"),
|
default=float("inf"),
|
||||||
help="Number of requests per second. If this is inf, "
|
help="Number of requests per second. If this is inf, "
|
||||||
"then all the requests are sent at time 0. "
|
"then all the requests are sent at time 0. "
|
||||||
"Otherwise, we use Poisson process to synthesize "
|
"Otherwise, we use Poisson process or gamma distribution "
|
||||||
"the request arrival times.",
|
"to synthesize the request arrival times.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--burstiness",
|
||||||
|
type=float,
|
||||||
|
default=1.0,
|
||||||
|
help="Burstiness factor of the request generation. "
|
||||||
|
"Only take effect when request_rate is not inf. "
|
||||||
|
"Default value is 1, which follows Poisson process. "
|
||||||
|
"Otherwise, the request intervals follow a gamma distribution. "
|
||||||
|
"A lower burstiness value (0 < burstiness < 1) results in more "
|
||||||
|
"bursty requests. A higher burstiness value (burstiness > 1) "
|
||||||
|
"results in a more uniform arrival of requests.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--seed", type=int, default=0)
|
parser.add_argument("--seed", type=int, default=0)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@@ -879,6 +1040,17 @@ if __name__ == "__main__":
|
|||||||
"Default value is \"99\". "
|
"Default value is \"99\". "
|
||||||
"Use \"--percentile-metrics\" to select metrics.",
|
"Use \"--percentile-metrics\" to select metrics.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--goodput",
|
||||||
|
nargs="+",
|
||||||
|
required=False,
|
||||||
|
help="Specify service level objectives for goodput as \"KEY:VALUE\" "
|
||||||
|
"pairs, where the key is a metric name, and the value is in "
|
||||||
|
"milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
|
||||||
|
"separated by spaces. Allowed request level metric names are "
|
||||||
|
"\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
|
||||||
|
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
|
||||||
|
"and the blog: https://hao-ai-lab.github.io/blogs/distserve")
|
||||||
|
|
||||||
# group for dataset specific arguments
|
# group for dataset specific arguments
|
||||||
sonnet_group = parser.add_argument_group("sonnet dataset options")
|
sonnet_group = parser.add_argument_group("sonnet dataset options")
|
||||||
|
|||||||
@@ -1,30 +1,71 @@
|
|||||||
"""Benchmark offline inference throughput."""
|
"""Benchmark offline inference throughput."""
|
||||||
import argparse
|
import argparse
|
||||||
|
import dataclasses
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from typing import List, Optional, Tuple
|
from typing import List, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import uvloop
|
import uvloop
|
||||||
|
from PIL import Image
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||||
PreTrainedTokenizerBase)
|
PreTrainedTokenizerBase)
|
||||||
|
|
||||||
from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
||||||
from vllm.entrypoints.openai.api_server import (
|
from vllm.entrypoints.openai.api_server import (
|
||||||
build_async_engine_client_from_engine_args)
|
build_async_engine_client_from_engine_args)
|
||||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
from vllm.inputs import TextPrompt
|
||||||
|
from vllm.multimodal import MultiModalDataDict
|
||||||
from vllm.sampling_params import BeamSearchParams
|
from vllm.sampling_params import BeamSearchParams
|
||||||
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
|
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
|
||||||
|
|
||||||
|
|
||||||
def sample_requests(
|
@dataclasses.dataclass
|
||||||
dataset_path: str,
|
class SampleRequest:
|
||||||
num_requests: int,
|
"""A class representing a single inference request for benchmarking.
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
|
||||||
fixed_output_len: Optional[int],
|
Attributes:
|
||||||
) -> List[Tuple[str, int, int]]:
|
prompt: The input text prompt for the model.
|
||||||
|
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
|
||||||
|
images).
|
||||||
|
prompt_len: The length of the prompt in tokens.
|
||||||
|
expected_output_len: The expected length of the output in tokens.
|
||||||
|
"""
|
||||||
|
prompt: str
|
||||||
|
prompt_len: int
|
||||||
|
expected_output_len: int
|
||||||
|
multi_modal_data: Optional[MultiModalDataDict] = None
|
||||||
|
|
||||||
|
|
||||||
|
def _get_prompt_for_image_model(question: str, *, model: str) -> str:
|
||||||
|
"""Prepend and append special tokens around the question to form a prompt.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
question: The input question text to wrap with special tokens
|
||||||
|
model: The name of the model being used, to determine which special
|
||||||
|
tokens to add
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The formatted prompt string with appropriate special tokens for the
|
||||||
|
model
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If an unsupported model name is provided
|
||||||
|
"""
|
||||||
|
model = model.lower()
|
||||||
|
if "pixtral" in model:
|
||||||
|
return f"<s>[INST]{question}\n[IMG][/INST]"
|
||||||
|
raise ValueError(f"Unsupported model {model}")
|
||||||
|
|
||||||
|
|
||||||
|
def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
||||||
|
args: argparse.Namespace) -> List[SampleRequest]:
|
||||||
|
dataset_path: str = args.dataset
|
||||||
|
num_requests: int = args.num_prompts
|
||||||
|
fixed_output_len: Optional[int] = args.output_len
|
||||||
|
model: str = args.model
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
raise ValueError("output_len too small")
|
raise ValueError("output_len too small")
|
||||||
|
|
||||||
@@ -33,23 +74,36 @@ def sample_requests(
|
|||||||
dataset = json.load(f)
|
dataset = json.load(f)
|
||||||
# Filter out the conversations with less than 2 turns.
|
# Filter out the conversations with less than 2 turns.
|
||||||
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
||||||
# Only keep the first two turns of each conversation.
|
|
||||||
dataset = [(data["conversations"][0]["value"],
|
|
||||||
data["conversations"][1]["value"]) for data in dataset]
|
|
||||||
|
|
||||||
# Shuffle the dataset.
|
# Shuffle the dataset.
|
||||||
random.shuffle(dataset)
|
random.shuffle(dataset)
|
||||||
|
|
||||||
# Filter out sequences that are too long or too short
|
# Filter out sequences that are too long or too short
|
||||||
filtered_dataset: List[Tuple[str, int, int]] = []
|
filtered_dataset: List[SampleRequest] = []
|
||||||
for i in range(len(dataset)):
|
for data in dataset:
|
||||||
if len(filtered_dataset) == num_requests:
|
if len(filtered_dataset) == num_requests:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# Only keep the first two turns of each conversation.
|
||||||
|
prompt = data["conversations"][0]["value"]
|
||||||
|
completion = data["conversations"][1]["value"]
|
||||||
|
|
||||||
|
multi_modal_data: Optional[MultiModalDataDict] = None
|
||||||
|
if "image" in data:
|
||||||
|
multi_modal_data = multi_modal_data or {}
|
||||||
|
image_path = data["image"]
|
||||||
|
# TODO(vllm-project/vllm/issues/9778): Support multiple images.
|
||||||
|
assert isinstance(image_path,
|
||||||
|
str), "Only support single image input"
|
||||||
|
try:
|
||||||
|
multi_modal_data["image"] = Image.open(image_path).convert(
|
||||||
|
"RGB")
|
||||||
|
except FileNotFoundError:
|
||||||
|
# Ignore datapoint where asset is missing
|
||||||
|
continue
|
||||||
|
prompt = _get_prompt_for_image_model(question=prompt, model=model)
|
||||||
|
|
||||||
# Tokenize the prompts and completions.
|
# Tokenize the prompts and completions.
|
||||||
prompt = dataset[i][0]
|
|
||||||
prompt_token_ids = tokenizer(prompt).input_ids
|
prompt_token_ids = tokenizer(prompt).input_ids
|
||||||
completion = dataset[i][1]
|
|
||||||
completion_token_ids = tokenizer(completion).input_ids
|
completion_token_ids = tokenizer(completion).input_ids
|
||||||
prompt_len = len(prompt_token_ids)
|
prompt_len = len(prompt_token_ids)
|
||||||
output_len = len(completion_token_ids
|
output_len = len(completion_token_ids
|
||||||
@@ -60,73 +114,37 @@ def sample_requests(
|
|||||||
if prompt_len > 1024 or prompt_len + output_len > 2048:
|
if prompt_len > 1024 or prompt_len + output_len > 2048:
|
||||||
# Prune too long sequences.
|
# Prune too long sequences.
|
||||||
continue
|
continue
|
||||||
filtered_dataset.append((prompt, prompt_len, output_len))
|
filtered_dataset.append(
|
||||||
|
SampleRequest(prompt=prompt,
|
||||||
|
prompt_len=prompt_len,
|
||||||
|
expected_output_len=output_len,
|
||||||
|
multi_modal_data=multi_modal_data))
|
||||||
|
|
||||||
return filtered_dataset
|
return filtered_dataset
|
||||||
|
|
||||||
|
|
||||||
def run_vllm(
|
def run_vllm(
|
||||||
requests: List[Tuple[str, int, int]],
|
requests: List[SampleRequest],
|
||||||
model: str,
|
|
||||||
tokenizer: str,
|
|
||||||
quantization: Optional[str],
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
seed: int,
|
|
||||||
n: int,
|
n: int,
|
||||||
trust_remote_code: bool,
|
engine_args: EngineArgs,
|
||||||
dtype: str,
|
|
||||||
max_model_len: Optional[int],
|
|
||||||
enforce_eager: bool,
|
|
||||||
kv_cache_dtype: str,
|
|
||||||
quantization_param_path: Optional[str],
|
|
||||||
device: str,
|
|
||||||
enable_prefix_caching: bool,
|
|
||||||
enable_chunked_prefill: bool,
|
|
||||||
max_num_batched_tokens: int,
|
|
||||||
distributed_executor_backend: Optional[str],
|
|
||||||
gpu_memory_utilization: float = 0.9,
|
|
||||||
num_scheduler_steps: int = 1,
|
|
||||||
download_dir: Optional[str] = None,
|
|
||||||
load_format: str = EngineArgs.load_format,
|
|
||||||
disable_async_output_proc: bool = False,
|
|
||||||
) -> float:
|
) -> float:
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
llm = LLM(
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
model=model,
|
|
||||||
tokenizer=tokenizer,
|
|
||||||
quantization=quantization,
|
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
|
||||||
seed=seed,
|
|
||||||
trust_remote_code=trust_remote_code,
|
|
||||||
dtype=dtype,
|
|
||||||
max_model_len=max_model_len,
|
|
||||||
gpu_memory_utilization=gpu_memory_utilization,
|
|
||||||
enforce_eager=enforce_eager,
|
|
||||||
kv_cache_dtype=kv_cache_dtype,
|
|
||||||
quantization_param_path=quantization_param_path,
|
|
||||||
device=device,
|
|
||||||
enable_prefix_caching=enable_prefix_caching,
|
|
||||||
download_dir=download_dir,
|
|
||||||
enable_chunked_prefill=enable_chunked_prefill,
|
|
||||||
max_num_batched_tokens=max_num_batched_tokens,
|
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
|
||||||
load_format=load_format,
|
|
||||||
num_scheduler_steps=num_scheduler_steps,
|
|
||||||
disable_async_output_proc=disable_async_output_proc,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
prompts: List[str] = []
|
prompts: List[TextPrompt] = []
|
||||||
sampling_params: List[SamplingParams] = []
|
sampling_params: List[SamplingParams] = []
|
||||||
for prompt, _, output_len in requests:
|
for request in requests:
|
||||||
prompts.append(prompt)
|
prompts.append(
|
||||||
|
TextPrompt(prompt=request.prompt,
|
||||||
|
multi_modal_data=request.multi_modal_data))
|
||||||
sampling_params.append(
|
sampling_params.append(
|
||||||
SamplingParams(
|
SamplingParams(
|
||||||
n=n,
|
n=n,
|
||||||
temperature=1.0,
|
temperature=1.0,
|
||||||
top_p=1.0,
|
top_p=1.0,
|
||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=output_len,
|
max_tokens=request.expected_output_len,
|
||||||
))
|
))
|
||||||
|
|
||||||
use_beam_search = False
|
use_beam_search = False
|
||||||
@@ -136,11 +154,11 @@ def run_vllm(
|
|||||||
llm.generate(prompts, sampling_params, use_tqdm=True)
|
llm.generate(prompts, sampling_params, use_tqdm=True)
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
else:
|
else:
|
||||||
prompts = [prompt for prompt, _, _ in requests]
|
prompts = [request.prompt for request in requests]
|
||||||
# output_len should be the same for all requests.
|
# output_len should be the same for all requests.
|
||||||
output_len = requests[0][2]
|
output_len = requests[0][2]
|
||||||
for prompt, input_len, _output_len in requests:
|
for request in requests:
|
||||||
assert _output_len == output_len
|
assert request.expected_output_len == output_len
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
llm.beam_search(
|
llm.beam_search(
|
||||||
prompts,
|
prompts,
|
||||||
@@ -154,73 +172,30 @@ def run_vllm(
|
|||||||
|
|
||||||
|
|
||||||
async def run_vllm_async(
|
async def run_vllm_async(
|
||||||
requests: List[Tuple[str, int, int]],
|
requests: List[SampleRequest],
|
||||||
model: str,
|
|
||||||
tokenizer: str,
|
|
||||||
quantization: Optional[str],
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
seed: int,
|
|
||||||
n: int,
|
n: int,
|
||||||
trust_remote_code: bool,
|
engine_args: AsyncEngineArgs,
|
||||||
dtype: str,
|
|
||||||
max_model_len: Optional[int],
|
|
||||||
enforce_eager: bool,
|
|
||||||
kv_cache_dtype: str,
|
|
||||||
quantization_param_path: Optional[str],
|
|
||||||
device: str,
|
|
||||||
enable_prefix_caching: bool,
|
|
||||||
enable_chunked_prefill: bool,
|
|
||||||
max_num_batched_tokens: int,
|
|
||||||
distributed_executor_backend: Optional[str],
|
|
||||||
gpu_memory_utilization: float = 0.9,
|
|
||||||
num_scheduler_steps: int = 1,
|
|
||||||
download_dir: Optional[str] = None,
|
|
||||||
load_format: str = EngineArgs.load_format,
|
|
||||||
disable_async_output_proc: bool = False,
|
|
||||||
disable_frontend_multiprocessing: bool = False,
|
disable_frontend_multiprocessing: bool = False,
|
||||||
) -> float:
|
) -> float:
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
engine_args = AsyncEngineArgs(
|
|
||||||
model=model,
|
|
||||||
tokenizer=tokenizer,
|
|
||||||
quantization=quantization,
|
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
|
||||||
seed=seed,
|
|
||||||
trust_remote_code=trust_remote_code,
|
|
||||||
dtype=dtype,
|
|
||||||
max_model_len=max_model_len,
|
|
||||||
gpu_memory_utilization=gpu_memory_utilization,
|
|
||||||
enforce_eager=enforce_eager,
|
|
||||||
kv_cache_dtype=kv_cache_dtype,
|
|
||||||
quantization_param_path=quantization_param_path,
|
|
||||||
device=device,
|
|
||||||
enable_prefix_caching=enable_prefix_caching,
|
|
||||||
download_dir=download_dir,
|
|
||||||
enable_chunked_prefill=enable_chunked_prefill,
|
|
||||||
max_num_batched_tokens=max_num_batched_tokens,
|
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
|
||||||
load_format=load_format,
|
|
||||||
num_scheduler_steps=num_scheduler_steps,
|
|
||||||
disable_async_output_proc=disable_async_output_proc,
|
|
||||||
worker_use_ray=False,
|
|
||||||
disable_log_requests=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
async with build_async_engine_client_from_engine_args(
|
async with build_async_engine_client_from_engine_args(
|
||||||
engine_args, disable_frontend_multiprocessing) as llm:
|
engine_args, disable_frontend_multiprocessing) as llm:
|
||||||
|
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
prompts: List[str] = []
|
prompts: List[TextPrompt] = []
|
||||||
sampling_params: List[SamplingParams] = []
|
sampling_params: List[SamplingParams] = []
|
||||||
for prompt, _, output_len in requests:
|
for request in requests:
|
||||||
prompts.append(prompt)
|
prompts.append(
|
||||||
|
TextPrompt(prompt=request.prompt,
|
||||||
|
multi_modal_data=request.multi_modal_data))
|
||||||
sampling_params.append(
|
sampling_params.append(
|
||||||
SamplingParams(
|
SamplingParams(
|
||||||
n=n,
|
n=n,
|
||||||
temperature=1.0,
|
temperature=1.0,
|
||||||
top_p=1.0,
|
top_p=1.0,
|
||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=output_len,
|
max_tokens=request.expected_output_len,
|
||||||
))
|
))
|
||||||
|
|
||||||
generators = []
|
generators = []
|
||||||
@@ -236,7 +211,7 @@ async def run_vllm_async(
|
|||||||
|
|
||||||
|
|
||||||
def run_hf(
|
def run_hf(
|
||||||
requests: List[Tuple[str, int, int]],
|
requests: List[SampleRequest],
|
||||||
model: str,
|
model: str,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
n: int,
|
n: int,
|
||||||
@@ -294,14 +269,14 @@ def run_hf(
|
|||||||
|
|
||||||
|
|
||||||
def run_mii(
|
def run_mii(
|
||||||
requests: List[Tuple[str, int, int]],
|
requests: List[SampleRequest],
|
||||||
model: str,
|
model: str,
|
||||||
tensor_parallel_size: int,
|
tensor_parallel_size: int,
|
||||||
output_len: int,
|
output_len: int,
|
||||||
) -> float:
|
) -> float:
|
||||||
from mii import client, serve
|
from mii import client, serve
|
||||||
llm = serve(model, tensor_parallel=tensor_parallel_size)
|
llm = serve(model, tensor_parallel=tensor_parallel_size)
|
||||||
prompts = [prompt for prompt, _, _ in requests]
|
prompts = [request.prompt for request in requests]
|
||||||
|
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
llm.generate(prompts, max_new_tokens=output_len)
|
llm.generate(prompts, max_new_tokens=output_len)
|
||||||
@@ -320,31 +295,39 @@ def main(args: argparse.Namespace):
|
|||||||
args.tokenizer, trust_remote_code=args.trust_remote_code)
|
args.tokenizer, trust_remote_code=args.trust_remote_code)
|
||||||
if args.dataset is None:
|
if args.dataset is None:
|
||||||
# Synthesize a prompt with the given input length.
|
# Synthesize a prompt with the given input length.
|
||||||
prompt = "hi" * (args.input_len - 1)
|
# As tokenizer may add additional tokens like BOS, we need to try
|
||||||
requests = [(prompt, args.input_len, args.output_len)
|
# different lengths to get the desired input length.
|
||||||
for _ in range(args.num_prompts)]
|
for i in range(-10, 10):
|
||||||
else:
|
prompt = "hi " * (args.input_len + i)
|
||||||
requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
|
tokenized_prompt = tokenizer(prompt).input_ids
|
||||||
args.output_len)
|
if len(tokenized_prompt) == args.input_len:
|
||||||
|
break
|
||||||
if args.backend == "vllm":
|
|
||||||
run_args = [
|
|
||||||
requests, args.model, args.tokenizer, args.quantization,
|
|
||||||
args.tensor_parallel_size, args.seed, args.n,
|
|
||||||
args.trust_remote_code, args.dtype, args.max_model_len,
|
|
||||||
args.enforce_eager, args.kv_cache_dtype,
|
|
||||||
args.quantization_param_path, args.device,
|
|
||||||
args.enable_prefix_caching, args.enable_chunked_prefill,
|
|
||||||
args.max_num_batched_tokens, args.distributed_executor_backend,
|
|
||||||
args.gpu_memory_utilization, args.num_scheduler_steps,
|
|
||||||
args.download_dir, args.load_format, args.disable_async_output_proc
|
|
||||||
]
|
|
||||||
|
|
||||||
if args.async_engine:
|
|
||||||
run_args.append(args.disable_frontend_multiprocessing)
|
|
||||||
elapsed_time = uvloop.run(run_vllm_async(*run_args))
|
|
||||||
else:
|
else:
|
||||||
elapsed_time = run_vllm(*run_args)
|
raise ValueError(
|
||||||
|
f"Failed to synthesize a prompt with {args.input_len} tokens.")
|
||||||
|
requests = [
|
||||||
|
SampleRequest(prompt=prompt,
|
||||||
|
prompt_len=args.input_len,
|
||||||
|
expected_output_len=args.output_len)
|
||||||
|
for _ in range(args.num_prompts)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
requests = sample_requests(tokenizer, args)
|
||||||
|
|
||||||
|
is_multi_modal = any(request.multi_modal_data is not None
|
||||||
|
for request in requests)
|
||||||
|
if args.backend == "vllm":
|
||||||
|
if args.async_engine:
|
||||||
|
elapsed_time = uvloop.run(
|
||||||
|
run_vllm_async(
|
||||||
|
requests,
|
||||||
|
args.n,
|
||||||
|
AsyncEngineArgs.from_cli_args(args),
|
||||||
|
args.disable_frontend_multiprocessing,
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
elapsed_time = run_vllm(requests, args.n,
|
||||||
|
EngineArgs.from_cli_args(args))
|
||||||
elif args.backend == "hf":
|
elif args.backend == "hf":
|
||||||
assert args.tensor_parallel_size == 1
|
assert args.tensor_parallel_size == 1
|
||||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
||||||
@@ -354,10 +337,18 @@ def main(args: argparse.Namespace):
|
|||||||
args.output_len)
|
args.output_len)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown backend: {args.backend}")
|
raise ValueError(f"Unknown backend: {args.backend}")
|
||||||
total_num_tokens = sum(prompt_len + output_len
|
total_num_tokens = sum(request.prompt_len + request.expected_output_len
|
||||||
for _, prompt_len, output_len in requests)
|
for request in requests)
|
||||||
|
total_output_tokens = sum(request.expected_output_len
|
||||||
|
for request in requests)
|
||||||
|
if is_multi_modal:
|
||||||
|
print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
|
||||||
|
"following metrics are not accurate because image tokens are not"
|
||||||
|
" counted. See vllm-project/vllm/issues/9778 for details.")
|
||||||
|
# TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
|
||||||
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||||
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
|
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
|
||||||
|
f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
|
||||||
|
|
||||||
# Output JSON results if specified
|
# Output JSON results if specified
|
||||||
if args.output_json:
|
if args.output_json:
|
||||||
@@ -381,7 +372,9 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument("--dataset",
|
parser.add_argument("--dataset",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help="Path to the dataset.")
|
help="Path to the dataset. The dataset is expected to "
|
||||||
|
"be a json in form of List[Dict[..., conversations: "
|
||||||
|
"List[Dict[..., value: <prompt_or_response>]]]]")
|
||||||
parser.add_argument("--input-len",
|
parser.add_argument("--input-len",
|
||||||
type=int,
|
type=int,
|
||||||
default=None,
|
default=None,
|
||||||
@@ -391,13 +384,6 @@ if __name__ == "__main__":
|
|||||||
default=None,
|
default=None,
|
||||||
help="Output length for each request. Overrides the "
|
help="Output length for each request. Overrides the "
|
||||||
"output length from the dataset.")
|
"output length from the dataset.")
|
||||||
parser.add_argument("--model", type=str, default="facebook/opt-125m")
|
|
||||||
parser.add_argument("--tokenizer", type=str, default=None)
|
|
||||||
parser.add_argument('--quantization',
|
|
||||||
'-q',
|
|
||||||
choices=[*QUANTIZATION_METHODS, None],
|
|
||||||
default=None)
|
|
||||||
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
|
|
||||||
parser.add_argument("--n",
|
parser.add_argument("--n",
|
||||||
type=int,
|
type=int,
|
||||||
default=1,
|
default=1,
|
||||||
@@ -406,123 +392,15 @@ if __name__ == "__main__":
|
|||||||
type=int,
|
type=int,
|
||||||
default=1000,
|
default=1000,
|
||||||
help="Number of prompts to process.")
|
help="Number of prompts to process.")
|
||||||
parser.add_argument("--seed", type=int, default=0)
|
|
||||||
parser.add_argument("--hf-max-batch-size",
|
parser.add_argument("--hf-max-batch-size",
|
||||||
type=int,
|
type=int,
|
||||||
default=None,
|
default=None,
|
||||||
help="Maximum batch size for HF backend.")
|
help="Maximum batch size for HF backend.")
|
||||||
parser.add_argument('--trust-remote-code',
|
|
||||||
action='store_true',
|
|
||||||
help='trust remote code from huggingface')
|
|
||||||
parser.add_argument(
|
|
||||||
'--max-model-len',
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help='Maximum length of a sequence (including prompt and output). '
|
|
||||||
'If None, will be derived from the model.')
|
|
||||||
parser.add_argument(
|
|
||||||
'--dtype',
|
|
||||||
type=str,
|
|
||||||
default='auto',
|
|
||||||
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
|
|
||||||
help='data type for model weights and activations. '
|
|
||||||
'The "auto" option will use FP16 precision '
|
|
||||||
'for FP32 and FP16 models, and BF16 precision '
|
|
||||||
'for BF16 models.')
|
|
||||||
parser.add_argument('--gpu-memory-utilization',
|
|
||||||
type=float,
|
|
||||||
default=0.9,
|
|
||||||
help='the fraction of GPU memory to be used for '
|
|
||||||
'the model executor, which can range from 0 to 1.'
|
|
||||||
'If unspecified, will use the default value of 0.9.')
|
|
||||||
parser.add_argument("--enforce-eager",
|
|
||||||
action="store_true",
|
|
||||||
help="enforce eager execution")
|
|
||||||
parser.add_argument(
|
|
||||||
'--kv-cache-dtype',
|
|
||||||
type=str,
|
|
||||||
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
|
|
||||||
default="auto",
|
|
||||||
help='Data type for kv cache storage. If "auto", will use model '
|
|
||||||
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
|
|
||||||
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
|
|
||||||
parser.add_argument(
|
|
||||||
'--quantization-param-path',
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help='Path to the JSON file containing the KV cache scaling factors. '
|
|
||||||
'This should generally be supplied, when KV cache dtype is FP8. '
|
|
||||||
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
|
|
||||||
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
|
|
||||||
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
|
|
||||||
'instead supported for common inference criteria.')
|
|
||||||
parser.add_argument("--device",
|
|
||||||
type=str,
|
|
||||||
default="auto",
|
|
||||||
choices=DEVICE_OPTIONS,
|
|
||||||
help='device type for vLLM execution')
|
|
||||||
parser.add_argument(
|
|
||||||
"--num-scheduler-steps",
|
|
||||||
type=int,
|
|
||||||
default=1,
|
|
||||||
help="Maximum number of forward steps per scheduler call.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--enable-prefix-caching",
|
|
||||||
action='store_true',
|
|
||||||
help="Enable automatic prefix caching for vLLM backend.")
|
|
||||||
parser.add_argument("--enable-chunked-prefill",
|
|
||||||
action='store_true',
|
|
||||||
help="enable chunked prefill for vLLM backend.")
|
|
||||||
parser.add_argument('--max-num-batched-tokens',
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help='maximum number of batched tokens per '
|
|
||||||
'iteration')
|
|
||||||
parser.add_argument('--download-dir',
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help='directory to download and load the weights, '
|
|
||||||
'default to the default cache dir of huggingface')
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--output-json',
|
'--output-json',
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help='Path to save the throughput results in JSON format.')
|
help='Path to save the throughput results in JSON format.')
|
||||||
parser.add_argument(
|
|
||||||
'--distributed-executor-backend',
|
|
||||||
choices=['ray', 'mp'],
|
|
||||||
default=None,
|
|
||||||
help='Backend to use for distributed serving. When more than 1 GPU '
|
|
||||||
'is used, will be automatically set to "ray" if installed '
|
|
||||||
'or "mp" (multiprocessing) otherwise.')
|
|
||||||
parser.add_argument(
|
|
||||||
'--load-format',
|
|
||||||
type=str,
|
|
||||||
default=EngineArgs.load_format,
|
|
||||||
choices=[
|
|
||||||
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
|
|
||||||
'bitsandbytes'
|
|
||||||
],
|
|
||||||
help='The format of the model weights to load.\n\n'
|
|
||||||
'* "auto" will try to load the weights in the safetensors format '
|
|
||||||
'and fall back to the pytorch bin format if safetensors format '
|
|
||||||
'is not available.\n'
|
|
||||||
'* "pt" will load the weights in the pytorch bin format.\n'
|
|
||||||
'* "safetensors" will load the weights in the safetensors format.\n'
|
|
||||||
'* "npcache" will load the weights in pytorch format and store '
|
|
||||||
'a numpy cache to speed up the loading.\n'
|
|
||||||
'* "dummy" will initialize the weights with random values, '
|
|
||||||
'which is mainly for profiling.\n'
|
|
||||||
'* "tensorizer" will load the weights using tensorizer from '
|
|
||||||
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
|
|
||||||
'section for more information.\n'
|
|
||||||
'* "bitsandbytes" will load the weights using bitsandbytes '
|
|
||||||
'quantization.\n')
|
|
||||||
parser.add_argument(
|
|
||||||
"--disable-async-output-proc",
|
|
||||||
action='store_true',
|
|
||||||
default=False,
|
|
||||||
help="Disable async output processor for vLLM backend.")
|
|
||||||
parser.add_argument("--async-engine",
|
parser.add_argument("--async-engine",
|
||||||
action='store_true',
|
action='store_true',
|
||||||
default=False,
|
default=False,
|
||||||
@@ -531,6 +409,7 @@ if __name__ == "__main__":
|
|||||||
action='store_true',
|
action='store_true',
|
||||||
default=False,
|
default=False,
|
||||||
help="Disable decoupled async engine frontend.")
|
help="Disable decoupled async engine frontend.")
|
||||||
|
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.tokenizer is None:
|
if args.tokenizer is None:
|
||||||
args.tokenizer = args.model
|
args.tokenizer = args.model
|
||||||
|
|||||||
@@ -3,8 +3,8 @@ import time
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
|
from vllm.platforms import current_platform
|
||||||
seed_everything)
|
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
@@ -16,7 +16,7 @@ def main(num_tokens: int,
|
|||||||
do_profile: bool = False,
|
do_profile: bool = False,
|
||||||
num_warmup_iters: int = 5,
|
num_warmup_iters: int = 5,
|
||||||
num_iters: int = 100) -> None:
|
num_iters: int = 100) -> None:
|
||||||
seed_everything(seed)
|
current_platform.seed_everything(seed)
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
|
|
||||||
layer = RMSNorm(hidden_size).to(dtype=dtype)
|
layer = RMSNorm(hidden_size).to(dtype=dtype)
|
||||||
|
|||||||
@@ -269,10 +269,10 @@ def run_square_bench(args):
|
|||||||
|
|
||||||
|
|
||||||
def run_range_bench(args):
|
def run_range_bench(args):
|
||||||
m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")]
|
m_start, k_start, n_start = (int(x) for x in args.dim_start.split(","))
|
||||||
m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")]
|
m_end, k_end, n_end = (int(x) for x in args.dim_end.split(","))
|
||||||
m_increment, k_increment, n_increment = \
|
m_increment, k_increment, n_increment = \
|
||||||
[int(x) for x in args.dim_increment.split(",")]
|
(int(x) for x in args.dim_increment.split(","))
|
||||||
Ms = list(range(m_start, m_end + 1, m_increment))
|
Ms = list(range(m_start, m_end + 1, m_increment))
|
||||||
Ks = list(range(k_start, k_end + 1, k_increment))
|
Ks = list(range(k_start, k_end + 1, k_increment))
|
||||||
Ns = list(range(n_start, n_end + 1, n_increment))
|
Ns = list(range(n_start, n_end + 1, n_increment))
|
||||||
|
|||||||
@@ -10,7 +10,8 @@ from ray.experimental.tqdm_ray import tqdm
|
|||||||
from transformers import AutoConfig
|
from transformers import AutoConfig
|
||||||
|
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||||
from vllm.utils import FlexibleArgumentParser, seed_everything
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
class BenchmarkConfig(TypedDict):
|
class BenchmarkConfig(TypedDict):
|
||||||
@@ -88,22 +89,23 @@ def benchmark_config(
|
|||||||
input_gating.copy_(gating_output[i])
|
input_gating.copy_(gating_output[i])
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
fused_moe(
|
from vllm.model_executor.layers.fused_moe import override_config
|
||||||
x,
|
with override_config(config):
|
||||||
w1,
|
fused_moe(
|
||||||
w2,
|
x,
|
||||||
input_gating,
|
w1,
|
||||||
topk,
|
w2,
|
||||||
renormalize=True,
|
input_gating,
|
||||||
inplace=True,
|
topk,
|
||||||
override_config=config,
|
renormalize=True,
|
||||||
use_fp8_w8a8=use_fp8_w8a8,
|
inplace=True,
|
||||||
use_int8_w8a16=use_int8_w8a16,
|
use_fp8_w8a8=use_fp8_w8a8,
|
||||||
w1_scale=w1_scale,
|
use_int8_w8a16=use_int8_w8a16,
|
||||||
w2_scale=w2_scale,
|
w1_scale=w1_scale,
|
||||||
a1_scale=a1_scale,
|
w2_scale=w2_scale,
|
||||||
a2_scale=a2_scale,
|
a1_scale=a1_scale,
|
||||||
)
|
a2_scale=a2_scale,
|
||||||
|
)
|
||||||
|
|
||||||
# JIT compilation & warmup
|
# JIT compilation & warmup
|
||||||
run()
|
run()
|
||||||
@@ -166,7 +168,7 @@ class BenchmarkWorker:
|
|||||||
|
|
||||||
def __init__(self, seed: int) -> None:
|
def __init__(self, seed: int) -> None:
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
seed_everything(seed)
|
current_platform.seed_everything(seed)
|
||||||
self.seed = seed
|
self.seed = seed
|
||||||
|
|
||||||
def benchmark(
|
def benchmark(
|
||||||
@@ -180,7 +182,7 @@ class BenchmarkWorker:
|
|||||||
use_fp8_w8a8: bool,
|
use_fp8_w8a8: bool,
|
||||||
use_int8_w8a16: bool,
|
use_int8_w8a16: bool,
|
||||||
) -> Tuple[Dict[str, int], float]:
|
) -> Tuple[Dict[str, int], float]:
|
||||||
seed_everything(self.seed)
|
current_platform.seed_everything(self.seed)
|
||||||
dtype_str = get_config_dtype_str(dtype,
|
dtype_str = get_config_dtype_str(dtype,
|
||||||
use_int8_w8a16=use_int8_w8a16,
|
use_int8_w8a16=use_int8_w8a16,
|
||||||
use_fp8_w8a8=use_fp8_w8a8)
|
use_fp8_w8a8=use_fp8_w8a8)
|
||||||
|
|||||||
@@ -5,8 +5,9 @@ from typing import List, Optional
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
|
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
|
||||||
create_kv_caches_with_random, seed_everything)
|
create_kv_caches_with_random)
|
||||||
|
|
||||||
NUM_BLOCKS = 1024
|
NUM_BLOCKS = 1024
|
||||||
PARTITION_SIZE = 512
|
PARTITION_SIZE = 512
|
||||||
@@ -28,7 +29,7 @@ def main(
|
|||||||
device: str = "cuda",
|
device: str = "cuda",
|
||||||
kv_cache_dtype: Optional[str] = None,
|
kv_cache_dtype: Optional[str] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
seed_everything(seed)
|
current_platform.seed_everything(seed)
|
||||||
|
|
||||||
scale = float(1.0 / (head_size**0.5))
|
scale = float(1.0 / (head_size**0.5))
|
||||||
query = torch.empty(num_seqs,
|
query = torch.empty(num_seqs,
|
||||||
|
|||||||
@@ -3,8 +3,8 @@ import time
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
|
from vllm.platforms import current_platform
|
||||||
seed_everything)
|
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
@@ -17,7 +17,7 @@ def main(num_tokens: int,
|
|||||||
do_profile: bool = False,
|
do_profile: bool = False,
|
||||||
num_warmup_iters: int = 5,
|
num_warmup_iters: int = 5,
|
||||||
num_iters: int = 100) -> None:
|
num_iters: int = 100) -> None:
|
||||||
seed_everything(seed)
|
current_platform.seed_everything(seed)
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
|
|
||||||
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||||
|
|||||||
@@ -6,7 +6,8 @@ import torch
|
|||||||
|
|
||||||
from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
|
from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
|
||||||
get_rope)
|
get_rope)
|
||||||
from vllm.utils import FlexibleArgumentParser, seed_everything
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
def benchmark_rope_kernels_multi_lora(
|
def benchmark_rope_kernels_multi_lora(
|
||||||
@@ -22,7 +23,7 @@ def benchmark_rope_kernels_multi_lora(
|
|||||||
max_position: int = 8192,
|
max_position: int = 8192,
|
||||||
base: int = 10000,
|
base: int = 10000,
|
||||||
) -> None:
|
) -> None:
|
||||||
seed_everything(seed)
|
current_platform.seed_everything(seed)
|
||||||
torch.set_default_device(device)
|
torch.set_default_device(device)
|
||||||
if rotary_dim is None:
|
if rotary_dim is None:
|
||||||
rotary_dim = head_size
|
rotary_dim = head_size
|
||||||
|
|||||||
@@ -4,13 +4,13 @@ PORT=8000
|
|||||||
MODEL=$1
|
MODEL=$1
|
||||||
TOKENS=$2
|
TOKENS=$2
|
||||||
|
|
||||||
docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
|
docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
|
||||||
-v $PWD/data:/data \
|
-v "$PWD/data:/data" \
|
||||||
ghcr.io/huggingface/text-generation-inference:2.2.0 \
|
ghcr.io/huggingface/text-generation-inference:2.2.0 \
|
||||||
--model-id $MODEL \
|
--model-id "$MODEL" \
|
||||||
--sharded false \
|
--sharded false \
|
||||||
--max-input-length 1024 \
|
--max-input-length 1024 \
|
||||||
--max-total-tokens 2048 \
|
--max-total-tokens 2048 \
|
||||||
--max-best-of 5 \
|
--max-best-of 5 \
|
||||||
--max-concurrent-requests 5000 \
|
--max-concurrent-requests 5000 \
|
||||||
--max-batch-total-tokens $TOKENS
|
--max-batch-total-tokens "$TOKENS"
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
|
|||||||
#
|
#
|
||||||
list(APPEND CXX_COMPILE_FLAGS
|
list(APPEND CXX_COMPILE_FLAGS
|
||||||
"-fopenmp"
|
"-fopenmp"
|
||||||
|
"-mf16c"
|
||||||
"-DVLLM_CPU_EXTENSION")
|
"-DVLLM_CPU_EXTENSION")
|
||||||
|
|
||||||
execute_process(COMMAND cat /proc/cpuinfo
|
execute_process(COMMAND cat /proc/cpuinfo
|
||||||
@@ -92,7 +93,7 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
oneDNN
|
oneDNN
|
||||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||||
GIT_TAG v3.5.3
|
GIT_TAG v3.6
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
GIT_SHALLOW TRUE
|
GIT_SHALLOW TRUE
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -424,11 +424,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
|
|||||||
# Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
|
# Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
|
||||||
# dependencies that are not necessary and may not be installed.
|
# dependencies that are not necessary and may not be installed.
|
||||||
if (GPU_LANGUAGE STREQUAL "CUDA")
|
if (GPU_LANGUAGE STREQUAL "CUDA")
|
||||||
if ("${CUDA_CUDA_LIB}" STREQUAL "")
|
target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver)
|
||||||
set(CUDA_CUDA_LIB "${CUDA_CUDA_LIBRARY}")
|
|
||||||
endif()
|
|
||||||
target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB}
|
|
||||||
${CUDA_LIBRARIES})
|
|
||||||
else()
|
else()
|
||||||
target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
|
target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
@@ -1,17 +1,19 @@
|
|||||||
# ruff: noqa
|
# ruff: noqa
|
||||||
# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
|
# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
|
||||||
|
|
||||||
# Unlike the rest of the PyTorch this file must be python2 compliant.
|
|
||||||
# This script outputs relevant system environment info
|
|
||||||
# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
|
|
||||||
import datetime
|
import datetime
|
||||||
import locale
|
import locale
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
# Unlike the rest of the PyTorch this file must be python2 compliant.
|
||||||
|
# This script outputs relevant system environment info
|
||||||
|
# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
|
|
||||||
|
from vllm.envs import environment_variables
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import torch
|
import torch
|
||||||
TORCH_AVAILABLE = True
|
TORCH_AVAILABLE = True
|
||||||
@@ -52,6 +54,7 @@ SystemEnv = namedtuple(
|
|||||||
'vllm_version', # vllm specific field
|
'vllm_version', # vllm specific field
|
||||||
'vllm_build_flags', # vllm specific field
|
'vllm_build_flags', # vllm specific field
|
||||||
'gpu_topo', # vllm specific field
|
'gpu_topo', # vllm specific field
|
||||||
|
'env_vars',
|
||||||
])
|
])
|
||||||
|
|
||||||
DEFAULT_CONDA_PATTERNS = {
|
DEFAULT_CONDA_PATTERNS = {
|
||||||
@@ -512,6 +515,22 @@ def is_xnnpack_available():
|
|||||||
else:
|
else:
|
||||||
return "N/A"
|
return "N/A"
|
||||||
|
|
||||||
|
def get_env_vars():
|
||||||
|
env_vars = ''
|
||||||
|
secret_terms=('secret', 'token', 'api', 'access', 'password')
|
||||||
|
report_prefix = ("TORCH", "NCCL", "PYTORCH",
|
||||||
|
"CUDA", "CUBLAS", "CUDNN",
|
||||||
|
"OMP_", "MKL_",
|
||||||
|
"NVIDIA")
|
||||||
|
for k, v in os.environ.items():
|
||||||
|
if any(term in k.lower() for term in secret_terms):
|
||||||
|
continue
|
||||||
|
if k in environment_variables:
|
||||||
|
env_vars = env_vars + "{}={}".format(k, v) + "\n"
|
||||||
|
if k.startswith(report_prefix):
|
||||||
|
env_vars = env_vars + "{}={}".format(k, v) + "\n"
|
||||||
|
|
||||||
|
return env_vars
|
||||||
|
|
||||||
def get_env_info():
|
def get_env_info():
|
||||||
run_lambda = run
|
run_lambda = run
|
||||||
@@ -583,6 +602,7 @@ def get_env_info():
|
|||||||
vllm_version=vllm_version,
|
vllm_version=vllm_version,
|
||||||
vllm_build_flags=vllm_build_flags,
|
vllm_build_flags=vllm_build_flags,
|
||||||
gpu_topo=gpu_topo,
|
gpu_topo=gpu_topo,
|
||||||
|
env_vars=get_env_vars(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -631,6 +651,8 @@ vLLM Build Flags:
|
|||||||
{vllm_build_flags}
|
{vllm_build_flags}
|
||||||
GPU Topology:
|
GPU Topology:
|
||||||
{gpu_topo}
|
{gpu_topo}
|
||||||
|
|
||||||
|
{env_vars}
|
||||||
""".strip()
|
""".strip()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -89,6 +89,48 @@ void gelu_tanh_and_mul(torch::Tensor& out, // [..., d]
|
|||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__device__ __forceinline__ T fatrelu_kernel(const T& x, const float threshold) {
|
||||||
|
const float f = (float)x;
|
||||||
|
return (T)(f > threshold ? f : 0.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&, const float)>
|
||||||
|
__global__ void act_and_mul_kernel_with_param(
|
||||||
|
scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d,
|
||||||
|
const float param) {
|
||||||
|
const int64_t token_idx = blockIdx.x;
|
||||||
|
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
||||||
|
const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
|
||||||
|
const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
|
||||||
|
out[token_idx * d + idx] = ACT_FN(x, param) * y;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace vllm
|
||||||
|
|
||||||
|
#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM) \
|
||||||
|
int d = input.size(-1) / 2; \
|
||||||
|
int64_t num_tokens = input.numel() / input.size(-1); \
|
||||||
|
dim3 grid(num_tokens); \
|
||||||
|
dim3 block(std::min(d, 1024)); \
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES( \
|
||||||
|
input.scalar_type(), "act_and_mul_kernel_with_param", [&] { \
|
||||||
|
vllm::act_and_mul_kernel_with_param<scalar_t, KERNEL<scalar_t>> \
|
||||||
|
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
|
||||||
|
input.data_ptr<scalar_t>(), d, \
|
||||||
|
PARAM); \
|
||||||
|
});
|
||||||
|
|
||||||
|
void fatrelu_and_mul(torch::Tensor& out, // [..., d],
|
||||||
|
torch::Tensor& input, // [..., 2 * d]
|
||||||
|
double threshold) {
|
||||||
|
LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold);
|
||||||
|
}
|
||||||
|
namespace vllm {
|
||||||
|
|
||||||
// Element-wise activation kernel template.
|
// Element-wise activation kernel template.
|
||||||
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
|
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
|
||||||
__global__ void activation_kernel(
|
__global__ void activation_kernel(
|
||||||
|
|||||||
@@ -670,332 +670,6 @@ __global__ void paged_attention_v2_reduce_kernel(
|
|||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \
|
|
||||||
VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \
|
|
||||||
((void*)vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, \
|
|
||||||
BLOCK_SIZE, NUM_THREADS, \
|
|
||||||
KV_DTYPE, IS_BLOCK_SPARSE>), \
|
|
||||||
shared_mem_size); \
|
|
||||||
vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, \
|
|
||||||
NUM_THREADS, KV_DTYPE, IS_BLOCK_SPARSE> \
|
|
||||||
<<<grid, block, shared_mem_size, stream>>>( \
|
|
||||||
out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \
|
|
||||||
scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \
|
|
||||||
alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \
|
|
||||||
k_scale, v_scale, tp_rank, blocksparse_local_blocks, \
|
|
||||||
blocksparse_vert_stride, blocksparse_block_size, \
|
|
||||||
blocksparse_head_sliding_step);
|
|
||||||
|
|
||||||
// TODO(woosuk): Tune NUM_THREADS.
|
|
||||||
template <typename T, typename CACHE_T, int BLOCK_SIZE,
|
|
||||||
vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE,
|
|
||||||
int NUM_THREADS = 128>
|
|
||||||
void paged_attention_v1_launcher(
|
|
||||||
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
|
||||||
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
|
||||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
|
|
||||||
const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
|
|
||||||
float v_scale, const int tp_rank, const int blocksparse_local_blocks,
|
|
||||||
const int blocksparse_vert_stride, const int blocksparse_block_size,
|
|
||||||
const int blocksparse_head_sliding_step) {
|
|
||||||
int num_seqs = query.size(0);
|
|
||||||
int num_heads = query.size(1);
|
|
||||||
int head_size = query.size(2);
|
|
||||||
int max_num_blocks_per_seq = block_tables.size(1);
|
|
||||||
int q_stride = query.stride(0);
|
|
||||||
int kv_block_stride = key_cache.stride(0);
|
|
||||||
int kv_head_stride = key_cache.stride(1);
|
|
||||||
|
|
||||||
[[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
|
|
||||||
assert(head_size % thread_group_size == 0);
|
|
||||||
|
|
||||||
// NOTE: alibi_slopes is optional.
|
|
||||||
const float* alibi_slopes_ptr =
|
|
||||||
alibi_slopes
|
|
||||||
? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
|
|
||||||
: nullptr;
|
|
||||||
|
|
||||||
T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
|
|
||||||
T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
|
|
||||||
CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
|
|
||||||
CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
|
|
||||||
int* block_tables_ptr = block_tables.data_ptr<int>();
|
|
||||||
int* seq_lens_ptr = seq_lens.data_ptr<int>();
|
|
||||||
|
|
||||||
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
|
||||||
int padded_max_seq_len =
|
|
||||||
DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
|
|
||||||
int logits_size = padded_max_seq_len * sizeof(float);
|
|
||||||
int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
|
|
||||||
// Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
|
|
||||||
// Keep that in sync with the logic here!
|
|
||||||
int shared_mem_size = std::max(logits_size, outputs_size);
|
|
||||||
|
|
||||||
dim3 grid(num_heads, num_seqs, 1);
|
|
||||||
dim3 block(NUM_THREADS);
|
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
|
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
|
||||||
switch (head_size) {
|
|
||||||
// NOTE(woosuk): To reduce the compilation time, we only compile for the
|
|
||||||
// head sizes that we use in the model. However, we can easily extend this
|
|
||||||
// to support any head size which is a multiple of 16.
|
|
||||||
case 64:
|
|
||||||
LAUNCH_PAGED_ATTENTION_V1(64);
|
|
||||||
break;
|
|
||||||
case 80:
|
|
||||||
LAUNCH_PAGED_ATTENTION_V1(80);
|
|
||||||
break;
|
|
||||||
case 96:
|
|
||||||
LAUNCH_PAGED_ATTENTION_V1(96);
|
|
||||||
break;
|
|
||||||
case 112:
|
|
||||||
LAUNCH_PAGED_ATTENTION_V1(112);
|
|
||||||
break;
|
|
||||||
case 120:
|
|
||||||
LAUNCH_PAGED_ATTENTION_V1(120);
|
|
||||||
break;
|
|
||||||
case 128:
|
|
||||||
LAUNCH_PAGED_ATTENTION_V1(128);
|
|
||||||
break;
|
|
||||||
case 192:
|
|
||||||
LAUNCH_PAGED_ATTENTION_V1(192);
|
|
||||||
break;
|
|
||||||
case 256:
|
|
||||||
LAUNCH_PAGED_ATTENTION_V1(256);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
TORCH_CHECK(false, "Unsupported head size: ", head_size);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \
|
|
||||||
paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE, \
|
|
||||||
IS_BLOCK_SPARSE>( \
|
|
||||||
out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
|
|
||||||
seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank, \
|
|
||||||
blocksparse_local_blocks, blocksparse_vert_stride, \
|
|
||||||
blocksparse_block_size, blocksparse_head_sliding_step);
|
|
||||||
|
|
||||||
#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
|
|
||||||
switch (is_block_sparse) { \
|
|
||||||
case true: \
|
|
||||||
CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
|
|
||||||
break; \
|
|
||||||
case false: \
|
|
||||||
CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
|
|
||||||
break; \
|
|
||||||
}
|
|
||||||
|
|
||||||
// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
|
|
||||||
// 1, 2, 4, 64, 128, 256.
|
|
||||||
#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \
|
|
||||||
switch (block_size) { \
|
|
||||||
case 8: \
|
|
||||||
CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \
|
|
||||||
break; \
|
|
||||||
case 16: \
|
|
||||||
CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \
|
|
||||||
break; \
|
|
||||||
case 32: \
|
|
||||||
CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \
|
|
||||||
break; \
|
|
||||||
default: \
|
|
||||||
TORCH_CHECK(false, "Unsupported block size: ", block_size); \
|
|
||||||
break; \
|
|
||||||
}
|
|
||||||
|
|
||||||
void paged_attention_v1(
|
|
||||||
torch::Tensor& out, // [num_seqs, num_heads, head_size]
|
|
||||||
torch::Tensor& query, // [num_seqs, num_heads, head_size]
|
|
||||||
torch::Tensor&
|
|
||||||
key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
|
|
||||||
torch::Tensor&
|
|
||||||
value_cache, // [num_blocks, num_heads, head_size, block_size]
|
|
||||||
int64_t num_kv_heads, // [num_heads]
|
|
||||||
double scale,
|
|
||||||
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
|
|
||||||
torch::Tensor& seq_lens, // [num_seqs]
|
|
||||||
int64_t block_size, int64_t max_seq_len,
|
|
||||||
const c10::optional<torch::Tensor>& alibi_slopes,
|
|
||||||
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
|
||||||
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
|
||||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
|
||||||
const int64_t blocksparse_head_sliding_step) {
|
|
||||||
const bool is_block_sparse = (blocksparse_vert_stride > 1);
|
|
||||||
|
|
||||||
DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
|
|
||||||
CALL_V1_LAUNCHER_BLOCK_SIZE)
|
|
||||||
}
|
|
||||||
|
|
||||||
#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \
|
|
||||||
vllm::paged_attention_v2_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, \
|
|
||||||
NUM_THREADS, KV_DTYPE, IS_BLOCK_SPARSE, \
|
|
||||||
PARTITION_SIZE> \
|
|
||||||
<<<grid, block, shared_mem_size, stream>>>( \
|
|
||||||
exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \
|
|
||||||
value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
|
|
||||||
seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \
|
|
||||||
kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank, \
|
|
||||||
blocksparse_local_blocks, blocksparse_vert_stride, \
|
|
||||||
blocksparse_block_size, blocksparse_head_sliding_step); \
|
|
||||||
vllm::paged_attention_v2_reduce_kernel<T, HEAD_SIZE, NUM_THREADS, \
|
|
||||||
PARTITION_SIZE> \
|
|
||||||
<<<reduce_grid, block, reduce_shared_mem_size, stream>>>( \
|
|
||||||
out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr, \
|
|
||||||
max_num_partitions);
|
|
||||||
|
|
||||||
template <typename T, typename CACHE_T, int BLOCK_SIZE,
|
|
||||||
vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE,
|
|
||||||
int NUM_THREADS = 128, int PARTITION_SIZE = 512>
|
|
||||||
void paged_attention_v2_launcher(
|
|
||||||
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
|
|
||||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
|
||||||
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
|
||||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
|
|
||||||
const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
|
|
||||||
float v_scale, const int tp_rank, const int blocksparse_local_blocks,
|
|
||||||
const int blocksparse_vert_stride, const int blocksparse_block_size,
|
|
||||||
const int blocksparse_head_sliding_step) {
|
|
||||||
int num_seqs = query.size(0);
|
|
||||||
int num_heads = query.size(1);
|
|
||||||
int head_size = query.size(2);
|
|
||||||
int max_num_blocks_per_seq = block_tables.size(1);
|
|
||||||
int q_stride = query.stride(0);
|
|
||||||
int kv_block_stride = key_cache.stride(0);
|
|
||||||
int kv_head_stride = key_cache.stride(1);
|
|
||||||
|
|
||||||
[[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
|
|
||||||
assert(head_size % thread_group_size == 0);
|
|
||||||
|
|
||||||
// NOTE: alibi_slopes is optional.
|
|
||||||
const float* alibi_slopes_ptr =
|
|
||||||
alibi_slopes
|
|
||||||
? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
|
|
||||||
: nullptr;
|
|
||||||
|
|
||||||
T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
|
|
||||||
float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
|
|
||||||
float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
|
|
||||||
T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
|
|
||||||
T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
|
|
||||||
CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
|
|
||||||
CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
|
|
||||||
int* block_tables_ptr = block_tables.data_ptr<int>();
|
|
||||||
int* seq_lens_ptr = seq_lens.data_ptr<int>();
|
|
||||||
|
|
||||||
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
|
||||||
int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
|
|
||||||
int logits_size = PARTITION_SIZE * sizeof(float);
|
|
||||||
int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
|
|
||||||
|
|
||||||
// For paged attention v2 kernel.
|
|
||||||
dim3 grid(num_heads, num_seqs, max_num_partitions);
|
|
||||||
int shared_mem_size = std::max(logits_size, outputs_size);
|
|
||||||
// For paged attention v2 reduce kernel.
|
|
||||||
dim3 reduce_grid(num_heads, num_seqs);
|
|
||||||
int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
|
|
||||||
|
|
||||||
dim3 block(NUM_THREADS);
|
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
|
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
|
||||||
switch (head_size) {
|
|
||||||
// NOTE(woosuk): To reduce the compilation time, we only compile for the
|
|
||||||
// head sizes that we use in the model. However, we can easily extend this
|
|
||||||
// to support any head size which is a multiple of 16.
|
|
||||||
case 64:
|
|
||||||
LAUNCH_PAGED_ATTENTION_V2(64);
|
|
||||||
break;
|
|
||||||
case 80:
|
|
||||||
LAUNCH_PAGED_ATTENTION_V2(80);
|
|
||||||
break;
|
|
||||||
case 96:
|
|
||||||
LAUNCH_PAGED_ATTENTION_V2(96);
|
|
||||||
break;
|
|
||||||
case 112:
|
|
||||||
LAUNCH_PAGED_ATTENTION_V2(112);
|
|
||||||
break;
|
|
||||||
case 120:
|
|
||||||
LAUNCH_PAGED_ATTENTION_V2(120);
|
|
||||||
break;
|
|
||||||
case 128:
|
|
||||||
LAUNCH_PAGED_ATTENTION_V2(128);
|
|
||||||
break;
|
|
||||||
case 192:
|
|
||||||
LAUNCH_PAGED_ATTENTION_V2(192);
|
|
||||||
break;
|
|
||||||
case 256:
|
|
||||||
LAUNCH_PAGED_ATTENTION_V2(256);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
TORCH_CHECK(false, "Unsupported head size: ", head_size);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \
|
|
||||||
paged_attention_v2_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE, \
|
|
||||||
IS_BLOCK_SPARSE>( \
|
|
||||||
out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
|
|
||||||
num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \
|
|
||||||
k_scale, v_scale, tp_rank, blocksparse_local_blocks, \
|
|
||||||
blocksparse_vert_stride, blocksparse_block_size, \
|
|
||||||
blocksparse_head_sliding_step);
|
|
||||||
|
|
||||||
#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
|
|
||||||
switch (is_block_sparse) { \
|
|
||||||
case true: \
|
|
||||||
CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
|
|
||||||
break; \
|
|
||||||
case false: \
|
|
||||||
CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
|
|
||||||
break; \
|
|
||||||
}
|
|
||||||
|
|
||||||
// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
|
|
||||||
// 1, 2, 4, 64, 128, 256.
|
|
||||||
#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \
|
|
||||||
switch (block_size) { \
|
|
||||||
case 8: \
|
|
||||||
CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \
|
|
||||||
break; \
|
|
||||||
case 16: \
|
|
||||||
CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \
|
|
||||||
break; \
|
|
||||||
case 32: \
|
|
||||||
CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \
|
|
||||||
break; \
|
|
||||||
default: \
|
|
||||||
TORCH_CHECK(false, "Unsupported block size: ", block_size); \
|
|
||||||
break; \
|
|
||||||
}
|
|
||||||
|
|
||||||
void paged_attention_v2(
|
|
||||||
torch::Tensor& out, // [num_seqs, num_heads, head_size]
|
|
||||||
torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions]
|
|
||||||
torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions]
|
|
||||||
torch::Tensor&
|
|
||||||
tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
|
|
||||||
torch::Tensor& query, // [num_seqs, num_heads, head_size]
|
|
||||||
torch::Tensor&
|
|
||||||
key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
|
|
||||||
torch::Tensor&
|
|
||||||
value_cache, // [num_blocks, num_heads, head_size, block_size]
|
|
||||||
int64_t num_kv_heads, // [num_heads]
|
|
||||||
double scale,
|
|
||||||
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
|
|
||||||
torch::Tensor& seq_lens, // [num_seqs]
|
|
||||||
int64_t block_size, int64_t max_seq_len,
|
|
||||||
const c10::optional<torch::Tensor>& alibi_slopes,
|
|
||||||
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
|
||||||
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
|
||||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
|
||||||
const int64_t blocksparse_head_sliding_step) {
|
|
||||||
const bool is_block_sparse = (blocksparse_vert_stride > 1);
|
|
||||||
DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
|
|
||||||
CALL_V2_LAUNCHER_BLOCK_SIZE)
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef WARP_SIZE
|
#undef WARP_SIZE
|
||||||
#undef MAX
|
#undef MAX
|
||||||
#undef MIN
|
#undef MIN
|
||||||
196
csrc/attention/paged_attention_v1.cu
Normal file
196
csrc/attention/paged_attention_v1.cu
Normal file
@@ -0,0 +1,196 @@
|
|||||||
|
/*
|
||||||
|
* Adapted from
|
||||||
|
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
|
||||||
|
* Copyright (c) 2023, The vLLM team.
|
||||||
|
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "attention_kernels.cuh"
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
#define WARP_SIZE 32
|
||||||
|
#else
|
||||||
|
#define WARP_SIZE warpSize
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
|
#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
|
||||||
|
|
||||||
|
#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \
|
||||||
|
VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \
|
||||||
|
((void*)vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, \
|
||||||
|
BLOCK_SIZE, NUM_THREADS, \
|
||||||
|
KV_DTYPE, IS_BLOCK_SPARSE>), \
|
||||||
|
shared_mem_size); \
|
||||||
|
vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, \
|
||||||
|
NUM_THREADS, KV_DTYPE, IS_BLOCK_SPARSE> \
|
||||||
|
<<<grid, block, shared_mem_size, stream>>>( \
|
||||||
|
out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \
|
||||||
|
scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \
|
||||||
|
alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \
|
||||||
|
k_scale, v_scale, tp_rank, blocksparse_local_blocks, \
|
||||||
|
blocksparse_vert_stride, blocksparse_block_size, \
|
||||||
|
blocksparse_head_sliding_step);
|
||||||
|
|
||||||
|
// TODO(woosuk): Tune NUM_THREADS.
|
||||||
|
template <typename T, typename CACHE_T, int BLOCK_SIZE,
|
||||||
|
vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE,
|
||||||
|
int NUM_THREADS = 128>
|
||||||
|
void paged_attention_v1_launcher(
|
||||||
|
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||||
|
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
||||||
|
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
|
||||||
|
const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
|
||||||
|
float v_scale, const int tp_rank, const int blocksparse_local_blocks,
|
||||||
|
const int blocksparse_vert_stride, const int blocksparse_block_size,
|
||||||
|
const int blocksparse_head_sliding_step) {
|
||||||
|
int num_seqs = query.size(0);
|
||||||
|
int num_heads = query.size(1);
|
||||||
|
int head_size = query.size(2);
|
||||||
|
int max_num_blocks_per_seq = block_tables.size(1);
|
||||||
|
int q_stride = query.stride(0);
|
||||||
|
int kv_block_stride = key_cache.stride(0);
|
||||||
|
int kv_head_stride = key_cache.stride(1);
|
||||||
|
|
||||||
|
[[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
|
||||||
|
assert(head_size % thread_group_size == 0);
|
||||||
|
|
||||||
|
// NOTE: alibi_slopes is optional.
|
||||||
|
const float* alibi_slopes_ptr =
|
||||||
|
alibi_slopes
|
||||||
|
? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
|
||||||
|
: nullptr;
|
||||||
|
|
||||||
|
T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
|
||||||
|
T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
|
||||||
|
CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
|
||||||
|
CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
|
||||||
|
int* block_tables_ptr = block_tables.data_ptr<int>();
|
||||||
|
int* seq_lens_ptr = seq_lens.data_ptr<int>();
|
||||||
|
|
||||||
|
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||||
|
int padded_max_seq_len =
|
||||||
|
DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
|
||||||
|
int logits_size = padded_max_seq_len * sizeof(float);
|
||||||
|
int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
|
||||||
|
// Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
|
||||||
|
// Keep that in sync with the logic here!
|
||||||
|
int shared_mem_size = std::max(logits_size, outputs_size);
|
||||||
|
|
||||||
|
dim3 grid(num_heads, num_seqs, 1);
|
||||||
|
dim3 block(NUM_THREADS);
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
switch (head_size) {
|
||||||
|
// NOTE(woosuk): To reduce the compilation time, we only compile for the
|
||||||
|
// head sizes that we use in the model. However, we can easily extend this
|
||||||
|
// to support any head size which is a multiple of 16.
|
||||||
|
case 32:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V1(32);
|
||||||
|
break;
|
||||||
|
case 64:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V1(64);
|
||||||
|
break;
|
||||||
|
case 80:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V1(80);
|
||||||
|
break;
|
||||||
|
case 96:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V1(96);
|
||||||
|
break;
|
||||||
|
case 112:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V1(112);
|
||||||
|
break;
|
||||||
|
case 120:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V1(120);
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V1(128);
|
||||||
|
break;
|
||||||
|
case 192:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V1(192);
|
||||||
|
break;
|
||||||
|
case 256:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V1(256);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
TORCH_CHECK(false, "Unsupported head size: ", head_size);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \
|
||||||
|
paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE, \
|
||||||
|
IS_BLOCK_SPARSE>( \
|
||||||
|
out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
|
||||||
|
seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank, \
|
||||||
|
blocksparse_local_blocks, blocksparse_vert_stride, \
|
||||||
|
blocksparse_block_size, blocksparse_head_sliding_step);
|
||||||
|
|
||||||
|
#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
|
||||||
|
switch (is_block_sparse) { \
|
||||||
|
case true: \
|
||||||
|
CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
|
||||||
|
break; \
|
||||||
|
case false: \
|
||||||
|
CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
|
||||||
|
break; \
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
|
||||||
|
// 1, 2, 4, 64, 128, 256.
|
||||||
|
#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \
|
||||||
|
switch (block_size) { \
|
||||||
|
case 8: \
|
||||||
|
CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \
|
||||||
|
break; \
|
||||||
|
case 16: \
|
||||||
|
CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \
|
||||||
|
break; \
|
||||||
|
case 32: \
|
||||||
|
CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \
|
||||||
|
break; \
|
||||||
|
default: \
|
||||||
|
TORCH_CHECK(false, "Unsupported block size: ", block_size); \
|
||||||
|
break; \
|
||||||
|
}
|
||||||
|
|
||||||
|
void paged_attention_v1(
|
||||||
|
torch::Tensor& out, // [num_seqs, num_heads, head_size]
|
||||||
|
torch::Tensor& query, // [num_seqs, num_heads, head_size]
|
||||||
|
torch::Tensor&
|
||||||
|
key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
|
||||||
|
torch::Tensor&
|
||||||
|
value_cache, // [num_blocks, num_heads, head_size, block_size]
|
||||||
|
int64_t num_kv_heads, // [num_heads]
|
||||||
|
double scale,
|
||||||
|
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||||
|
torch::Tensor& seq_lens, // [num_seqs]
|
||||||
|
int64_t block_size, int64_t max_seq_len,
|
||||||
|
const c10::optional<torch::Tensor>& alibi_slopes,
|
||||||
|
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
||||||
|
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
||||||
|
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||||
|
const int64_t blocksparse_head_sliding_step) {
|
||||||
|
const bool is_block_sparse = (blocksparse_vert_stride > 1);
|
||||||
|
|
||||||
|
DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
|
||||||
|
CALL_V1_LAUNCHER_BLOCK_SIZE)
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef WARP_SIZE
|
||||||
|
#undef MAX
|
||||||
|
#undef MIN
|
||||||
|
#undef DIVIDE_ROUND_UP
|
||||||
206
csrc/attention/paged_attention_v2.cu
Normal file
206
csrc/attention/paged_attention_v2.cu
Normal file
@@ -0,0 +1,206 @@
|
|||||||
|
/*
|
||||||
|
* Adapted from
|
||||||
|
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
|
||||||
|
* Copyright (c) 2023, The vLLM team.
|
||||||
|
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "attention_kernels.cuh"
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
#define WARP_SIZE 32
|
||||||
|
#else
|
||||||
|
#define WARP_SIZE warpSize
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
|
#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
|
||||||
|
|
||||||
|
#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \
|
||||||
|
vllm::paged_attention_v2_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, \
|
||||||
|
NUM_THREADS, KV_DTYPE, IS_BLOCK_SPARSE, \
|
||||||
|
PARTITION_SIZE> \
|
||||||
|
<<<grid, block, shared_mem_size, stream>>>( \
|
||||||
|
exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \
|
||||||
|
value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
|
||||||
|
seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \
|
||||||
|
kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank, \
|
||||||
|
blocksparse_local_blocks, blocksparse_vert_stride, \
|
||||||
|
blocksparse_block_size, blocksparse_head_sliding_step); \
|
||||||
|
vllm::paged_attention_v2_reduce_kernel<T, HEAD_SIZE, NUM_THREADS, \
|
||||||
|
PARTITION_SIZE> \
|
||||||
|
<<<reduce_grid, block, reduce_shared_mem_size, stream>>>( \
|
||||||
|
out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr, \
|
||||||
|
max_num_partitions);
|
||||||
|
|
||||||
|
template <typename T, typename CACHE_T, int BLOCK_SIZE,
|
||||||
|
vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE,
|
||||||
|
int NUM_THREADS = 128, int PARTITION_SIZE = 512>
|
||||||
|
void paged_attention_v2_launcher(
|
||||||
|
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
|
||||||
|
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||||
|
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
||||||
|
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
|
||||||
|
const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
|
||||||
|
float v_scale, const int tp_rank, const int blocksparse_local_blocks,
|
||||||
|
const int blocksparse_vert_stride, const int blocksparse_block_size,
|
||||||
|
const int blocksparse_head_sliding_step) {
|
||||||
|
int num_seqs = query.size(0);
|
||||||
|
int num_heads = query.size(1);
|
||||||
|
int head_size = query.size(2);
|
||||||
|
int max_num_blocks_per_seq = block_tables.size(1);
|
||||||
|
int q_stride = query.stride(0);
|
||||||
|
int kv_block_stride = key_cache.stride(0);
|
||||||
|
int kv_head_stride = key_cache.stride(1);
|
||||||
|
|
||||||
|
[[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
|
||||||
|
assert(head_size % thread_group_size == 0);
|
||||||
|
|
||||||
|
// NOTE: alibi_slopes is optional.
|
||||||
|
const float* alibi_slopes_ptr =
|
||||||
|
alibi_slopes
|
||||||
|
? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
|
||||||
|
: nullptr;
|
||||||
|
|
||||||
|
T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
|
||||||
|
float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
|
||||||
|
float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
|
||||||
|
T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
|
||||||
|
T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
|
||||||
|
CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
|
||||||
|
CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
|
||||||
|
int* block_tables_ptr = block_tables.data_ptr<int>();
|
||||||
|
int* seq_lens_ptr = seq_lens.data_ptr<int>();
|
||||||
|
|
||||||
|
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||||
|
int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
|
||||||
|
int logits_size = PARTITION_SIZE * sizeof(float);
|
||||||
|
int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
|
||||||
|
|
||||||
|
// For paged attention v2 kernel.
|
||||||
|
dim3 grid(num_heads, num_seqs, max_num_partitions);
|
||||||
|
int shared_mem_size = std::max(logits_size, outputs_size);
|
||||||
|
// For paged attention v2 reduce kernel.
|
||||||
|
dim3 reduce_grid(num_heads, num_seqs);
|
||||||
|
int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
|
||||||
|
|
||||||
|
dim3 block(NUM_THREADS);
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
switch (head_size) {
|
||||||
|
// NOTE(woosuk): To reduce the compilation time, we only compile for the
|
||||||
|
// head sizes that we use in the model. However, we can easily extend this
|
||||||
|
// to support any head size which is a multiple of 16.
|
||||||
|
case 32:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V2(32);
|
||||||
|
break;
|
||||||
|
case 64:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V2(64);
|
||||||
|
break;
|
||||||
|
case 80:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V2(80);
|
||||||
|
break;
|
||||||
|
case 96:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V2(96);
|
||||||
|
break;
|
||||||
|
case 112:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V2(112);
|
||||||
|
break;
|
||||||
|
case 120:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V2(120);
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V2(128);
|
||||||
|
break;
|
||||||
|
case 192:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V2(192);
|
||||||
|
break;
|
||||||
|
case 256:
|
||||||
|
LAUNCH_PAGED_ATTENTION_V2(256);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
TORCH_CHECK(false, "Unsupported head size: ", head_size);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \
|
||||||
|
paged_attention_v2_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE, \
|
||||||
|
IS_BLOCK_SPARSE>( \
|
||||||
|
out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
|
||||||
|
num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \
|
||||||
|
k_scale, v_scale, tp_rank, blocksparse_local_blocks, \
|
||||||
|
blocksparse_vert_stride, blocksparse_block_size, \
|
||||||
|
blocksparse_head_sliding_step);
|
||||||
|
|
||||||
|
#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
|
||||||
|
switch (is_block_sparse) { \
|
||||||
|
case true: \
|
||||||
|
CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
|
||||||
|
break; \
|
||||||
|
case false: \
|
||||||
|
CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
|
||||||
|
break; \
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
|
||||||
|
// 1, 2, 4, 64, 128, 256.
|
||||||
|
#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \
|
||||||
|
switch (block_size) { \
|
||||||
|
case 8: \
|
||||||
|
CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \
|
||||||
|
break; \
|
||||||
|
case 16: \
|
||||||
|
CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \
|
||||||
|
break; \
|
||||||
|
case 32: \
|
||||||
|
CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \
|
||||||
|
break; \
|
||||||
|
default: \
|
||||||
|
TORCH_CHECK(false, "Unsupported block size: ", block_size); \
|
||||||
|
break; \
|
||||||
|
}
|
||||||
|
|
||||||
|
void paged_attention_v2(
|
||||||
|
torch::Tensor& out, // [num_seqs, num_heads, head_size]
|
||||||
|
torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions]
|
||||||
|
torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions]
|
||||||
|
torch::Tensor&
|
||||||
|
tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
|
||||||
|
torch::Tensor& query, // [num_seqs, num_heads, head_size]
|
||||||
|
torch::Tensor&
|
||||||
|
key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
|
||||||
|
torch::Tensor&
|
||||||
|
value_cache, // [num_blocks, num_heads, head_size, block_size]
|
||||||
|
int64_t num_kv_heads, // [num_heads]
|
||||||
|
double scale,
|
||||||
|
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||||
|
torch::Tensor& seq_lens, // [num_seqs]
|
||||||
|
int64_t block_size, int64_t max_seq_len,
|
||||||
|
const c10::optional<torch::Tensor>& alibi_slopes,
|
||||||
|
const std::string& kv_cache_dtype, double k_scale, double v_scale,
|
||||||
|
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
|
||||||
|
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||||
|
const int64_t blocksparse_head_sliding_step) {
|
||||||
|
const bool is_block_sparse = (blocksparse_vert_stride > 1);
|
||||||
|
DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
|
||||||
|
CALL_V2_LAUNCHER_BLOCK_SIZE)
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef WARP_SIZE
|
||||||
|
#undef MAX
|
||||||
|
#undef MIN
|
||||||
|
#undef DIVIDE_ROUND_UP
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <torch/custom_class.h>
|
// For TORCH_CHECK
|
||||||
|
#include <torch/library.h>
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
@@ -9,12 +10,7 @@ namespace vllm {
|
|||||||
// in particular it can be used to represent sub-byte data types (something
|
// in particular it can be used to represent sub-byte data types (something
|
||||||
// that torch.dtype currently does not support).
|
// that torch.dtype currently does not support).
|
||||||
//
|
//
|
||||||
// ScalarTypeTorch is a subclass of ScalarType that is compatible with
|
// The type definitions on the Python side can be found in: vllm/scalar_type.py
|
||||||
// TORCH_LIBRARY, making it accessible from Python as well meaning this class
|
|
||||||
// can be used as a argument for custom operators, helping to simplify these
|
|
||||||
// interfaces.
|
|
||||||
//
|
|
||||||
// The type definitions on the Python side can be found in: vllm/_core_ext.pyi
|
|
||||||
// these type definitions should be kept up to date with any Python API changes
|
// these type definitions should be kept up to date with any Python API changes
|
||||||
// here.
|
// here.
|
||||||
//
|
//
|
||||||
@@ -308,204 +304,7 @@ class ScalarType {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Create a TORCH_LIBRARY compatible version of ScalarType (i.e. inherit from
|
using ScalarTypeId = ScalarType::Id;
|
||||||
// torch::CustomClassHolder), we use multiple inheritance here since we cannot
|
|
||||||
// have ScalarType inherit from torch::CustomClassHolder and have a constexpr
|
|
||||||
// constructor at the same time (torch::CustomClassHolder does not have a
|
|
||||||
// constexpr destructor)
|
|
||||||
// See also:
|
|
||||||
// https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA
|
|
||||||
class ScalarTypeTorch : public torch::CustomClassHolder, public ScalarType {
|
|
||||||
public:
|
|
||||||
ScalarTypeTorch(int64_t exponent, int64_t mantissa, int64_t bias,
|
|
||||||
bool _signed)
|
|
||||||
: ScalarType(exponent, mantissa, bias, _signed){};
|
|
||||||
|
|
||||||
ScalarTypeTorch(ScalarType type) : ScalarType(type){};
|
|
||||||
|
|
||||||
using Base = ScalarType;
|
|
||||||
using Self = ScalarTypeTorch;
|
|
||||||
using SelfPtr = c10::intrusive_ptr<Self>;
|
|
||||||
|
|
||||||
static void check_size_bits(int64_t size_bits, bool signed_) {
|
|
||||||
TORCH_CHECK(
|
|
||||||
size_bits <=
|
|
||||||
std::numeric_limits<decltype(std::declval<Self>().mantissa)>::max(),
|
|
||||||
"size_bits bit width is too large to be represented");
|
|
||||||
}
|
|
||||||
|
|
||||||
static void check_bias(int64_t bias) {
|
|
||||||
using Bias = decltype(std::declval<Self>().bias);
|
|
||||||
TORCH_CHECK(bias <= std::numeric_limits<Bias>::max() &&
|
|
||||||
bias >= std::numeric_limits<Bias>::min(),
|
|
||||||
"bias too large or small to be represented");
|
|
||||||
}
|
|
||||||
|
|
||||||
static void check_exponent(int64_t exponent) {
|
|
||||||
TORCH_CHECK(
|
|
||||||
exponent <=
|
|
||||||
std::numeric_limits<decltype(std::declval<Self>().exponent)>::max(),
|
|
||||||
"exponent bit width is too large to be represented");
|
|
||||||
}
|
|
||||||
|
|
||||||
static void check_mantissa(int64_t mantissa) {
|
|
||||||
TORCH_CHECK(
|
|
||||||
mantissa <=
|
|
||||||
std::numeric_limits<decltype(std::declval<Self>().mantissa)>::max(),
|
|
||||||
"mantissa bit width is too large to be represented");
|
|
||||||
}
|
|
||||||
|
|
||||||
static SelfPtr int_(int64_t size_bits, c10::optional<int64_t> bias) {
|
|
||||||
check_size_bits(size_bits, true);
|
|
||||||
check_bias(bias.value_or(0));
|
|
||||||
return c10::make_intrusive<Self>(
|
|
||||||
ScalarType::int_(size_bits, bias.value_or(0)));
|
|
||||||
}
|
|
||||||
|
|
||||||
static SelfPtr uint(int64_t size_bits, c10::optional<int64_t> bias) {
|
|
||||||
check_size_bits(size_bits, true);
|
|
||||||
check_bias(bias.value_or(0));
|
|
||||||
return c10::make_intrusive<Self>(
|
|
||||||
ScalarType::uint(size_bits, bias.value_or(0)));
|
|
||||||
}
|
|
||||||
|
|
||||||
static SelfPtr float_IEEE754(int64_t exponent, int64_t mantissa) {
|
|
||||||
check_mantissa(mantissa);
|
|
||||||
check_exponent(exponent);
|
|
||||||
return c10::make_intrusive<Self>(
|
|
||||||
ScalarType::float_IEEE754(exponent, mantissa));
|
|
||||||
}
|
|
||||||
|
|
||||||
static SelfPtr float_(int64_t exponent, int64_t mantissa,
|
|
||||||
bool finite_values_only, int64_t nan_repr) {
|
|
||||||
check_mantissa(mantissa);
|
|
||||||
check_exponent(exponent);
|
|
||||||
return c10::make_intrusive<Self>(ScalarType::float_(
|
|
||||||
exponent, mantissa, finite_values_only, NanRepr(nan_repr)));
|
|
||||||
}
|
|
||||||
|
|
||||||
// This needs to be implemented and throw a TypeError in order for
|
|
||||||
// PyTorch's opcheck to work on ops that use ScalarTypes.
|
|
||||||
int64_t len() const {
|
|
||||||
throw c10::TypeError({__func__, __FILE__, static_cast<uint32_t>(__LINE__)},
|
|
||||||
"__len__ not implemented");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Serialize a ScalarType into a tuple of pairs. Where each pair
|
|
||||||
// is a (fieldname, value).
|
|
||||||
// For simplicity, we are just going to convert to a ScalarTypeId.
|
|
||||||
std::tuple<std::tuple<std::string, int64_t>> obj_flatten() const {
|
|
||||||
return {{"ScalarType", id()}};
|
|
||||||
}
|
|
||||||
|
|
||||||
// Deserialize a scalar type that has been serialized by obj_flatten,
|
|
||||||
// ostensibly from a tuple of (member name, value) pairs, but in reality
|
|
||||||
// just a ScalarTypeId.
|
|
||||||
static SelfPtr obj_unflatten(
|
|
||||||
std::tuple<std::tuple<std::string, int64_t>> const& flat_type) {
|
|
||||||
return c10::make_intrusive<Self>(
|
|
||||||
from_id(std::get<1>(std::get<0>(flat_type))));
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
static void bind_readonly_property(torch::class_<Self>& cls,
|
|
||||||
std::string const& name, T Base::*field) {
|
|
||||||
auto getter_func_helper = [field = std::move(field)](SelfPtr const& self) {
|
|
||||||
if constexpr (std::is_member_function_pointer_v<decltype(field)>) {
|
|
||||||
return (self.get()->*field)();
|
|
||||||
} else {
|
|
||||||
return self.get()->*field;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
auto getter_func = [field = std::move(field),
|
|
||||||
getter_func_helper = std::move(getter_func_helper)](
|
|
||||||
SelfPtr const& self) {
|
|
||||||
auto val = getter_func_helper(self);
|
|
||||||
// upconvert uint8_t, int32_t etc. to int64_t for python
|
|
||||||
if constexpr (std::is_integral_v<T>) {
|
|
||||||
return static_cast<int64_t>(val);
|
|
||||||
} else {
|
|
||||||
return val;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
cls.def_property(name, getter_func);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename MemberFunc, typename Cls>
|
|
||||||
static void bind_function(torch::class_<Self>& cls, const std::string& name,
|
|
||||||
MemberFunc Cls::*member) {
|
|
||||||
cls.def(name, [member = std::move(member)](SelfPtr const& self) {
|
|
||||||
return (self.get()->*member)();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename Func>
|
|
||||||
static void bind_function(torch::class_<Self>& cls, const std::string& name,
|
|
||||||
Func func) {
|
|
||||||
cls.def(name, func);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename Func>
|
|
||||||
static void bind_static_function(torch::class_<Self>& cls,
|
|
||||||
const std::string& name, Func func) {
|
|
||||||
cls.def_static(name, func);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void bind_class(torch::Library& lib) {
|
|
||||||
auto cls = lib.class_<ScalarTypeTorch>("ScalarType")
|
|
||||||
.def(torch::init<int64_t, int64_t, int64_t, bool>());
|
|
||||||
|
|
||||||
// Bind Properties
|
|
||||||
bind_readonly_property(cls, "mantissa", &Base::mantissa);
|
|
||||||
bind_readonly_property(cls, "exponent", &Base::exponent);
|
|
||||||
bind_readonly_property(cls, "bias", &Base::bias);
|
|
||||||
bind_readonly_property(cls, "signed", &Base::is_signed);
|
|
||||||
bind_readonly_property(cls, "size_bits", &Base::size_bits);
|
|
||||||
|
|
||||||
// Bind member functions
|
|
||||||
bind_function(cls, "is_signed", &Base::is_signed);
|
|
||||||
bind_function(cls, "is_integer", &Base::is_integer);
|
|
||||||
bind_function(cls, "is_floating_point", &Base::is_floating_point);
|
|
||||||
bind_function(cls, "is_ieee_754", &Base::is_ieee_754);
|
|
||||||
bind_function(cls, "has_nans", &Base::has_nans);
|
|
||||||
bind_function(cls, "has_infs", &Base::has_infs);
|
|
||||||
bind_function(cls, "has_bias", &Base::has_bias);
|
|
||||||
|
|
||||||
bind_function(cls, "max", [](SelfPtr const& self) {
|
|
||||||
return std::visit([](auto arg) { return c10::IValue(arg); },
|
|
||||||
self.get()->max());
|
|
||||||
});
|
|
||||||
bind_function(cls, "min", [](SelfPtr const& self) {
|
|
||||||
return std::visit([](auto arg) { return c10::IValue(arg); },
|
|
||||||
self.get()->min());
|
|
||||||
});
|
|
||||||
|
|
||||||
bind_function(cls, "__len__", &ScalarTypeTorch::len);
|
|
||||||
bind_function(cls, "__str__", &Base::str);
|
|
||||||
bind_function(cls, "__eq__", [](SelfPtr const& self, SelfPtr const& other) {
|
|
||||||
return *self == *other;
|
|
||||||
});
|
|
||||||
bind_function(cls, "__repr__", [](SelfPtr const& self) {
|
|
||||||
return "ScalarType." + self.get()->str();
|
|
||||||
});
|
|
||||||
|
|
||||||
bind_function(cls, "__obj_flatten__", &ScalarTypeTorch::obj_flatten);
|
|
||||||
bind_static_function(cls, "__obj_unflatten__",
|
|
||||||
&ScalarTypeTorch::obj_unflatten);
|
|
||||||
|
|
||||||
// Bind static functions (convenience constructors)
|
|
||||||
bind_static_function(cls, "int_", &ScalarTypeTorch::int_);
|
|
||||||
bind_static_function(cls, "uint", &ScalarTypeTorch::uint);
|
|
||||||
bind_static_function(cls, "float_IEEE754", &ScalarTypeTorch::float_IEEE754);
|
|
||||||
bind_static_function(cls, "float_", &ScalarTypeTorch::float_);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
using ScalarTypeId = int64_t;
|
|
||||||
using ScalarTypeTorchPtr = c10::intrusive_ptr<ScalarTypeTorch>;
|
|
||||||
|
|
||||||
// "rust style" names generally following:
|
// "rust style" names generally following:
|
||||||
// https://github.com/pytorch/pytorch/blob/6d9f74f0af54751311f0dd71f7e5c01a93260ab3/torch/csrc/api/include/torch/types.h#L60-L70
|
// https://github.com/pytorch/pytorch/blob/6d9f74f0af54751311f0dd71f7e5c01a93260ab3/torch/csrc/api/include/torch/types.h#L60-L70
|
||||||
|
|||||||
@@ -1,16 +0,0 @@
|
|||||||
#include <torch/library.h>
|
|
||||||
|
|
||||||
#include "scalar_type.hpp"
|
|
||||||
#include "registration.h"
|
|
||||||
|
|
||||||
// Note the CORE exstension will be built for (almost) all hardware targets so
|
|
||||||
// new additions must account for this. (currently not built for TPU and Neuron)
|
|
||||||
|
|
||||||
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, lib) {
|
|
||||||
// ScalarType, a custom class for representing data types that supports
|
|
||||||
// quantized types, declared here so it can be used when creating interfaces
|
|
||||||
// for custom ops.
|
|
||||||
vllm::ScalarTypeTorch::bind_class(lib);
|
|
||||||
}
|
|
||||||
|
|
||||||
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
|
|
||||||
@@ -22,6 +22,16 @@ struct KernelVecType<float> {
|
|||||||
using v_load_vec_type = vec_op::FP32Vec16;
|
using v_load_vec_type = vec_op::FP32Vec16;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct KernelVecType<c10::Half> {
|
||||||
|
using q_load_vec_type = vec_op::FP16Vec8;
|
||||||
|
using q_vec_type = vec_op::FP32Vec16;
|
||||||
|
using k_load_vec_type = vec_op::FP16Vec16;
|
||||||
|
using k_vec_type = vec_op::FP32Vec16;
|
||||||
|
using qk_acc_vec_type = vec_op::FP32Vec16;
|
||||||
|
using v_load_vec_type = vec_op::FP16Vec16;
|
||||||
|
};
|
||||||
|
|
||||||
#ifdef __AVX512BF16__
|
#ifdef __AVX512BF16__
|
||||||
template <>
|
template <>
|
||||||
struct KernelVecType<c10::BFloat16> {
|
struct KernelVecType<c10::BFloat16> {
|
||||||
@@ -375,6 +385,9 @@ void paged_attention_v1_impl_launcher(
|
|||||||
int* seq_lens_ptr = seq_lens.data_ptr<int>();
|
int* seq_lens_ptr = seq_lens.data_ptr<int>();
|
||||||
|
|
||||||
switch (head_size) {
|
switch (head_size) {
|
||||||
|
case 32:
|
||||||
|
LAUNCH_V1_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
|
||||||
|
break;
|
||||||
case 64:
|
case 64:
|
||||||
LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
|
LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
|
||||||
break;
|
break;
|
||||||
@@ -692,6 +705,9 @@ void paged_attention_v2_impl_launcher(
|
|||||||
int* seq_lens_ptr = seq_lens.data_ptr<int>();
|
int* seq_lens_ptr = seq_lens.data_ptr<int>();
|
||||||
|
|
||||||
switch (head_size) {
|
switch (head_size) {
|
||||||
|
case 32:
|
||||||
|
LAUNCH_V2_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
|
||||||
|
break;
|
||||||
case 64:
|
case 64:
|
||||||
LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
|
LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
|
||||||
break;
|
break;
|
||||||
|
|||||||
@@ -11,10 +11,10 @@ static_assert(false, "AVX2 must be supported for the current implementation.");
|
|||||||
|
|
||||||
namespace vec_op {
|
namespace vec_op {
|
||||||
|
|
||||||
// FIXME: FP16 is not fully supported in Torch-CPU
|
|
||||||
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
||||||
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||||
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
|
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
|
||||||
|
|
||||||
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
||||||
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
|
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
|
||||||
@@ -50,37 +50,37 @@ template <typename T> struct Vec {
|
|||||||
struct FP32Vec8;
|
struct FP32Vec8;
|
||||||
struct FP32Vec16;
|
struct FP32Vec16;
|
||||||
|
|
||||||
#ifdef __AVX512FP16__
|
|
||||||
struct FP16Vec8 : public Vec<FP16Vec8> {
|
struct FP16Vec8 : public Vec<FP16Vec8> {
|
||||||
constexpr static int VEC_ELEM_NUM = 8;
|
constexpr static int VEC_ELEM_NUM = 8;
|
||||||
|
|
||||||
__m128h reg;
|
__m128i reg;
|
||||||
|
|
||||||
explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
|
explicit FP16Vec8(const void *ptr)
|
||||||
|
: reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
|
||||||
|
|
||||||
explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
|
explicit FP16Vec8(const FP32Vec8 &);
|
||||||
|
|
||||||
explicit FP16Vec8(__m128h data) : reg(data) {}
|
void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
|
||||||
|
};
|
||||||
FP16Vec8 operator*(const FP16Vec8 &b) const {
|
|
||||||
return FP16Vec8(_mm_mul_ph(reg, b.reg));
|
struct FP16Vec16 : public Vec<FP16Vec16> {
|
||||||
}
|
constexpr static int VEC_ELEM_NUM = 16;
|
||||||
|
|
||||||
FP16Vec8 operator+(const FP16Vec8 &b) const {
|
__m256i reg;
|
||||||
return FP16Vec8(_mm_add_ph(reg, b.reg));
|
|
||||||
}
|
explicit FP16Vec16(const void *ptr)
|
||||||
|
: reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
|
||||||
FP16Vec8 operator-(const FP16Vec8 &b) const {
|
|
||||||
return FP16Vec8(_mm_sub_ph(reg, b.reg));
|
explicit FP16Vec16(const FP32Vec16 &);
|
||||||
}
|
|
||||||
|
void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
|
||||||
FP16Vec8 operator/(const FP16Vec8 &b) const {
|
|
||||||
return FP16Vec8(_mm_div_ph(reg, b.reg));
|
void save(void* ptr, const int elem_num) const {
|
||||||
}
|
constexpr uint32_t M = 0xFFFFFFFF;
|
||||||
|
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
|
||||||
void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
|
_mm256_mask_storeu_epi16(ptr, mask, reg);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
#endif
|
|
||||||
|
|
||||||
struct BF16Vec8 : public Vec<BF16Vec8> {
|
struct BF16Vec8 : public Vec<BF16Vec8> {
|
||||||
constexpr static int VEC_ELEM_NUM = 8;
|
constexpr static int VEC_ELEM_NUM = 8;
|
||||||
@@ -202,9 +202,7 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
|
|||||||
|
|
||||||
explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
|
explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
|
||||||
|
|
||||||
#ifdef __AVX512FP16__
|
explicit FP32Vec8(const FP16Vec8 &v) : reg(_mm256_cvtph_ps(v.reg)) {}
|
||||||
explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
explicit FP32Vec8(const BF16Vec8 &v)
|
explicit FP32Vec8(const BF16Vec8 &v)
|
||||||
: reg(_mm256_castsi256_ps(
|
: reg(_mm256_castsi256_ps(
|
||||||
@@ -323,6 +321,10 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
|||||||
: reg(_mm512_castsi512_ps(
|
: reg(_mm512_castsi512_ps(
|
||||||
_mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
|
_mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP16Vec16 &v) : reg(_mm512_cvtph_ps(v.reg)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
|
||||||
|
|
||||||
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
|
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
|
||||||
|
|
||||||
explicit FP32Vec16(const INT32Vec16 &v)
|
explicit FP32Vec16(const INT32Vec16 &v)
|
||||||
@@ -430,6 +432,16 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
|||||||
explicit FP32Vec16(const FP32Vec8 &data)
|
explicit FP32Vec16(const FP32Vec8 &data)
|
||||||
: reg_low(data.reg), reg_high(data.reg) {}
|
: reg_low(data.reg), reg_high(data.reg) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP16Vec16 &v) {
|
||||||
|
__m128i low = _mm256_extractf128_si256(v.reg, 0);
|
||||||
|
__m128i high = _mm256_extractf128_si256(v.reg, 1);
|
||||||
|
|
||||||
|
reg_low = _mm256_cvtph_ps(low);
|
||||||
|
reg_high = _mm256_cvtph_ps(high);
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
|
||||||
|
|
||||||
explicit FP32Vec16(const BF16Vec16 &v) {
|
explicit FP32Vec16(const BF16Vec16 &v) {
|
||||||
__m128i low = _mm256_extractf128_si256(v.reg, 0);
|
__m128i low = _mm256_extractf128_si256(v.reg, 0);
|
||||||
__m128i high = _mm256_extractf128_si256(v.reg, 1);
|
__m128i high = _mm256_extractf128_si256(v.reg, 1);
|
||||||
@@ -534,24 +546,34 @@ template <typename T> using vec_t = typename VecType<T>::vec_type;
|
|||||||
|
|
||||||
template <> struct VecType<float> { using vec_type = FP32Vec8; };
|
template <> struct VecType<float> { using vec_type = FP32Vec8; };
|
||||||
|
|
||||||
#ifdef __AVX512FP16__
|
template <> struct VecType<c10::Half> { using vec_type = FP16Vec8; };
|
||||||
template <> struct VecType<c10::Half> { using vec_type = FP16Vec16; };
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
|
template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
|
||||||
|
|
||||||
template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
|
template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
|
||||||
|
|
||||||
#ifdef __AVX512FP16__
|
|
||||||
template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
|
|
||||||
*reinterpret_cast<_Float16 *>(ptr) = v;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
|
inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
|
||||||
acc = acc + a * b;
|
acc = acc + a * b;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
|
||||||
|
*reinterpret_cast<unsigned short *>(ptr) =
|
||||||
|
_cvtss_sh(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline FP16Vec8::FP16Vec8(const FP32Vec8 &v)
|
||||||
|
: reg(_mm256_cvtps_ph(v.reg,
|
||||||
|
_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {}
|
||||||
|
|
||||||
|
#ifdef __AVX512F__
|
||||||
|
inline FP16Vec16::FP16Vec16(const FP32Vec16 &v)
|
||||||
|
: reg(_mm512_cvtps_ph(v.reg,
|
||||||
|
_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {}
|
||||||
|
#else
|
||||||
|
inline FP16Vec16::FP16Vec16(const FP32Vec16 &v)
|
||||||
|
: reg(_mm256_insertf128_si256(_mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg), FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __AVX512BF16__
|
#ifdef __AVX512BF16__
|
||||||
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
|
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
|
||||||
*reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
|
*reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
#define DNNL_HELPER_HPP
|
#define DNNL_HELPER_HPP
|
||||||
|
|
||||||
#include <c10/util/BFloat16.h>
|
#include <c10/util/BFloat16.h>
|
||||||
|
#include <c10/util/Half.h>
|
||||||
|
|
||||||
#include "oneapi/dnnl/dnnl.hpp"
|
#include "oneapi/dnnl/dnnl.hpp"
|
||||||
|
|
||||||
@@ -32,6 +33,11 @@ struct DNNLType<c10::BFloat16> {
|
|||||||
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
|
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct DNNLType<c10::Half> {
|
||||||
|
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16;
|
||||||
|
};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
constexpr inline dnnl::memory::data_type get_dnnl_type() {
|
constexpr inline dnnl::memory::data_type get_dnnl_type() {
|
||||||
return DNNLType<std::decay_t<T>>::type;
|
return DNNLType<std::decay_t<T>>::type;
|
||||||
|
|||||||
@@ -23,6 +23,13 @@ struct KernelVecType<c10::BFloat16> {
|
|||||||
using cvt_vec_type = vec_op::FP32Vec16;
|
using cvt_vec_type = vec_op::FP32Vec16;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct KernelVecType<c10::Half> {
|
||||||
|
using load_vec_type = vec_op::FP16Vec16;
|
||||||
|
using azp_adj_load_vec_type = vec_op::INT32Vec16;
|
||||||
|
using cvt_vec_type = vec_op::FP32Vec16;
|
||||||
|
};
|
||||||
|
|
||||||
#ifdef __AVX512F__
|
#ifdef __AVX512F__
|
||||||
template <bool AZP, typename scalar_t>
|
template <bool AZP, typename scalar_t>
|
||||||
void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
|
void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
|
||||||
|
|||||||
@@ -5,32 +5,29 @@
|
|||||||
|
|
||||||
#include "custom_all_reduce.cuh"
|
#include "custom_all_reduce.cuh"
|
||||||
|
|
||||||
// fake pointer type, must match fptr_t type in ops.h
|
// Fake pointer type, must match fptr_t type in ops.h.
|
||||||
|
// We use this type alias to indicate when pointers are passed in as int64_t.
|
||||||
using fptr_t = int64_t;
|
using fptr_t = int64_t;
|
||||||
static_assert(sizeof(void*) == sizeof(fptr_t));
|
static_assert(sizeof(void*) == sizeof(fptr_t));
|
||||||
|
|
||||||
fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
|
fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
|
||||||
const std::vector<std::string>& handles,
|
torch::Tensor& rank_data, int64_t rank,
|
||||||
const std::vector<int64_t>& offsets, int64_t rank,
|
|
||||||
bool full_nvlink) {
|
bool full_nvlink) {
|
||||||
int world_size = offsets.size();
|
int world_size = fake_ipc_ptrs.size();
|
||||||
if (world_size > 8)
|
if (world_size > 8)
|
||||||
throw std::invalid_argument("world size > 8 is not supported");
|
throw std::invalid_argument("world size > 8 is not supported");
|
||||||
if (world_size % 2 != 0)
|
if (world_size % 2 != 0)
|
||||||
throw std::invalid_argument("Odd num gpus is not supported for now");
|
throw std::invalid_argument("Odd num gpus is not supported for now");
|
||||||
if (world_size != handles.size())
|
|
||||||
throw std::invalid_argument(
|
|
||||||
"handles length should equal to offsets length");
|
|
||||||
if (rank < 0 || rank >= world_size)
|
if (rank < 0 || rank >= world_size)
|
||||||
throw std::invalid_argument("invalid rank passed in");
|
throw std::invalid_argument("invalid rank passed in");
|
||||||
|
|
||||||
cudaIpcMemHandle_t ipc_handles[8];
|
vllm::Signal* ipc_ptrs[8];
|
||||||
for (int i = 0; i < world_size; i++) {
|
for (int i = 0; i < world_size; i++) {
|
||||||
std::memcpy(&ipc_handles[i], handles[i].data(), sizeof(cudaIpcMemHandle_t));
|
ipc_ptrs[i] = reinterpret_cast<vllm::Signal*>(fake_ipc_ptrs[i]);
|
||||||
}
|
}
|
||||||
return (fptr_t) new vllm::CustomAllreduce(
|
return (fptr_t) new vllm::CustomAllreduce(ipc_ptrs, rank_data.data_ptr(),
|
||||||
reinterpret_cast<vllm::Signal*>(meta.data_ptr()), rank_data.data_ptr(),
|
rank_data.numel(), rank, world_size,
|
||||||
rank_data.numel(), ipc_handles, offsets, rank, full_nvlink);
|
full_nvlink);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -55,26 +52,48 @@ bool _is_weak_contiguous(torch::Tensor& t) {
|
|||||||
t.numel() * t.element_size());
|
t.numel() * t.element_size());
|
||||||
}
|
}
|
||||||
|
|
||||||
void _all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
|
/**
|
||||||
cudaStream_t stream) {
|
* Performs an out-of-place allreduce and stores result in out.
|
||||||
|
*
|
||||||
|
* If _reg_buffer is null, assumes inp.data_ptr() is already IPC-registered.
|
||||||
|
* Otherwise, _reg_buffer is assumed to be IPC-registered and inp is first
|
||||||
|
* copied into _reg_buffer.
|
||||||
|
*/
|
||||||
|
void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
|
||||||
|
fptr_t _reg_buffer, int64_t reg_buffer_sz_bytes) {
|
||||||
auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
|
auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
|
||||||
|
auto stream = c10::cuda::getCurrentCUDAStream().stream();
|
||||||
|
|
||||||
|
TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
|
||||||
|
TORCH_CHECK_EQ(inp.numel(), out.numel());
|
||||||
TORCH_CHECK(_is_weak_contiguous(out));
|
TORCH_CHECK(_is_weak_contiguous(out));
|
||||||
|
TORCH_CHECK(_is_weak_contiguous(inp));
|
||||||
|
auto input_size = inp.numel() * inp.element_size();
|
||||||
|
auto reg_buffer = reinterpret_cast<void*>(_reg_buffer);
|
||||||
|
if (reg_buffer) {
|
||||||
|
TORCH_CHECK_LE(input_size, reg_buffer_sz_bytes);
|
||||||
|
AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer, inp.data_ptr(), input_size,
|
||||||
|
cudaMemcpyDeviceToDevice, stream));
|
||||||
|
} else {
|
||||||
|
reg_buffer = inp.data_ptr();
|
||||||
|
}
|
||||||
switch (out.scalar_type()) {
|
switch (out.scalar_type()) {
|
||||||
case at::ScalarType::Float: {
|
case at::ScalarType::Float: {
|
||||||
fa->allreduce<float>(stream, reinterpret_cast<float*>(inp.data_ptr()),
|
fa->allreduce<float>(stream, reinterpret_cast<float*>(reg_buffer),
|
||||||
reinterpret_cast<float*>(out.data_ptr()),
|
reinterpret_cast<float*>(out.data_ptr()),
|
||||||
out.numel());
|
out.numel());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case at::ScalarType::Half: {
|
case at::ScalarType::Half: {
|
||||||
fa->allreduce<half>(stream, reinterpret_cast<half*>(inp.data_ptr()),
|
fa->allreduce<half>(stream, reinterpret_cast<half*>(reg_buffer),
|
||||||
reinterpret_cast<half*>(out.data_ptr()), out.numel());
|
reinterpret_cast<half*>(out.data_ptr()), out.numel());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
|
#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
|
||||||
case at::ScalarType::BFloat16: {
|
case at::ScalarType::BFloat16: {
|
||||||
fa->allreduce<nv_bfloat16>(
|
fa->allreduce<nv_bfloat16>(
|
||||||
stream, reinterpret_cast<nv_bfloat16*>(inp.data_ptr()),
|
stream, reinterpret_cast<nv_bfloat16*>(reg_buffer),
|
||||||
reinterpret_cast<nv_bfloat16*>(out.data_ptr()), out.numel());
|
reinterpret_cast<nv_bfloat16*>(out.data_ptr()), out.numel());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -85,57 +104,41 @@ void _all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out) {
|
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
|
|
||||||
auto stream = c10::cuda::getCurrentCUDAStream().stream();
|
|
||||||
TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
|
|
||||||
TORCH_CHECK_EQ(inp.numel(), out.numel());
|
|
||||||
_all_reduce(_fa, inp, out, stream);
|
|
||||||
}
|
|
||||||
|
|
||||||
void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer,
|
|
||||||
torch::Tensor& out) {
|
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
|
|
||||||
auto stream = c10::cuda::getCurrentCUDAStream().stream();
|
|
||||||
|
|
||||||
auto input_size = inp.numel() * inp.element_size();
|
|
||||||
TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
|
|
||||||
TORCH_CHECK_EQ(inp.numel(), out.numel());
|
|
||||||
TORCH_CHECK(input_size <= reg_buffer.numel() * reg_buffer.element_size(),
|
|
||||||
"registered buffer is too small to contain the input");
|
|
||||||
AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer.data_ptr(), inp.data_ptr(),
|
|
||||||
input_size, cudaMemcpyDeviceToDevice, stream));
|
|
||||||
_all_reduce(_fa, reg_buffer, out, stream);
|
|
||||||
}
|
|
||||||
|
|
||||||
void dispose(fptr_t _fa) {
|
void dispose(fptr_t _fa) {
|
||||||
auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
|
delete reinterpret_cast<vllm::CustomAllreduce*>(_fa);
|
||||||
delete fa;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t meta_size() { return sizeof(vllm::Signal); }
|
int64_t meta_size() { return sizeof(vllm::Signal); }
|
||||||
|
|
||||||
void register_buffer(fptr_t _fa, torch::Tensor& t,
|
void register_buffer(fptr_t _fa, const std::vector<fptr_t>& fake_ipc_ptrs) {
|
||||||
const std::vector<std::string>& handles,
|
|
||||||
const std::vector<int64_t>& offsets) {
|
|
||||||
auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
|
auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
|
||||||
fa->register_buffer(handles, offsets, t.data_ptr());
|
TORCH_CHECK(fake_ipc_ptrs.size() == fa->world_size_);
|
||||||
|
void* ipc_ptrs[8];
|
||||||
|
for (int i = 0; i < fake_ipc_ptrs.size(); i++) {
|
||||||
|
ipc_ptrs[i] = reinterpret_cast<void*>(fake_ipc_ptrs[i]);
|
||||||
|
}
|
||||||
|
fa->register_buffer(ipc_ptrs);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
|
// Use vector<int64_t> to represent byte data for python binding compatibility.
|
||||||
fptr_t _fa) {
|
std::tuple<std::vector<int64_t>, std::vector<int64_t>>
|
||||||
|
get_graph_buffer_ipc_meta(fptr_t _fa) {
|
||||||
auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
|
auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
|
||||||
auto [handle_bytes, offsets] = fa->get_graph_buffer_ipc_meta();
|
auto [handle, offsets] = fa->get_graph_buffer_ipc_meta();
|
||||||
auto options =
|
std::vector<int64_t> bytes(handle.begin(), handle.end());
|
||||||
torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
|
return std::make_tuple(bytes, offsets);
|
||||||
auto handles =
|
|
||||||
torch::empty({static_cast<int64_t>(handle_bytes.size())}, options);
|
|
||||||
std::memcpy(handles.data_ptr(), handle_bytes.data(), handle_bytes.size());
|
|
||||||
return {handles, std::move(offsets)};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
|
// Use vector<int64_t> to represent byte data for python binding compatibility.
|
||||||
|
void register_graph_buffers(fptr_t _fa,
|
||||||
|
const std::vector<std::vector<int64_t>>& handles,
|
||||||
const std::vector<std::vector<int64_t>>& offsets) {
|
const std::vector<std::vector<int64_t>>& offsets) {
|
||||||
auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
|
auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
|
||||||
fa->register_graph_buffers(handles, offsets);
|
std::vector<std::string> bytes;
|
||||||
|
bytes.reserve(handles.size());
|
||||||
|
for (int i = 0; i < handles.size(); i++) {
|
||||||
|
bytes.emplace_back(handles[i].begin(), handles[i].end());
|
||||||
|
}
|
||||||
|
bytes.reserve(handles.size());
|
||||||
|
fa->register_graph_buffers(bytes, offsets);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -285,46 +285,52 @@ class CustomAllreduce {
|
|||||||
int world_size_;
|
int world_size_;
|
||||||
bool full_nvlink_;
|
bool full_nvlink_;
|
||||||
|
|
||||||
// below are device pointers
|
|
||||||
RankSignals sg_;
|
RankSignals sg_;
|
||||||
|
// Stores an map from a pointer to its peer pointters from all ranks.
|
||||||
std::unordered_map<void*, RankData*> buffers_;
|
std::unordered_map<void*, RankData*> buffers_;
|
||||||
Signal* self_sg_;
|
Signal* self_sg_;
|
||||||
|
|
||||||
// stores the registered device pointers from all ranks
|
// Stores rank data from all ranks. This is mainly for cuda graph purposes.
|
||||||
|
// For cuda graph to work, all kernel arguments must be fixed during graph
|
||||||
|
// capture time. However, the peer pointers are not known during graph capture
|
||||||
|
// time. Therefore, during capture, we increment the rank data pointer and use
|
||||||
|
// that as the argument to the kernel. The kernel arguments are stored in
|
||||||
|
// graph_unreg_buffers_. The actual peer pointers will be filled in at the
|
||||||
|
// memory pointed to by the pointers in graph_unreg_buffers_ when
|
||||||
|
// the IPC handles are exchanged between ranks.
|
||||||
|
//
|
||||||
|
// The overall process looks like this:
|
||||||
|
// 1. Graph capture.
|
||||||
|
// 2. Each rank obtains the IPC handles for each addresses used during cuda
|
||||||
|
// graph capture using get_graph_buffer_ipc_meta.
|
||||||
|
// 3. (In Python) all gather the IPC handles.
|
||||||
|
// 4. Obtain the peer pointers by opening the IPC handles, and store them in
|
||||||
|
// the rank data array at corresponding positions.
|
||||||
RankData *d_rank_data_base_, *d_rank_data_end_;
|
RankData *d_rank_data_base_, *d_rank_data_end_;
|
||||||
std::vector<void*> graph_unreg_buffers_;
|
std::vector<void*> graph_unreg_buffers_;
|
||||||
// a map from IPC handles to opened IPC pointers
|
// a map from IPC handles to opened IPC pointers
|
||||||
std::map<IPC_KEY, char*> ipc_handles_;
|
std::map<IPC_KEY, char*> ipc_handles_;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* meta is a pointer to device metadata and temporary buffer for allreduce.
|
* Signals are an array of ipc-enabled buffers from all ranks.
|
||||||
|
* For each of the buffer, the layout is as follows:
|
||||||
|
* | -- sizeof(Signal) -- | ------ a few MB ----- |
|
||||||
|
* The first section is for allreduce synchronization, and the second section
|
||||||
|
* is for storing the intermediate results required by some allreduce algos.
|
||||||
*
|
*
|
||||||
* There's a total of sizeof(Signal) of prefix before the actual data,
|
* Note: this class does not own any device memory. Any required buffers
|
||||||
* so meta + 1 points to actual temporary buffer.
|
* are passed in from the constructor.
|
||||||
*
|
|
||||||
* note: this class does not own any device memory. Any required buffers
|
|
||||||
* are passed in from the constructor
|
|
||||||
*/
|
*/
|
||||||
CustomAllreduce(Signal* meta, void* rank_data, size_t rank_data_sz,
|
CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz,
|
||||||
const cudaIpcMemHandle_t* handles,
|
int rank, int world_size, bool full_nvlink = true)
|
||||||
const std::vector<int64_t>& offsets, int rank,
|
|
||||||
bool full_nvlink = true)
|
|
||||||
: rank_(rank),
|
: rank_(rank),
|
||||||
world_size_(offsets.size()),
|
world_size_(world_size),
|
||||||
full_nvlink_(full_nvlink),
|
full_nvlink_(full_nvlink),
|
||||||
self_sg_(meta),
|
self_sg_(signals[rank]),
|
||||||
d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
|
d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
|
||||||
d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
|
d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
|
||||||
for (int i = 0; i < world_size_; i++) {
|
for (int i = 0; i < world_size_; i++) {
|
||||||
Signal* rank_sg;
|
sg_.signals[i] = signals[i];
|
||||||
if (i != rank_) {
|
|
||||||
char* handle = open_ipc_handle(&handles[i]);
|
|
||||||
handle += offsets[i];
|
|
||||||
rank_sg = (Signal*)handle;
|
|
||||||
} else {
|
|
||||||
rank_sg = self_sg_;
|
|
||||||
}
|
|
||||||
sg_.signals[i] = rank_sg;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -341,11 +347,10 @@ class CustomAllreduce {
|
|||||||
return it->second;
|
return it->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::pair<std::vector<uint8_t>, std::vector<int64_t>>
|
std::pair<std::string, std::vector<int64_t>> get_graph_buffer_ipc_meta() {
|
||||||
get_graph_buffer_ipc_meta() {
|
|
||||||
auto num_buffers = graph_unreg_buffers_.size();
|
auto num_buffers = graph_unreg_buffers_.size();
|
||||||
auto handle_sz = sizeof(cudaIpcMemHandle_t);
|
auto handle_sz = sizeof(cudaIpcMemHandle_t);
|
||||||
std::vector<uint8_t> handles(handle_sz * num_buffers, 0);
|
std::string handles(handle_sz * num_buffers, static_cast<char>(0));
|
||||||
std::vector<int64_t> offsets(num_buffers);
|
std::vector<int64_t> offsets(num_buffers);
|
||||||
for (int i = 0; i < num_buffers; i++) {
|
for (int i = 0; i < num_buffers; i++) {
|
||||||
auto ptr = graph_unreg_buffers_[i];
|
auto ptr = graph_unreg_buffers_[i];
|
||||||
@@ -370,26 +375,22 @@ class CustomAllreduce {
|
|||||||
std::to_string(d_rank_data_base_ + num - d_rank_data_end_));
|
std::to_string(d_rank_data_base_ + num - d_rank_data_end_));
|
||||||
}
|
}
|
||||||
|
|
||||||
void register_buffer(const std::vector<std::string>& handles,
|
/**
|
||||||
const std::vector<int64_t>& offsets, void* self) {
|
* Register already-shared IPC pointers.
|
||||||
|
*/
|
||||||
|
void register_buffer(void** ptrs) {
|
||||||
check_rank_data_capacity();
|
check_rank_data_capacity();
|
||||||
RankData data;
|
RankData data;
|
||||||
for (int i = 0; i < world_size_; i++) {
|
for (int i = 0; i < world_size_; i++) {
|
||||||
if (i != rank_) {
|
data.ptrs[i] = ptrs[i];
|
||||||
char* handle = open_ipc_handle(handles[i].data());
|
|
||||||
handle += offsets[i];
|
|
||||||
data.ptrs[i] = handle;
|
|
||||||
} else {
|
|
||||||
data.ptrs[i] = self;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
auto d_data = d_rank_data_base_++;
|
auto d_data = d_rank_data_base_++;
|
||||||
CUDACHECK(
|
CUDACHECK(
|
||||||
cudaMemcpy(d_data, &data, sizeof(RankData), cudaMemcpyHostToDevice));
|
cudaMemcpy(d_data, &data, sizeof(RankData), cudaMemcpyHostToDevice));
|
||||||
buffers_[self] = d_data;
|
buffers_[ptrs[rank_]] = d_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
// note: when registering graph buffers, we intentionally choose to not
|
// Note: when registering graph buffers, we intentionally choose to not
|
||||||
// deduplicate the addresses. That means if the allocator reuses some
|
// deduplicate the addresses. That means if the allocator reuses some
|
||||||
// addresses, they will be registered again. This is to account for the remote
|
// addresses, they will be registered again. This is to account for the remote
|
||||||
// possibility of different allocation patterns between ranks. For example,
|
// possibility of different allocation patterns between ranks. For example,
|
||||||
@@ -424,11 +425,13 @@ class CustomAllreduce {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is the result after careful grid search. Using 36 blocks give the best
|
* Performs allreduce, assuming input has already been registered.
|
||||||
* or close to the best runtime on the devices I tried: A100, A10, A30, T4,
|
*
|
||||||
* V100. You'll notice that NCCL kernels also only take a small amount of SMs.
|
* Block and grid default configs are results after careful grid search. Using
|
||||||
* Not quite sure the underlying reason, but my guess is that too many SMs
|
* 36 blocks give the best or close to the best runtime on the devices I
|
||||||
* will cause contention on NVLink bus.
|
* tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only
|
||||||
|
* take a small amount of SMs. Not quite sure the underlying reason, but my
|
||||||
|
* guess is that too many SMs will cause contention on NVLink bus.
|
||||||
*/
|
*/
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void allreduce(cudaStream_t stream, T* input, T* output, int size,
|
void allreduce(cudaStream_t stream, T* input, T* output, int size,
|
||||||
|
|||||||
@@ -135,24 +135,26 @@ void run(int myRank, int nRanks, ncclComm_t& comm, int threads, int block_limit,
|
|||||||
void* rank_data;
|
void* rank_data;
|
||||||
size_t rank_data_sz = 16 * 1024 * 1024;
|
size_t rank_data_sz = 16 * 1024 * 1024;
|
||||||
CUDACHECK(cudaMalloc(&rank_data, rank_data_sz));
|
CUDACHECK(cudaMalloc(&rank_data, rank_data_sz));
|
||||||
std::vector<int64_t> offsets(nRanks, 0);
|
vllm::Signal* ipc_ptrs[8];
|
||||||
vllm::CustomAllreduce fa(buffer, rank_data, rank_data_sz, data_handles,
|
for (int i = 0; i < nRanks; i++) {
|
||||||
offsets, myRank);
|
if (i == myRank)
|
||||||
|
ipc_ptrs[i] = buffer;
|
||||||
|
else
|
||||||
|
CUDACHECK(cudaIpcOpenMemHandle((void**)&ipc_ptrs[i], data_handles[i],
|
||||||
|
cudaIpcMemLazyEnablePeerAccess));
|
||||||
|
}
|
||||||
|
vllm::CustomAllreduce fa(ipc_ptrs, rank_data, rank_data_sz, myRank, nRanks);
|
||||||
auto* self_data =
|
auto* self_data =
|
||||||
reinterpret_cast<T*>(reinterpret_cast<char*>(buffer) +
|
reinterpret_cast<T*>(reinterpret_cast<char*>(buffer) +
|
||||||
sizeof(vllm::Signal) + data_size * sizeof(T));
|
sizeof(vllm::Signal) + data_size * sizeof(T));
|
||||||
// hack buffer registration
|
// hack buffer registration
|
||||||
{
|
{
|
||||||
std::vector<std::string> handles;
|
void* data[8];
|
||||||
handles.reserve(nRanks);
|
|
||||||
for (int i = 0; i < nRanks; i++) {
|
for (int i = 0; i < nRanks; i++) {
|
||||||
char* begin = (char*)&data_handles[i];
|
data[i] =
|
||||||
char* end = (char*)&data_handles[i + 1];
|
((char*)ipc_ptrs[i]) + sizeof(vllm::Signal) + data_size * sizeof(T);
|
||||||
handles.emplace_back(begin, end);
|
|
||||||
}
|
}
|
||||||
std::vector<int64_t> offsets(nRanks,
|
fa.register_buffer(data);
|
||||||
sizeof(vllm::Signal) + data_size * sizeof(T));
|
|
||||||
fa.register_buffer(handles, offsets, self_data);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
double* ground_truth;
|
double* ground_truth;
|
||||||
|
|||||||
@@ -1,21 +1,13 @@
|
|||||||
#include <torch/all.h>
|
#include "type_convert.cuh"
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
#include "dispatch_utils.h"
|
||||||
|
|
||||||
|
#include <torch/cuda.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
#include "dispatch_utils.h"
|
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
#include <cuda_bf16.h>
|
|
||||||
#include <cuda_fp16.h>
|
|
||||||
#include <cub/util_type.cuh>
|
|
||||||
#include <cub/cub.cuh>
|
#include <cub/cub.cuh>
|
||||||
#else
|
#else
|
||||||
#include <hip/hip_bf16.h>
|
|
||||||
#include <hip/hip_fp16.h>
|
|
||||||
#include <hipcub/util_type.hpp>
|
|
||||||
#include <hipcub/hipcub.hpp>
|
#include <hipcub/hipcub.hpp>
|
||||||
|
|
||||||
using __nv_bfloat16 = __hip_bfloat16;
|
|
||||||
using __nv_bfloat162 = __hip_bfloat162;
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
@@ -51,155 +43,6 @@ __global__ void rms_norm_kernel(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Converter structs for the conversion from torch types to HIP/CUDA types,
|
|
||||||
and the associated type conversions within HIP/CUDA. These helpers need
|
|
||||||
to be implemented for now because the relevant type conversion
|
|
||||||
operators/constructors are not consistently implemented by HIP/CUDA, so
|
|
||||||
a generic conversion via type casts cannot be implemented.
|
|
||||||
|
|
||||||
Each struct should have the member static constexpr bool `exists`:
|
|
||||||
If false, the optimized kernel is not used for the corresponding torch type.
|
|
||||||
If true, the struct should be fully defined as shown in the examples below.
|
|
||||||
*/
|
|
||||||
template <typename torch_type>
|
|
||||||
struct _typeConvert {
|
|
||||||
static constexpr bool exists = false;
|
|
||||||
};
|
|
||||||
|
|
||||||
#if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000))
|
|
||||||
// CUDA < 12.0 runs into issues with packed type conversion
|
|
||||||
template <>
|
|
||||||
struct _typeConvert<c10::Half> {
|
|
||||||
static constexpr bool exists = true;
|
|
||||||
using hip_type = __half;
|
|
||||||
using packed_hip_type = __half2;
|
|
||||||
|
|
||||||
__device__ static inline float convert(hip_type x) { return __half2float(x); }
|
|
||||||
__device__ static inline float2 convert(packed_hip_type x) {
|
|
||||||
return __half22float2(x);
|
|
||||||
}
|
|
||||||
__device__ static inline hip_type convert(float x) {
|
|
||||||
return __float2half_rn(x);
|
|
||||||
}
|
|
||||||
__device__ static inline packed_hip_type convert(float2 x) {
|
|
||||||
return __float22half2_rn(x);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
|
|
||||||
// CUDA_ARCH < 800 does not have BF16 support
|
|
||||||
// TODO: Add in ROCm support once public headers handle bf16 maturely
|
|
||||||
template <>
|
|
||||||
struct _typeConvert<c10::BFloat16> {
|
|
||||||
static constexpr bool exists = true;
|
|
||||||
using hip_type = __nv_bfloat16;
|
|
||||||
using packed_hip_type = __nv_bfloat162;
|
|
||||||
|
|
||||||
__device__ static inline float convert(hip_type x) {
|
|
||||||
return __bfloat162float(x);
|
|
||||||
}
|
|
||||||
__device__ static inline float2 convert(packed_hip_type x) {
|
|
||||||
return __bfloat1622float2(x);
|
|
||||||
}
|
|
||||||
__device__ static inline hip_type convert(float x) {
|
|
||||||
return __float2bfloat16(x);
|
|
||||||
}
|
|
||||||
__device__ static inline packed_hip_type convert(float2 x) {
|
|
||||||
return __float22bfloat162_rn(x);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
|
|
||||||
#endif // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >=
|
|
||||||
// 12000))
|
|
||||||
|
|
||||||
/* Vector POD struct to generate vectorized and packed FP16/BF16 ops
|
|
||||||
for appropriate specializations of fused_add_rms_norm_kernel.
|
|
||||||
Only functions that are necessary in that kernel are implemented.
|
|
||||||
Alignment to 16 bytes is required to use 128-bit global memory ops.
|
|
||||||
*/
|
|
||||||
template <typename scalar_t, int width>
|
|
||||||
struct alignas(16) _f16Vec {
|
|
||||||
/* Not theoretically necessary that width is a power of 2 but should
|
|
||||||
almost always be the case for optimization purposes */
|
|
||||||
static_assert(width > 0 && (width & (width - 1)) == 0,
|
|
||||||
"Width is not a positive power of 2!");
|
|
||||||
using Converter = _typeConvert<scalar_t>;
|
|
||||||
using T1 = typename Converter::hip_type;
|
|
||||||
using T2 = typename Converter::packed_hip_type;
|
|
||||||
T1 data[width];
|
|
||||||
|
|
||||||
__device__ _f16Vec& operator+=(const _f16Vec<scalar_t, width>& other) {
|
|
||||||
if constexpr (width % 2 == 0) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < width; i += 2) {
|
|
||||||
T2 temp{data[i], data[i + 1]};
|
|
||||||
temp += T2{other.data[i], other.data[i + 1]};
|
|
||||||
data[i] = temp.x;
|
|
||||||
data[i + 1] = temp.y;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < width; ++i) data[i] += other.data[i];
|
|
||||||
}
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ _f16Vec& operator*=(const _f16Vec<scalar_t, width>& other) {
|
|
||||||
if constexpr (width % 2 == 0) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < width; i += 2) {
|
|
||||||
T2 temp{data[i], data[i + 1]};
|
|
||||||
temp *= T2{other.data[i], other.data[i + 1]};
|
|
||||||
data[i] = temp.x;
|
|
||||||
data[i + 1] = temp.y;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < width; ++i) data[i] *= other.data[i];
|
|
||||||
}
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ _f16Vec& operator*=(const float scale) {
|
|
||||||
if constexpr (width % 2 == 0) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < width; i += 2) {
|
|
||||||
float2 temp_f = Converter::convert(T2{data[i], data[i + 1]});
|
|
||||||
temp_f.x *= scale;
|
|
||||||
temp_f.y *= scale;
|
|
||||||
T2 temp = Converter::convert(temp_f);
|
|
||||||
data[i] = temp.x;
|
|
||||||
data[i + 1] = temp.y;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < width; ++i) {
|
|
||||||
float temp = Converter::convert(data[i]) * scale;
|
|
||||||
data[i] = Converter::convert(temp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ float sum_squares() const {
|
|
||||||
float result = 0.0f;
|
|
||||||
if constexpr (width % 2 == 0) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < width; i += 2) {
|
|
||||||
float2 z = Converter::convert(T2{data[i], data[i + 1]});
|
|
||||||
result += z.x * z.x + z.y * z.y;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < width; ++i) {
|
|
||||||
float x = Converter::convert(data[i]);
|
|
||||||
result += x * x;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/* Function specialization in the case of FP16/BF16 tensors.
|
/* Function specialization in the case of FP16/BF16 tensors.
|
||||||
Additional optimizations we can make in this case are
|
Additional optimizations we can make in this case are
|
||||||
packed and vectorized operations, which help with the
|
packed and vectorized operations, which help with the
|
||||||
|
|||||||
234
csrc/layernorm_quant_kernels.cu
Normal file
234
csrc/layernorm_quant_kernels.cu
Normal file
@@ -0,0 +1,234 @@
|
|||||||
|
/*
|
||||||
|
* This file contains the CUDA kernels for the fused quantized layernorm.
|
||||||
|
* The kernels correspond to the kernels in layernorm_kernels.cu, except they
|
||||||
|
* also produce quantized output directly.
|
||||||
|
* Currently, only static fp8 quantization is supported.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "type_convert.cuh"
|
||||||
|
#include "quantization/fp8/common.cuh"
|
||||||
|
#include "dispatch_utils.h"
|
||||||
|
|
||||||
|
#include <torch/cuda.h>
|
||||||
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
#include <cub/cub.cuh>
|
||||||
|
#else
|
||||||
|
#include <hipcub/hipcub.hpp>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace vllm {
|
||||||
|
|
||||||
|
// TODO(woosuk): Further optimize this kernel.
|
||||||
|
template <typename scalar_t>
|
||||||
|
__global__ void rms_norm_static_fp8_quant_kernel(
|
||||||
|
FP8_TYPE* __restrict__ out, // [..., hidden_size]
|
||||||
|
const scalar_t* __restrict__ input, // [..., hidden_size]
|
||||||
|
const scalar_t* __restrict__ weight, // [hidden_size]
|
||||||
|
const float* __restrict__ scale, // [1]
|
||||||
|
const float epsilon, const int num_tokens, const int hidden_size) {
|
||||||
|
__shared__ float s_variance;
|
||||||
|
float variance = 0.0f;
|
||||||
|
|
||||||
|
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||||
|
const float x = (float)input[blockIdx.x * hidden_size + idx];
|
||||||
|
variance += x * x;
|
||||||
|
}
|
||||||
|
|
||||||
|
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||||
|
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||||
|
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
|
||||||
|
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// invert scale to avoid division
|
||||||
|
float const scale_inv = 1.0f / *scale;
|
||||||
|
|
||||||
|
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||||
|
float x = (float)input[blockIdx.x * hidden_size + idx];
|
||||||
|
float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx];
|
||||||
|
out[blockIdx.x * hidden_size + idx] =
|
||||||
|
scaled_fp8_conversion<true>(out_norm, scale_inv);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Function specialization in the case of FP16/BF16 tensors.
|
||||||
|
Additional optimizations we can make in this case are
|
||||||
|
packed and vectorized operations, which help with the
|
||||||
|
memory latency bottleneck. */
|
||||||
|
template <typename scalar_t, int width>
|
||||||
|
__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
|
||||||
|
fused_add_rms_norm_static_fp8_quant_kernel(
|
||||||
|
FP8_TYPE* __restrict__ out, // [..., hidden_size]
|
||||||
|
scalar_t* __restrict__ input, // [..., hidden_size]
|
||||||
|
scalar_t* __restrict__ residual, // [..., hidden_size]
|
||||||
|
const scalar_t* __restrict__ weight, // [hidden_size]
|
||||||
|
const float* __restrict__ scale, // [1]
|
||||||
|
const float epsilon, const int num_tokens, const int hidden_size) {
|
||||||
|
// Sanity checks on our vector struct and type-punned pointer arithmetic
|
||||||
|
static_assert(std::is_pod_v<_f16Vec<scalar_t, width>>);
|
||||||
|
static_assert(sizeof(_f16Vec<scalar_t, width>) == sizeof(scalar_t) * width);
|
||||||
|
|
||||||
|
const int vec_hidden_size = hidden_size / width;
|
||||||
|
__shared__ float s_variance;
|
||||||
|
float variance = 0.0f;
|
||||||
|
/* These and the argument pointers are all declared `restrict` as they are
|
||||||
|
not aliased in practice. Argument pointers should not be dereferenced
|
||||||
|
in this kernel as that would be undefined behavior */
|
||||||
|
auto* __restrict__ input_v =
|
||||||
|
reinterpret_cast<_f16Vec<scalar_t, width>*>(input);
|
||||||
|
auto* __restrict__ residual_v =
|
||||||
|
reinterpret_cast<_f16Vec<scalar_t, width>*>(residual);
|
||||||
|
auto* __restrict__ weight_v =
|
||||||
|
reinterpret_cast<const _f16Vec<scalar_t, width>*>(weight);
|
||||||
|
|
||||||
|
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
|
||||||
|
int id = blockIdx.x * vec_hidden_size + idx;
|
||||||
|
_f16Vec<scalar_t, width> temp = input_v[id];
|
||||||
|
temp += residual_v[id];
|
||||||
|
variance += temp.sum_squares();
|
||||||
|
residual_v[id] = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||||
|
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||||
|
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
|
||||||
|
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// invert scale to avoid division
|
||||||
|
float const scale_inv = 1.0f / *scale;
|
||||||
|
|
||||||
|
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
|
||||||
|
int id = blockIdx.x * vec_hidden_size + idx;
|
||||||
|
_f16Vec<scalar_t, width> temp = residual_v[id];
|
||||||
|
temp *= s_variance;
|
||||||
|
temp *= weight_v[idx];
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < width; ++i) {
|
||||||
|
out[id * width + i] =
|
||||||
|
scaled_fp8_conversion<true>(float(temp.data[i]), scale_inv);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Generic fused_add_rms_norm_kernel
|
||||||
|
The width field is not used here but necessary for other specializations.
|
||||||
|
*/
|
||||||
|
template <typename scalar_t, int width>
|
||||||
|
__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
|
||||||
|
fused_add_rms_norm_static_fp8_quant_kernel(
|
||||||
|
FP8_TYPE* __restrict__ out, // [..., hidden_size]
|
||||||
|
scalar_t* __restrict__ input, // [..., hidden_size]
|
||||||
|
scalar_t* __restrict__ residual, // [..., hidden_size]
|
||||||
|
const scalar_t* __restrict__ weight, // [hidden_size]
|
||||||
|
const float* __restrict__ scale, // [1]
|
||||||
|
const float epsilon, const int num_tokens, const int hidden_size) {
|
||||||
|
__shared__ float s_variance;
|
||||||
|
float variance = 0.0f;
|
||||||
|
|
||||||
|
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||||
|
scalar_t z = input[blockIdx.x * hidden_size + idx];
|
||||||
|
z += residual[blockIdx.x * hidden_size + idx];
|
||||||
|
float x = (float)z;
|
||||||
|
variance += x * x;
|
||||||
|
residual[blockIdx.x * hidden_size + idx] = z;
|
||||||
|
}
|
||||||
|
|
||||||
|
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||||
|
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||||
|
variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
|
||||||
|
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// invert scale to avoid division
|
||||||
|
float const scale_inv = 1.0f / *scale;
|
||||||
|
|
||||||
|
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||||
|
float x = (float)residual[blockIdx.x * hidden_size + idx];
|
||||||
|
float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx];
|
||||||
|
out[blockIdx.x * hidden_size + idx] =
|
||||||
|
scaled_fp8_conversion<true>(out_norm, scale_inv);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace vllm
|
||||||
|
|
||||||
|
void rms_norm_static_fp8_quant(torch::Tensor& out, // [..., hidden_size]
|
||||||
|
torch::Tensor& input, // [..., hidden_size]
|
||||||
|
torch::Tensor& weight, // [hidden_size]
|
||||||
|
torch::Tensor& scale, // [1]
|
||||||
|
double epsilon) {
|
||||||
|
int hidden_size = input.size(-1);
|
||||||
|
int num_tokens = input.numel() / hidden_size;
|
||||||
|
|
||||||
|
dim3 grid(num_tokens);
|
||||||
|
dim3 block(std::min(hidden_size, 1024));
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
|
||||||
|
vllm::rms_norm_static_fp8_quant_kernel<scalar_t>
|
||||||
|
<<<grid, block, 0, stream>>>(
|
||||||
|
out.data_ptr<FP8_TYPE>(), input.data_ptr<scalar_t>(),
|
||||||
|
weight.data_ptr<scalar_t>(), scale.data_ptr<float>(), epsilon,
|
||||||
|
num_tokens, hidden_size);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
#define LAUNCH_FUSED_ADD_RMS_NORM(width) \
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES( \
|
||||||
|
input.scalar_type(), "fused_add_rms_norm_kernel", [&] { \
|
||||||
|
vllm::fused_add_rms_norm_static_fp8_quant_kernel<scalar_t, width> \
|
||||||
|
<<<grid, block, 0, stream>>>( \
|
||||||
|
out.data_ptr<FP8_TYPE>(), input.data_ptr<scalar_t>(), \
|
||||||
|
residual.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(), \
|
||||||
|
scale.data_ptr<float>(), epsilon, num_tokens, hidden_size); \
|
||||||
|
});
|
||||||
|
|
||||||
|
void fused_add_rms_norm_static_fp8_quant(
|
||||||
|
torch::Tensor& out, // [..., hidden_size],
|
||||||
|
torch::Tensor& input, // [..., hidden_size]
|
||||||
|
torch::Tensor& residual, // [..., hidden_size]
|
||||||
|
torch::Tensor& weight, // [hidden_size]
|
||||||
|
torch::Tensor& scale, // [1]
|
||||||
|
double epsilon) {
|
||||||
|
int hidden_size = input.size(-1);
|
||||||
|
int num_tokens = input.numel() / hidden_size;
|
||||||
|
|
||||||
|
dim3 grid(num_tokens);
|
||||||
|
/* This kernel is memory-latency bound in many scenarios.
|
||||||
|
When num_tokens is large, a smaller block size allows
|
||||||
|
for increased block occupancy on CUs and better latency
|
||||||
|
hiding on global mem ops. */
|
||||||
|
const int max_block_size = (num_tokens < 256) ? 1024 : 256;
|
||||||
|
dim3 block(std::min(hidden_size, max_block_size));
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
/*If the tensor types are FP16/BF16, try to use the optimized kernel
|
||||||
|
with packed + vectorized ops.
|
||||||
|
Max optimization is achieved with a width-8 vector of FP16/BF16s
|
||||||
|
since we can load at most 128 bits at once in a global memory op.
|
||||||
|
However, this requires each tensor's data to be aligned to 16
|
||||||
|
bytes.
|
||||||
|
*/
|
||||||
|
auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
|
||||||
|
auto res_ptr = reinterpret_cast<std::uintptr_t>(residual.data_ptr());
|
||||||
|
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
|
||||||
|
bool ptrs_are_aligned =
|
||||||
|
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
|
||||||
|
if (ptrs_are_aligned && hidden_size % 8 == 0) {
|
||||||
|
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||||
|
} else {
|
||||||
|
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -418,6 +418,31 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
|
|||||||
typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, seqlen - chunk * kChunkSize);
|
typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, seqlen - chunk * kChunkSize);
|
||||||
}
|
}
|
||||||
out += kChunkSize;
|
out += kChunkSize;
|
||||||
|
|
||||||
|
int final_state_position = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize);
|
||||||
|
// in case the final state is separated between the last "smem_exchange" and
|
||||||
|
// and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2),
|
||||||
|
// (which occurs when `final_state_position` is a non-positivie index)
|
||||||
|
// we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
|
||||||
|
if (final_state_position < 0 && seqlen > kWidth){
|
||||||
|
input_t vals_load[kNElts] = {0};
|
||||||
|
if ((chunk == n_chunks - 2) && (tidx == kNThreads - 1)){
|
||||||
|
// chunk = n_chunks - 2, a segment of the final state sits in the last index
|
||||||
|
reinterpret_cast<vec_t *>(vals_load)[0] = smem_exchange[kNThreads - 1];
|
||||||
|
#pragma unroll
|
||||||
|
for (int w = 0; w < -final_state_position; ++w){
|
||||||
|
conv_states[w] = vals_load[kNElts + final_state_position + w];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ((chunk == n_chunks - 1) && tidx == 0){
|
||||||
|
// chunk = n_chunks - 1, the second segment of the final state first positions
|
||||||
|
reinterpret_cast<vec_t *>(vals_load)[0] = smem_exchange[0];
|
||||||
|
for (int w = -final_state_position; w < kWidth - 1; ++w){
|
||||||
|
conv_states[w] = vals_load[w + final_state_position];
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// Final state is stored in the smem_exchange last token slot,
|
// Final state is stored in the smem_exchange last token slot,
|
||||||
// in case seqlen < kWidth, we would need to take the final state from the
|
// in case seqlen < kWidth, we would need to take the final state from the
|
||||||
@@ -446,9 +471,14 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
|
|||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// in case the final state is in between the threads data
|
// in case the final state is in between the threads data
|
||||||
reinterpret_cast<vec_t *>(x_vals_load)[1] = smem_exchange[last_thread + 1];
|
|
||||||
reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[last_thread];
|
|
||||||
const int offset = ((seqlen - (kWidth - 1)) % (kNElts));
|
const int offset = ((seqlen - (kWidth - 1)) % (kNElts));
|
||||||
|
if ((offset + kWidth - 2) >= kNElts && (last_thread + 1 < kNThreads)){
|
||||||
|
// In case last_thread == kNThreads - 1, accessing last_thread + 1 will result in a
|
||||||
|
// illegal access error on H100.
|
||||||
|
// Therefore, we access last_thread + 1, only if the final state data sits there
|
||||||
|
reinterpret_cast<vec_t *>(x_vals_load)[1] = smem_exchange[last_thread + 1];
|
||||||
|
}
|
||||||
|
reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[last_thread];
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int w = 0; w < kWidth - 1; ++w){
|
for (int w = 0; w < kWidth - 1; ++w){
|
||||||
conv_states[w] = x_vals_load[offset + w ];
|
conv_states[w] = x_vals_load[offset + w ];
|
||||||
|
|||||||
@@ -484,21 +484,22 @@ torch::Tensor marlin_gemm_moe(
|
|||||||
const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
|
const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
|
||||||
torch::Tensor& b_zeros, const torch::Tensor& g_idx,
|
torch::Tensor& b_zeros, const torch::Tensor& g_idx,
|
||||||
const torch::Tensor& perm, torch::Tensor& workspace,
|
const torch::Tensor& perm, torch::Tensor& workspace,
|
||||||
vllm::ScalarTypeTorchPtr const& b_q_type, int64_t size_m, int64_t size_n,
|
vllm::ScalarTypeId const b_q_type_id, int64_t size_m, int64_t size_n,
|
||||||
int64_t size_k, bool is_k_full, int64_t num_experts, int64_t topk,
|
int64_t size_k, bool is_k_full, int64_t num_experts, int64_t topk,
|
||||||
int64_t moe_block_size, bool replicate_input, bool apply_weights) {
|
int64_t moe_block_size, bool replicate_input, bool apply_weights) {
|
||||||
|
vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
|
||||||
bool has_zp = b_zeros.size(1) != 0;
|
bool has_zp = b_zeros.size(1) != 0;
|
||||||
if (has_zp) {
|
if (has_zp) {
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
*b_q_type == vllm::kU4,
|
b_q_type == vllm::kU4,
|
||||||
"b_q_type must be u4 when has_zp = True. Got = ", b_q_type->str());
|
"b_q_type must be u4 when has_zp = True. Got = ", b_q_type.str());
|
||||||
} else {
|
} else {
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
*b_q_type == vllm::kU4B8 || *b_q_type == vllm::kU8B128,
|
b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128,
|
||||||
"b_q_type must be uint4b8 or uint8b128. Got = ", b_q_type->str());
|
"b_q_type must be uint4b8 or uint8b128. Got = ", b_q_type.str());
|
||||||
}
|
}
|
||||||
|
|
||||||
int pack_factor = 32 / b_q_type->size_bits();
|
int pack_factor = 32 / b_q_type.size_bits();
|
||||||
|
|
||||||
int max_par = 4;
|
int max_par = 4;
|
||||||
|
|
||||||
@@ -575,7 +576,7 @@ torch::Tensor marlin_gemm_moe(
|
|||||||
topk_weights.data_ptr(), topk_ids.data_ptr(), b_scales.data_ptr(),
|
topk_weights.data_ptr(), topk_ids.data_ptr(), b_scales.data_ptr(),
|
||||||
b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(),
|
b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(),
|
||||||
expert_offsets.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(),
|
expert_offsets.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(),
|
||||||
*b_q_type, has_act_order, is_k_full, has_zp, num_groups, group_size,
|
b_q_type, has_act_order, is_k_full, has_zp, num_groups, group_size,
|
||||||
num_experts, topk, moe_block_size, dev,
|
num_experts, topk, moe_block_size, dev,
|
||||||
at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, max_par,
|
at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, max_par,
|
||||||
replicate_input, apply_weights);
|
replicate_input, apply_weights);
|
||||||
|
|||||||
@@ -1,15 +1,17 @@
|
|||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
#include <THC/THCAtomics.cuh>
|
#include <THC/THCAtomics.cuh>
|
||||||
|
|
||||||
#include "cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
#include "dispatch_utils.h"
|
#include "../dispatch_utils.h"
|
||||||
|
|
||||||
#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
|
#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
namespace moe {
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
|
__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
|
||||||
@@ -32,10 +34,10 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
|
|||||||
extern __shared__ int32_t shared_mem[];
|
extern __shared__ int32_t shared_mem[];
|
||||||
|
|
||||||
int32_t* tokens_cnts =
|
int32_t* tokens_cnts =
|
||||||
shared_mem; // 2d tensor with shape (num_experts + 1, num_experts)
|
shared_mem; // 2d tensor with shape (blockDim.x + 1, num_experts)
|
||||||
int32_t* cumsum =
|
int32_t* cumsum =
|
||||||
shared_mem + (num_experts + 1) *
|
shared_mem +
|
||||||
num_experts; // 1d tensor with shape (num_experts + 1)
|
(blockDim.x + 1) * num_experts; // 1d tensor with shape (num_experts + 1)
|
||||||
|
|
||||||
for (int i = 0; i < num_experts; ++i) {
|
for (int i = 0; i < num_experts; ++i) {
|
||||||
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
|
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
|
||||||
@@ -53,10 +55,12 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// For each expert we accumulate the token counts from the different threads.
|
// For each expert we accumulate the token counts from the different threads.
|
||||||
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
|
if (threadIdx.x < num_experts) {
|
||||||
for (int i = 1; i <= blockDim.x; ++i) {
|
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
|
||||||
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
|
for (int i = 1; i <= blockDim.x; ++i) {
|
||||||
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
|
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
|
||||||
|
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
@@ -79,9 +83,11 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
|
|||||||
* For each expert, each thread processes the tokens of the corresponding
|
* For each expert, each thread processes the tokens of the corresponding
|
||||||
* blocks and stores the corresponding expert_id for each block.
|
* blocks and stores the corresponding expert_id for each block.
|
||||||
*/
|
*/
|
||||||
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
if (threadIdx.x < num_experts) {
|
||||||
i += block_size) {
|
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
||||||
expert_ids[i / block_size] = threadIdx.x;
|
i += block_size) {
|
||||||
|
expert_ids[i / block_size] = threadIdx.x;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -106,6 +112,24 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
|
|||||||
++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
|
++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t, int TOPK>
|
||||||
|
__global__ void moe_sum_kernel(
|
||||||
|
scalar_t* __restrict__ out, // [..., d]
|
||||||
|
const scalar_t* __restrict__ input, // [..., topk, d]
|
||||||
|
const int d) {
|
||||||
|
const int64_t token_idx = blockIdx.x;
|
||||||
|
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
||||||
|
scalar_t x = 0.0;
|
||||||
|
#pragma unroll
|
||||||
|
for (int k = 0; k < TOPK; ++k) {
|
||||||
|
x += VLLM_LDG(&input[token_idx * TOPK * d + k * d + idx]);
|
||||||
|
}
|
||||||
|
out[token_idx * d + idx] = x;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace moe
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||||
@@ -117,18 +141,62 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
|||||||
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
|
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
|
||||||
// calc needed amount of shared mem for `tokens_cnts` and `cumsum`
|
// calc needed amount of shared mem for `tokens_cnts` and `cumsum`
|
||||||
// tensors
|
// tensors
|
||||||
|
const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
|
||||||
const int32_t shared_mem =
|
const int32_t shared_mem =
|
||||||
((num_experts + 1) * num_experts + (num_experts + 1)) *
|
((num_thread + 1) * num_experts + (num_experts + 1)) *
|
||||||
sizeof(int32_t);
|
sizeof(int32_t);
|
||||||
|
|
||||||
// set dynamic shared mem
|
// set dynamic shared mem
|
||||||
auto kernel = vllm::moe_align_block_size_kernel<scalar_t>;
|
auto kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
|
||||||
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
|
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
|
||||||
(void*)kernel, shared_mem));
|
(void*)kernel, shared_mem));
|
||||||
kernel<<<1, num_experts, shared_mem, stream>>>(
|
kernel<<<1, num_thread, shared_mem, stream>>>(
|
||||||
topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
|
topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
|
||||||
experts_ids.data_ptr<int32_t>(),
|
experts_ids.data_ptr<int32_t>(),
|
||||||
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
|
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
|
||||||
topk_ids.numel());
|
topk_ids.numel());
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
|
||||||
|
torch::Tensor& output) // [num_tokens, hidden_size]
|
||||||
|
{
|
||||||
|
const int hidden_size = input.size(-1);
|
||||||
|
const int num_tokens = output.numel() / hidden_size;
|
||||||
|
const int topk = input.size(1);
|
||||||
|
|
||||||
|
dim3 grid(num_tokens);
|
||||||
|
dim3 block(std::min(hidden_size, 1024));
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
|
switch (topk) {
|
||||||
|
case 2:
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
|
||||||
|
vllm::moe::moe_sum_kernel<scalar_t, 2><<<grid, block, 0, stream>>>(
|
||||||
|
output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
|
||||||
|
hidden_size);
|
||||||
|
});
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 3:
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
|
||||||
|
vllm::moe::moe_sum_kernel<scalar_t, 3><<<grid, block, 0, stream>>>(
|
||||||
|
output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
|
||||||
|
hidden_size);
|
||||||
|
});
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 4:
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
|
||||||
|
vllm::moe::moe_sum_kernel<scalar_t, 4><<<grid, block, 0, stream>>>(
|
||||||
|
output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
|
||||||
|
hidden_size);
|
||||||
|
});
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
at::sum_out(output, input, 1);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -5,3 +5,10 @@
|
|||||||
void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
|
void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
|
||||||
torch::Tensor& token_expert_indices,
|
torch::Tensor& token_expert_indices,
|
||||||
torch::Tensor& gating_output);
|
torch::Tensor& gating_output);
|
||||||
|
|
||||||
|
void moe_sum(torch::Tensor& input, torch::Tensor& output);
|
||||||
|
|
||||||
|
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||||
|
int64_t block_size, torch::Tensor sorted_token_ids,
|
||||||
|
torch::Tensor experts_ids,
|
||||||
|
torch::Tensor num_tokens_post_pad);
|
||||||
|
|||||||
@@ -8,13 +8,28 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
|||||||
"token_expert_indices, Tensor gating_output) -> ()");
|
"token_expert_indices, Tensor gating_output) -> ()");
|
||||||
m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
|
m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
|
||||||
|
|
||||||
|
// Calculate the result of moe by summing up the partial results
|
||||||
|
// from all selected experts.
|
||||||
|
m.def("moe_sum(Tensor! input, Tensor output) -> ()");
|
||||||
|
m.impl("moe_sum", torch::kCUDA, &moe_sum);
|
||||||
|
|
||||||
|
// Aligning the number of tokens to be processed by each expert such
|
||||||
|
// that it is divisible by the block size.
|
||||||
|
m.def(
|
||||||
|
"moe_align_block_size(Tensor topk_ids, int num_experts,"
|
||||||
|
" int block_size, Tensor! sorted_token_ids,"
|
||||||
|
" Tensor! experts_ids,"
|
||||||
|
" Tensor! num_tokens_post_pad) -> ()");
|
||||||
|
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
m.def(
|
m.def(
|
||||||
"marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
|
"marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
|
||||||
"Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
|
"Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
|
||||||
"b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, "
|
"b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, "
|
||||||
"__torch__.torch.classes._core_C.ScalarType b_q_type, int size_m, "
|
"int b_q_type, SymInt size_m, "
|
||||||
"int size_n, int size_k, bool is_k_full, int num_experts, int topk, "
|
"SymInt size_n, SymInt size_k, bool is_k_full, int num_experts, int "
|
||||||
|
"topk, "
|
||||||
"int moe_block_size, bool replicate_input, bool apply_weights)"
|
"int moe_block_size, bool replicate_input, bool apply_weights)"
|
||||||
" -> Tensor");
|
" -> Tensor");
|
||||||
// conditionally compiled so impl registration is in source file
|
// conditionally compiled so impl registration is in source file
|
||||||
|
|||||||
64
csrc/ops.h
64
csrc/ops.h
@@ -5,6 +5,30 @@
|
|||||||
|
|
||||||
#include "core/scalar_type.hpp"
|
#include "core/scalar_type.hpp"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
torch::Tensor weak_ref_tensor(torch::Tensor& tensor) {
|
||||||
|
// Ensure tensor is on CUDA
|
||||||
|
if (!tensor.is_cuda()) {
|
||||||
|
throw std::runtime_error("Tensor must be on CUDA device");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the raw data pointer
|
||||||
|
void* data_ptr = tensor.data_ptr();
|
||||||
|
|
||||||
|
// Get tensor sizes and strides
|
||||||
|
std::vector<int64_t> sizes = tensor.sizes().vec();
|
||||||
|
std::vector<int64_t> strides = tensor.strides().vec();
|
||||||
|
|
||||||
|
// Get tensor options (dtype, device)
|
||||||
|
auto options = tensor.options();
|
||||||
|
|
||||||
|
// Create a new tensor from the raw data pointer
|
||||||
|
auto new_tensor = torch::from_blob(data_ptr, sizes, strides, options);
|
||||||
|
|
||||||
|
return new_tensor;
|
||||||
|
}
|
||||||
|
|
||||||
void paged_attention_v1(
|
void paged_attention_v1(
|
||||||
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||||
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||||
@@ -32,6 +56,16 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
|
|||||||
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
|
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
|
||||||
torch::Tensor& weight, double epsilon);
|
torch::Tensor& weight, double epsilon);
|
||||||
|
|
||||||
|
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
|
||||||
|
torch::Tensor& weight, torch::Tensor& scale,
|
||||||
|
double epsilon);
|
||||||
|
|
||||||
|
void fused_add_rms_norm_static_fp8_quant(torch::Tensor& out,
|
||||||
|
torch::Tensor& input,
|
||||||
|
torch::Tensor& residual,
|
||||||
|
torch::Tensor& weight,
|
||||||
|
torch::Tensor& scale, double epsilon);
|
||||||
|
|
||||||
void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
|
void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
|
||||||
torch::Tensor& key, int64_t head_size,
|
torch::Tensor& key, int64_t head_size,
|
||||||
torch::Tensor& cos_sin_cache, bool is_neox);
|
torch::Tensor& cos_sin_cache, bool is_neox);
|
||||||
@@ -48,6 +82,9 @@ void gelu_and_mul(torch::Tensor& out, torch::Tensor& input);
|
|||||||
|
|
||||||
void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input);
|
void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input);
|
||||||
|
|
||||||
|
void fatrelu_and_mul(torch::Tensor& out, torch::Tensor& input,
|
||||||
|
double threshold);
|
||||||
|
|
||||||
void gelu_new(torch::Tensor& out, torch::Tensor& input);
|
void gelu_new(torch::Tensor& out, torch::Tensor& input);
|
||||||
|
|
||||||
void gelu_fast(torch::Tensor& out, torch::Tensor& input);
|
void gelu_fast(torch::Tensor& out, torch::Tensor& input);
|
||||||
@@ -142,11 +179,6 @@ void dynamic_per_token_scaled_fp8_quant(
|
|||||||
torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
|
torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
|
||||||
c10::optional<torch::Tensor> const& scale_ub);
|
c10::optional<torch::Tensor> const& scale_ub);
|
||||||
|
|
||||||
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
|
||||||
int64_t block_size, torch::Tensor sorted_token_ids,
|
|
||||||
torch::Tensor experts_ids,
|
|
||||||
torch::Tensor num_tokens_post_pad);
|
|
||||||
|
|
||||||
void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
|
void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
|
||||||
const torch::Tensor& A, const torch::Tensor& B,
|
const torch::Tensor& A, const torch::Tensor& B,
|
||||||
const torch::Tensor& C,
|
const torch::Tensor& C,
|
||||||
@@ -177,20 +209,16 @@ void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
|
|||||||
|
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
using fptr_t = int64_t;
|
using fptr_t = int64_t;
|
||||||
fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
|
fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
|
||||||
const std::vector<std::string>& handles,
|
torch::Tensor& rank_data, int64_t rank, bool full_nvlink);
|
||||||
const std::vector<int64_t>& offsets, int64_t rank,
|
void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
|
||||||
bool full_nvlink);
|
fptr_t reg_buffer, int64_t reg_buffer_sz_bytes);
|
||||||
void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out);
|
|
||||||
void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer,
|
|
||||||
torch::Tensor& out);
|
|
||||||
void dispose(fptr_t _fa);
|
void dispose(fptr_t _fa);
|
||||||
int64_t meta_size();
|
int64_t meta_size();
|
||||||
void register_buffer(fptr_t _fa, torch::Tensor& t,
|
void register_buffer(fptr_t _fa, const std::vector<int64_t>& fake_ipc_ptrs);
|
||||||
const std::vector<std::string>& handles,
|
std::tuple<std::vector<int64_t>, std::vector<int64_t>>
|
||||||
const std::vector<int64_t>& offsets);
|
get_graph_buffer_ipc_meta(fptr_t _fa);
|
||||||
std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
|
void register_graph_buffers(fptr_t _fa,
|
||||||
fptr_t _fa);
|
const std::vector<std::vector<int64_t>>& handles,
|
||||||
void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
|
|
||||||
const std::vector<std::vector<int64_t>>& offsets);
|
const std::vector<std::vector<int64_t>>& offsets);
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -88,6 +88,7 @@ inline void verify_tensor(std::string const& name, torch::Tensor const& t,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// each thread processes a block per query
|
||||||
__global__ void advance_step_flashinfer_kernel(
|
__global__ void advance_step_flashinfer_kernel(
|
||||||
int num_threads, int num_seqs, int num_queries, int block_size,
|
int num_threads, int num_seqs, int num_queries, int block_size,
|
||||||
long* input_tokens_ptr, long const* sampled_token_ids_ptr,
|
long* input_tokens_ptr, long const* sampled_token_ids_ptr,
|
||||||
@@ -134,8 +135,10 @@ __global__ void advance_step_flashinfer_indptr_kernel(
|
|||||||
int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr,
|
int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr,
|
||||||
int* block_table_bound_ptr) {
|
int* block_table_bound_ptr) {
|
||||||
int idx = blockIdx.x * num_threads + threadIdx.x;
|
int idx = blockIdx.x * num_threads + threadIdx.x;
|
||||||
|
|
||||||
// Update paged_kv_indptr
|
// Update paged_kv_indptr
|
||||||
|
if (idx == 0) {
|
||||||
|
paged_kv_indptr_ptr[idx] = 0;
|
||||||
|
}
|
||||||
if (idx < num_queries) {
|
if (idx < num_queries) {
|
||||||
int sum = 0;
|
int sum = 0;
|
||||||
for (int i = 0; i <= idx; ++i) {
|
for (int i = 0; i <= idx; ++i) {
|
||||||
@@ -146,20 +149,33 @@ __global__ void advance_step_flashinfer_indptr_kernel(
|
|||||||
}
|
}
|
||||||
|
|
||||||
__global__ void advance_step_flashinfer_indices_kernel(
|
__global__ void advance_step_flashinfer_indices_kernel(
|
||||||
int num_threads, int num_seqs, int num_queries, int const* block_tables_ptr,
|
int num_seqs, int num_queries, int const* block_tables_ptr,
|
||||||
int64_t const block_tables_stride, int* paged_kv_indices_ptr,
|
int64_t const max_num_blocks_per_seq, int* paged_kv_indices_ptr,
|
||||||
int* paged_kv_indptr_ptr, int* block_table_bound_ptr) {
|
int* paged_kv_indptr_ptr, int* block_table_bound_ptr) {
|
||||||
int idx = blockIdx.x * num_threads + threadIdx.x;
|
// note: max_num_blocks_per_seq = block_tables.stride(0)
|
||||||
int row = idx / block_tables_stride;
|
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
int col = idx % block_tables_stride;
|
|
||||||
|
|
||||||
if (row < num_queries && col < block_table_bound_ptr[row]) {
|
// when cuda graphs are enabled, paged_kv_indptr tensor
|
||||||
paged_kv_indices_ptr[paged_kv_indptr_ptr[row] + col] =
|
// has to be updated for the padded queries
|
||||||
block_tables_ptr[row * block_tables_stride + col];
|
// tid represents a query# for paged_kv_indptr tensor
|
||||||
|
if (num_queries < tid && tid <= num_seqs) {
|
||||||
|
paged_kv_indptr_ptr[tid] = paged_kv_indptr_ptr[num_queries];
|
||||||
}
|
}
|
||||||
// if cudagraph, fill padded seqs with the last valid seq's indptr
|
|
||||||
if (num_queries < row && row <= num_seqs) {
|
// each thread processes a block_ptr in block_tables
|
||||||
paged_kv_indptr_ptr[row] = paged_kv_indptr_ptr[num_queries];
|
// block_tables shape: [num_queries, max_num_blocks_per_seq]
|
||||||
|
// paged_kv_indices is flattened block_tables.
|
||||||
|
for (int idx = tid; idx < (num_seqs * max_num_blocks_per_seq);
|
||||||
|
idx += (gridDim.x * blockDim.x)) {
|
||||||
|
// block_tables-row = paged_kv_indptr[queryNum]
|
||||||
|
int queryNum = idx / max_num_blocks_per_seq;
|
||||||
|
int col = idx % max_num_blocks_per_seq;
|
||||||
|
if (queryNum < num_queries && col < block_table_bound_ptr[queryNum]) {
|
||||||
|
int indices_arr_idx = paged_kv_indptr_ptr[queryNum] + col;
|
||||||
|
int block_tables_idx = queryNum * max_num_blocks_per_seq + col;
|
||||||
|
paged_kv_indices_ptr[indices_arr_idx] =
|
||||||
|
block_tables_ptr[block_tables_idx];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -247,22 +263,16 @@ void advance_step_flashinfer(
|
|||||||
int threads;
|
int threads;
|
||||||
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
|
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
|
||||||
cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
|
cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
|
||||||
|
|
||||||
|
int block_tables_stride = block_tables.stride(0);
|
||||||
|
TORCH_CHECK((blocks * threads > num_queries),
|
||||||
|
"multi-step: not enough threads to map to num_queries = ",
|
||||||
|
num_queries, " block_tables.stride(0) = ", block_tables.stride(0),
|
||||||
|
" blocks = ", blocks, " max_threads = ", threads);
|
||||||
if (logging) {
|
if (logging) {
|
||||||
printf("launching kernel with %d blocks\n", blocks);
|
printf("launching kernels with %d blocks and %d threads\n", blocks,
|
||||||
|
threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO(will): support arbitrary block_tables stride
|
|
||||||
if ((blocks * threads) / block_tables.stride(0) < num_queries) {
|
|
||||||
TORCH_CHECK(false,
|
|
||||||
"multi-step: not enough threads to map block_table to"
|
|
||||||
"FlashInfer's paged_kv_indices on GPU. Try reducing the number "
|
|
||||||
"of seqs,",
|
|
||||||
" increasing the block size or take smaller steps.",
|
|
||||||
" num_queries = ", num_queries,
|
|
||||||
" block_tables.stride(0) = ", block_tables.stride(0),
|
|
||||||
" blocks = ", blocks, " max_threads = ", threads);
|
|
||||||
}
|
|
||||||
|
|
||||||
advance_step_flashinfer_kernel<<<blocks, threads, 0, stream>>>(
|
advance_step_flashinfer_kernel<<<blocks, threads, 0, stream>>>(
|
||||||
threads, num_seqs, num_queries, block_size,
|
threads, num_seqs, num_queries, block_size,
|
||||||
reinterpret_cast<long*>(input_tokens.data_ptr()),
|
reinterpret_cast<long*>(input_tokens.data_ptr()),
|
||||||
@@ -281,7 +291,7 @@ void advance_step_flashinfer(
|
|||||||
reinterpret_cast<int*>(block_table_bound.data_ptr()));
|
reinterpret_cast<int*>(block_table_bound.data_ptr()));
|
||||||
|
|
||||||
advance_step_flashinfer_indices_kernel<<<blocks, threads, 0, stream>>>(
|
advance_step_flashinfer_indices_kernel<<<blocks, threads, 0, stream>>>(
|
||||||
threads, num_seqs, num_queries,
|
num_seqs, num_queries,
|
||||||
reinterpret_cast<int const*>(block_tables.data_ptr()),
|
reinterpret_cast<int const*>(block_tables.data_ptr()),
|
||||||
block_tables.stride(0),
|
block_tables.stride(0),
|
||||||
reinterpret_cast<int*>(paged_kv_indices.data_ptr()),
|
reinterpret_cast<int*>(paged_kv_indices.data_ptr()),
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user