Compare commits
465 Commits
v0.17.0rc0
...
v0.17.2rc0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
54a62a79f7 | ||
|
|
384dc7f77b | ||
|
|
f04d5226f8 | ||
|
|
0a0a1a198b | ||
|
|
6c1cfbad32 | ||
|
|
45f526d652 | ||
|
|
5db91f0aaf | ||
|
|
061980c36a | ||
|
|
7a49742b88 | ||
|
|
3e6a1e1686 | ||
|
|
7961486a9b | ||
|
|
4f9b14c21c | ||
|
|
31a458c091 | ||
|
|
a3a51d20e7 | ||
|
|
e5b807607c | ||
|
|
fd4d96302a | ||
|
|
c0f011918d | ||
|
|
e6ae4b1be1 | ||
|
|
2dccb38f73 | ||
|
|
d157216093 | ||
|
|
93f3c8e531 | ||
|
|
2cc26c3a99 | ||
|
|
dfa8852db2 | ||
|
|
714c6e0eab | ||
|
|
0fefd00e6c | ||
|
|
f5c081d432 | ||
|
|
c88ea8338b | ||
|
|
9f9ecff4cd | ||
|
|
ca1954d58c | ||
|
|
55e6d3d5c0 | ||
|
|
6682c231fa | ||
|
|
5ae685c1c8 | ||
|
|
ce8cf9161d | ||
|
|
18be11fd59 | ||
|
|
8d8855fdae | ||
|
|
e855d380fa | ||
|
|
0e5a9382af | ||
|
|
04bf5a35fa | ||
|
|
43a73f853b | ||
|
|
ffbc2e5bdb | ||
|
|
f9e6db3034 | ||
|
|
d61d2b08e9 | ||
|
|
f5e59ee7a6 | ||
|
|
9b005edc48 | ||
|
|
bf9a185395 | ||
|
|
ad041c79db | ||
|
|
747b068136 | ||
|
|
122f75d939 | ||
|
|
d8f8a7aad2 | ||
|
|
0115e957d4 | ||
|
|
116ed130f4 | ||
|
|
8374387bd8 | ||
|
|
912fbe9555 | ||
|
|
52131f88d9 | ||
|
|
821eb80c0d | ||
|
|
a2956a0f8e | ||
|
|
911355e216 | ||
|
|
8d3f8f485e | ||
|
|
96efb91480 | ||
|
|
2754231ba3 | ||
|
|
2390d44209 | ||
|
|
7362b4450a | ||
|
|
57a314d155 | ||
|
|
d4c57863f7 | ||
|
|
68e1b711f1 | ||
|
|
0024f39a32 | ||
|
|
e9163b536e | ||
|
|
7acaea634c | ||
|
|
697e4ff352 | ||
|
|
a3e2e250f0 | ||
|
|
143e4dccdf | ||
|
|
6590a3ecda | ||
|
|
b3debb7e77 | ||
|
|
458c1a4b2d | ||
|
|
821fde2df4 | ||
|
|
8c29042bb9 | ||
|
|
5467d137b3 | ||
|
|
3ed46f374b | ||
|
|
84868e4793 | ||
|
|
a8e8d62dd8 | ||
|
|
e42b49bd69 | ||
|
|
4a718e770d | ||
|
|
600a039f57 | ||
|
|
ffa5d74f15 | ||
|
|
74fe80ee95 | ||
|
|
bcfdadb1bc | ||
|
|
236de72e49 | ||
|
|
a116f96930 | ||
|
|
092ace9e3a | ||
|
|
f680dc1b39 | ||
|
|
b41aa264f9 | ||
|
|
367cf5cd3e | ||
|
|
6d53efd2a5 | ||
|
|
8b346309a5 | ||
|
|
54a6db827f | ||
|
|
9efc4db965 | ||
|
|
f1816fb192 | ||
|
|
0005d2a3c9 | ||
|
|
d0b402974f | ||
|
|
6341d43043 | ||
|
|
7afe0faab1 | ||
|
|
5a3f1eb62f | ||
|
|
b3ce711b93 | ||
|
|
abf61aaa8e | ||
|
|
4508532fbd | ||
|
|
d5af196c18 | ||
|
|
82f836d976 | ||
|
|
4fccd30f19 | ||
|
|
cfaf4668f7 | ||
|
|
99a57bdf74 | ||
|
|
a2268617cf | ||
|
|
a4ad9db541 | ||
|
|
b373b5102a | ||
|
|
f296a1966d | ||
|
|
bc2c0c86ef | ||
|
|
891c60dcd5 | ||
|
|
1ce13cf992 | ||
|
|
10f08dedfa | ||
|
|
5e1a373d2e | ||
|
|
572c776bfb | ||
|
|
55d8073d06 | ||
|
|
cd32d6f586 | ||
|
|
aaa3092f51 | ||
|
|
87985077a4 | ||
|
|
a79c1c2c80 | ||
|
|
cc8f1f4764 | ||
|
|
05b9e8ab5b | ||
|
|
2cdf92228c | ||
|
|
c973ecdead | ||
|
|
e39257a552 | ||
|
|
cc16b24b17 | ||
|
|
bdc2343454 | ||
|
|
f444c05c32 | ||
|
|
85199f9681 | ||
|
|
a1257fd1ea | ||
|
|
abcffbba8c | ||
|
|
53ec16a705 | ||
|
|
2e693f48e7 | ||
|
|
7f1f36bf91 | ||
|
|
5282c7d4d0 | ||
|
|
9e19f8338b | ||
|
|
06e0bc21d2 | ||
|
|
5a71cdd76e | ||
|
|
f0d3658c0f | ||
|
|
57431d8231 | ||
|
|
3e64fe4a18 | ||
|
|
8cb24d3aed | ||
|
|
00726c74c9 | ||
|
|
9fe404ed04 | ||
|
|
802f306cd1 | ||
|
|
894843eb25 | ||
|
|
584a3f56de | ||
|
|
36735fd772 | ||
|
|
6ecabe4936 | ||
|
|
2f8b4ce0c0 | ||
|
|
2ef69456f5 | ||
|
|
17852aa503 | ||
|
|
8647c6cf51 | ||
|
|
513949f95f | ||
|
|
262b76a09f | ||
|
|
c34ba6b961 | ||
|
|
24062b704f | ||
|
|
d6b61e5166 | ||
|
|
cf632499ee | ||
|
|
a3774a8198 | ||
|
|
0ce21c46a0 | ||
|
|
55eed6b7a5 | ||
|
|
c77181e534 | ||
|
|
12001f2ebc | ||
|
|
7ee5d5093b | ||
|
|
428bc718bd | ||
|
|
ff1e3d9c63 | ||
|
|
35bdca5431 | ||
|
|
8a24842765 | ||
|
|
65986db6ba | ||
|
|
9556af87d5 | ||
|
|
a1a3523a56 | ||
|
|
741f4e046b | ||
|
|
a5d06dc557 | ||
|
|
5efa206a8c | ||
|
|
196802dfa6 | ||
|
|
c84b519cf3 | ||
|
|
741ecf0630 | ||
|
|
b7e5a588d8 | ||
|
|
822e250ab7 | ||
|
|
bea02cdf93 | ||
|
|
a3ea760ea5 | ||
|
|
35db669f1d | ||
|
|
afebeffbfb | ||
|
|
5573894737 | ||
|
|
d5816c8c2f | ||
|
|
8ccbcda5c0 | ||
|
|
a9e532afe2 | ||
|
|
f3163bba67 | ||
|
|
700a1ddc65 | ||
|
|
f33251ffc8 | ||
|
|
e584dce52b | ||
|
|
40c0461f24 | ||
|
|
724759684c | ||
|
|
9c34e9d24f | ||
|
|
09b6f99852 | ||
|
|
c87fb515ed | ||
|
|
5353c9b016 | ||
|
|
13e79fc811 | ||
|
|
9d07a3d6e4 | ||
|
|
646b85544b | ||
|
|
4286cc5ec2 | ||
|
|
545d18d81b | ||
|
|
e661b9ee83 | ||
|
|
c910eeb125 | ||
|
|
f4ae58b38b | ||
|
|
e568cf88bc | ||
|
|
098d844731 | ||
|
|
a40ee486f2 | ||
|
|
eac2dc2b41 | ||
|
|
d5080aeaa4 | ||
|
|
f22d6e0267 | ||
|
|
76c6e6da08 | ||
|
|
4184653775 | ||
|
|
4aaaf8c8ce | ||
|
|
4bf533623b | ||
|
|
5f77ef15ae | ||
|
|
7d6abdd022 | ||
|
|
a8ff2cca92 | ||
|
|
42fadebecb | ||
|
|
a197eda9c3 | ||
|
|
82b110d50e | ||
|
|
9040cd40af | ||
|
|
fa0d353acf | ||
|
|
b386bb3d7c | ||
|
|
fe714dd507 | ||
|
|
8ab3d7427c | ||
|
|
84e436ed1c | ||
|
|
81939e7733 | ||
|
|
195d1ca3e8 | ||
|
|
8d983d7cd6 | ||
|
|
65b2f405dc | ||
|
|
2a68464c5b | ||
|
|
bdd8981dab | ||
|
|
f088a831dd | ||
|
|
f83b933b84 | ||
|
|
82f3f30e26 | ||
|
|
9095cbbfb6 | ||
|
|
721ae79f50 | ||
|
|
aefc59f088 | ||
|
|
d88f28da05 | ||
|
|
106ff69c4e | ||
|
|
ca5fb4bbd8 | ||
|
|
cf88b23749 | ||
|
|
a3189a08b0 | ||
|
|
409c4e632d | ||
|
|
8850738b70 | ||
|
|
234860399b | ||
|
|
c88510083b | ||
|
|
4ff8c3c8f9 | ||
|
|
507ddbe992 | ||
|
|
ddbb0d230a | ||
|
|
9efc3bdcd6 | ||
|
|
156e33553c | ||
|
|
d0cd736caa | ||
|
|
195c997203 | ||
|
|
04b67d8f62 | ||
|
|
7279374f91 | ||
|
|
006aea17d7 | ||
|
|
0836be3b03 | ||
|
|
4e95ec111c | ||
|
|
179547d62c | ||
|
|
f85b4eda3a | ||
|
|
2a194ddd72 | ||
|
|
203a7f27da | ||
|
|
483463f735 | ||
|
|
4e571ce643 | ||
|
|
4ff9b045fe | ||
|
|
3fd03f1ec2 | ||
|
|
10a5f4d53d | ||
|
|
fe0c085c28 | ||
|
|
8d6b3d5dda | ||
|
|
4b87ffbefb | ||
|
|
fa028207aa | ||
|
|
d460a18fc6 | ||
|
|
6e956d9eca | ||
|
|
1e0f917b34 | ||
|
|
c174d54f86 | ||
|
|
55d27cca55 | ||
|
|
580864d81e | ||
|
|
2b28b9b269 | ||
|
|
70485a11bd | ||
|
|
74a9f54cdb | ||
|
|
00c4cb5606 | ||
|
|
941e52c298 | ||
|
|
be292b7c14 | ||
|
|
77a73458e3 | ||
|
|
5578f2a4d3 | ||
|
|
3ec2115015 | ||
|
|
b0906d8b02 | ||
|
|
aaf5fa9abf | ||
|
|
f96c3ab08c | ||
|
|
dc6b578466 | ||
|
|
1bc9c77f6d | ||
|
|
65a4da1504 | ||
|
|
217f27598d | ||
|
|
fff3711a24 | ||
|
|
c4d859c274 | ||
|
|
747431044d | ||
|
|
d62856b928 | ||
|
|
bd2659a566 | ||
|
|
90512b2e8b | ||
|
|
dcf8862fd4 | ||
|
|
43aa389231 | ||
|
|
384425f84e | ||
|
|
a0f44bb616 | ||
|
|
fde4771bbd | ||
|
|
e5ff140216 | ||
|
|
0a6a3a1290 | ||
|
|
4497431df6 | ||
|
|
b7332b058c | ||
|
|
40077ea3de | ||
|
|
5d6aae4577 | ||
|
|
63298ee173 | ||
|
|
2dde535df1 | ||
|
|
379689d533 | ||
|
|
a6be75dbd2 | ||
|
|
ee54f9cdb9 | ||
|
|
fc4657756f | ||
|
|
eebd14651f | ||
|
|
ebb9cc5f2b | ||
|
|
85f50eb41f | ||
|
|
5261223c2d | ||
|
|
00b814ba5a | ||
|
|
ee8a29511f | ||
|
|
755356b3d1 | ||
|
|
58928475e4 | ||
|
|
1a9718085c | ||
|
|
7eb524e64c | ||
|
|
c7f32e08c2 | ||
|
|
b354686524 | ||
|
|
6a18d8789b | ||
|
|
24a03915f5 | ||
|
|
b5e34e1fca | ||
|
|
ce8546a12b | ||
|
|
c188749bcd | ||
|
|
225d1090a0 | ||
|
|
f3c6c9c9d7 | ||
|
|
26bd43b52d | ||
|
|
6b625a8807 | ||
|
|
54756b6109 | ||
|
|
39f9ea0da4 | ||
|
|
e4ae148a78 | ||
|
|
1d0c0d209c | ||
|
|
fcb73f306c | ||
|
|
e2090bf3af | ||
|
|
2a00d3241f | ||
|
|
10f4db4dbe | ||
|
|
5b3ba94ab4 | ||
|
|
90f3c01fa4 | ||
|
|
807d680337 | ||
|
|
5afb387bd4 | ||
|
|
43e77e59ab | ||
|
|
00bd08edee | ||
|
|
43f10573c9 | ||
|
|
86e1060b17 | ||
|
|
27066d1b2b | ||
|
|
57c84ff129 | ||
|
|
e68de8adc0 | ||
|
|
a1ffa56a1e | ||
|
|
0a208d1f54 | ||
|
|
03a49bb8f0 | ||
|
|
8e87cc57f1 | ||
|
|
6dd302653f | ||
|
|
de00ebeac4 | ||
|
|
639680d220 | ||
|
|
c5362c739f | ||
|
|
0a49676fb0 | ||
|
|
c012a8c477 | ||
|
|
ebed80a7c8 | ||
|
|
a73af584fe | ||
|
|
a97954b6a8 | ||
|
|
a911f4dd20 | ||
|
|
5395471d29 | ||
|
|
a57c877f18 | ||
|
|
f917020983 | ||
|
|
86483ca774 | ||
|
|
b93a9e6f6d | ||
|
|
d8839ef7d9 | ||
|
|
e998fa76b9 | ||
|
|
6a895197fa | ||
|
|
8c760b6ab6 | ||
|
|
3ee68590c7 | ||
|
|
7196348157 | ||
|
|
176c799f4c | ||
|
|
612e7729c2 | ||
|
|
ecde7af9c4 | ||
|
|
8df523351f | ||
|
|
b03ff6a96b | ||
|
|
ed81d5edd1 | ||
|
|
3c23ac840e | ||
|
|
a708ef5944 | ||
|
|
66a2209645 | ||
|
|
0bfa229bf1 | ||
|
|
7493c51c55 | ||
|
|
ac773bbe80 | ||
|
|
48e376a007 | ||
|
|
21eb2c3372 | ||
|
|
e2b31243c0 | ||
|
|
c3598d02fa | ||
|
|
57c629e9c1 | ||
|
|
d106bf39f5 | ||
|
|
b0651021e5 | ||
|
|
f600d5192e | ||
|
|
8e7820131e | ||
|
|
0a12cea25f | ||
|
|
dd6dbd93f8 | ||
|
|
26366009c5 | ||
|
|
16c472abe7 | ||
|
|
3b23d57c96 | ||
|
|
2f4226fe52 | ||
|
|
792cbd64ca | ||
|
|
2ed4722e26 | ||
|
|
a3299c3d1d | ||
|
|
6c21a0c2d7 | ||
|
|
562339abc3 | ||
|
|
d7adcadb9b | ||
|
|
f678c3f61a | ||
|
|
be0a3f7570 | ||
|
|
17dc9c7fc9 | ||
|
|
7eca859110 | ||
|
|
636ee223ac | ||
|
|
b7d59ffce2 | ||
|
|
5569f5218d | ||
|
|
138d891d7f | ||
|
|
d7166e74c1 | ||
|
|
417fd28fb1 | ||
|
|
7faba503c4 | ||
|
|
bc6be89d16 | ||
|
|
32224f568a | ||
|
|
f3dc292e9f | ||
|
|
138c5fa186 | ||
|
|
2f2c1d73a7 | ||
|
|
fb3e78ab09 | ||
|
|
fd3bfe74c9 | ||
|
|
bfdb512f11 | ||
|
|
d25c1ec3c9 | ||
|
|
7cc6058ac6 | ||
|
|
28028dff2f | ||
|
|
3417ba5648 | ||
|
|
58cfe0dc44 | ||
|
|
e86221deb6 | ||
|
|
289fc48ab7 | ||
|
|
2f2212e6cc | ||
|
|
18e01a0a10 | ||
|
|
6cb901093f | ||
|
|
ead7bde1ab | ||
|
|
6aa6ad8992 | ||
|
|
c8c3935b70 | ||
|
|
bb6888b8b1 | ||
|
|
1aaec59d79 | ||
|
|
1659b2e058 | ||
|
|
d6e04f4c43 | ||
|
|
a8f66cbde8 | ||
|
|
16d2ad1d38 | ||
|
|
5dc3538736 | ||
|
|
36bf213181 | ||
|
|
6f0dd93801 | ||
|
|
5d199ac8f2 | ||
|
|
9e0f44bec4 |
@@ -10,7 +10,7 @@ steps:
|
|||||||
docker build
|
docker build
|
||||||
--build-arg max_jobs=16
|
--build-arg max_jobs=16
|
||||||
--build-arg REMOTE_VLLM=1
|
--build-arg REMOTE_VLLM=1
|
||||||
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx942;gfx950'
|
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
|
||||||
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
|
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
|
||||||
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||||
-f docker/Dockerfile.rocm
|
-f docker/Dockerfile.rocm
|
||||||
|
|||||||
@@ -21,6 +21,20 @@ steps:
|
|||||||
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
|
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
|
||||||
pytest -x -v -s tests/kernels/test_onednn.py"
|
pytest -x -v -s tests/kernels/test_onednn.py"
|
||||||
|
|
||||||
|
- label: CPU-Compatibility Tests
|
||||||
|
depends_on: []
|
||||||
|
soft_fail: true
|
||||||
|
device: intel_cpu
|
||||||
|
no_plugin: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- cmake/cpu_extension.cmake
|
||||||
|
- setup.py
|
||||||
|
- vllm/platforms/cpu.py
|
||||||
|
commands:
|
||||||
|
- |
|
||||||
|
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
|
||||||
|
bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
|
||||||
|
|
||||||
- label: CPU-Language Generation and Pooling Model Tests
|
- label: CPU-Language Generation and Pooling Model Tests
|
||||||
depends_on: []
|
depends_on: []
|
||||||
soft_fail: true
|
soft_fail: true
|
||||||
|
|||||||
@@ -25,9 +25,7 @@ fi
|
|||||||
docker build --file docker/Dockerfile.cpu \
|
docker build --file docker/Dockerfile.cpu \
|
||||||
--build-arg max_jobs=16 \
|
--build-arg max_jobs=16 \
|
||||||
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
|
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
|
||||||
--build-arg VLLM_CPU_AVX512BF16=true \
|
--build-arg VLLM_CPU_X86=true \
|
||||||
--build-arg VLLM_CPU_AVX512VNNI=true \
|
|
||||||
--build-arg VLLM_CPU_AMXBF16=true \
|
|
||||||
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
|
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
|
||||||
--target vllm-test \
|
--target vllm-test \
|
||||||
--progress plain .
|
--progress plain .
|
||||||
|
|||||||
@@ -13,9 +13,10 @@ import os
|
|||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
|
||||||
import lm_eval
|
import lm_eval
|
||||||
import numpy as np
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
DEFAULT_RTOL = 0.08
|
DEFAULT_RTOL = 0.08
|
||||||
|
|
||||||
|
|
||||||
@@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size):
|
|||||||
"allow_deprecated_quantization=True,"
|
"allow_deprecated_quantization=True,"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]:
|
||||||
|
model_args += "attention_backend=TRITON_ATTN"
|
||||||
|
|
||||||
env_vars = eval_config.get("env_vars", None)
|
env_vars = eval_config.get("env_vars", None)
|
||||||
with scoped_env_vars(env_vars):
|
with scoped_env_vars(env_vars):
|
||||||
results = lm_eval.simple_evaluate(
|
results = lm_eval.simple_evaluate(
|
||||||
@@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
|||||||
f"ground_truth={ground_truth:.3f} | "
|
f"ground_truth={ground_truth:.3f} | "
|
||||||
f"measured={measured_value:.3f} | rtol={rtol}"
|
f"measured={measured_value:.3f} | rtol={rtol}"
|
||||||
)
|
)
|
||||||
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
|
|
||||||
|
min_acceptable = ground_truth * (1 - rtol)
|
||||||
|
success = success and measured_value >= min_acceptable
|
||||||
|
|
||||||
assert success
|
assert success
|
||||||
|
|||||||
@@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
|
|||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3-8B",
|
"model": "meta-llama/Meta-Llama-3-8B",
|
||||||
"tensor_parallel_size": 1,
|
"tensor_parallel_size": 1,
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -7,12 +7,12 @@ import argparse
|
|||||||
import html as _html
|
import html as _html
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
from contextlib import nullcontext
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from importlib import util
|
from importlib import util
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import regex as re
|
|
||||||
|
|
||||||
pd.options.display.float_format = "{:.2f}".format
|
pd.options.display.float_format = "{:.2f}".format
|
||||||
plotly_found = util.find_spec("plotly.express") is not None
|
plotly_found = util.find_spec("plotly.express") is not None
|
||||||
@@ -33,6 +33,45 @@ pd.set_option("display.precision", 2)
|
|||||||
pd.set_option("display.float_format", lambda x: f"{x:.2f}")
|
pd.set_option("display.float_format", lambda x: f"{x:.2f}")
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------
|
||||||
|
# Concurrency normalization (NEW, small)
|
||||||
|
# -----------------------------
|
||||||
|
def _find_concurrency_col(df: pd.DataFrame) -> str:
|
||||||
|
for c in [
|
||||||
|
"# of max concurrency.",
|
||||||
|
"# of max concurrency",
|
||||||
|
"Max Concurrency",
|
||||||
|
"max_concurrency",
|
||||||
|
"Concurrency",
|
||||||
|
]:
|
||||||
|
if c in df.columns:
|
||||||
|
return c
|
||||||
|
|
||||||
|
for c in df.columns:
|
||||||
|
if "concurr" in str(c).lower():
|
||||||
|
s = df[c]
|
||||||
|
if s.dtype.kind in "iu" and s.nunique() > 1 and s.min() >= 1:
|
||||||
|
return c
|
||||||
|
|
||||||
|
raise ValueError(
|
||||||
|
"Cannot infer concurrency column. "
|
||||||
|
"Please rename the column to one of the known names "
|
||||||
|
"or add an explicit override (e.g., --concurrency-col)."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_concurrency_in_df(
|
||||||
|
df: pd.DataFrame, canonical: str = "# of max concurrency."
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
if canonical in df.columns:
|
||||||
|
return df
|
||||||
|
detected = _find_concurrency_col(df)
|
||||||
|
if detected in df.columns and detected != canonical:
|
||||||
|
return df.rename(columns={detected: canonical})
|
||||||
|
df[canonical] = pd.NA
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------
|
# -----------------------------
|
||||||
# Core data compare
|
# Core data compare
|
||||||
# -----------------------------
|
# -----------------------------
|
||||||
@@ -52,19 +91,25 @@ def compare_data_columns(
|
|||||||
- Concat along axis=1 (indexes align), then reset_index so callers can
|
- Concat along axis=1 (indexes align), then reset_index so callers can
|
||||||
group by columns.
|
group by columns.
|
||||||
- If --debug, add a <file_label>_name column per file.
|
- If --debug, add a <file_label>_name column per file.
|
||||||
|
|
||||||
|
Minimal fix to support different max_concurrency lists across files:
|
||||||
|
- normalize concurrency column naming to "# of max concurrency."
|
||||||
|
- align on UNION of keys (missing points become NaN)
|
||||||
|
- BUGFIX: don't drop throughput rows based on P99/Median presence
|
||||||
"""
|
"""
|
||||||
print("\ncompare_data_column:", data_column)
|
print("\ncompare_data_column:", data_column)
|
||||||
|
|
||||||
frames = []
|
frames = []
|
||||||
raw_data_cols: list[str] = []
|
raw_data_cols: list[str] = []
|
||||||
compare_frames = []
|
|
||||||
|
|
||||||
|
# Determine key cols after normalizing concurrency
|
||||||
cols_per_file: list[set] = []
|
cols_per_file: list[set] = []
|
||||||
for f in files:
|
for f in files:
|
||||||
try:
|
try:
|
||||||
df_tmp = pd.read_json(f, orient="records")
|
df_tmp = pd.read_json(f, orient="records")
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
raise ValueError(f"Failed to read {f}") from err
|
raise ValueError(f"Failed to read {f}") from err
|
||||||
|
df_tmp = _normalize_concurrency_in_df(df_tmp, canonical="# of max concurrency.")
|
||||||
cols_per_file.append(set(df_tmp.columns))
|
cols_per_file.append(set(df_tmp.columns))
|
||||||
|
|
||||||
key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
|
key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
|
||||||
@@ -75,12 +120,25 @@ def compare_data_columns(
|
|||||||
"No common key columns found from info_cols across the input files."
|
"No common key columns found from info_cols across the input files."
|
||||||
)
|
)
|
||||||
|
|
||||||
meta_added = False
|
union_index = None
|
||||||
|
metas: list[pd.DataFrame] = []
|
||||||
|
staged: list[tuple[str, pd.Series, pd.Series | None]] = []
|
||||||
|
|
||||||
for file in files:
|
for file in files:
|
||||||
df = pd.read_json(file, orient="records")
|
df = pd.read_json(file, orient="records")
|
||||||
|
df = _normalize_concurrency_in_df(df, canonical="# of max concurrency.")
|
||||||
|
|
||||||
if drop_column in df.columns:
|
# BUGFIX: only drop rows for latency-like metrics; throughput rows may have
|
||||||
|
# NaN in P99/Median columns even if the column exists in the JSON.
|
||||||
|
metric_lc = str(data_column).lower()
|
||||||
|
is_latency_metric = (
|
||||||
|
"ttft" in metric_lc
|
||||||
|
or "tpot" in metric_lc
|
||||||
|
or "p99" in metric_lc
|
||||||
|
or "median" in metric_lc
|
||||||
|
or metric_lc.strip() in {"p99", "median"}
|
||||||
|
)
|
||||||
|
if is_latency_metric and drop_column in df.columns:
|
||||||
df = df.dropna(subset=[drop_column], ignore_index=True)
|
df = df.dropna(subset=[drop_column], ignore_index=True)
|
||||||
|
|
||||||
for c in (
|
for c in (
|
||||||
@@ -105,35 +163,61 @@ def compare_data_columns(
|
|||||||
meta = meta.groupby(level=key_cols, dropna=False).first()
|
meta = meta.groupby(level=key_cols, dropna=False).first()
|
||||||
|
|
||||||
file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
|
file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
|
||||||
s = df_idx[data_column]
|
|
||||||
if not s.index.is_unique:
|
if data_column in df_idx.columns:
|
||||||
s = s.groupby(level=key_cols, dropna=False).mean()
|
s = df_idx[data_column]
|
||||||
|
if not s.index.is_unique:
|
||||||
|
s = s.groupby(level=key_cols, dropna=False).mean()
|
||||||
|
else:
|
||||||
|
# keep NA series to preserve meta keys for union_index
|
||||||
|
s = pd.Series(pd.NA, index=meta.index)
|
||||||
s.name = file_label
|
s.name = file_label
|
||||||
|
|
||||||
if not meta_added:
|
name_s = None
|
||||||
frames.append(meta)
|
|
||||||
meta_added = True
|
|
||||||
|
|
||||||
if debug and name_column in df_idx.columns:
|
if debug and name_column in df_idx.columns:
|
||||||
name_s = df_idx[name_column]
|
name_s = df_idx[name_column]
|
||||||
if not name_s.index.is_unique:
|
if not name_s.index.is_unique:
|
||||||
name_s = name_s.groupby(level=key_cols, dropna=False).first()
|
name_s = name_s.groupby(level=key_cols, dropna=False).first()
|
||||||
name_s.name = f"{file_label}_name"
|
name_s.name = f"{file_label}_name"
|
||||||
frames.append(name_s)
|
|
||||||
|
|
||||||
frames.append(s)
|
if union_index is None:
|
||||||
|
union_index = meta.index
|
||||||
|
else:
|
||||||
|
union_index = union_index.union(meta.index)
|
||||||
|
metas.append(meta)
|
||||||
|
|
||||||
|
staged.append((file_label, s, name_s))
|
||||||
|
|
||||||
|
if union_index is None:
|
||||||
|
raise ValueError("No data found after loading inputs.")
|
||||||
|
|
||||||
|
# meta first (union-aligned): build UNION meta across all files
|
||||||
|
if metas:
|
||||||
|
meta_union = pd.concat(metas, axis=0)
|
||||||
|
# Collapse duplicates on the MultiIndex; keep first non-null per column
|
||||||
|
meta_union = meta_union.groupby(level=key_cols, dropna=False).first()
|
||||||
|
frames.append(meta_union.reindex(union_index))
|
||||||
|
|
||||||
|
# values + ratios (union-aligned)
|
||||||
|
metric_series_aligned: list[pd.Series] = []
|
||||||
|
for file_label, s, name_s in staged:
|
||||||
|
s_aligned = s.reindex(union_index)
|
||||||
|
frames.append(s_aligned)
|
||||||
raw_data_cols.append(file_label)
|
raw_data_cols.append(file_label)
|
||||||
compare_frames.append(s)
|
metric_series_aligned.append(s_aligned)
|
||||||
|
|
||||||
if len(compare_frames) >= 2:
|
if debug and name_s is not None:
|
||||||
base = compare_frames[0]
|
frames.append(name_s.reindex(union_index))
|
||||||
current = compare_frames[-1]
|
|
||||||
if "P99" in data_column or "Median" in data_column:
|
if len(metric_series_aligned) >= 2:
|
||||||
|
base = metric_series_aligned[0]
|
||||||
|
current = metric_series_aligned[-1]
|
||||||
|
if "P99" in str(data_column) or "Median" in str(data_column):
|
||||||
ratio = base / current
|
ratio = base / current
|
||||||
else:
|
else:
|
||||||
ratio = current / base
|
ratio = current / base
|
||||||
ratio = ratio.mask(base == 0)
|
ratio = ratio.mask(base == 0)
|
||||||
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
|
ratio.name = f"Ratio 1 vs {len(metric_series_aligned)}"
|
||||||
frames.append(ratio)
|
frames.append(ratio)
|
||||||
|
|
||||||
concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
|
concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
|
||||||
@@ -204,24 +288,10 @@ def split_json_by_tp_pp(
|
|||||||
# -----------------------------
|
# -----------------------------
|
||||||
# Styling helpers
|
# Styling helpers
|
||||||
# -----------------------------
|
# -----------------------------
|
||||||
def _find_concurrency_col(df: pd.DataFrame) -> str:
|
|
||||||
for c in [
|
|
||||||
"# of max concurrency.",
|
|
||||||
"# of max concurrency",
|
|
||||||
"Max Concurrency",
|
|
||||||
"max_concurrency",
|
|
||||||
"Concurrency",
|
|
||||||
]:
|
|
||||||
if c in df.columns:
|
|
||||||
return c
|
|
||||||
for c in df.columns:
|
|
||||||
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
|
|
||||||
return c
|
|
||||||
return "# of max concurrency."
|
|
||||||
|
|
||||||
|
|
||||||
def _highlight_threshold(
|
def _highlight_threshold(
|
||||||
df: pd.DataFrame, threshold: float
|
df: pd.DataFrame,
|
||||||
|
threshold: float,
|
||||||
|
slack_pct: float = 0.0,
|
||||||
) -> pd.io.formats.style.Styler:
|
) -> pd.io.formats.style.Styler:
|
||||||
conc_col = _find_concurrency_col(df)
|
conc_col = _find_concurrency_col(df)
|
||||||
key_cols = [
|
key_cols = [
|
||||||
@@ -234,12 +304,24 @@ def _highlight_threshold(
|
|||||||
]
|
]
|
||||||
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
|
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
|
||||||
|
|
||||||
return df.style.map(
|
try:
|
||||||
lambda v: "background-color:#e6ffe6;font-weight:bold;"
|
slack_pct = float(slack_pct or 0.0)
|
||||||
if pd.notna(v) and v <= threshold
|
except Exception:
|
||||||
else "",
|
slack_pct = 0.0
|
||||||
subset=conf_cols,
|
slack_limit = threshold * (1.0 + slack_pct / 100.0)
|
||||||
)
|
|
||||||
|
def _cell(v):
|
||||||
|
if pd.isna(v):
|
||||||
|
return ""
|
||||||
|
if v <= threshold:
|
||||||
|
# Strict SLA
|
||||||
|
return "background-color:#e6ffe6;font-weight:bold;"
|
||||||
|
if v <= slack_limit:
|
||||||
|
# Within slack range
|
||||||
|
return "background-color:#ffe5cc;font-weight:bold;"
|
||||||
|
return ""
|
||||||
|
|
||||||
|
return df.style.map(_cell, subset=conf_cols)
|
||||||
|
|
||||||
|
|
||||||
def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
|
def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
|
||||||
@@ -286,11 +368,30 @@ def _sanitize_sheet_name(name: str) -> str:
|
|||||||
- max 31 chars
|
- max 31 chars
|
||||||
- cannot contain: : \ / ? * [ ]
|
- cannot contain: : \ / ? * [ ]
|
||||||
- cannot be empty
|
- cannot be empty
|
||||||
|
|
||||||
|
NOTE: Use fast, non-regex operations here to avoid the third-party `regex`
|
||||||
|
module's compile overhead/edge-cases on some systems.
|
||||||
"""
|
"""
|
||||||
name = "sheet" if name is None else str(name)
|
name = "sheet" if name is None else str(name)
|
||||||
name = re.sub(r"[:\\/?*\[\]]", "_", name)
|
|
||||||
|
# Replace illegal characters with underscore.
|
||||||
|
trans = str.maketrans(
|
||||||
|
{
|
||||||
|
":": "_",
|
||||||
|
"\\": "_",
|
||||||
|
"/": "_",
|
||||||
|
"?": "_",
|
||||||
|
"*": "_",
|
||||||
|
"[": "_",
|
||||||
|
"]": "_",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
name = name.translate(trans)
|
||||||
|
|
||||||
|
# Strip quotes/spaces and collapse whitespace.
|
||||||
name = name.strip().strip("'")
|
name = name.strip().strip("'")
|
||||||
name = re.sub(r"\s+", " ", name)
|
name = " ".join(name.split())
|
||||||
|
|
||||||
if not name:
|
if not name:
|
||||||
name = "sheet"
|
name = "sheet"
|
||||||
return name[:31]
|
return name[:31]
|
||||||
@@ -298,30 +399,57 @@ def _sanitize_sheet_name(name: str) -> str:
|
|||||||
|
|
||||||
def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
|
def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
|
||||||
d = dict(zip(group_cols, gkey_tuple))
|
d = dict(zip(group_cols, gkey_tuple))
|
||||||
model = d.get("Model", "model")
|
|
||||||
model_short = str(model).split("/")[-1]
|
# Always keep input/output lengths (these are important).
|
||||||
ilen = d.get("Input Len", "")
|
ilen = d.get("Input Len", "")
|
||||||
olen = d.get("Output Len", "")
|
olen = d.get("Output Len", "")
|
||||||
lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
|
lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
|
||||||
|
|
||||||
|
# Shorten model name aggressively to make room for lens.
|
||||||
|
model = d.get("Model", "model")
|
||||||
|
leaf = str(model).split("/")[-1]
|
||||||
|
|
||||||
|
max_model_len = max(1, 31 - len(lens))
|
||||||
|
model_short = leaf[:max_model_len]
|
||||||
|
|
||||||
return _sanitize_sheet_name(f"{model_short}{lens}")
|
return _sanitize_sheet_name(f"{model_short}{lens}")
|
||||||
|
|
||||||
|
|
||||||
def _write_tables_to_excel_sheet(
|
def _write_tables_to_excel_sheet(
|
||||||
writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
|
writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
|
||||||
):
|
):
|
||||||
startrow = 0
|
"""Write all blocks to a sheet with a single to_excel() call.
|
||||||
|
|
||||||
|
Pandas+openpyxl can be extremely slow when called many times per sheet.
|
||||||
|
We flatten blocks into one table with a 'Section' column to keep structure
|
||||||
|
while making Excel generation fast and deterministic.
|
||||||
|
"""
|
||||||
|
if not blocks:
|
||||||
|
pd.DataFrame().to_excel(writer, sheet_name=sheet, index=False)
|
||||||
|
return
|
||||||
|
|
||||||
|
combined_parts: list[pd.DataFrame] = []
|
||||||
for title, df in blocks:
|
for title, df in blocks:
|
||||||
pd.DataFrame([[title]]).to_excel(
|
df2 = df.copy()
|
||||||
writer, sheet_name=sheet, index=False, header=False, startrow=startrow
|
# Put the section label as the first column for readability.
|
||||||
)
|
df2.insert(0, "Section", title)
|
||||||
startrow += 1
|
combined_parts.append(df2)
|
||||||
df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow)
|
|
||||||
startrow += len(df) + 3
|
combined = pd.concat(combined_parts, axis=0, ignore_index=True, sort=False)
|
||||||
|
combined.to_excel(writer, sheet_name=sheet, index=False)
|
||||||
|
|
||||||
|
|
||||||
def _safe_filename(s: str) -> str:
|
def _safe_filename(s: str) -> str:
|
||||||
s = re.sub(r"[^\w\-.]+", "_", str(s).strip())
|
# Fast path without the third-party `regex` module.
|
||||||
return s[:180] if len(s) > 180 else s
|
s = " ".join(str(s).strip().split())
|
||||||
|
allowed = []
|
||||||
|
for ch in s:
|
||||||
|
if ch.isalnum() or ch in "._-":
|
||||||
|
allowed.append(ch)
|
||||||
|
else:
|
||||||
|
allowed.append("_")
|
||||||
|
out = "".join(allowed)
|
||||||
|
return out[:180] if len(out) > 180 else out
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------
|
# -----------------------------
|
||||||
@@ -428,7 +556,11 @@ def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
|
|||||||
|
|
||||||
|
|
||||||
def _max_concurrency_ok(
|
def _max_concurrency_ok(
|
||||||
df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
|
df: pd.DataFrame,
|
||||||
|
conc_col: str,
|
||||||
|
cfg_col: str,
|
||||||
|
threshold: float,
|
||||||
|
slack_pct: float = 0.0,
|
||||||
):
|
):
|
||||||
if df is None or conc_col not in df.columns or cfg_col not in df.columns:
|
if df is None or conc_col not in df.columns or cfg_col not in df.columns:
|
||||||
return pd.NA
|
return pd.NA
|
||||||
@@ -441,7 +573,14 @@ def _max_concurrency_ok(
|
|||||||
if d.empty:
|
if d.empty:
|
||||||
return pd.NA
|
return pd.NA
|
||||||
|
|
||||||
ok = d[d[cfg_col] <= threshold]
|
# Accept values up to (1 + slack_pct%) above the SLA.
|
||||||
|
try:
|
||||||
|
slack_pct = float(slack_pct or 0.0)
|
||||||
|
except Exception:
|
||||||
|
slack_pct = 0.0
|
||||||
|
effective_limit = float(threshold) * (1.0 + slack_pct / 100.0)
|
||||||
|
|
||||||
|
ok = d[d[cfg_col] <= effective_limit]
|
||||||
if ok.empty:
|
if ok.empty:
|
||||||
return pd.NA
|
return pd.NA
|
||||||
|
|
||||||
@@ -507,15 +646,25 @@ def build_valid_max_concurrency_summary_html(
|
|||||||
if not cfg_cols:
|
if not cfg_cols:
|
||||||
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
|
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
|
||||||
|
|
||||||
|
# Display SLA ranges in the table header (SLA .. SLA*(1+slack))
|
||||||
|
ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
|
||||||
|
tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
|
||||||
|
ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
|
||||||
|
tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
|
||||||
|
|
||||||
rows = []
|
rows = []
|
||||||
for cfg in cfg_cols:
|
for cfg in cfg_cols:
|
||||||
ttft_max = (
|
ttft_max = (
|
||||||
_max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
|
_max_concurrency_ok(
|
||||||
|
ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
|
||||||
|
)
|
||||||
if ttft_group_df is not None
|
if ttft_group_df is not None
|
||||||
else pd.NA
|
else pd.NA
|
||||||
)
|
)
|
||||||
tpot_max = (
|
tpot_max = (
|
||||||
_max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
|
_max_concurrency_ok(
|
||||||
|
tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
|
||||||
|
)
|
||||||
if tpot_group_df is not None
|
if tpot_group_df is not None
|
||||||
else pd.NA
|
else pd.NA
|
||||||
)
|
)
|
||||||
@@ -544,8 +693,8 @@ def build_valid_max_concurrency_summary_html(
|
|||||||
rows.append(
|
rows.append(
|
||||||
{
|
{
|
||||||
"Configuration": cfg,
|
"Configuration": cfg,
|
||||||
f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
|
f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
|
||||||
f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
|
f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
|
||||||
f"Max {conc_col} (Both)": both,
|
f"Max {conc_col} (Both)": both,
|
||||||
"Output Tput @ Both (tok/s)": tput_at_both,
|
"Output Tput @ Both (tok/s)": tput_at_both,
|
||||||
"TTFT @ Both (ms)": ttft_at_both,
|
"TTFT @ Both (ms)": ttft_at_both,
|
||||||
@@ -620,15 +769,24 @@ def build_valid_max_concurrency_summary_df(
|
|||||||
if not cfg_cols:
|
if not cfg_cols:
|
||||||
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
|
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
|
||||||
|
|
||||||
|
ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
|
||||||
|
tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
|
||||||
|
ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
|
||||||
|
tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
|
||||||
|
|
||||||
rows = []
|
rows = []
|
||||||
for cfg in cfg_cols:
|
for cfg in cfg_cols:
|
||||||
ttft_max = (
|
ttft_max = (
|
||||||
_max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
|
_max_concurrency_ok(
|
||||||
|
ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
|
||||||
|
)
|
||||||
if ttft_group_df is not None
|
if ttft_group_df is not None
|
||||||
else pd.NA
|
else pd.NA
|
||||||
)
|
)
|
||||||
tpot_max = (
|
tpot_max = (
|
||||||
_max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
|
_max_concurrency_ok(
|
||||||
|
tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
|
||||||
|
)
|
||||||
if tpot_group_df is not None
|
if tpot_group_df is not None
|
||||||
else pd.NA
|
else pd.NA
|
||||||
)
|
)
|
||||||
@@ -657,8 +815,8 @@ def build_valid_max_concurrency_summary_df(
|
|||||||
rows.append(
|
rows.append(
|
||||||
{
|
{
|
||||||
"Configuration": cfg,
|
"Configuration": cfg,
|
||||||
f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
|
f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
|
||||||
f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
|
f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
|
||||||
f"Max {conc_col} (Both)": both,
|
f"Max {conc_col} (Both)": both,
|
||||||
"Output Tput @ Both (tok/s)": tput_at_both,
|
"Output Tput @ Both (tok/s)": tput_at_both,
|
||||||
"TTFT @ Both (ms)": ttft_at_both,
|
"TTFT @ Both (ms)": ttft_at_both,
|
||||||
@@ -751,7 +909,21 @@ def build_parser() -> argparse.ArgumentParser:
|
|||||||
help="Reference limit for TPOT plots (ms)",
|
help="Reference limit for TPOT plots (ms)",
|
||||||
)
|
)
|
||||||
|
|
||||||
# ---- NEW: export options ----
|
# ---- SLA tolerance (slack) options ----
|
||||||
|
parser.add_argument(
|
||||||
|
"--ttft-slack-pct",
|
||||||
|
type=float,
|
||||||
|
default=5.0,
|
||||||
|
help="Allowed percentage above TTFT SLA (default: 5).",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--tpot-slack-pct",
|
||||||
|
type=float,
|
||||||
|
default=5.0,
|
||||||
|
help="Allowed percentage above TPOT SLA (default: 5).",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---- export options ----
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--excel-out",
|
"--excel-out",
|
||||||
type=str,
|
type=str,
|
||||||
@@ -843,9 +1015,13 @@ def render_metric_table_html(
|
|||||||
|
|
||||||
metric_name = metric_label.lower()
|
metric_name = metric_label.lower()
|
||||||
if "ttft" in metric_name:
|
if "ttft" in metric_name:
|
||||||
styler = _highlight_threshold(display_group, args.ttft_max_ms)
|
styler = _highlight_threshold(
|
||||||
|
display_group, args.ttft_max_ms, args.ttft_slack_pct
|
||||||
|
)
|
||||||
elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
|
elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
|
||||||
styler = _highlight_threshold(display_group, args.tpot_max_ms)
|
styler = _highlight_threshold(
|
||||||
|
display_group, args.tpot_max_ms, args.tpot_slack_pct
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
styler = display_group.style
|
styler = display_group.style
|
||||||
|
|
||||||
@@ -962,22 +1138,46 @@ def write_report_group_first(
|
|||||||
csv_dir.mkdir(parents=True, exist_ok=True)
|
csv_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
excel_path = args.excel_out or "perf_comparison.xlsx"
|
excel_path = args.excel_out or "perf_comparison.xlsx"
|
||||||
with pd.ExcelWriter(excel_path, engine="openpyxl") as xw:
|
disable_excel = os.getenv("VLLM_COMPARE_DISABLE_EXCEL", "0") == "1"
|
||||||
|
|
||||||
|
# Prefer xlsxwriter for speed; fallback to openpyxl if unavailable.
|
||||||
|
excel_engine = (
|
||||||
|
os.getenv("VLLM_COMPARE_EXCEL_ENGINE", "xlsxwriter").strip() or "xlsxwriter"
|
||||||
|
)
|
||||||
|
if excel_engine == "xlsxwriter" and util.find_spec("xlsxwriter") is None:
|
||||||
|
excel_engine = "openpyxl"
|
||||||
|
|
||||||
|
excel_engine_kwargs = {}
|
||||||
|
if excel_engine == "xlsxwriter":
|
||||||
|
# Reduce memory pressure & usually faster writes.
|
||||||
|
excel_engine_kwargs = {"options": {"constant_memory": True}}
|
||||||
|
|
||||||
|
xw_ctx = (
|
||||||
|
nullcontext(None)
|
||||||
|
if disable_excel
|
||||||
|
else pd.ExcelWriter(
|
||||||
|
excel_path, engine=excel_engine, engine_kwargs=excel_engine_kwargs
|
||||||
|
)
|
||||||
|
)
|
||||||
|
with xw_ctx as xw:
|
||||||
|
used_sheets: set[str] = set()
|
||||||
# ---- Environment sheet (first) ----
|
# ---- Environment sheet (first) ----
|
||||||
env_sheet = _sanitize_sheet_name("Environment")
|
env_sheet = _sanitize_sheet_name("Environment")
|
||||||
env_df = _load_env_df_for_inputs(args, files)
|
env_df = _load_env_df_for_inputs(args, files)
|
||||||
if env_df is None or env_df.empty:
|
if xw is not None:
|
||||||
pd.DataFrame(
|
if env_df is None or env_df.empty:
|
||||||
[
|
pd.DataFrame(
|
||||||
{
|
[
|
||||||
"Section": "Environment",
|
{
|
||||||
"Key": "vllm_env.txt",
|
"Section": "Environment",
|
||||||
"Value": "NOT FOUND (or empty)",
|
"Key": "vllm_env.txt",
|
||||||
}
|
"Value": "NOT FOUND (or empty)",
|
||||||
]
|
}
|
||||||
).to_excel(xw, sheet_name=env_sheet, index=False)
|
]
|
||||||
else:
|
).to_excel(xw, sheet_name=env_sheet, index=False)
|
||||||
env_df.to_excel(xw, sheet_name=env_sheet, index=False)
|
else:
|
||||||
|
env_df.to_excel(xw, sheet_name=env_sheet, index=False)
|
||||||
|
used_sheets.add(env_sheet)
|
||||||
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
|
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
|
||||||
main_fh.write('<meta charset="utf-8">\n')
|
main_fh.write('<meta charset="utf-8">\n')
|
||||||
for gkey in group_keys:
|
for gkey in group_keys:
|
||||||
@@ -993,12 +1193,19 @@ def write_report_group_first(
|
|||||||
|
|
||||||
main_fh.write(group_header)
|
main_fh.write(group_header)
|
||||||
|
|
||||||
|
do_excel = xw is not None
|
||||||
sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
|
sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
|
||||||
sheet_base = sheet
|
sheet_base = sheet
|
||||||
dedup_i = 1
|
if do_excel:
|
||||||
while sheet in xw.sheets:
|
dedup_i = 1
|
||||||
dedup_i += 1
|
while sheet in used_sheets:
|
||||||
sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}")
|
dedup_i += 1
|
||||||
|
suffix = f"_{dedup_i}"
|
||||||
|
# Ensure uniqueness even when sheet names are truncated.
|
||||||
|
base = str(sheet_base)
|
||||||
|
keep = max(1, 31 - len(suffix))
|
||||||
|
sheet = _sanitize_sheet_name(base[:keep] + suffix)
|
||||||
|
used_sheets.add(sheet)
|
||||||
|
|
||||||
excel_blocks: list[tuple[str, pd.DataFrame]] = []
|
excel_blocks: list[tuple[str, pd.DataFrame]] = []
|
||||||
|
|
||||||
@@ -1059,7 +1266,7 @@ def write_report_group_first(
|
|||||||
)
|
)
|
||||||
|
|
||||||
excel_blocks.append(
|
excel_blocks.append(
|
||||||
(metric_label, display_group.reset_index(drop=True))
|
(metric_label, group_df.reset_index(drop=True))
|
||||||
)
|
)
|
||||||
if csv_dir:
|
if csv_dir:
|
||||||
fn = _safe_filename(
|
fn = _safe_filename(
|
||||||
@@ -1067,7 +1274,7 @@ def write_report_group_first(
|
|||||||
"/", "_"
|
"/", "_"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
display_group.to_csv(csv_dir / f"{fn}.csv", index=False)
|
group_df.to_csv(csv_dir / f"{fn}.csv", index=False)
|
||||||
|
|
||||||
summary_html = build_valid_max_concurrency_summary_html(
|
summary_html = build_valid_max_concurrency_summary_html(
|
||||||
tput_group_df=tput_group_df,
|
tput_group_df=tput_group_df,
|
||||||
@@ -1097,9 +1304,13 @@ def write_report_group_first(
|
|||||||
)
|
)
|
||||||
summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
|
summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
|
||||||
|
|
||||||
_write_tables_to_excel_sheet(xw, sheet, excel_blocks)
|
if do_excel:
|
||||||
|
_write_tables_to_excel_sheet(xw, sheet, excel_blocks)
|
||||||
|
|
||||||
print(f"Wrote Excel: {excel_path}")
|
if disable_excel:
|
||||||
|
print("Skipped Excel generation (VLLM_COMPARE_DISABLE_EXCEL=1).")
|
||||||
|
else:
|
||||||
|
print(f"Wrote Excel: {excel_path}")
|
||||||
if csv_dir:
|
if csv_dir:
|
||||||
print(f"Wrote CSVs under: {csv_dir}")
|
print(f"Wrote CSVs under: {csv_dir}")
|
||||||
|
|
||||||
|
|||||||
365
.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
Executable file → Normal file
365
.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
Executable file → Normal file
@@ -12,6 +12,13 @@ DRY_RUN="${DRY_RUN:-0}"
|
|||||||
MODEL_FILTER="${MODEL_FILTER:-}"
|
MODEL_FILTER="${MODEL_FILTER:-}"
|
||||||
DTYPE_FILTER="${DTYPE_FILTER:-}"
|
DTYPE_FILTER="${DTYPE_FILTER:-}"
|
||||||
|
|
||||||
|
# Adaptive search controls
|
||||||
|
ENABLE_ADAPTIVE_CONCURRENCY="${ENABLE_ADAPTIVE_CONCURRENCY:-0}"
|
||||||
|
SLA_TTFT_MS="${SLA_TTFT_MS:-3000}"
|
||||||
|
SLA_TPOT_MS="${SLA_TPOT_MS:-100}"
|
||||||
|
ADAPTIVE_MAX_PROBES="${ADAPTIVE_MAX_PROBES:-8}"
|
||||||
|
ADAPTIVE_MAX_CONCURRENCY="${ADAPTIVE_MAX_CONCURRENCY:-1024}"
|
||||||
|
|
||||||
check_gpus() {
|
check_gpus() {
|
||||||
if command -v nvidia-smi; then
|
if command -v nvidia-smi; then
|
||||||
# check the number of GPUs and GPU type.
|
# check the number of GPUs and GPU type.
|
||||||
@@ -183,6 +190,304 @@ upload_to_buildkite() {
|
|||||||
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
|
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# Adaptive concurrency helpers
|
||||||
|
# -------------------------------
|
||||||
|
result_json_path_for_serving() {
|
||||||
|
local test_name=$1
|
||||||
|
local qps=$2
|
||||||
|
local max_concurrency=$3
|
||||||
|
echo "$RESULTS_FOLDER/${test_name}_qps_${qps}_concurrency_${max_concurrency}.json"
|
||||||
|
}
|
||||||
|
|
||||||
|
extract_metric_ms() {
|
||||||
|
local metric_name=$1
|
||||||
|
local json_file=$2
|
||||||
|
|
||||||
|
[[ -f "$json_file" ]] || return 0
|
||||||
|
|
||||||
|
if [[ "$metric_name" == "ttft" ]]; then
|
||||||
|
jq -r '
|
||||||
|
[
|
||||||
|
.ttft_ms.p99?,
|
||||||
|
.metrics.ttft_ms.p99?,
|
||||||
|
.ttft.p99?,
|
||||||
|
.metrics.ttft.p99?,
|
||||||
|
.p99_ttft_ms?,
|
||||||
|
.ttft_ms.mean?,
|
||||||
|
.metrics.ttft_ms.mean?,
|
||||||
|
.ttft.mean?,
|
||||||
|
.metrics.ttft.mean?,
|
||||||
|
.mean_ttft_ms?
|
||||||
|
] | map(select(. != null)) | .[0] // empty
|
||||||
|
' "$json_file"
|
||||||
|
else
|
||||||
|
jq -r '
|
||||||
|
[
|
||||||
|
.tpot_ms.p99?,
|
||||||
|
.metrics.tpot_ms.p99?,
|
||||||
|
.tpot.p99?,
|
||||||
|
.metrics.tpot.p99?,
|
||||||
|
.p99_tpot_ms?,
|
||||||
|
.itl_ms.p99?,
|
||||||
|
.metrics.itl_ms.p99?,
|
||||||
|
.inter_token_latency_ms.p99?,
|
||||||
|
.tpot_ms.mean?,
|
||||||
|
.metrics.tpot_ms.mean?,
|
||||||
|
.tpot.mean?,
|
||||||
|
.metrics.tpot.mean?,
|
||||||
|
.itl_ms.mean?,
|
||||||
|
.metrics.itl_ms.mean?,
|
||||||
|
.mean_tpot_ms?,
|
||||||
|
.mean_itl_ms?
|
||||||
|
] | map(select(. != null)) | .[0] // empty
|
||||||
|
' "$json_file"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
evaluate_sla_from_json() {
|
||||||
|
local json_file=$1
|
||||||
|
local ttft
|
||||||
|
local tpot
|
||||||
|
local pass
|
||||||
|
|
||||||
|
[[ -f "$json_file" ]] || return 2
|
||||||
|
|
||||||
|
ttft=$(extract_metric_ms ttft "$json_file")
|
||||||
|
tpot=$(extract_metric_ms tpot "$json_file")
|
||||||
|
|
||||||
|
[[ -n "$ttft" && -n "$tpot" ]] || return 2
|
||||||
|
|
||||||
|
pass=$(jq -n \
|
||||||
|
--argjson ttft "$ttft" \
|
||||||
|
--argjson tpot "$tpot" \
|
||||||
|
--argjson sla_ttft "$SLA_TTFT_MS" \
|
||||||
|
--argjson sla_tpot "$SLA_TPOT_MS" \
|
||||||
|
'($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)')
|
||||||
|
|
||||||
|
[[ "$pass" == "true" ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
write_adaptive_summary_json() {
|
||||||
|
local summary_file=$1
|
||||||
|
local test_name=$2
|
||||||
|
local qps=$3
|
||||||
|
local static_last_pass=$4
|
||||||
|
local static_first_fail=$5
|
||||||
|
local final_last_pass=$6
|
||||||
|
local final_first_fail=$7
|
||||||
|
|
||||||
|
jq -n \
|
||||||
|
--arg test_name "$test_name" \
|
||||||
|
--arg qps "$qps" \
|
||||||
|
--argjson sla_ttft "$SLA_TTFT_MS" \
|
||||||
|
--argjson sla_tpot "$SLA_TPOT_MS" \
|
||||||
|
--arg static_last_pass "${static_last_pass:-}" \
|
||||||
|
--arg static_first_fail "${static_first_fail:-}" \
|
||||||
|
--arg final_last_pass "${final_last_pass:-}" \
|
||||||
|
--arg final_first_fail "${final_first_fail:-}" \
|
||||||
|
'{
|
||||||
|
test_name: $test_name,
|
||||||
|
qps: $qps,
|
||||||
|
sla_ttft_ms: $sla_ttft,
|
||||||
|
sla_tpot_ms: $sla_tpot,
|
||||||
|
static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end),
|
||||||
|
static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end),
|
||||||
|
final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end),
|
||||||
|
final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end)
|
||||||
|
}' > "$summary_file"
|
||||||
|
}
|
||||||
|
|
||||||
|
run_single_serving_probe() {
|
||||||
|
local test_name=$1
|
||||||
|
local qps=$2
|
||||||
|
local max_concurrency=$3
|
||||||
|
local tp=$4
|
||||||
|
local compilation_config_mode=$5
|
||||||
|
local optimization_level=$6
|
||||||
|
local client_args_effective=$7
|
||||||
|
local client_remote_args=$8
|
||||||
|
local server_command=$9
|
||||||
|
|
||||||
|
local new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
|
||||||
|
local result_json
|
||||||
|
local num_prompts_arg=""
|
||||||
|
local client_command
|
||||||
|
|
||||||
|
result_json=$(result_json_path_for_serving "$test_name" "$qps" "$max_concurrency")
|
||||||
|
|
||||||
|
if [[ -f "$result_json" ]]; then
|
||||||
|
evaluate_sla_from_json "$result_json"
|
||||||
|
return $?
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
|
||||||
|
num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
|
||||||
|
if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
|
||||||
|
if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
|
||||||
|
num_prompts_arg="--num-prompts $num_prompts"
|
||||||
|
fi
|
||||||
|
|
||||||
|
client_command="vllm bench serve \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
--max-concurrency $max_concurrency \
|
||||||
|
$num_prompts_arg \
|
||||||
|
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level adaptive_search=1 \
|
||||||
|
$client_args_effective $client_remote_args "
|
||||||
|
|
||||||
|
echo "Adaptive probe: $client_command"
|
||||||
|
|
||||||
|
if [[ "${DRY_RUN:-0}" != "1" ]]; then
|
||||||
|
bash -c "$client_command"
|
||||||
|
fi
|
||||||
|
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg server "$server_command" \
|
||||||
|
--arg client "$client_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
'{
|
||||||
|
server_command: $server,
|
||||||
|
client_command: $client,
|
||||||
|
gpu_type: $gpu,
|
||||||
|
adaptive_search: true
|
||||||
|
}')
|
||||||
|
echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
|
||||||
|
|
||||||
|
evaluate_sla_from_json "$result_json"
|
||||||
|
}
|
||||||
|
|
||||||
|
adaptive_refine_from_static_results() {
|
||||||
|
local test_name=$1
|
||||||
|
local qps=$2
|
||||||
|
local max_concurrency_list_raw=$3
|
||||||
|
local tp=$4
|
||||||
|
local compilation_config_mode=$5
|
||||||
|
local optimization_level=$6
|
||||||
|
local client_args_effective=$7
|
||||||
|
local client_remote_args=$8
|
||||||
|
local server_command=$9
|
||||||
|
|
||||||
|
local sorted_points
|
||||||
|
local point
|
||||||
|
local rc
|
||||||
|
local static_last_pass=""
|
||||||
|
local static_first_fail=""
|
||||||
|
local largest_static=""
|
||||||
|
local step_hint=1
|
||||||
|
local previous_point=""
|
||||||
|
local low
|
||||||
|
local high
|
||||||
|
local mid
|
||||||
|
local probes=0
|
||||||
|
local summary_file="$RESULTS_FOLDER/${test_name}_qps_${qps}_sla_summary.json"
|
||||||
|
|
||||||
|
[[ "${ENABLE_ADAPTIVE_CONCURRENCY}" == "1" ]] || return 0
|
||||||
|
[[ "${DRY_RUN:-0}" != "1" ]] || return 0
|
||||||
|
|
||||||
|
sorted_points=$(for point in $max_concurrency_list_raw; do printf '%s\n' "$point"; done | tr -d "'" | awk '/^[0-9]+$/' | sort -n | uniq)
|
||||||
|
[[ -n "$sorted_points" ]] || return 0
|
||||||
|
|
||||||
|
while read -r point; do
|
||||||
|
[[ -z "$point" ]] && continue
|
||||||
|
largest_static="$point"
|
||||||
|
evaluate_sla_from_json "$(result_json_path_for_serving "$test_name" "$qps" "$point")"
|
||||||
|
rc=$?
|
||||||
|
if (( rc == 0 )); then
|
||||||
|
static_last_pass="$point"
|
||||||
|
elif (( rc == 1 )); then
|
||||||
|
if [[ -n "$static_last_pass" ]]; then
|
||||||
|
static_first_fail="$point"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n "$previous_point" ]]; then
|
||||||
|
step_hint=$(( point - previous_point ))
|
||||||
|
if (( step_hint < 1 )); then step_hint=1; fi
|
||||||
|
fi
|
||||||
|
previous_point="$point"
|
||||||
|
done <<< "$sorted_points"
|
||||||
|
|
||||||
|
if [[ -z "$static_last_pass" ]]; then
|
||||||
|
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "" "$static_first_fail" "" "$static_first_fail"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n "$static_first_fail" ]]; then
|
||||||
|
low=$static_last_pass
|
||||||
|
high=$static_first_fail
|
||||||
|
while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
|
||||||
|
mid=$(( (low + high) / 2 ))
|
||||||
|
probes=$(( probes + 1 ))
|
||||||
|
run_single_serving_probe \
|
||||||
|
"$test_name" "$qps" "$mid" "$tp" \
|
||||||
|
"$compilation_config_mode" "$optimization_level" \
|
||||||
|
"$client_args_effective" "$client_remote_args" "$server_command"
|
||||||
|
rc=$?
|
||||||
|
if (( rc == 0 )); then
|
||||||
|
low=$mid
|
||||||
|
elif (( rc == 1 )); then
|
||||||
|
high=$mid
|
||||||
|
else
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "$static_first_fail" "$low" "$high"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
low=$largest_static
|
||||||
|
high=""
|
||||||
|
while (( probes < ADAPTIVE_MAX_PROBES )); do
|
||||||
|
point=$(( low + step_hint ))
|
||||||
|
if (( point > ADAPTIVE_MAX_CONCURRENCY )); then
|
||||||
|
point=$ADAPTIVE_MAX_CONCURRENCY
|
||||||
|
fi
|
||||||
|
(( point > low )) || break
|
||||||
|
probes=$(( probes + 1 ))
|
||||||
|
run_single_serving_probe \
|
||||||
|
"$test_name" "$qps" "$point" "$tp" \
|
||||||
|
"$compilation_config_mode" "$optimization_level" \
|
||||||
|
"$client_args_effective" "$client_remote_args" "$server_command"
|
||||||
|
rc=$?
|
||||||
|
if (( rc == 0 )); then
|
||||||
|
low=$point
|
||||||
|
(( point == ADAPTIVE_MAX_CONCURRENCY )) && break
|
||||||
|
step_hint=$(( step_hint * 2 ))
|
||||||
|
if (( step_hint < 1 )); then step_hint=1; fi
|
||||||
|
elif (( rc == 1 )); then
|
||||||
|
high=$point
|
||||||
|
break
|
||||||
|
else
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ -n "$high" ]]; then
|
||||||
|
while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
|
||||||
|
mid=$(( (low + high) / 2 ))
|
||||||
|
probes=$(( probes + 1 ))
|
||||||
|
run_single_serving_probe \
|
||||||
|
"$test_name" "$qps" "$mid" "$tp" \
|
||||||
|
"$compilation_config_mode" "$optimization_level" \
|
||||||
|
"$client_args_effective" "$client_remote_args" "$server_command"
|
||||||
|
rc=$?
|
||||||
|
if (( rc == 0 )); then
|
||||||
|
low=$mid
|
||||||
|
elif (( rc == 1 )); then
|
||||||
|
high=$mid
|
||||||
|
else
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "" "$low" "$high"
|
||||||
|
}
|
||||||
|
|
||||||
run_benchmark_tests() {
|
run_benchmark_tests() {
|
||||||
# run benchmark tests using `vllm bench <test_type>` command
|
# run benchmark tests using `vllm bench <test_type>` command
|
||||||
# $1: test type (latency or throughput)
|
# $1: test type (latency or throughput)
|
||||||
@@ -347,10 +652,48 @@ run_serving_tests() {
|
|||||||
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
|
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
|
||||||
client_params=$(echo "$params" | jq -r '.client_parameters')
|
client_params=$(echo "$params" | jq -r '.client_parameters')
|
||||||
|
|
||||||
server_args=$(json2args "$server_params")
|
# vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly.
|
||||||
|
server_model=$(echo "$server_params" | jq -r '.model // empty')
|
||||||
|
if [[ -z "$server_model" || "$server_model" == "null" ]]; then
|
||||||
|
echo "Error: serving test '$test_name' is missing server_parameters.model" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
server_params_no_model=$(echo "$server_params" | jq -c 'del(.model)')
|
||||||
|
server_args=$(json2args "$server_params_no_model")
|
||||||
|
|
||||||
server_envs=$(json2envs "$server_envs")
|
server_envs=$(json2envs "$server_envs")
|
||||||
client_args=$(json2args "$client_params")
|
client_args=$(json2args "$client_params")
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# Option 1: Dynamic num-prompts scaling based on max_concurrency
|
||||||
|
#
|
||||||
|
# If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with:
|
||||||
|
# num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY
|
||||||
|
#
|
||||||
|
# If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior
|
||||||
|
# unchanged (i.e., whatever is in serving-tests-*.json).
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
PROMPTS_PER_CONCURRENCY="${PROMPTS_PER_CONCURRENCY-}" # no default on purpose
|
||||||
|
MIN_NUM_PROMPTS="${MIN_NUM_PROMPTS:-1}"
|
||||||
|
MAX_NUM_PROMPTS="${MAX_NUM_PROMPTS:-1000000}"
|
||||||
|
|
||||||
|
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
|
||||||
|
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
|
||||||
|
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
|
||||||
|
# Handles: --num-prompts 123 and --num-prompts=123
|
||||||
|
client_args_no_np="$(
|
||||||
|
printf ' %s ' "$client_args" \
|
||||||
|
| sed -E \
|
||||||
|
-e 's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g' \
|
||||||
|
-e 's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g'
|
||||||
|
)"
|
||||||
|
# normalize whitespace
|
||||||
|
client_args_no_np="$(echo "$client_args_no_np" | tr -s ' ' | sed -E 's/^ //; s/ $//')"
|
||||||
|
client_args_no_np="$(echo "$client_args_no_np" | xargs)"
|
||||||
|
client_args_effective="$client_args_no_np"
|
||||||
|
else
|
||||||
|
client_args_effective="$client_args"
|
||||||
|
fi
|
||||||
# qps_list
|
# qps_list
|
||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
@@ -382,14 +725,13 @@ run_serving_tests() {
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# check if server model and client model is aligned
|
# check if server model and client model is aligned
|
||||||
server_model=$(echo "$server_params" | jq -r '.model')
|
|
||||||
client_model=$(echo "$client_params" | jq -r '.model')
|
client_model=$(echo "$client_params" | jq -r '.model')
|
||||||
if [[ $server_model != "$client_model" ]]; then
|
if [[ $server_model != "$client_model" ]]; then
|
||||||
echo "Server model and client model must be the same. Skip testcase $test_name."
|
echo "Server model and client model must be the same. Skip testcase $test_name."
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
server_command="$server_envs vllm serve \
|
server_command="$server_envs vllm serve $server_model \
|
||||||
$server_args"
|
$server_args"
|
||||||
|
|
||||||
# run the server
|
# run the server
|
||||||
@@ -436,6 +778,14 @@ run_serving_tests() {
|
|||||||
for max_concurrency in $max_concurrency_list; do
|
for max_concurrency in $max_concurrency_list; do
|
||||||
new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
|
new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
|
||||||
echo " new test name $new_test_name"
|
echo " new test name $new_test_name"
|
||||||
|
# If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts.
|
||||||
|
num_prompts_arg=""
|
||||||
|
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
|
||||||
|
num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
|
||||||
|
if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
|
||||||
|
if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
|
||||||
|
num_prompts_arg="--num-prompts $num_prompts"
|
||||||
|
fi
|
||||||
# pass the tensor parallel size, the compilation mode, and the optimization
|
# pass the tensor parallel size, the compilation mode, and the optimization
|
||||||
# level to the client so that they can be used on the benchmark dashboard
|
# level to the client so that they can be used on the benchmark dashboard
|
||||||
client_command="vllm bench serve \
|
client_command="vllm bench serve \
|
||||||
@@ -444,8 +794,9 @@ run_serving_tests() {
|
|||||||
--result-filename ${new_test_name}.json \
|
--result-filename ${new_test_name}.json \
|
||||||
--request-rate $qps \
|
--request-rate $qps \
|
||||||
--max-concurrency $max_concurrency \
|
--max-concurrency $max_concurrency \
|
||||||
|
$num_prompts_arg \
|
||||||
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
|
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
|
||||||
$client_args $client_remote_args "
|
$client_args_effective $client_remote_args "
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
echo "Running test case $test_name with qps $qps"
|
||||||
echo "Client command: $client_command"
|
echo "Client command: $client_command"
|
||||||
@@ -467,6 +818,11 @@ run_serving_tests() {
|
|||||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||||
|
|
||||||
done
|
done
|
||||||
|
|
||||||
|
adaptive_refine_from_static_results \
|
||||||
|
"$test_name" "$qps" "$max_concurrency_list" "$tp" \
|
||||||
|
"$compilation_config_mode" "$optimization_level" \
|
||||||
|
"$client_args_effective" "$client_remote_args" "$server_command"
|
||||||
done
|
done
|
||||||
|
|
||||||
# clean up
|
# clean up
|
||||||
@@ -532,6 +888,7 @@ main() {
|
|||||||
# postprocess benchmarking results
|
# postprocess benchmarking results
|
||||||
pip install tabulate pandas
|
pip install tabulate pandas
|
||||||
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
|
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
|
||||||
|
python3 $QUICK_BENCHMARK_ROOT/scripts/compare-json-results.py -f $RESULTS_FOLDER/benchmark_results.json
|
||||||
|
|
||||||
upload_to_buildkite
|
upload_to_buildkite
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,37 @@
|
|||||||
|
{
|
||||||
|
"defaults": {
|
||||||
|
"qps_list": [
|
||||||
|
"inf"
|
||||||
|
],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"model": "openai/whisper-large-v3-turbo"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "openai/whisper-large-v3-turbo",
|
||||||
|
"backend": "openai-audio",
|
||||||
|
"endpoint": "/v1/audio/transcriptions",
|
||||||
|
"dataset_name": "hf",
|
||||||
|
"dataset_path": "openslr/librispeech_asr",
|
||||||
|
"hf_subset": "clean",
|
||||||
|
"hf_split": "test",
|
||||||
|
"no_stream": "",
|
||||||
|
"no_oversample": "",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tests": [
|
||||||
|
{
|
||||||
|
"test_name": "serving_whisper_large_v3_turbo_librispeech_clean_tp1",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -149,6 +149,39 @@
|
|||||||
"random-output-len": 128
|
"random-output-len": 128
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_random_2048_2048",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 2048,
|
||||||
|
"random-output-len": 2048
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_random_2048_2048",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 2
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 2048,
|
||||||
|
"random-output-len": 2048
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp4_random_2048_2048",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 4
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 2048,
|
||||||
|
"random-output-len": 2048
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_int4_tp1_random_128_128",
|
"test_name": "serving_llama8B_int4_tp1_random_128_128",
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
@@ -188,6 +221,45 @@
|
|||||||
"random-output-len": 128
|
"random-output-len": 128
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp1_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 1
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp2_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 2
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp4_random_128_128",
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 4
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama3B_tp1_random_128_128",
|
"test_name": "serving_llama3B_tp1_random_128_128",
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
|
|||||||
@@ -72,17 +72,6 @@
|
|||||||
"random-output-len": 128
|
"random-output-len": 128
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp4_random_128_128",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 4
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
@@ -105,17 +94,6 @@
|
|||||||
"random-output-len": 2048
|
"random-output-len": 2048
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp4_random_128_2048",
|
|
||||||
"server_parameters": {
|
|
||||||
"tensor_parallel_size": 4
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 2048
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
@@ -139,14 +117,25 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp4_random_2048_128",
|
"test_name": "serving_llama8B_tp1_random_2048_2048",
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"tensor_parallel_size": 4
|
"tensor_parallel_size": 1
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"dataset_name": "random",
|
"dataset_name": "random",
|
||||||
"random-input-len": 2048,
|
"random-input-len": 2048,
|
||||||
"random-output-len": 128
|
"random-output-len": 2048
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_random_2048_2048",
|
||||||
|
"server_parameters": {
|
||||||
|
"tensor_parallel_size": 2
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 2048,
|
||||||
|
"random-output-len": 2048
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -10,7 +10,6 @@
|
|||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
"tensor_parallel_size": 1,
|
"tensor_parallel_size": 1,
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"load_format": "dummy",
|
"load_format": "dummy",
|
||||||
"max-model-len": 2048,
|
"max-model-len": 2048,
|
||||||
@@ -37,7 +36,6 @@
|
|||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"load_format": "dummy",
|
"load_format": "dummy",
|
||||||
"max-model-len": 2048,
|
"max-model-len": 2048,
|
||||||
@@ -64,7 +62,6 @@
|
|||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
"tensor_parallel_size": 2,
|
"tensor_parallel_size": 2,
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"load_format": "dummy",
|
"load_format": "dummy",
|
||||||
"max-model-len": 2048,
|
"max-model-len": 2048,
|
||||||
@@ -91,7 +88,6 @@
|
|||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "deepseek-ai/DeepSeek-R1",
|
"model": "deepseek-ai/DeepSeek-R1",
|
||||||
"tensor_parallel_size": 8,
|
"tensor_parallel_size": 8,
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"load_format": "dummy",
|
"load_format": "dummy",
|
||||||
"max-model-len": 2048,
|
"max-model-len": 2048,
|
||||||
|
|||||||
@@ -5,7 +5,6 @@
|
|||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
"tensor_parallel_size": 1,
|
"tensor_parallel_size": 1,
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
@@ -23,7 +22,6 @@
|
|||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
@@ -41,7 +39,6 @@
|
|||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
"tensor_parallel_size": 2,
|
"tensor_parallel_size": 2,
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
@@ -59,7 +56,6 @@
|
|||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"swap_space": 16,
|
|
||||||
"speculative_config": {
|
"speculative_config": {
|
||||||
"model": "turboderp/Qwama-0.5B-Instruct",
|
"model": "turboderp/Qwama-0.5B-Instruct",
|
||||||
"num_speculative_tokens": 4,
|
"num_speculative_tokens": 4,
|
||||||
|
|||||||
@@ -83,7 +83,7 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||||
@@ -152,7 +152,7 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||||
env:
|
env:
|
||||||
|
|||||||
@@ -166,12 +166,19 @@ See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for contex
|
|||||||
EOF
|
EOF
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Notify Slack if webhook is configured.
|
# Notify Slack if webhook is configured and PR/branch are valid.
|
||||||
if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
|
if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
|
||||||
echo ">>> Sending Slack notification"
|
PR="${BUILDKITE_PULL_REQUEST:-}"
|
||||||
# Single quotes are intentional: the f-string expressions are Python, not shell.
|
BRANCH="${BUILDKITE_BRANCH:-}"
|
||||||
# shellcheck disable=SC2016
|
|
||||||
PAYLOAD=$(python3 -c '
|
# Skip notification if PR is invalid or branch is empty
|
||||||
|
if [[ "$PR" = "false" || -z "$PR" || -z "$BRANCH" ]]; then
|
||||||
|
echo ">>> Skipping Slack notification (invalid PR or empty branch: PR=$PR, branch=$BRANCH)"
|
||||||
|
else
|
||||||
|
echo ">>> Sending Slack notification"
|
||||||
|
# Single quotes are intentional: the f-string expressions are Python, not shell.
|
||||||
|
# shellcheck disable=SC2016
|
||||||
|
PAYLOAD=$(python3 -c '
|
||||||
import json, os, sys
|
import json, os, sys
|
||||||
pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A")
|
pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A")
|
||||||
branch = os.getenv("BUILDKITE_BRANCH", "unknown")
|
branch = os.getenv("BUILDKITE_BRANCH", "unknown")
|
||||||
@@ -194,10 +201,11 @@ data = {
|
|||||||
print(json.dumps(data))
|
print(json.dumps(data))
|
||||||
')
|
')
|
||||||
|
|
||||||
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
|
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
|
||||||
-H 'Content-type: application/json' \
|
-H 'Content-type: application/json' \
|
||||||
-d "$PAYLOAD")
|
-d "$PAYLOAD")
|
||||||
echo " Slack webhook response: $HTTP_CODE"
|
echo " Slack webhook response: $HTTP_CODE"
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
|
echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@@ -205,6 +205,13 @@ re_quote_pytest_markers() {
|
|||||||
esac
|
esac
|
||||||
|
|
||||||
if $is_boundary; then
|
if $is_boundary; then
|
||||||
|
# Strip surrounding double quotes if present (from upstream
|
||||||
|
# single-to-double conversion); without this, wrapping below
|
||||||
|
# would produce '"expr"' with literal double-quote characters.
|
||||||
|
if [[ "$marker_buf" == '"'*'"' ]]; then
|
||||||
|
marker_buf="${marker_buf#\"}"
|
||||||
|
marker_buf="${marker_buf%\"}"
|
||||||
|
fi
|
||||||
# Flush the collected marker expression
|
# Flush the collected marker expression
|
||||||
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
|
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
|
||||||
output+="'${marker_buf}' "
|
output+="'${marker_buf}' "
|
||||||
@@ -242,6 +249,11 @@ re_quote_pytest_markers() {
|
|||||||
|
|
||||||
# Flush any trailing marker expression (marker at end of command)
|
# Flush any trailing marker expression (marker at end of command)
|
||||||
if $collecting && [[ -n "$marker_buf" ]]; then
|
if $collecting && [[ -n "$marker_buf" ]]; then
|
||||||
|
# Strip surrounding double quotes (see mid-stream flush comment)
|
||||||
|
if [[ "$marker_buf" == '"'*'"' ]]; then
|
||||||
|
marker_buf="${marker_buf#\"}"
|
||||||
|
marker_buf="${marker_buf%\"}"
|
||||||
|
fi
|
||||||
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
|
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
|
||||||
output+="'${marker_buf}'"
|
output+="'${marker_buf}'"
|
||||||
else
|
else
|
||||||
@@ -321,15 +333,15 @@ apply_rocm_test_overrides() {
|
|||||||
# --- Entrypoint ignores ---
|
# --- Entrypoint ignores ---
|
||||||
if [[ $cmds == *" entrypoints/openai "* ]]; then
|
if [[ $cmds == *" entrypoints/openai "* ]]; then
|
||||||
cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
|
cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
|
||||||
--ignore=entrypoints/openai/test_audio.py \
|
--ignore=entrypoints/openai/chat_completion/test_audio.py \
|
||||||
--ignore=entrypoints/openai/test_shutdown.py \
|
--ignore=entrypoints/openai/completion/test_shutdown.py \
|
||||||
--ignore=entrypoints/openai/test_completion.py \
|
--ignore=entrypoints/openai/test_completion.py \
|
||||||
--ignore=entrypoints/openai/test_models.py \
|
--ignore=entrypoints/openai/test_models.py \
|
||||||
--ignore=entrypoints/openai/test_lora_adapters.py \
|
--ignore=entrypoints/openai/test_lora_adapters.py \
|
||||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
||||||
--ignore=entrypoints/openai/test_root_path.py \
|
--ignore=entrypoints/openai/chat_completion/test_root_path.py \
|
||||||
--ignore=entrypoints/openai/test_tokenization.py \
|
--ignore=entrypoints/openai/test_tokenization.py \
|
||||||
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
--ignore=entrypoints/openai/completion/test_prompt_validation.py "}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $cmds == *" entrypoints/llm "* ]]; then
|
if [[ $cmds == *" entrypoints/llm "* ]]; then
|
||||||
@@ -492,6 +504,8 @@ else
|
|||||||
-e HF_TOKEN \
|
-e HF_TOKEN \
|
||||||
-e AWS_ACCESS_KEY_ID \
|
-e AWS_ACCESS_KEY_ID \
|
||||||
-e AWS_SECRET_ACCESS_KEY \
|
-e AWS_SECRET_ACCESS_KEY \
|
||||||
|
-e BUILDKITE_PARALLEL_JOB \
|
||||||
|
-e BUILDKITE_PARALLEL_JOB_COUNT \
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
||||||
|
|||||||
65
.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
Executable file
65
.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
Executable file
@@ -0,0 +1,65 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -euox pipefail
|
||||||
|
|
||||||
|
export VLLM_CPU_KVCACHE_SPACE=1
|
||||||
|
export VLLM_CPU_CI_ENV=1
|
||||||
|
# Reduce sub-processes for acceleration
|
||||||
|
export TORCH_COMPILE_DISABLE=1
|
||||||
|
export VLLM_ENABLE_V1_MULTIPROCESSING=0
|
||||||
|
|
||||||
|
SDE_ARCHIVE="sde-external-10.7.0-2026-02-18-lin.tar.xz"
|
||||||
|
SDE_CHECKSUM="CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217"
|
||||||
|
wget "https://downloadmirror.intel.com/913594/${SDE_ARCHIVE}"
|
||||||
|
echo "${SDE_CHECKSUM} ${SDE_ARCHIVE}" | sha256sum --check
|
||||||
|
mkdir -p sde
|
||||||
|
tar -xvf "./${SDE_ARCHIVE}" --strip-components=1 -C ./sde/
|
||||||
|
|
||||||
|
wait_for_pid_and_check_log() {
|
||||||
|
local pid="$1"
|
||||||
|
local log_file="$2"
|
||||||
|
local exit_status
|
||||||
|
|
||||||
|
if [ -z "$pid" ] || [ -z "$log_file" ]; then
|
||||||
|
echo "Usage: wait_for_pid_and_check_log <PID> <LOG_FILE>"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Waiting for process $pid to finish..."
|
||||||
|
|
||||||
|
# Use the 'wait' command to pause the script until the specific PID exits.
|
||||||
|
# The 'wait' command's own exit status will be that of the waited-for process.
|
||||||
|
if wait "$pid"; then
|
||||||
|
exit_status=$?
|
||||||
|
echo "Process $pid finished with exit status $exit_status (Success)."
|
||||||
|
else
|
||||||
|
exit_status=$?
|
||||||
|
echo "Process $pid finished with exit status $exit_status (Failure)."
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$exit_status" -ne 0 ]; then
|
||||||
|
echo "Process exited with a non-zero status."
|
||||||
|
echo "--- Last few lines of log file: $log_file ---"
|
||||||
|
tail -n 50 "$log_file"
|
||||||
|
echo "---------------------------------------------"
|
||||||
|
return 1 # Indicate failure based on exit status
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "No errors detected in log file and process exited successfully."
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Test Sky Lake (AVX512F)
|
||||||
|
./sde/sde64 -skl -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_0.log 2>&1 &
|
||||||
|
PID_TEST_0=$!
|
||||||
|
|
||||||
|
# Test Cascade Lake (AVX512F + VNNI)
|
||||||
|
./sde/sde64 -clx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_1.log 2>&1 &
|
||||||
|
PID_TEST_1=$!
|
||||||
|
|
||||||
|
# Test Cooper Lake (AVX512F + VNNI + BF16)
|
||||||
|
./sde/sde64 -cpx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_2.log 2>&1 &
|
||||||
|
PID_TEST_2=$!
|
||||||
|
|
||||||
|
wait_for_pid_and_check_log $PID_TEST_0 test_0.log
|
||||||
|
wait_for_pid_and_check_log $PID_TEST_1 test_1.log
|
||||||
|
wait_for_pid_and_check_log $PID_TEST_2 test_2.log
|
||||||
@@ -34,7 +34,7 @@ function cpu_tests() {
|
|||||||
# offline inference
|
# offline inference
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
set -e
|
set -e
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"
|
||||||
|
|
||||||
# Run model tests
|
# Run model tests
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ function cpu_tests() {
|
|||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
export TORCH_COMPILE_DISABLE=1
|
export TORCH_COMPILE_DISABLE=1
|
||||||
set -xve
|
set -xve
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
|
|||||||
@@ -25,5 +25,5 @@ remove_docker_container
|
|||||||
|
|
||||||
# Run the image and test offline inference
|
# Run the image and test offline inference
|
||||||
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||||
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
|
||||||
'
|
'
|
||||||
|
|||||||
@@ -76,7 +76,7 @@ docker run --rm --runtime=habana --name="${container_name}" --network=host \
|
|||||||
-e PT_HPU_LAZY_MODE=1 \
|
-e PT_HPU_LAZY_MODE=1 \
|
||||||
"${image_name}" \
|
"${image_name}" \
|
||||||
/bin/bash -c '
|
/bin/bash -c '
|
||||||
cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
|
||||||
'
|
'
|
||||||
|
|
||||||
EXITCODE=$?
|
EXITCODE=$?
|
||||||
|
|||||||
@@ -34,17 +34,17 @@ docker run \
|
|||||||
set -e
|
set -e
|
||||||
echo $ZE_AFFINITY_MASK
|
echo $ZE_AFFINITY_MASK
|
||||||
pip install tblib==3.1.0
|
pip install tblib==3.1.0
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
|
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
|
||||||
python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
|
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
|
||||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
|
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
|
||||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
|
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
|
||||||
cd tests
|
cd tests
|
||||||
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
|
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
|
||||||
pytest -v -s v1/engine
|
pytest -v -s v1/engine
|
||||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:
|
|||||||
BACKENDS=("allgather_reducescatter")
|
BACKENDS=("allgather_reducescatter")
|
||||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||||
export VLLM_ROCM_MOE_PADDING=0
|
export VLLM_ROCM_MOE_PADDING=0
|
||||||
PLATFORM_ARGS=("--no-async-scheduling")
|
PLATFORM_ARGS=("--no-async-scheduling" "--attention-backend=TRITON_ATTN")
|
||||||
echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
|
echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
|
||||||
else
|
else
|
||||||
# Non-ROCm platform (CUDA/other)
|
# Non-ROCm platform (CUDA/other)
|
||||||
|
|||||||
248
.buildkite/scripts/tool_call/run-bfcl-eval.sh
Executable file
248
.buildkite/scripts/tool_call/run-bfcl-eval.sh
Executable file
@@ -0,0 +1,248 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Run BFCL (Berkeley Function Call Leaderboard) tool-calling correctness
|
||||||
|
# evaluation against a local vLLM server.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# # Run with defaults (gpt-oss-20b, multi_turn)
|
||||||
|
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
|
||||||
|
#
|
||||||
|
# # Run with gpt-oss-120b and multiple test categories
|
||||||
|
# BFCL_MODEL="openai/gpt-oss-120b" BFCL_TP_SIZE=4 \
|
||||||
|
# BFCL_TEST_CATEGORY="live_simple, multiple, parallel_multiple" \
|
||||||
|
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
|
||||||
|
#
|
||||||
|
# # Chain both API types (use BFCL_OUTPUT_DIR to avoid overwriting results)
|
||||||
|
# BFCL_OUTPUT_DIR=./bfcl-chat-completions BFCL_API_TYPE=chat_completions \
|
||||||
|
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh && \
|
||||||
|
# BFCL_OUTPUT_DIR=./bfcl-responses BFCL_API_TYPE=responses \
|
||||||
|
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
|
||||||
|
#
|
||||||
|
# Environment variables (all optional, with defaults):
|
||||||
|
# BFCL_MODEL - HF model name (default: openai/gpt-oss-20b)
|
||||||
|
# BFCL_API_TYPE - API type: "chat_completions" or "responses" (default: chat_completions)
|
||||||
|
# BFCL_OUTPUT_DIR - Directory for BFCL results (default: current working directory)
|
||||||
|
# BFCL_TEST_CATEGORY - BFCL test categories (default: multi_turn)
|
||||||
|
# BFCL_TOOL_CALL_PARSER - Tool call parser name (default: openai)
|
||||||
|
# BFCL_NUM_THREADS - Threads for BFCL generate (default: 8)
|
||||||
|
# BFCL_TP_SIZE - Tensor parallel size (default: 1)
|
||||||
|
# BFCL_MAX_MODEL_LEN - Max model length (default: 4096)
|
||||||
|
# BFCL_PORT - Server port (default: 8000)
|
||||||
|
# BFCL_REASONING_PARSER - Reasoning parser name (default: disabled)
|
||||||
|
# BFCL_EXTRA_ARGS - Additional vLLM server args
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# ---- Configuration ----
|
||||||
|
MODEL="${BFCL_MODEL:-openai/gpt-oss-20b}"
|
||||||
|
API_TYPE="${BFCL_API_TYPE:-chat_completions}"
|
||||||
|
OUTPUT_DIR="${BFCL_OUTPUT_DIR:-}"
|
||||||
|
TEST_CATEGORY="${BFCL_TEST_CATEGORY:-multi_turn}"
|
||||||
|
TOOL_CALL_PARSER="${BFCL_TOOL_CALL_PARSER:-openai}"
|
||||||
|
NUM_THREADS="${BFCL_NUM_THREADS:-8}"
|
||||||
|
TP_SIZE="${BFCL_TP_SIZE:-1}"
|
||||||
|
MAX_MODEL_LEN="${BFCL_MAX_MODEL_LEN:-4096}"
|
||||||
|
PORT="${BFCL_PORT:-8000}"
|
||||||
|
REASONING_PARSER="${BFCL_REASONING_PARSER:-}"
|
||||||
|
EXTRA_ARGS="${BFCL_EXTRA_ARGS:-}"
|
||||||
|
|
||||||
|
# Set up output directory
|
||||||
|
if [ -n "$OUTPUT_DIR" ]; then
|
||||||
|
mkdir -p "$OUTPUT_DIR"
|
||||||
|
OUTPUT_DIR="$(cd "$OUTPUT_DIR" && pwd)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "============================================"
|
||||||
|
echo "BFCL Tool Call Correctness Evaluation"
|
||||||
|
echo "============================================"
|
||||||
|
echo "Model: $MODEL"
|
||||||
|
echo "Tool parser: $TOOL_CALL_PARSER"
|
||||||
|
echo "API type: $API_TYPE"
|
||||||
|
echo "Output dir: ${OUTPUT_DIR:-<cwd>}"
|
||||||
|
echo "Test category: $TEST_CATEGORY"
|
||||||
|
echo "TP size: $TP_SIZE"
|
||||||
|
echo "Max model len: $MAX_MODEL_LEN"
|
||||||
|
echo "Port: $PORT"
|
||||||
|
echo "Num threads: $NUM_THREADS"
|
||||||
|
echo "============================================"
|
||||||
|
|
||||||
|
# ---- Install bfcl-eval if missing ----
|
||||||
|
if ! python3 -c "import bfcl_eval" 2>/dev/null; then
|
||||||
|
echo "Installing bfcl-eval..."
|
||||||
|
pip install "bfcl-eval>=2025.10.20.1,<2026"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ---- Cleanup handler ----
|
||||||
|
SERVER_PID=""
|
||||||
|
cleanup() {
|
||||||
|
if [ -n "$SERVER_PID" ]; then
|
||||||
|
echo "Stopping vLLM server (pid=$SERVER_PID)..."
|
||||||
|
kill "$SERVER_PID" 2>/dev/null || true
|
||||||
|
wait "$SERVER_PID" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
# Remove BFCL lock files (created by filelock for thread-safe writes)
|
||||||
|
rm -rf .file_locks/
|
||||||
|
if [ -n "${OUTPUT_DIR:-}" ]; then
|
||||||
|
rm -rf "$OUTPUT_DIR/.file_locks/"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
# ---- Start vLLM server ----
|
||||||
|
echo "Starting vLLM server..."
|
||||||
|
|
||||||
|
SERVE_ARGS=(
|
||||||
|
"$MODEL"
|
||||||
|
--port "$PORT"
|
||||||
|
--enable-auto-tool-choice
|
||||||
|
--tool-call-parser "$TOOL_CALL_PARSER"
|
||||||
|
--tensor-parallel-size "$TP_SIZE"
|
||||||
|
--max-model-len "$MAX_MODEL_LEN"
|
||||||
|
--enforce-eager
|
||||||
|
--no-enable-prefix-caching
|
||||||
|
)
|
||||||
|
|
||||||
|
# Append reasoning parser if specified
|
||||||
|
if [ -n "$REASONING_PARSER" ]; then
|
||||||
|
SERVE_ARGS+=(--reasoning-parser "$REASONING_PARSER")
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Append any extra args
|
||||||
|
if [ -n "$EXTRA_ARGS" ]; then
|
||||||
|
read -ra EXTRA_ARGS_ARRAY <<< "$EXTRA_ARGS"
|
||||||
|
SERVE_ARGS+=("${EXTRA_ARGS_ARRAY[@]}")
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Command: vllm serve ${SERVE_ARGS[*]}"
|
||||||
|
vllm serve "${SERVE_ARGS[@]}" &
|
||||||
|
SERVER_PID=$!
|
||||||
|
|
||||||
|
# ---- Wait for server to be ready ----
|
||||||
|
echo "Waiting for vLLM server to start (timeout: 600s)..."
|
||||||
|
SECONDS_WAITED=0
|
||||||
|
until curl -sf "http://localhost:${PORT}/health" > /dev/null 2>&1; do
|
||||||
|
if [ $SECONDS_WAITED -ge 600 ]; then
|
||||||
|
echo ""
|
||||||
|
echo "ERROR: vLLM server failed to start within 600s"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if (( SECONDS_WAITED % 30 == 0 && SECONDS_WAITED > 0 )); then
|
||||||
|
echo " Still waiting... (${SECONDS_WAITED}s elapsed)"
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
SECONDS_WAITED=$((SECONDS_WAITED + 2))
|
||||||
|
done
|
||||||
|
echo "vLLM server is ready. (started in ${SECONDS_WAITED}s)"
|
||||||
|
|
||||||
|
# ---- Run BFCL evaluation ----
|
||||||
|
# bfcl-eval has no CLI entry point; generate() and evaluate() are Typer
|
||||||
|
# functions that must be called from Python. The MODEL_CONFIG_MAPPING must
|
||||||
|
# be patched in-process so BFCL knows to use the OpenAI-compatible handler
|
||||||
|
# against our local vLLM server.
|
||||||
|
bfcl_exit_code=0
|
||||||
|
python3 - "$MODEL" "$TEST_CATEGORY" "$NUM_THREADS" "$PORT" "$API_TYPE" "$OUTPUT_DIR" << 'PYEOF' || bfcl_exit_code=$?
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
model = sys.argv[1]
|
||||||
|
test_category = sys.argv[2]
|
||||||
|
num_threads = int(sys.argv[3])
|
||||||
|
port = sys.argv[4]
|
||||||
|
api_type = sys.argv[5]
|
||||||
|
output_dir = sys.argv[6] if len(sys.argv) > 6 and sys.argv[6] else os.getcwd()
|
||||||
|
|
||||||
|
os.environ["OPENAI_BASE_URL"] = f"http://localhost:{port}/v1"
|
||||||
|
os.environ["OPENAI_API_KEY"] = "dummy"
|
||||||
|
os.environ["BFCL_PROJECT_ROOT"] = output_dir
|
||||||
|
|
||||||
|
import bfcl_eval.constants.model_config as bfcl_model_config
|
||||||
|
from bfcl_eval.constants.model_config import ModelConfig
|
||||||
|
from bfcl_eval.model_handler.api_inference.openai_completion import (
|
||||||
|
OpenAICompletionsHandler,
|
||||||
|
)
|
||||||
|
from bfcl_eval.model_handler.api_inference.openai_response import (
|
||||||
|
OpenAIResponsesHandler,
|
||||||
|
)
|
||||||
|
|
||||||
|
if api_type == "responses":
|
||||||
|
handler = OpenAIResponsesHandler
|
||||||
|
else:
|
||||||
|
handler = OpenAICompletionsHandler
|
||||||
|
|
||||||
|
bfcl_model_config.MODEL_CONFIG_MAPPING[model] = ModelConfig(
|
||||||
|
model_name=model,
|
||||||
|
display_name=f"{model} (FC) (vLLM)",
|
||||||
|
url=f"https://huggingface.co/{model}",
|
||||||
|
org="",
|
||||||
|
license="apache-2.0",
|
||||||
|
model_handler=handler,
|
||||||
|
input_price=None,
|
||||||
|
output_price=None,
|
||||||
|
is_fc_model=True,
|
||||||
|
underscore_to_dot=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
from bfcl_eval.__main__ import evaluate, generate
|
||||||
|
import inspect
|
||||||
|
import typer
|
||||||
|
|
||||||
|
|
||||||
|
def _get_default_kwargs(function):
|
||||||
|
kwargs = {}
|
||||||
|
for k, v in inspect.signature(function).parameters.items():
|
||||||
|
if v.default is not inspect.Parameter.empty:
|
||||||
|
default = v.default
|
||||||
|
if isinstance(default, typer.models.OptionInfo):
|
||||||
|
default = default.default
|
||||||
|
kwargs[k] = default
|
||||||
|
return kwargs
|
||||||
|
|
||||||
|
|
||||||
|
# ---- generate ----
|
||||||
|
print(f"=== BFCL generate: model={model} test_category={test_category} ===")
|
||||||
|
gen_kwargs = _get_default_kwargs(generate)
|
||||||
|
gen_kwargs["model"] = [model]
|
||||||
|
gen_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
|
||||||
|
gen_kwargs["skip_server_setup"] = True
|
||||||
|
gen_kwargs["num_threads"] = num_threads
|
||||||
|
generate(**gen_kwargs)
|
||||||
|
|
||||||
|
# ---- evaluate ----
|
||||||
|
print(f"=== BFCL evaluate: model={model} test_category={test_category} ===")
|
||||||
|
eval_kwargs = _get_default_kwargs(evaluate)
|
||||||
|
eval_kwargs["model"] = [model]
|
||||||
|
eval_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
|
||||||
|
evaluate(**eval_kwargs)
|
||||||
|
|
||||||
|
print("=== BFCL evaluation completed successfully ===")
|
||||||
|
PYEOF
|
||||||
|
|
||||||
|
# ---- Upload results to buildkite ----
|
||||||
|
if command -v buildkite-agent &>/dev/null; then
|
||||||
|
if [ $bfcl_exit_code -eq 0 ]; then
|
||||||
|
STYLE="success"
|
||||||
|
STATUS="PASSED"
|
||||||
|
else
|
||||||
|
STYLE="error"
|
||||||
|
STATUS="FAILED"
|
||||||
|
fi
|
||||||
|
|
||||||
|
buildkite-agent annotate --style "$STYLE" --context "bfcl-results" <<EOF
|
||||||
|
### BFCL Tool Call Correctness - ${STATUS}
|
||||||
|
- **Model:** \`${MODEL}\`
|
||||||
|
- **Parser:** \`${TOOL_CALL_PARSER}\`
|
||||||
|
- **API type:** \`${API_TYPE}\`
|
||||||
|
- **Test category:** \`${TEST_CATEGORY}\`
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# BFCL writes results to $BFCL_PROJECT_ROOT/result/ and scores to
|
||||||
|
# $BFCL_PROJECT_ROOT/score/
|
||||||
|
RESULTS_ROOT="${OUTPUT_DIR:-.}"
|
||||||
|
if [ -d "$RESULTS_ROOT/result" ]; then
|
||||||
|
buildkite-agent artifact upload "$RESULTS_ROOT/result/**/*"
|
||||||
|
fi
|
||||||
|
if [ -d "$RESULTS_ROOT/score" ]; then
|
||||||
|
buildkite-agent artifact upload "$RESULTS_ROOT/score/**/*"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit $bfcl_exit_code
|
||||||
@@ -72,7 +72,7 @@ obj_json="objects.json"
|
|||||||
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
|
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
|
||||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||||
|
|
||||||
# call script to generate indicies for all existing wheels
|
# call script to generate indices for all existing wheels
|
||||||
# this indices have relative paths that could work as long as it is next to the wheel directory in s3
|
# this indices have relative paths that could work as long as it is next to the wheel directory in s3
|
||||||
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
|
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
|
||||||
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
|
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
|
||||||
|
|||||||
@@ -54,10 +54,13 @@ mkdir -p $DIST_DIR
|
|||||||
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
|
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
|
||||||
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
|
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
|
||||||
echo "Wheels copied to local directory"
|
echo "Wheels copied to local directory"
|
||||||
# generate source tarball
|
# generate source distribution using setup.py
|
||||||
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT"
|
python setup.py sdist --dist-dir=$DIST_DIR
|
||||||
ls -la $DIST_DIR
|
ls -la $DIST_DIR
|
||||||
|
|
||||||
|
SDIST_FILE=$(find $DIST_DIR -name "vllm*.tar.gz")
|
||||||
|
echo "Found sdist: $SDIST_FILE"
|
||||||
|
|
||||||
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
|
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
|
||||||
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
|
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
|
||||||
if [[ -z "$PYPI_WHEEL_FILES" ]]; then
|
if [[ -z "$PYPI_WHEEL_FILES" ]]; then
|
||||||
@@ -65,6 +68,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
python3 -m twine check "$PYPI_WHEEL_FILES"
|
python3 -m twine check "$PYPI_WHEEL_FILES" "$SDIST_FILE"
|
||||||
python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES"
|
python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES" "$SDIST_FILE"
|
||||||
echo "Wheels uploaded to PyPI"
|
echo "Wheels and source distribution uploaded to PyPI"
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -14,8 +14,3 @@ steps:
|
|||||||
- pytest -v -s basic_correctness/test_cumem.py
|
- pytest -v -s basic_correctness/test_cumem.py
|
||||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||||
mirror:
|
|
||||||
amd:
|
|
||||||
device: mi325_1
|
|
||||||
depends_on:
|
|
||||||
- image-build-amd
|
|
||||||
|
|||||||
@@ -36,6 +36,16 @@ steps:
|
|||||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||||
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
|
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
|
||||||
|
|
||||||
|
- label: AsyncTP Correctness Tests (B200)
|
||||||
|
timeout_in_minutes: 50
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
device: b200
|
||||||
|
optional: true
|
||||||
|
num_devices: 2
|
||||||
|
commands:
|
||||||
|
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||||
|
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
|
||||||
|
|
||||||
- label: Distributed Compile Unit Tests (2xH100)
|
- label: Distributed Compile Unit Tests (2xH100)
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
@@ -91,8 +101,8 @@ steps:
|
|||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
# Run all models and attn backends but only Inductor partition and native custom ops
|
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||||
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
# Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
|
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and (qwen3 or deepseek)"
|
||||||
|
|
||||||
- label: Fusion E2E Config Sweep (H100)
|
- label: Fusion E2E Config Sweep (H100)
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
@@ -122,9 +132,9 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
# Run all models but only FLASHINFER, Inductor partition and native custom ops
|
# Run all models but only FLASHINFER, Inductor partition and native custom ops
|
||||||
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
# Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||||
# Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
|
# Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
|
||||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"
|
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek)) or llama-3)"
|
||||||
|
|
||||||
- label: Fusion E2E TP2 Quick (H100)
|
- label: Fusion E2E TP2 Quick (H100)
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
@@ -140,8 +150,8 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
# Run all models and attn backends but only Inductor partition and native custom ops
|
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
|
||||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
|
||||||
|
|
||||||
- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
|
- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 40
|
||||||
@@ -195,7 +205,7 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
# Run all models but only FLASHINFER, Inductor partition and native custom ops
|
# Run all models but only FLASHINFER, Inductor partition and native custom ops
|
||||||
# include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
# include qwen/deepseek with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||||
# for ar-rms-quant-fp4, also sweep llama3
|
# for ar-rms-quant-fp4, also sweep llama3
|
||||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))) or Llama-3.1-8B-Instruct-FP4"
|
||||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
|
||||||
|
|||||||
@@ -50,23 +50,18 @@ steps:
|
|||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||||
|
|
||||||
- label: Distributed Tests (4 GPUs)
|
- label: Distributed Torchrun + Examples (4 GPUs)
|
||||||
timeout_in_minutes: 50
|
timeout_in_minutes: 30
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_devices: 4
|
num_devices: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/distributed/
|
- vllm/distributed/
|
||||||
- tests/distributed/test_utils
|
- tests/distributed/test_torchrun_example.py
|
||||||
- tests/distributed/test_pynccl
|
- tests/distributed/test_torchrun_example_moe.py
|
||||||
- tests/distributed/test_events
|
|
||||||
- tests/compile/fullgraph/test_basic_correctness.py
|
|
||||||
- examples/offline_inference/rlhf.py
|
- examples/offline_inference/rlhf.py
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
- examples/offline_inference/new_weight_syncing/
|
- examples/offline_inference/new_weight_syncing/
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
- tests/v1/distributed
|
|
||||||
- tests/v1/engine/test_engine_core_client.py
|
|
||||||
- tests/distributed/test_symm_mem_allreduce.py
|
|
||||||
commands:
|
commands:
|
||||||
# https://github.com/NVIDIA/nccl/issues/1838
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
@@ -84,19 +79,6 @@ steps:
|
|||||||
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||||
# test with internal dp
|
# test with internal dp
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
|
||||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
|
||||||
- pytest -v -s distributed/test_utils.py
|
|
||||||
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
|
||||||
- pytest -v -s distributed/test_events.py
|
|
||||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
|
||||||
# TODO: create a dedicated test section for multi-GPU example tests
|
|
||||||
# when we have multiple distributed example tests
|
|
||||||
# OLD rlhf examples
|
# OLD rlhf examples
|
||||||
- cd ../examples/offline_inference
|
- cd ../examples/offline_inference
|
||||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||||
@@ -106,6 +88,47 @@ steps:
|
|||||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
|
||||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
|
||||||
|
|
||||||
|
- label: Distributed DP Tests (4 GPUs)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_devices: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/
|
||||||
|
- tests/v1/distributed
|
||||||
|
- tests/v1/engine/test_engine_core_client.py
|
||||||
|
- tests/distributed/test_utils
|
||||||
|
commands:
|
||||||
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||||
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
||||||
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||||
|
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||||
|
- pytest -v -s distributed/test_utils.py
|
||||||
|
|
||||||
|
- label: Distributed Compile + Comm (4 GPUs)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_devices: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/
|
||||||
|
- tests/distributed/test_pynccl
|
||||||
|
- tests/distributed/test_events
|
||||||
|
- tests/compile/fullgraph/test_basic_correctness.py
|
||||||
|
- tests/distributed/test_symm_mem_allreduce.py
|
||||||
|
- tests/distributed/test_multiproc_executor.py
|
||||||
|
commands:
|
||||||
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
|
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
||||||
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
|
- pytest -v -s distributed/test_events.py
|
||||||
|
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||||
|
# test multi-node TP with multiproc executor (simulated on single node)
|
||||||
|
- pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
|
||||||
|
|
||||||
- label: Distributed Tests (8 GPUs)(H100)
|
- label: Distributed Tests (8 GPUs)(H100)
|
||||||
timeout_in_minutes: 10
|
timeout_in_minutes: 10
|
||||||
device: h100
|
device: h100
|
||||||
@@ -146,7 +169,7 @@ steps:
|
|||||||
num_devices: 2
|
num_devices: 2
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
# - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py --- failing, need to re-enable
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
|
||||||
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
@@ -210,6 +233,19 @@ steps:
|
|||||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
- CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
- CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||||
|
|
||||||
|
- label: NixlConnector PD + Spec Decode acceptance (2 GPUs)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
device: a100
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_devices: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||||
|
- vllm/v1/worker/kv_connector_model_runner_mixin.py
|
||||||
|
- tests/v1/kv_connector/nixl_integration/
|
||||||
|
commands:
|
||||||
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
|
- bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
|
||||||
|
|
||||||
- label: Pipeline + Context Parallelism (4 GPUs)
|
- label: Pipeline + Context Parallelism (4 GPUs)
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
group: Engine
|
group: Engine
|
||||||
depends_on:
|
depends_on:
|
||||||
- image-build
|
- image-build
|
||||||
steps:
|
steps:
|
||||||
- label: Engine
|
- label: Engine
|
||||||
@@ -14,28 +14,30 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||||
|
|
||||||
- label: V1 e2e + engine (1 GPU)
|
- label: Engine (1 GPU)
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 30
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/v1/engine/
|
||||||
- tests/v1
|
- tests/v1/engine/
|
||||||
commands:
|
commands:
|
||||||
# TODO: accuracy does not match, whether setting
|
|
||||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
|
||||||
- pytest -v -s v1/e2e
|
|
||||||
# Run this test standalone for now;
|
|
||||||
# need to untangle use (implicit) use of spawn/fork across the tests.
|
|
||||||
- pytest -v -s v1/engine/test_preprocess_error_handling.py
|
- pytest -v -s v1/engine/test_preprocess_error_handling.py
|
||||||
# Run the rest of v1/engine tests
|
|
||||||
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
|
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
|
||||||
mirror:
|
|
||||||
amd:
|
- label: e2e Scheduling (1 GPU)
|
||||||
device: mi325_1
|
timeout_in_minutes: 30
|
||||||
depends_on:
|
source_file_dependencies:
|
||||||
- image-build-amd
|
- vllm/v1/
|
||||||
commands:
|
- tests/v1/e2e/general/
|
||||||
- pytest -v -s v1/e2e
|
commands:
|
||||||
- pytest -v -s v1/engine
|
- pytest -v -s v1/e2e/general/test_async_scheduling.py
|
||||||
|
|
||||||
|
- label: e2e Core (1 GPU)
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/
|
||||||
|
- tests/v1/e2e/general/
|
||||||
|
commands:
|
||||||
|
- pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
|
||||||
|
|
||||||
- label: V1 e2e (2 GPUs)
|
- label: V1 e2e (2 GPUs)
|
||||||
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
|
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
|
||||||
@@ -46,7 +48,7 @@ steps:
|
|||||||
- tests/v1/e2e
|
- tests/v1/e2e
|
||||||
commands:
|
commands:
|
||||||
# Only run tests that need exactly 2 GPUs
|
# Only run tests that need exactly 2 GPUs
|
||||||
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
|
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
|
||||||
mirror:
|
mirror:
|
||||||
amd:
|
amd:
|
||||||
device: mi325_2
|
device: mi325_2
|
||||||
@@ -62,7 +64,7 @@ steps:
|
|||||||
- tests/v1/e2e
|
- tests/v1/e2e
|
||||||
commands:
|
commands:
|
||||||
# Only run tests that need 4 GPUs
|
# Only run tests that need 4 GPUs
|
||||||
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
|
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
|
||||||
mirror:
|
mirror:
|
||||||
amd:
|
amd:
|
||||||
device: mi325_4
|
device: mi325_4
|
||||||
|
|||||||
@@ -24,11 +24,6 @@ steps:
|
|||||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
mirror:
|
|
||||||
amd:
|
|
||||||
device: mi325_1
|
|
||||||
depends_on:
|
|
||||||
- image-build-amd
|
|
||||||
|
|
||||||
- label: Entrypoints Integration (API Server 1)
|
- label: Entrypoints Integration (API Server 1)
|
||||||
timeout_in_minutes: 130
|
timeout_in_minutes: 130
|
||||||
@@ -39,8 +34,13 @@ steps:
|
|||||||
- tests/entrypoints/test_chat_utils
|
- tests/entrypoints/test_chat_utils
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
|
||||||
- pytest -v -s entrypoints/test_chat_utils.py
|
- pytest -v -s entrypoints/test_chat_utils.py
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|
||||||
- label: Entrypoints Integration (API Server 2)
|
- label: Entrypoints Integration (API Server 2)
|
||||||
timeout_in_minutes: 130
|
timeout_in_minutes: 130
|
||||||
@@ -65,11 +65,6 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s entrypoints/pooling
|
- pytest -v -s entrypoints/pooling
|
||||||
mirror:
|
|
||||||
amd:
|
|
||||||
device: mi325_1
|
|
||||||
depends_on:
|
|
||||||
- image-build-amd
|
|
||||||
|
|
||||||
- label: Entrypoints Integration (Responses API)
|
- label: Entrypoints Integration (Responses API)
|
||||||
timeout_in_minutes: 50
|
timeout_in_minutes: 50
|
||||||
@@ -87,6 +82,11 @@ steps:
|
|||||||
- tests/v1
|
- tests/v1
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s v1/entrypoints
|
- pytest -v -s v1/entrypoints
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|
||||||
- label: OpenAI API Correctness
|
- label: OpenAI API Correctness
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
|
|||||||
@@ -8,8 +8,9 @@ steps:
|
|||||||
- csrc/
|
- csrc/
|
||||||
- tests/kernels/core
|
- tests/kernels/core
|
||||||
- tests/kernels/test_top_k_per_row.py
|
- tests/kernels/test_top_k_per_row.py
|
||||||
|
- tests/kernels/test_concat_mla_q.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
- pytest -v -s kernels/core kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py
|
||||||
|
|
||||||
- label: Kernels Attention Test %N
|
- label: Kernels Attention Test %N
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 35
|
||||||
@@ -96,7 +97,7 @@ steps:
|
|||||||
- vllm/platforms/cuda.py
|
- vllm/platforms/cuda.py
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/basic/offline_inference/chat.py
|
||||||
# Attention
|
# Attention
|
||||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||||
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||||
|
|||||||
@@ -67,12 +67,13 @@ steps:
|
|||||||
- examples/
|
- examples/
|
||||||
commands:
|
commands:
|
||||||
- pip install tensorizer # for tensorizer test
|
- pip install tensorizer # for tensorizer test
|
||||||
- python3 offline_inference/basic/chat.py # for basic
|
# for basic
|
||||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
- python3 basic/offline_inference/chat.py
|
||||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||||
- python3 offline_inference/basic/embed.py
|
- python3 basic/offline_inference/classify.py
|
||||||
- python3 offline_inference/basic/score.py
|
- python3 basic/offline_inference/embed.py
|
||||||
|
- python3 basic/offline_inference/score.py
|
||||||
# for multi-modal models
|
# for multi-modal models
|
||||||
- python3 offline_inference/audio_language.py --seed 0
|
- python3 offline_inference/audio_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language.py --seed 0
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
|
|||||||
@@ -9,9 +9,9 @@ steps:
|
|||||||
- vllm/config/model.py
|
- vllm/config/model.py
|
||||||
- vllm/model_executor
|
- vllm/model_executor
|
||||||
- tests/model_executor
|
- tests/model_executor
|
||||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
- tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
|
||||||
commands:
|
commands:
|
||||||
- apt-get update && apt-get install -y curl libsodium23
|
- apt-get update && apt-get install -y curl libsodium23
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s model_executor
|
- pytest -v -s model_executor
|
||||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
- pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py
|
||||||
|
|||||||
110
.buildkite/test_areas/model_runner_v2.yaml
Normal file
110
.buildkite/test_areas/model_runner_v2.yaml
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
group: Model Runner V2
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Model Runner V2 Core Tests
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/worker/gpu/
|
||||||
|
- vllm/v1/worker/gpu_worker.py
|
||||||
|
- vllm/v1/core/sched/
|
||||||
|
- vllm/v1/attention/
|
||||||
|
- tests/v1/engine/test_llm_engine.py
|
||||||
|
- tests/v1/e2e/
|
||||||
|
- tests/v1/entrypoints/llm/test_struct_output_generate.py
|
||||||
|
commands:
|
||||||
|
- set -x
|
||||||
|
- export VLLM_USE_V2_MODEL_RUNNER=1
|
||||||
|
- pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics"
|
||||||
|
# This requires eager until we sort out CG correctness issues.
|
||||||
|
# TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged.
|
||||||
|
- ENFORCE_EAGER=1 pytest -v -s v1/e2e/general/test_async_scheduling.py -k "not ngram"
|
||||||
|
- pytest -v -s v1/e2e/general/test_context_length.py
|
||||||
|
- pytest -v -s v1/e2e/general/test_min_tokens.py
|
||||||
|
# Temporary hack filter to exclude ngram spec decoding based tests.
|
||||||
|
- pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
|
||||||
|
|
||||||
|
- label: Model Runner V2 Examples
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
working_dir: "/vllm-workspace/examples"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/worker/gpu/
|
||||||
|
- vllm/v1/core/sched/
|
||||||
|
- vllm/v1/worker/gpu_worker.py
|
||||||
|
- examples/offline_inference/
|
||||||
|
- examples/basic/offline_inference/
|
||||||
|
- examples/pooling/embed/vision_embedding_offline.py
|
||||||
|
- examples/others/tensorize_vllm_model.py
|
||||||
|
commands:
|
||||||
|
- set -x
|
||||||
|
- export VLLM_USE_V2_MODEL_RUNNER=1
|
||||||
|
- pip install tensorizer # for tensorizer test
|
||||||
|
- python3 basic/offline_inference/chat.py # for basic
|
||||||
|
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
|
||||||
|
#- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 # TODO
|
||||||
|
#- python3 basic/offline_inference/embed.py # TODO
|
||||||
|
# for multi-modal models
|
||||||
|
- python3 offline_inference/audio_language.py --seed 0
|
||||||
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||||
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||||
|
# for pooling models
|
||||||
|
- python3 pooling/embed/vision_embedding_offline.py --seed 0
|
||||||
|
# for features demo
|
||||||
|
- python3 offline_inference/prefix_caching.py
|
||||||
|
- python3 offline_inference/llm_engine_example.py
|
||||||
|
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
|
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||||
|
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||||
|
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||||
|
|
||||||
|
- label: Model Runner V2 Distributed (2 GPUs)
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_devices: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/worker/gpu/
|
||||||
|
- vllm/v1/worker/gpu_worker.py
|
||||||
|
- tests/basic_correctness/test_basic_correctness.py
|
||||||
|
- tests/v1/distributed/test_async_llm_dp.py
|
||||||
|
- tests/v1/distributed/test_eagle_dp.py
|
||||||
|
commands:
|
||||||
|
- set -x
|
||||||
|
- export VLLM_USE_V2_MODEL_RUNNER=1
|
||||||
|
# The "and not True" here is a hacky way to exclude the prompt_embeds cases which aren't yet supported.
|
||||||
|
- TARGET_TEST_SUITE=L4 pytest -v -s basic_correctness/test_basic_correctness.py -m 'distributed(num_gpus=2)' -k "not ray and not True"
|
||||||
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py -k "not ray"
|
||||||
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||||
|
|
||||||
|
# These require fix https://github.com/vllm-project/vllm/pull/36280
|
||||||
|
- label: Model Runner V2 Pipeline Parallelism (4 GPUs)
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_devices: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/worker/gpu/
|
||||||
|
- vllm/v1/worker/gpu_worker.py
|
||||||
|
- tests/distributed/test_pipeline_parallel.py
|
||||||
|
#- tests/distributed/test_pp_cudagraph.py
|
||||||
|
commands:
|
||||||
|
- set -x
|
||||||
|
- export VLLM_USE_V2_MODEL_RUNNER=1
|
||||||
|
- pytest -v -s distributed/test_pipeline_parallel.py -k "not ray and not Jamba"
|
||||||
|
# TODO: Uncomment once https://github.com/vllm-project/vllm/pull/35162 is merged.
|
||||||
|
#- pytest -v -s distributed/test_pp_cudagraph.py -k "not ray"
|
||||||
|
|
||||||
|
- label: Model Runner V2 Spec Decode
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/worker/gpu/
|
||||||
|
- vllm/v1/worker/gpu_worker.py
|
||||||
|
- tests/v1/spec_decode/test_max_len.py
|
||||||
|
- tests/v1/e2e/spec_decode/test_spec_decode.py
|
||||||
|
commands:
|
||||||
|
- set -x
|
||||||
|
- export VLLM_USE_V2_MODEL_RUNNER=1
|
||||||
|
- pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp"
|
||||||
|
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle or mtp"
|
||||||
@@ -65,7 +65,7 @@ steps:
|
|||||||
- pytest -v -s tests/models/test_transformers.py
|
- pytest -v -s tests/models/test_transformers.py
|
||||||
- pytest -v -s tests/models/multimodal/processing/
|
- pytest -v -s tests/models/multimodal/processing/
|
||||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/basic/offline_inference/chat.py
|
||||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||||
# Whisper needs spawn method to avoid deadlock
|
# Whisper needs spawn method to avoid deadlock
|
||||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
|
|||||||
@@ -2,16 +2,65 @@ group: Models - Multimodal
|
|||||||
depends_on:
|
depends_on:
|
||||||
- image-build
|
- image-build
|
||||||
steps:
|
steps:
|
||||||
- label: Multi-Modal Models (Standard) # 60min
|
- label: "Multi-Modal Models (Standard) 1: qwen2"
|
||||||
timeout_in_minutes: 80
|
timeout_in_minutes: 45
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/multimodal
|
- tests/models/multimodal
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pip freeze | grep -E 'torch'
|
- pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2"
|
||||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
- pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|
||||||
|
- label: "Multi-Modal Models (Standard) 2: qwen3 + gemma"
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma"
|
||||||
|
- pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|
||||||
|
- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl"
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma"
|
||||||
|
- pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|
||||||
|
- label: "Multi-Modal Models (Standard) 4: other + whisper"
|
||||||
|
timeout_in_minutes: 45
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|
||||||
- label: Multi-Modal Processor Test (CPU)
|
- label: Multi-Modal Processor Test (CPU)
|
||||||
depends_on:
|
depends_on:
|
||||||
@@ -54,6 +103,11 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
|
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
|
||||||
|
mirror:
|
||||||
|
amd:
|
||||||
|
device: mi325_1
|
||||||
|
depends_on:
|
||||||
|
- image-build-amd
|
||||||
|
|
||||||
- label: Multi-Modal Models (Extended) 2
|
- label: Multi-Modal Models (Extended) 2
|
||||||
optional: true
|
optional: true
|
||||||
|
|||||||
@@ -15,9 +15,12 @@ steps:
|
|||||||
- pytest -v -s plugins_tests/test_platform_plugins.py
|
- pytest -v -s plugins_tests/test_platform_plugins.py
|
||||||
- pip uninstall vllm_add_dummy_platform -y
|
- pip uninstall vllm_add_dummy_platform -y
|
||||||
# end platform plugin tests
|
# end platform plugin tests
|
||||||
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
|
# begin io_processor plugins test
|
||||||
|
# test generic io_processor plugins functions
|
||||||
|
- pytest -v -s ./plugins_tests/test_io_processor_plugins.py
|
||||||
|
# test Terratorch io_processor plugins
|
||||||
- pip install -e ./plugins/prithvi_io_processor_plugin
|
- pip install -e ./plugins/prithvi_io_processor_plugin
|
||||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
- pytest -v -s plugins_tests/test_terratorch_io_processor_plugins.py
|
||||||
- pip uninstall prithvi_io_processor_plugin -y
|
- pip uninstall prithvi_io_processor_plugin -y
|
||||||
# test bge_m3_sparse io_processor plugin
|
# test bge_m3_sparse io_processor plugin
|
||||||
- pip install -e ./plugins/bge_m3_sparse_plugin
|
- pip install -e ./plugins/bge_m3_sparse_plugin
|
||||||
@@ -33,6 +36,6 @@ steps:
|
|||||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
- pytest -v -s distributed/test_distributed_oot.py
|
- pytest -v -s distributed/test_distributed_oot.py
|
||||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
- pytest -v -s entrypoints/openai/chat_completion/test_oot_registration.py # it needs a clean process
|
||||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||||
|
|||||||
40
.buildkite/test_areas/spec_decode.yaml
Normal file
40
.buildkite/test_areas/spec_decode.yaml
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
group: Spec Decode
|
||||||
|
depends_on:
|
||||||
|
- image-build
|
||||||
|
steps:
|
||||||
|
- label: Spec Decode Eagle
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/spec_decode/
|
||||||
|
- vllm/v1/worker/gpu/spec_decode/
|
||||||
|
- tests/v1/e2e/spec_decode/
|
||||||
|
commands:
|
||||||
|
- pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"
|
||||||
|
|
||||||
|
- label: Spec Decode Speculators + MTP
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/spec_decode/
|
||||||
|
- vllm/v1/worker/gpu/spec_decode/
|
||||||
|
- vllm/transformers_utils/configs/speculators/
|
||||||
|
- tests/v1/e2e/spec_decode/
|
||||||
|
commands:
|
||||||
|
- pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
|
||||||
|
|
||||||
|
- label: Spec Decode Ngram + Suffix
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/spec_decode/
|
||||||
|
- vllm/v1/worker/gpu/spec_decode/
|
||||||
|
- tests/v1/e2e/spec_decode/
|
||||||
|
commands:
|
||||||
|
- pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"
|
||||||
|
|
||||||
|
- label: Spec Decode Draft Model
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/spec_decode/
|
||||||
|
- vllm/v1/worker/gpu/spec_decode/
|
||||||
|
- tests/v1/e2e/spec_decode/
|
||||||
|
commands:
|
||||||
|
- pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"
|
||||||
13
.github/mergify.yml
vendored
13
.github/mergify.yml
vendored
@@ -3,6 +3,7 @@ pull_request_rules:
|
|||||||
description: Automatically apply documentation label
|
description: Automatically apply documentation label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
- label != stale
|
||||||
|
- -closed
|
||||||
- or:
|
- or:
|
||||||
- files~=^[^/]+\.md$
|
- files~=^[^/]+\.md$
|
||||||
- files~=^docs/
|
- files~=^docs/
|
||||||
@@ -26,7 +27,7 @@ pull_request_rules:
|
|||||||
Hi @{{author}}, the pre-commit checks have failed. Please run:
|
Hi @{{author}}, the pre-commit checks have failed. Please run:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
uv pip install pre-commit
|
uv pip install pre-commit>=4.5.1
|
||||||
pre-commit install
|
pre-commit install
|
||||||
pre-commit run --all-files
|
pre-commit run --all-files
|
||||||
```
|
```
|
||||||
@@ -37,15 +38,13 @@ pull_request_rules:
|
|||||||
|
|
||||||
> [!TIP]
|
> [!TIP]
|
||||||
> <details>
|
> <details>
|
||||||
> <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
|
> <summary>Is <code>mypy</code> failing?</summary>
|
||||||
> <br/>
|
> <br/>
|
||||||
> <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
|
> <code>mypy</code> is run differently in CI. If the failure is related to this check, please use the following command to run it locally:
|
||||||
>
|
>
|
||||||
> ```bash
|
> ```bash
|
||||||
> # For mypy (substitute "3.10" with the failing version if needed)
|
> # For mypy (substitute "3.10" with the failing version if needed)
|
||||||
> pre-commit run --hook-stage manual mypy-3.10
|
> pre-commit run --hook-stage manual mypy-3.10
|
||||||
> # For markdownlint
|
|
||||||
> pre-commit run --hook-stage manual markdownlint
|
|
||||||
> ```
|
> ```
|
||||||
> </details>
|
> </details>
|
||||||
|
|
||||||
@@ -335,7 +334,7 @@ pull_request_rules:
|
|||||||
- or:
|
- or:
|
||||||
- files~=^tests/tool_use/
|
- files~=^tests/tool_use/
|
||||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
- files=tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
|
||||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||||
- files=docs/features/tool_calling.md
|
- files=docs/features/tool_calling.md
|
||||||
- files~=^examples/tool_chat_*
|
- files~=^examples/tool_chat_*
|
||||||
@@ -382,7 +381,7 @@ pull_request_rules:
|
|||||||
- or:
|
- or:
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
||||||
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
- files~=^tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
|
||||||
- files~=^tests/model_executor/model_loader/tensorizer_loader/
|
- files~=^tests/model_executor/model_loader/tensorizer_loader/
|
||||||
actions:
|
actions:
|
||||||
assign:
|
assign:
|
||||||
|
|||||||
3
.github/workflows/macos-smoke-test.yml
vendored
3
.github/workflows/macos-smoke-test.yml
vendored
@@ -6,6 +6,9 @@ on:
|
|||||||
- main
|
- main
|
||||||
workflow_dispatch: # Manual trigger
|
workflow_dispatch: # Manual trigger
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
macos-m1-smoke-test:
|
macos-m1-smoke-test:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
|
|||||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -189,11 +189,9 @@ cython_debug/
|
|||||||
.vscode/
|
.vscode/
|
||||||
|
|
||||||
# Claude
|
# Claude
|
||||||
CLAUDE.md
|
|
||||||
.claude/
|
.claude/
|
||||||
|
|
||||||
# Codex
|
# Codex
|
||||||
AGENTS.md
|
|
||||||
.codex/
|
.codex/
|
||||||
|
|
||||||
# Cursor
|
# Cursor
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ repos:
|
|||||||
args: [--output-format, github, --fix]
|
args: [--output-format, github, --fix]
|
||||||
- id: ruff-format
|
- id: ruff-format
|
||||||
- repo: https://github.com/crate-ci/typos
|
- repo: https://github.com/crate-ci/typos
|
||||||
rev: v1.38.1
|
rev: v1.43.5
|
||||||
hooks:
|
hooks:
|
||||||
- id: typos
|
- id: typos
|
||||||
args: [--force-exclude]
|
args: [--force-exclude]
|
||||||
@@ -24,12 +24,13 @@ repos:
|
|||||||
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
||||||
types_or: [c++, cuda]
|
types_or: [c++, cuda]
|
||||||
args: [--style=file, --verbose]
|
args: [--style=file, --verbose]
|
||||||
- repo: https://github.com/igorshubovych/markdownlint-cli
|
- repo: https://github.com/DavidAnson/markdownlint-cli2
|
||||||
rev: v0.45.0
|
rev: v0.21.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: markdownlint
|
- id: markdownlint-cli2
|
||||||
exclude: '.*\.inc\.md'
|
language_version: lts
|
||||||
stages: [manual] # Only run in CI
|
args: [--fix]
|
||||||
|
exclude: ^CLAUDE\.md$
|
||||||
- repo: https://github.com/rhysd/actionlint
|
- repo: https://github.com/rhysd/actionlint
|
||||||
rev: v1.7.7
|
rev: v1.7.7
|
||||||
hooks:
|
hooks:
|
||||||
@@ -55,7 +56,7 @@ repos:
|
|||||||
language: python
|
language: python
|
||||||
types_or: [python, pyi]
|
types_or: [python, pyi]
|
||||||
require_serial: true
|
require_serial: true
|
||||||
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
|
additional_dependencies: ["mypy[faster-cache]==1.19.1", regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
|
||||||
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.10
|
name: Run mypy for Python 3.10
|
||||||
entry: python tools/pre_commit/mypy.py 1 "3.10"
|
entry: python tools/pre_commit/mypy.py 1 "3.10"
|
||||||
@@ -127,6 +128,13 @@ repos:
|
|||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
additional_dependencies: [regex]
|
additional_dependencies: [regex]
|
||||||
|
# prevent use torch.cuda APIs
|
||||||
|
- id: check-torch-cuda-call
|
||||||
|
name: "Prevent new 'torch.cuda' APIs call"
|
||||||
|
entry: python tools/pre_commit/check_torch_cuda.py
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: [regex]
|
||||||
- id: validate-config
|
- id: validate-config
|
||||||
name: Validate configuration has default values and that each field has a docstring
|
name: Validate configuration has default values and that each field has a docstring
|
||||||
entry: python tools/pre_commit/validate_config.py
|
entry: python tools/pre_commit/validate_config.py
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ build:
|
|||||||
python: "3.12"
|
python: "3.12"
|
||||||
jobs:
|
jobs:
|
||||||
post_checkout:
|
post_checkout:
|
||||||
|
# - bash docs/maybe_skip_pr_build.sh
|
||||||
- git fetch origin main --unshallow --no-tags --filter=blob:none || true
|
- git fetch origin main --unshallow --no-tags --filter=blob:none || true
|
||||||
pre_create_environment:
|
pre_create_environment:
|
||||||
- pip install uv
|
- pip install uv
|
||||||
|
|||||||
113
AGENTS.md
Normal file
113
AGENTS.md
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
# Agent Instructions for vLLM
|
||||||
|
|
||||||
|
> These instructions apply to **all** AI-assisted contributions to `vllm-project/vllm`.
|
||||||
|
> Breaching these guidelines can result in automatic banning.
|
||||||
|
|
||||||
|
## 1. Contribution Policy (Mandatory)
|
||||||
|
|
||||||
|
### Duplicate-work checks
|
||||||
|
|
||||||
|
Before proposing a PR, run these checks:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
gh issue view <issue_number> --repo vllm-project/vllm --comments
|
||||||
|
gh pr list --repo vllm-project/vllm --state open --search "<issue_number> in:body"
|
||||||
|
gh pr list --repo vllm-project/vllm --state open --search "<short area keywords>"
|
||||||
|
```
|
||||||
|
|
||||||
|
- If an open PR already addresses the same fix, do not open another.
|
||||||
|
- If your approach is materially different, explain the difference in the issue.
|
||||||
|
|
||||||
|
### No low-value busywork PRs
|
||||||
|
|
||||||
|
Do not open one-off PRs for tiny edits (single typo, isolated style change, one mutable default, etc.). Mechanical cleanups are acceptable only when bundled with substantive work.
|
||||||
|
|
||||||
|
### Accountability
|
||||||
|
|
||||||
|
- Pure code-agent PRs are **not allowed**. A human submitter must understand and defend the change end-to-end.
|
||||||
|
- The submitting human must review every changed line and run relevant tests.
|
||||||
|
- PR descriptions for AI-assisted work **must** include:
|
||||||
|
- Why this is not duplicating an existing PR.
|
||||||
|
- Test commands run and results.
|
||||||
|
- Clear statement that AI assistance was used.
|
||||||
|
|
||||||
|
### Fail-closed behavior
|
||||||
|
|
||||||
|
If work is duplicate/trivial busywork, **do not proceed**. Return a short explanation of what is missing.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Development Workflow
|
||||||
|
|
||||||
|
### Environment setup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install `uv` if you don't have it already:
|
||||||
|
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
|
|
||||||
|
# Always use `uv` for Python environment management:
|
||||||
|
uv venv --python 3.12
|
||||||
|
source .venv/bin/activate
|
||||||
|
|
||||||
|
# Always make sure `pre-commit` and its hooks are installed:
|
||||||
|
uv pip install -r requirements/lint.txt
|
||||||
|
pre-commit install
|
||||||
|
```
|
||||||
|
|
||||||
|
### Installing dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# If you are only making Python changes:
|
||||||
|
VLLM_USE_PRECOMPILED=1 uv pip install -e .
|
||||||
|
|
||||||
|
# If you are also making C/C++ changes:
|
||||||
|
uv pip install -e .
|
||||||
|
```
|
||||||
|
|
||||||
|
### Running tests
|
||||||
|
|
||||||
|
Tests require extra dependencies.
|
||||||
|
All versions for test dependencies should be read from `requirements/test.txt`
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install bare minimum test dependencies:
|
||||||
|
uv pip install pytest pytest-asyncio tblib
|
||||||
|
|
||||||
|
# Install additional test dependencies as needed, or install them all as follows:
|
||||||
|
uv pip install -r requirements/test.txt
|
||||||
|
|
||||||
|
# Run specific test from specific test file
|
||||||
|
pytest tests/path/to/test.py -v -s -k test_name
|
||||||
|
|
||||||
|
# Run all tests in directory
|
||||||
|
pytest tests/path/to/dir -v -s
|
||||||
|
```
|
||||||
|
|
||||||
|
### Running linters
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run all pre-commit hooks on staged files:
|
||||||
|
pre-commit run
|
||||||
|
|
||||||
|
# Run on all files:
|
||||||
|
pre-commit run --all-files
|
||||||
|
|
||||||
|
# Run a specific hook:
|
||||||
|
pre-commit run ruff-check --all-files
|
||||||
|
|
||||||
|
# Run mypy as it is in CI:
|
||||||
|
pre-commit run mypy-3.10 --all-files --hook-stage manual
|
||||||
|
```
|
||||||
|
|
||||||
|
### Commit messages
|
||||||
|
|
||||||
|
Add attribution using commit trailers such as `Co-authored-by:` (other projects use `Assisted-by:` or `Generated-by:`). For example:
|
||||||
|
|
||||||
|
```text
|
||||||
|
Your commit message here
|
||||||
|
|
||||||
|
Co-authored-by: GitHub Copilot
|
||||||
|
Co-authored-by: Claude
|
||||||
|
Co-authored-by: gemini-code-assist
|
||||||
|
Signed-off-by: Your Name <your.email@example.com>
|
||||||
|
```
|
||||||
@@ -37,7 +37,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
|||||||
set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
|
set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
|
||||||
|
|
||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1150;gfx1151;gfx1152;gfx1153;gfx1200;gfx1201")
|
||||||
|
|
||||||
# ROCm installation prefix. Default to /opt/rocm but allow override via
|
# ROCm installation prefix. Default to /opt/rocm but allow override via
|
||||||
# -DROCM_PATH=/your/rocm/path when invoking cmake.
|
# -DROCM_PATH=/your/rocm/path when invoking cmake.
|
||||||
|
|||||||
@@ -187,7 +187,7 @@ python benchmark.py \
|
|||||||
## Hardware Requirements
|
## Hardware Requirements
|
||||||
|
|
||||||
| Backend | Hardware |
|
| Backend | Hardware |
|
||||||
|---------|----------|
|
| ------- | -------- |
|
||||||
| Flash/Triton/FlashInfer | Any CUDA GPU |
|
| Flash/Triton/FlashInfer | Any CUDA GPU |
|
||||||
| CUTLASS MLA | Blackwell (SM100+) |
|
| CUTLASS MLA | Blackwell (SM100+) |
|
||||||
| FlashAttn MLA | Hopper (SM90+) |
|
| FlashAttn MLA | Hopper (SM90+) |
|
||||||
|
|||||||
@@ -47,6 +47,8 @@ from common import (
|
|||||||
is_mla_backend,
|
is_mla_backend,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from vllm.v1.worker.workspace import init_workspace_manager
|
||||||
|
|
||||||
|
|
||||||
def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
|
def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
|
||||||
"""Run standard attention benchmark (Flash/Triton/FlashInfer)."""
|
"""Run standard attention benchmark (Flash/Triton/FlashInfer)."""
|
||||||
@@ -59,7 +61,9 @@ def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
|
|||||||
"""Run MLA benchmark with appropriate backend."""
|
"""Run MLA benchmark with appropriate backend."""
|
||||||
from mla_runner import run_mla_benchmark as run_mla
|
from mla_runner import run_mla_benchmark as run_mla
|
||||||
|
|
||||||
return run_mla(config.backend, config, **kwargs)
|
return run_mla(
|
||||||
|
config.backend, config, prefill_backend=config.prefill_backend, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
|
def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
|
||||||
@@ -440,20 +444,27 @@ def main():
|
|||||||
# Backend selection
|
# Backend selection
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--backends",
|
"--backends",
|
||||||
|
"--decode-backends",
|
||||||
nargs="+",
|
nargs="+",
|
||||||
help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
|
help="Decode backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
|
||||||
"flashinfer_mla, flashattn_mla, flashmla)",
|
"flashinfer_mla, flashattn_mla, flashmla)",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--backend",
|
"--backend",
|
||||||
help="Single backend (alternative to --backends)",
|
help="Single backend (alternative to --backends)",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--prefill-backends",
|
||||||
|
nargs="+",
|
||||||
|
help="Prefill backends to compare (fa2, fa3, fa4). "
|
||||||
|
"Uses the first decode backend for impl construction.",
|
||||||
|
)
|
||||||
|
|
||||||
# Batch specifications
|
# Batch specifications
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--batch-specs",
|
"--batch-specs",
|
||||||
nargs="+",
|
nargs="+",
|
||||||
default=["q2k", "8q1s1k"],
|
default=None,
|
||||||
help="Batch specifications using extended grammar",
|
help="Batch specifications using extended grammar",
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -469,6 +480,21 @@ def main():
|
|||||||
parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
|
parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
|
||||||
parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
|
parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
|
||||||
parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
|
parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
|
||||||
|
parser.add_argument(
|
||||||
|
"--kv-cache-dtype",
|
||||||
|
default="auto",
|
||||||
|
choices=["auto", "fp8"],
|
||||||
|
help="KV cache dtype: auto or fp8",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--cuda-graphs",
|
||||||
|
action=argparse.BooleanOptionalAction,
|
||||||
|
default=True,
|
||||||
|
help=(
|
||||||
|
"Launch kernels with CUDA graphs to eliminate CPU overhead"
|
||||||
|
"in measurements (default: True)"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
# Parameter sweep (use YAML config for advanced sweeps)
|
# Parameter sweep (use YAML config for advanced sweeps)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@@ -502,7 +528,7 @@ def main():
|
|||||||
|
|
||||||
# Override args with YAML values, but CLI args take precedence
|
# Override args with YAML values, but CLI args take precedence
|
||||||
# Check if CLI provided backends (they would be non-None and not default)
|
# Check if CLI provided backends (they would be non-None and not default)
|
||||||
cli_backends_provided = args.backends is not None or args.backend is not None
|
cli_backends_provided = args.backend is not None or args.backends is not None
|
||||||
|
|
||||||
# Backend(s) - only use YAML if CLI didn't specify
|
# Backend(s) - only use YAML if CLI didn't specify
|
||||||
if not cli_backends_provided:
|
if not cli_backends_provided:
|
||||||
@@ -512,6 +538,12 @@ def main():
|
|||||||
elif "backends" in yaml_config:
|
elif "backends" in yaml_config:
|
||||||
args.backends = yaml_config["backends"]
|
args.backends = yaml_config["backends"]
|
||||||
args.backend = None
|
args.backend = None
|
||||||
|
elif "decode_backends" in yaml_config:
|
||||||
|
args.backends = yaml_config["decode_backends"]
|
||||||
|
args.backend = None
|
||||||
|
|
||||||
|
# Prefill backends (e.g., ["fa3", "fa4"])
|
||||||
|
args.prefill_backends = yaml_config.get("prefill_backends", None)
|
||||||
|
|
||||||
# Check for special modes
|
# Check for special modes
|
||||||
if "mode" in yaml_config:
|
if "mode" in yaml_config:
|
||||||
@@ -521,21 +553,24 @@ def main():
|
|||||||
|
|
||||||
# Batch specs and sizes
|
# Batch specs and sizes
|
||||||
# Support both explicit batch_specs and generated batch_spec_ranges
|
# Support both explicit batch_specs and generated batch_spec_ranges
|
||||||
if "batch_spec_ranges" in yaml_config:
|
# CLI --batch-specs takes precedence over YAML when provided.
|
||||||
# Generate batch specs from ranges
|
cli_batch_specs_provided = args.batch_specs is not None
|
||||||
generated_specs = generate_batch_specs_from_ranges(
|
if not cli_batch_specs_provided:
|
||||||
yaml_config["batch_spec_ranges"]
|
if "batch_spec_ranges" in yaml_config:
|
||||||
)
|
# Generate batch specs from ranges
|
||||||
# Combine with any explicit batch_specs
|
generated_specs = generate_batch_specs_from_ranges(
|
||||||
if "batch_specs" in yaml_config:
|
yaml_config["batch_spec_ranges"]
|
||||||
args.batch_specs = yaml_config["batch_specs"] + generated_specs
|
)
|
||||||
else:
|
# Combine with any explicit batch_specs
|
||||||
args.batch_specs = generated_specs
|
if "batch_specs" in yaml_config:
|
||||||
console.print(
|
args.batch_specs = yaml_config["batch_specs"] + generated_specs
|
||||||
f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
|
else:
|
||||||
)
|
args.batch_specs = generated_specs
|
||||||
elif "batch_specs" in yaml_config:
|
console.print(
|
||||||
args.batch_specs = yaml_config["batch_specs"]
|
f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
|
||||||
|
)
|
||||||
|
elif "batch_specs" in yaml_config:
|
||||||
|
args.batch_specs = yaml_config["batch_specs"]
|
||||||
|
|
||||||
if "batch_sizes" in yaml_config:
|
if "batch_sizes" in yaml_config:
|
||||||
args.batch_sizes = yaml_config["batch_sizes"]
|
args.batch_sizes = yaml_config["batch_sizes"]
|
||||||
@@ -560,6 +595,10 @@ def main():
|
|||||||
args.warmup_iters = yaml_config["warmup_iters"]
|
args.warmup_iters = yaml_config["warmup_iters"]
|
||||||
if "profile_memory" in yaml_config:
|
if "profile_memory" in yaml_config:
|
||||||
args.profile_memory = yaml_config["profile_memory"]
|
args.profile_memory = yaml_config["profile_memory"]
|
||||||
|
if "kv_cache_dtype" in yaml_config:
|
||||||
|
args.kv_cache_dtype = yaml_config["kv_cache_dtype"]
|
||||||
|
if "cuda_graphs" in yaml_config:
|
||||||
|
args.cuda_graphs = yaml_config["cuda_graphs"]
|
||||||
|
|
||||||
# Parameter sweep configuration
|
# Parameter sweep configuration
|
||||||
if "parameter_sweep" in yaml_config:
|
if "parameter_sweep" in yaml_config:
|
||||||
@@ -613,10 +652,19 @@ def main():
|
|||||||
|
|
||||||
# Determine backends
|
# Determine backends
|
||||||
backends = args.backends or ([args.backend] if args.backend else ["flash"])
|
backends = args.backends or ([args.backend] if args.backend else ["flash"])
|
||||||
|
prefill_backends = getattr(args, "prefill_backends", None)
|
||||||
|
if not args.batch_specs:
|
||||||
|
args.batch_specs = ["q2k", "8q1s1k"]
|
||||||
console.print(f"Backends: {', '.join(backends)}")
|
console.print(f"Backends: {', '.join(backends)}")
|
||||||
|
if prefill_backends:
|
||||||
|
console.print(f"Prefill backends: {', '.join(prefill_backends)}")
|
||||||
console.print(f"Batch specs: {', '.join(args.batch_specs)}")
|
console.print(f"Batch specs: {', '.join(args.batch_specs)}")
|
||||||
|
console.print(f"KV cache dtype: {args.kv_cache_dtype}")
|
||||||
|
console.print(f"CUDA graphs: {args.cuda_graphs}")
|
||||||
console.print()
|
console.print()
|
||||||
|
|
||||||
|
init_workspace_manager(args.device)
|
||||||
|
|
||||||
# Run benchmarks
|
# Run benchmarks
|
||||||
all_results = []
|
all_results = []
|
||||||
|
|
||||||
@@ -669,6 +717,8 @@ def main():
|
|||||||
repeats=args.repeats,
|
repeats=args.repeats,
|
||||||
warmup_iters=args.warmup_iters,
|
warmup_iters=args.warmup_iters,
|
||||||
profile_memory=args.profile_memory,
|
profile_memory=args.profile_memory,
|
||||||
|
kv_cache_dtype=args.kv_cache_dtype,
|
||||||
|
use_cuda_graphs=args.cuda_graphs,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add decode pipeline config
|
# Add decode pipeline config
|
||||||
@@ -821,6 +871,8 @@ def main():
|
|||||||
"repeats": args.repeats,
|
"repeats": args.repeats,
|
||||||
"warmup_iters": args.warmup_iters,
|
"warmup_iters": args.warmup_iters,
|
||||||
"profile_memory": args.profile_memory,
|
"profile_memory": args.profile_memory,
|
||||||
|
"kv_cache_dtype": args.kv_cache_dtype,
|
||||||
|
"use_cuda_graphs": args.cuda_graphs,
|
||||||
}
|
}
|
||||||
all_results = run_model_parameter_sweep(
|
all_results = run_model_parameter_sweep(
|
||||||
backends,
|
backends,
|
||||||
@@ -843,6 +895,8 @@ def main():
|
|||||||
"repeats": args.repeats,
|
"repeats": args.repeats,
|
||||||
"warmup_iters": args.warmup_iters,
|
"warmup_iters": args.warmup_iters,
|
||||||
"profile_memory": args.profile_memory,
|
"profile_memory": args.profile_memory,
|
||||||
|
"kv_cache_dtype": args.kv_cache_dtype,
|
||||||
|
"use_cuda_graphs": args.cuda_graphs,
|
||||||
}
|
}
|
||||||
all_results = run_parameter_sweep(
|
all_results = run_parameter_sweep(
|
||||||
backends, args.batch_specs, base_config_args, args.parameter_sweep, console
|
backends, args.batch_specs, base_config_args, args.parameter_sweep, console
|
||||||
@@ -850,37 +904,95 @@ def main():
|
|||||||
|
|
||||||
else:
|
else:
|
||||||
# Normal mode: compare backends
|
# Normal mode: compare backends
|
||||||
total = len(backends) * len(args.batch_specs)
|
decode_results = []
|
||||||
|
prefill_results = []
|
||||||
|
|
||||||
with tqdm(total=total, desc="Benchmarking") as pbar:
|
# Run decode backend comparison
|
||||||
for spec in args.batch_specs:
|
if not prefill_backends:
|
||||||
for backend in backends:
|
# No prefill backends specified: compare decode backends as before
|
||||||
config = BenchmarkConfig(
|
total = len(backends) * len(args.batch_specs)
|
||||||
backend=backend,
|
|
||||||
batch_spec=spec,
|
|
||||||
num_layers=args.num_layers,
|
|
||||||
head_dim=args.head_dim,
|
|
||||||
num_q_heads=args.num_q_heads,
|
|
||||||
num_kv_heads=args.num_kv_heads,
|
|
||||||
block_size=args.block_size,
|
|
||||||
device=args.device,
|
|
||||||
repeats=args.repeats,
|
|
||||||
warmup_iters=args.warmup_iters,
|
|
||||||
profile_memory=args.profile_memory,
|
|
||||||
)
|
|
||||||
|
|
||||||
result = run_benchmark(config)
|
with tqdm(total=total, desc="Benchmarking") as pbar:
|
||||||
all_results.append(result)
|
for spec in args.batch_specs:
|
||||||
|
for backend in backends:
|
||||||
|
config = BenchmarkConfig(
|
||||||
|
backend=backend,
|
||||||
|
batch_spec=spec,
|
||||||
|
num_layers=args.num_layers,
|
||||||
|
head_dim=args.head_dim,
|
||||||
|
num_q_heads=args.num_q_heads,
|
||||||
|
num_kv_heads=args.num_kv_heads,
|
||||||
|
block_size=args.block_size,
|
||||||
|
device=args.device,
|
||||||
|
repeats=args.repeats,
|
||||||
|
warmup_iters=args.warmup_iters,
|
||||||
|
profile_memory=args.profile_memory,
|
||||||
|
kv_cache_dtype=args.kv_cache_dtype,
|
||||||
|
use_cuda_graphs=args.cuda_graphs,
|
||||||
|
)
|
||||||
|
|
||||||
if not result.success:
|
result = run_benchmark(config)
|
||||||
console.print(f"[red]Error {backend} {spec}: {result.error}[/]")
|
decode_results.append(result)
|
||||||
|
|
||||||
pbar.update(1)
|
if not result.success:
|
||||||
|
console.print(
|
||||||
|
f"[red]Error {backend} {spec}: {result.error}[/]"
|
||||||
|
)
|
||||||
|
|
||||||
# Display results
|
pbar.update(1)
|
||||||
console.print("\n[bold green]Results:[/]")
|
|
||||||
formatter = ResultsFormatter(console)
|
console.print("\n[bold green]Results:[/]")
|
||||||
formatter.print_table(all_results, backends)
|
formatter = ResultsFormatter(console)
|
||||||
|
formatter.print_table(decode_results, backends)
|
||||||
|
|
||||||
|
# Run prefill backend comparison
|
||||||
|
if prefill_backends:
|
||||||
|
# Use first decode backend for impl construction
|
||||||
|
decode_backend = backends[0]
|
||||||
|
total = len(prefill_backends) * len(args.batch_specs)
|
||||||
|
|
||||||
|
console.print(
|
||||||
|
f"[yellow]Prefill comparison mode: "
|
||||||
|
f"using {decode_backend} for decode impl[/]"
|
||||||
|
)
|
||||||
|
|
||||||
|
with tqdm(total=total, desc="Prefill benchmarking") as pbar:
|
||||||
|
for spec in args.batch_specs:
|
||||||
|
for pb in prefill_backends:
|
||||||
|
config = BenchmarkConfig(
|
||||||
|
backend=decode_backend,
|
||||||
|
batch_spec=spec,
|
||||||
|
num_layers=args.num_layers,
|
||||||
|
head_dim=args.head_dim,
|
||||||
|
num_q_heads=args.num_q_heads,
|
||||||
|
num_kv_heads=args.num_kv_heads,
|
||||||
|
block_size=args.block_size,
|
||||||
|
device=args.device,
|
||||||
|
repeats=args.repeats,
|
||||||
|
warmup_iters=args.warmup_iters,
|
||||||
|
profile_memory=args.profile_memory,
|
||||||
|
prefill_backend=pb,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = run_benchmark(config)
|
||||||
|
|
||||||
|
# Label result with prefill backend name for display
|
||||||
|
labeled_config = replace(result.config, backend=pb)
|
||||||
|
result = replace(result, config=labeled_config)
|
||||||
|
prefill_results.append(result)
|
||||||
|
|
||||||
|
if not result.success:
|
||||||
|
console.print(f"[red]Error {pb} {spec}: {result.error}[/]")
|
||||||
|
|
||||||
|
pbar.update(1)
|
||||||
|
|
||||||
|
console.print("\n[bold green]Prefill Backend Results:[/]")
|
||||||
|
formatter = ResultsFormatter(console)
|
||||||
|
formatter.print_table(
|
||||||
|
prefill_results, prefill_backends, compare_to_fastest=True
|
||||||
|
)
|
||||||
|
|
||||||
|
all_results = decode_results + prefill_results
|
||||||
|
|
||||||
# Save results
|
# Save results
|
||||||
if all_results:
|
if all_results:
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
|
|||||||
max_kv_len = max(r.kv_len for r in requests) if requests else 0
|
max_kv_len = max(r.kv_len for r in requests) if requests else 0
|
||||||
return (batch_size, max_q_len, max_kv_len)
|
return (batch_size, max_q_len, max_kv_len)
|
||||||
except Exception:
|
except Exception:
|
||||||
# Fallback for unparseable specs
|
# Fallback for unparsable specs
|
||||||
return (0, 0, 0)
|
return (0, 0, 0)
|
||||||
|
|
||||||
|
|
||||||
@@ -77,6 +77,7 @@ class MockKVBProj:
|
|||||||
self.qk_nope_head_dim = qk_nope_head_dim
|
self.qk_nope_head_dim = qk_nope_head_dim
|
||||||
self.v_head_dim = v_head_dim
|
self.v_head_dim = v_head_dim
|
||||||
self.out_dim = qk_nope_head_dim + v_head_dim
|
self.out_dim = qk_nope_head_dim + v_head_dim
|
||||||
|
self.weight = torch.empty(0, dtype=torch.bfloat16)
|
||||||
|
|
||||||
def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
|
def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
|
||||||
"""
|
"""
|
||||||
@@ -212,7 +213,11 @@ class BenchmarkConfig:
|
|||||||
profile_memory: bool = False
|
profile_memory: bool = False
|
||||||
use_cuda_graphs: bool = False
|
use_cuda_graphs: bool = False
|
||||||
|
|
||||||
|
# "auto" or "fp8"
|
||||||
|
kv_cache_dtype: str = "auto"
|
||||||
|
|
||||||
# MLA-specific
|
# MLA-specific
|
||||||
|
prefill_backend: str | None = None
|
||||||
kv_lora_rank: int | None = None
|
kv_lora_rank: int | None = None
|
||||||
qk_nope_head_dim: int | None = None
|
qk_nope_head_dim: int | None = None
|
||||||
qk_rope_head_dim: int | None = None
|
qk_rope_head_dim: int | None = None
|
||||||
@@ -367,6 +372,7 @@ class ResultsFormatter:
|
|||||||
"backend",
|
"backend",
|
||||||
"batch_spec",
|
"batch_spec",
|
||||||
"num_layers",
|
"num_layers",
|
||||||
|
"kv_cache_dtype",
|
||||||
"mean_time",
|
"mean_time",
|
||||||
"std_time",
|
"std_time",
|
||||||
"throughput",
|
"throughput",
|
||||||
@@ -380,6 +386,7 @@ class ResultsFormatter:
|
|||||||
"backend": r.config.backend,
|
"backend": r.config.backend,
|
||||||
"batch_spec": r.config.batch_spec,
|
"batch_spec": r.config.batch_spec,
|
||||||
"num_layers": r.config.num_layers,
|
"num_layers": r.config.num_layers,
|
||||||
|
"kv_cache_dtype": r.config.kv_cache_dtype,
|
||||||
"mean_time": r.mean_time,
|
"mean_time": r.mean_time,
|
||||||
"std_time": r.std_time,
|
"std_time": r.std_time,
|
||||||
"throughput": r.throughput_tokens_per_sec or 0,
|
"throughput": r.throughput_tokens_per_sec or 0,
|
||||||
|
|||||||
@@ -30,9 +30,9 @@ batch_specs:
|
|||||||
- "2q16k_32q1s4k" # 2 very large prefill + 32 decode
|
- "2q16k_32q1s4k" # 2 very large prefill + 32 decode
|
||||||
|
|
||||||
# Context extension + decode
|
# Context extension + decode
|
||||||
- "2q1kkv2k_16q1s1k" # 2 extend + 16 decode
|
- "2q1ks2k_16q1s1k" # 2 extend + 16 decode
|
||||||
- "4q2kkv4k_32q1s2k" # 4 extend + 32 decode
|
- "4q2ks4k_32q1s2k" # 4 extend + 32 decode
|
||||||
- "2q1kkv8k_32q1s2k" # 2 large extend + 32 decode
|
- "2q1ks8k_32q1s2k" # 2 large extend + 32 decode
|
||||||
|
|
||||||
# Explicitly chunked prefill
|
# Explicitly chunked prefill
|
||||||
- "q8k" # 8k prefill with chunking hint
|
- "q8k" # 8k prefill with chunking hint
|
||||||
|
|||||||
@@ -1,4 +1,19 @@
|
|||||||
# MLA prefill-only benchmark configuration for sparse backends
|
# MLA prefill backend comparison
|
||||||
|
#
|
||||||
|
# Compares all available MLA prefill backends:
|
||||||
|
# FA backends: fa2, fa3, fa4 (FlashAttention versions)
|
||||||
|
# Non-FA: flashinfer, cudnn, trtllm (Blackwell-only, require flashinfer)
|
||||||
|
#
|
||||||
|
# Uses cutlass_mla as the decode backend for impl construction
|
||||||
|
# (only the prefill path is exercised).
|
||||||
|
#
|
||||||
|
# Backends that aren't available on the current platform will report errors
|
||||||
|
# in the results table (e.g., fa3 on Blackwell, cudnn without artifactory).
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# python benchmark.py --config configs/mla_prefill.yaml
|
||||||
|
|
||||||
|
description: "MLA prefill backend comparison"
|
||||||
|
|
||||||
model:
|
model:
|
||||||
name: "deepseek-v3"
|
name: "deepseek-v3"
|
||||||
@@ -12,20 +27,25 @@ model:
|
|||||||
v_head_dim: 128
|
v_head_dim: 128
|
||||||
block_size: 128
|
block_size: 128
|
||||||
|
|
||||||
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
|
# model:
|
||||||
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
|
# name: "deepseek-v2-lite"
|
||||||
model_parameter_sweep:
|
# num_layers: 27
|
||||||
param_name: "num_q_heads"
|
# num_q_heads: 16
|
||||||
values: [128, 64, 32, 16]
|
# num_kv_heads: 1
|
||||||
label_format: "{backend}_{value}h"
|
# head_dim: 576
|
||||||
|
# kv_lora_rank: 512
|
||||||
|
# qk_nope_head_dim: 128
|
||||||
|
# qk_rope_head_dim: 64
|
||||||
|
# v_head_dim: 128
|
||||||
|
# block_size: 128
|
||||||
|
|
||||||
batch_specs:
|
batch_specs:
|
||||||
# Pure prefill
|
# Pure prefill
|
||||||
- "1q512"
|
- "q512"
|
||||||
- "1q1k"
|
- "q1k"
|
||||||
- "1q2k"
|
- "q2k"
|
||||||
- "1q4k"
|
- "q4k"
|
||||||
- "1q8k"
|
- "q8k"
|
||||||
|
|
||||||
# Batched pure prefill
|
# Batched pure prefill
|
||||||
- "2q512"
|
- "2q512"
|
||||||
@@ -44,19 +64,63 @@ batch_specs:
|
|||||||
- "8q4k"
|
- "8q4k"
|
||||||
- "8q8k"
|
- "8q8k"
|
||||||
|
|
||||||
# Extend
|
# Chunked prefill / extend
|
||||||
- "1q512s4k"
|
# Short context
|
||||||
- "1q512s8k"
|
- "q128s1k"
|
||||||
- "1q1ks8k"
|
- "q256s2k"
|
||||||
- "1q2ks8k"
|
- "q512s4k"
|
||||||
- "1q2ks16k"
|
- "q1ks4k"
|
||||||
- "1q4ks16k"
|
- "q2ks8k"
|
||||||
|
- "2q128s1k"
|
||||||
|
- "2q256s2k"
|
||||||
|
- "2q512s4k"
|
||||||
|
- "2q1ks4k"
|
||||||
|
- "2q2ks8k"
|
||||||
|
- "4q128s1k"
|
||||||
|
- "4q256s2k"
|
||||||
|
- "4q512s4k"
|
||||||
|
- "4q1ks4k"
|
||||||
|
- "4q2ks8k"
|
||||||
|
- "8q128s1k"
|
||||||
|
- "8q256s2k"
|
||||||
|
- "8q512s4k"
|
||||||
|
- "8q1ks4k"
|
||||||
|
|
||||||
backends:
|
# Medium context
|
||||||
- FLASHMLA_SPARSE
|
- "q128s16k"
|
||||||
- FLASHINFER_MLA_SPARSE
|
- "q512s16k"
|
||||||
|
- "q1ks16k"
|
||||||
|
- "q2ks16k"
|
||||||
|
- "2q128s16k"
|
||||||
|
- "2q512s16k"
|
||||||
|
- "2q1ks16k"
|
||||||
|
- "2q2ks16k"
|
||||||
|
- "4q128s16k"
|
||||||
|
- "4q512s16k"
|
||||||
|
- "4q1ks16k"
|
||||||
|
- "4q2ks16k"
|
||||||
|
|
||||||
|
# Long context
|
||||||
|
- "q128s64k"
|
||||||
|
- "q512s64k"
|
||||||
|
- "q1ks64k"
|
||||||
|
- "q2ks64k"
|
||||||
|
- "2q128s64k"
|
||||||
|
- "2q512s64k"
|
||||||
|
- "2q1ks64k"
|
||||||
|
- "2q2ks64k"
|
||||||
|
|
||||||
|
decode_backends:
|
||||||
|
- CUTLASS_MLA
|
||||||
|
|
||||||
|
prefill_backends:
|
||||||
|
- fa2
|
||||||
|
- fa3
|
||||||
|
- fa4
|
||||||
|
- flashinfer
|
||||||
|
- cudnn
|
||||||
|
- trtllm
|
||||||
|
|
||||||
device: "cuda:0"
|
device: "cuda:0"
|
||||||
repeats: 10
|
repeats: 20
|
||||||
warmup_iters: 3
|
warmup_iters: 5
|
||||||
profile_memory: true
|
|
||||||
|
|||||||
@@ -0,0 +1,58 @@
|
|||||||
|
# MLA decode-only benchmark configuration
|
||||||
|
|
||||||
|
model:
|
||||||
|
name: "deepseek-v3"
|
||||||
|
num_layers: 60
|
||||||
|
num_q_heads: 128 # Base value, can be swept for TP simulation
|
||||||
|
num_kv_heads: 1 # MLA uses single latent KV
|
||||||
|
head_dim: 576
|
||||||
|
kv_lora_rank: 512
|
||||||
|
qk_nope_head_dim: 128
|
||||||
|
qk_rope_head_dim: 64
|
||||||
|
v_head_dim: 128
|
||||||
|
block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128
|
||||||
|
|
||||||
|
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
|
||||||
|
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
|
||||||
|
model_parameter_sweep:
|
||||||
|
param_name: "num_q_heads"
|
||||||
|
values: [128, 64, 32, 16]
|
||||||
|
label_format: "{backend}_{value}h"
|
||||||
|
|
||||||
|
batch_specs:
|
||||||
|
# Small batches, varying sequence lengths
|
||||||
|
- "16q1s512" # 16 requests, 512 KV cache
|
||||||
|
- "16q1s1k" # 16 requests, 1k KV cache
|
||||||
|
- "16q1s2k" # 16 requests, 2k KV cache
|
||||||
|
- "16q1s4k" # 16 requests, 4k KV cache
|
||||||
|
|
||||||
|
# Medium batches
|
||||||
|
- "32q1s1k" # 32 requests, 1k KV cache
|
||||||
|
- "32q1s2k" # 32 requests, 2k KV cache
|
||||||
|
- "32q1s4k" # 32 requests, 4k KV cache
|
||||||
|
- "32q1s8k" # 32 requests, 8k KV cache
|
||||||
|
|
||||||
|
# Large batches
|
||||||
|
- "64q1s1k" # 64 requests, 1k KV cache
|
||||||
|
- "64q1s2k" # 64 requests, 2k KV cache
|
||||||
|
- "64q1s4k" # 64 requests, 4k KV cache
|
||||||
|
- "64q1s8k" # 64 requests, 8k KV cache
|
||||||
|
|
||||||
|
# Very large batches
|
||||||
|
- "128q1s1k" # 128 requests, 1k KV cache
|
||||||
|
- "128q1s2k" # 128 requests, 2k KV cache
|
||||||
|
- "128q1s4k" # 128 requests, 4k KV cache
|
||||||
|
- "128q1s8k" # 128 requests, 8k KV cache
|
||||||
|
|
||||||
|
# Long context
|
||||||
|
- "32q1s16k" # 32 requests, 16k KV cache
|
||||||
|
- "32q1s32k" # 32 requests, 32k KV cache
|
||||||
|
|
||||||
|
backends:
|
||||||
|
- FLASHMLA_SPARSE
|
||||||
|
- FLASHINFER_MLA_SPARSE
|
||||||
|
|
||||||
|
device: "cuda:0"
|
||||||
|
repeats: 100
|
||||||
|
warmup_iters: 10
|
||||||
|
profile_memory: true
|
||||||
@@ -0,0 +1,62 @@
|
|||||||
|
# MLA prefill-only benchmark configuration for sparse backends
|
||||||
|
|
||||||
|
model:
|
||||||
|
name: "deepseek-v3"
|
||||||
|
num_layers: 60
|
||||||
|
num_q_heads: 128
|
||||||
|
num_kv_heads: 1
|
||||||
|
head_dim: 576
|
||||||
|
kv_lora_rank: 512
|
||||||
|
qk_nope_head_dim: 128
|
||||||
|
qk_rope_head_dim: 64
|
||||||
|
v_head_dim: 128
|
||||||
|
block_size: 128
|
||||||
|
|
||||||
|
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
|
||||||
|
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
|
||||||
|
model_parameter_sweep:
|
||||||
|
param_name: "num_q_heads"
|
||||||
|
values: [128, 64, 32, 16]
|
||||||
|
label_format: "{backend}_{value}h"
|
||||||
|
|
||||||
|
batch_specs:
|
||||||
|
# Pure prefill
|
||||||
|
- "1q512"
|
||||||
|
- "1q1k"
|
||||||
|
- "1q2k"
|
||||||
|
- "1q4k"
|
||||||
|
- "1q8k"
|
||||||
|
|
||||||
|
# Batched pure prefill
|
||||||
|
- "2q512"
|
||||||
|
- "2q1k"
|
||||||
|
- "2q2k"
|
||||||
|
- "2q4k"
|
||||||
|
- "2q8k"
|
||||||
|
- "4q512"
|
||||||
|
- "4q1k"
|
||||||
|
- "4q2k"
|
||||||
|
- "4q4k"
|
||||||
|
- "4q8k"
|
||||||
|
- "8q512"
|
||||||
|
- "8q1k"
|
||||||
|
- "8q2k"
|
||||||
|
- "8q4k"
|
||||||
|
- "8q8k"
|
||||||
|
|
||||||
|
# Extend
|
||||||
|
- "1q512s4k"
|
||||||
|
- "1q512s8k"
|
||||||
|
- "1q1ks8k"
|
||||||
|
- "1q2ks8k"
|
||||||
|
- "1q2ks16k"
|
||||||
|
- "1q4ks16k"
|
||||||
|
|
||||||
|
backends:
|
||||||
|
- FLASHMLA_SPARSE
|
||||||
|
- FLASHINFER_MLA_SPARSE
|
||||||
|
|
||||||
|
device: "cuda:0"
|
||||||
|
repeats: 10
|
||||||
|
warmup_iters: 3
|
||||||
|
profile_memory: true
|
||||||
@@ -60,8 +60,11 @@ def create_minimal_vllm_config(
|
|||||||
model_name: str = "deepseek-v3",
|
model_name: str = "deepseek-v3",
|
||||||
block_size: int = 128,
|
block_size: int = 128,
|
||||||
max_num_seqs: int = 256,
|
max_num_seqs: int = 256,
|
||||||
|
max_num_batched_tokens: int = 8192,
|
||||||
mla_dims: dict | None = None,
|
mla_dims: dict | None = None,
|
||||||
index_topk: int | None = None,
|
index_topk: int | None = None,
|
||||||
|
prefill_backend: str | None = None,
|
||||||
|
kv_cache_dtype: str = "auto",
|
||||||
) -> VllmConfig:
|
) -> VllmConfig:
|
||||||
"""
|
"""
|
||||||
Create minimal VllmConfig for MLA benchmarks.
|
Create minimal VllmConfig for MLA benchmarks.
|
||||||
@@ -75,6 +78,9 @@ def create_minimal_vllm_config(
|
|||||||
setup_mla_dims(model_name)
|
setup_mla_dims(model_name)
|
||||||
index_topk: Optional topk value for sparse MLA backends. If provided,
|
index_topk: Optional topk value for sparse MLA backends. If provided,
|
||||||
the config will include index_topk for sparse attention.
|
the config will include index_topk for sparse attention.
|
||||||
|
prefill_backend: Prefill backend name (e.g., "fa3", "fa4", "flashinfer",
|
||||||
|
"cudnn", "trtllm"). Configures the attention config to
|
||||||
|
force the specified prefill backend.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
VllmConfig for benchmarking
|
VllmConfig for benchmarking
|
||||||
@@ -145,14 +151,13 @@ def create_minimal_vllm_config(
|
|||||||
cache_config = CacheConfig(
|
cache_config = CacheConfig(
|
||||||
block_size=block_size,
|
block_size=block_size,
|
||||||
gpu_memory_utilization=0.9,
|
gpu_memory_utilization=0.9,
|
||||||
swap_space=0,
|
cache_dtype=kv_cache_dtype,
|
||||||
cache_dtype="auto",
|
|
||||||
enable_prefix_caching=False,
|
enable_prefix_caching=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
scheduler_config = SchedulerConfig(
|
scheduler_config = SchedulerConfig(
|
||||||
max_num_seqs=max_num_seqs,
|
max_num_seqs=max_num_seqs,
|
||||||
max_num_batched_tokens=8192,
|
max_num_batched_tokens=max(max_num_batched_tokens, max_num_seqs),
|
||||||
max_model_len=32768,
|
max_model_len=32768,
|
||||||
is_encoder_decoder=False,
|
is_encoder_decoder=False,
|
||||||
enable_chunked_prefill=True,
|
enable_chunked_prefill=True,
|
||||||
@@ -164,7 +169,7 @@ def create_minimal_vllm_config(
|
|||||||
|
|
||||||
compilation_config = CompilationConfig()
|
compilation_config = CompilationConfig()
|
||||||
|
|
||||||
return VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
cache_config=cache_config,
|
cache_config=cache_config,
|
||||||
parallel_config=parallel_config,
|
parallel_config=parallel_config,
|
||||||
@@ -172,9 +177,84 @@ def create_minimal_vllm_config(
|
|||||||
compilation_config=compilation_config,
|
compilation_config=compilation_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if prefill_backend is not None:
|
||||||
|
prefill_cfg = get_prefill_backend_config(prefill_backend)
|
||||||
|
if prefill_cfg["flash_attn_version"] is not None:
|
||||||
|
vllm_config.attention_config.flash_attn_version = prefill_cfg[
|
||||||
|
"flash_attn_version"
|
||||||
|
]
|
||||||
|
vllm_config.attention_config.disable_flashinfer_prefill = prefill_cfg[
|
||||||
|
"disable_flashinfer_prefill"
|
||||||
|
]
|
||||||
|
vllm_config.attention_config.use_cudnn_prefill = prefill_cfg[
|
||||||
|
"use_cudnn_prefill"
|
||||||
|
]
|
||||||
|
vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill = prefill_cfg[
|
||||||
|
"use_trtllm_ragged_deepseek_prefill"
|
||||||
|
]
|
||||||
|
|
||||||
|
return vllm_config
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# Backend Configuration
|
# Prefill Backend Configuration
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
# Maps prefill backend names to attention config overrides.
|
||||||
|
# FA backends set flash_attn_version and disable non-FA paths.
|
||||||
|
# Non-FA backends enable their specific path and disable others.
|
||||||
|
_PREFILL_BACKEND_CONFIG: dict[str, dict] = {
|
||||||
|
"fa2": {
|
||||||
|
"flash_attn_version": 2,
|
||||||
|
"disable_flashinfer_prefill": True,
|
||||||
|
"use_cudnn_prefill": False,
|
||||||
|
"use_trtllm_ragged_deepseek_prefill": False,
|
||||||
|
},
|
||||||
|
"fa3": {
|
||||||
|
"flash_attn_version": 3,
|
||||||
|
"disable_flashinfer_prefill": True,
|
||||||
|
"use_cudnn_prefill": False,
|
||||||
|
"use_trtllm_ragged_deepseek_prefill": False,
|
||||||
|
},
|
||||||
|
"fa4": {
|
||||||
|
"flash_attn_version": 4,
|
||||||
|
"disable_flashinfer_prefill": True,
|
||||||
|
"use_cudnn_prefill": False,
|
||||||
|
"use_trtllm_ragged_deepseek_prefill": False,
|
||||||
|
},
|
||||||
|
"flashinfer": {
|
||||||
|
"flash_attn_version": None,
|
||||||
|
"disable_flashinfer_prefill": False,
|
||||||
|
"use_cudnn_prefill": False,
|
||||||
|
"use_trtllm_ragged_deepseek_prefill": False,
|
||||||
|
},
|
||||||
|
"cudnn": {
|
||||||
|
"flash_attn_version": None,
|
||||||
|
"disable_flashinfer_prefill": True,
|
||||||
|
"use_cudnn_prefill": True,
|
||||||
|
"use_trtllm_ragged_deepseek_prefill": False,
|
||||||
|
},
|
||||||
|
"trtllm": {
|
||||||
|
"flash_attn_version": None,
|
||||||
|
"disable_flashinfer_prefill": True,
|
||||||
|
"use_cudnn_prefill": False,
|
||||||
|
"use_trtllm_ragged_deepseek_prefill": True,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_prefill_backend_config(prefill_backend: str) -> dict:
|
||||||
|
"""Get attention config overrides for a prefill backend."""
|
||||||
|
if prefill_backend not in _PREFILL_BACKEND_CONFIG:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unknown prefill backend: {prefill_backend!r}. "
|
||||||
|
f"Available: {list(_PREFILL_BACKEND_CONFIG.keys())}"
|
||||||
|
)
|
||||||
|
return _PREFILL_BACKEND_CONFIG[prefill_backend]
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Decode Backend Configuration
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|
||||||
|
|
||||||
@@ -204,6 +284,7 @@ def _get_backend_config(backend: str) -> dict:
|
|||||||
Returns:
|
Returns:
|
||||||
Dict with backend configuration
|
Dict with backend configuration
|
||||||
"""
|
"""
|
||||||
|
from vllm.v1.attention.backend import MultipleOf
|
||||||
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -220,8 +301,8 @@ def _get_backend_config(backend: str) -> dict:
|
|||||||
block_sizes = backend_class.get_supported_kernel_block_sizes()
|
block_sizes = backend_class.get_supported_kernel_block_sizes()
|
||||||
# Use first supported block size (backends typically support one for MLA)
|
# Use first supported block size (backends typically support one for MLA)
|
||||||
block_size = block_sizes[0] if block_sizes else None
|
block_size = block_sizes[0] if block_sizes else None
|
||||||
if hasattr(block_size, "value"):
|
if isinstance(block_size, MultipleOf):
|
||||||
# Handle MultipleOf enum
|
# No fixed block size; fall back to config value
|
||||||
block_size = None
|
block_size = None
|
||||||
|
|
||||||
# Check if sparse via class method if available
|
# Check if sparse via class method if available
|
||||||
@@ -456,6 +537,7 @@ def _create_backend_impl(
|
|||||||
device: torch.device,
|
device: torch.device,
|
||||||
max_num_tokens: int = 8192,
|
max_num_tokens: int = 8192,
|
||||||
index_topk: int | None = None,
|
index_topk: int | None = None,
|
||||||
|
kv_cache_dtype: str = "auto",
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Create backend implementation instance.
|
Create backend implementation instance.
|
||||||
@@ -504,7 +586,7 @@ def _create_backend_impl(
|
|||||||
"num_kv_heads": mla_dims["num_kv_heads"],
|
"num_kv_heads": mla_dims["num_kv_heads"],
|
||||||
"alibi_slopes": None,
|
"alibi_slopes": None,
|
||||||
"sliding_window": None,
|
"sliding_window": None,
|
||||||
"kv_cache_dtype": "auto",
|
"kv_cache_dtype": kv_cache_dtype,
|
||||||
"logits_soft_cap": None,
|
"logits_soft_cap": None,
|
||||||
"attn_type": "decoder",
|
"attn_type": "decoder",
|
||||||
"kv_sharing_target_layer_name": None,
|
"kv_sharing_target_layer_name": None,
|
||||||
@@ -622,6 +704,7 @@ def _run_single_benchmark(
|
|||||||
mla_dims: dict,
|
mla_dims: dict,
|
||||||
device: torch.device,
|
device: torch.device,
|
||||||
indexer=None,
|
indexer=None,
|
||||||
|
kv_cache_dtype: str | None = None,
|
||||||
) -> BenchmarkResult:
|
) -> BenchmarkResult:
|
||||||
"""
|
"""
|
||||||
Run a single benchmark iteration.
|
Run a single benchmark iteration.
|
||||||
@@ -655,53 +738,123 @@ def _run_single_benchmark(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Create KV cache
|
# Create KV cache
|
||||||
kv_cache = torch.zeros(
|
if kv_cache_dtype is None:
|
||||||
num_blocks,
|
kv_cache_dtype = getattr(config, "kv_cache_dtype", "auto")
|
||||||
block_size,
|
head_size = mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"]
|
||||||
mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
|
if kv_cache_dtype == "fp8_ds_mla":
|
||||||
device=device,
|
# FlashMLA sparse custom format: 656 bytes per token, stored as uint8.
|
||||||
dtype=torch.bfloat16,
|
# Layout: kv_lora_rank fp8 bytes + 4 float32 tile scales
|
||||||
)
|
# + 2*rope_dim bf16 bytes
|
||||||
|
# = 512 + 16 + 128 = 656 bytes for DeepSeek dims.
|
||||||
|
kv_cache = torch.zeros(
|
||||||
|
num_blocks,
|
||||||
|
block_size,
|
||||||
|
656,
|
||||||
|
device=device,
|
||||||
|
dtype=torch.uint8,
|
||||||
|
)
|
||||||
|
elif kv_cache_dtype == "fp8":
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
# Create input tensors for both decode and prefill modes
|
kv_cache = torch.zeros(
|
||||||
decode_inputs, prefill_inputs = _create_input_tensors(
|
num_blocks,
|
||||||
total_q,
|
block_size,
|
||||||
mla_dims,
|
head_size,
|
||||||
backend_cfg["query_format"],
|
device=device,
|
||||||
device,
|
dtype=torch.uint8,
|
||||||
torch.bfloat16,
|
).view(current_platform.fp8_dtype())
|
||||||
)
|
else:
|
||||||
|
kv_cache = torch.zeros(
|
||||||
|
num_blocks,
|
||||||
|
block_size,
|
||||||
|
head_size,
|
||||||
|
device=device,
|
||||||
|
dtype=torch.bfloat16,
|
||||||
|
)
|
||||||
|
|
||||||
# Fill indexer with random indices for sparse backends
|
# Fill indexer with random indices for sparse backends
|
||||||
is_sparse = backend_cfg.get("is_sparse", False)
|
is_sparse = backend_cfg.get("is_sparse", False)
|
||||||
if is_sparse and indexer is not None:
|
if is_sparse and indexer is not None:
|
||||||
indexer.fill_random_indices(total_q, max_kv_len)
|
indexer.fill_random_indices(total_q, max_kv_len)
|
||||||
|
|
||||||
# Determine which forward method to use
|
# Determine which forward methods to use based on metadata.
|
||||||
if is_sparse:
|
# Sparse MLA backends always use forward_mqa
|
||||||
# Sparse backends use forward_mqa
|
has_decode = is_sparse or getattr(metadata, "decode", None) is not None
|
||||||
forward_fn = lambda: impl.forward_mqa(decode_inputs, kv_cache, metadata, layer)
|
has_prefill = not is_sparse and getattr(metadata, "prefill", None) is not None
|
||||||
elif metadata.decode is not None:
|
if not has_decode and not has_prefill:
|
||||||
forward_fn = lambda: impl._forward_decode(
|
|
||||||
decode_inputs, kv_cache, metadata, layer
|
|
||||||
)
|
|
||||||
elif metadata.prefill is not None:
|
|
||||||
forward_fn = lambda: impl._forward_prefill(
|
|
||||||
prefill_inputs["q"],
|
|
||||||
prefill_inputs["k_c_normed"],
|
|
||||||
prefill_inputs["k_pe"],
|
|
||||||
kv_cache,
|
|
||||||
metadata,
|
|
||||||
prefill_inputs["k_scale"],
|
|
||||||
prefill_inputs["output"],
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
raise RuntimeError("Metadata has neither decode nor prefill metadata")
|
raise RuntimeError("Metadata has neither decode nor prefill metadata")
|
||||||
|
|
||||||
|
num_decode = (
|
||||||
|
metadata.num_decode_tokens
|
||||||
|
if (has_decode and has_prefill)
|
||||||
|
else total_q
|
||||||
|
if has_decode
|
||||||
|
else 0
|
||||||
|
)
|
||||||
|
num_prefill = total_q - num_decode
|
||||||
|
|
||||||
|
# Some backends requires fp8 queries when using fp8 KV cache.
|
||||||
|
is_fp8_kvcache = kv_cache_dtype.startswith("fp8")
|
||||||
|
quantize_query = is_fp8_kvcache and getattr(
|
||||||
|
impl, "supports_quant_query_input", False
|
||||||
|
)
|
||||||
|
|
||||||
|
# quantize_query forces concat format
|
||||||
|
query_fmt = "concat" if quantize_query else backend_cfg["query_format"]
|
||||||
|
|
||||||
|
# Create decode query tensors
|
||||||
|
if has_decode:
|
||||||
|
decode_inputs, _ = _create_input_tensors(
|
||||||
|
num_decode, mla_dims, query_fmt, device, torch.bfloat16
|
||||||
|
)
|
||||||
|
# Cast decode query to fp8 if the backend supports it
|
||||||
|
if quantize_query:
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
if isinstance(decode_inputs, tuple):
|
||||||
|
decode_inputs = torch.cat(list(decode_inputs), dim=-1)
|
||||||
|
decode_inputs = decode_inputs.to(current_platform.fp8_dtype())
|
||||||
|
|
||||||
|
# Create prefill input tensors
|
||||||
|
if has_prefill:
|
||||||
|
_, prefill_inputs = _create_input_tensors(
|
||||||
|
num_prefill, mla_dims, query_fmt, device, torch.bfloat16
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build forward function
|
||||||
|
def forward_fn():
|
||||||
|
results = []
|
||||||
|
if has_decode:
|
||||||
|
results.append(impl.forward_mqa(decode_inputs, kv_cache, metadata, layer))
|
||||||
|
if has_prefill:
|
||||||
|
results.append(
|
||||||
|
impl.forward_mha(
|
||||||
|
prefill_inputs["q"],
|
||||||
|
prefill_inputs["k_c_normed"],
|
||||||
|
prefill_inputs["k_pe"],
|
||||||
|
kv_cache,
|
||||||
|
metadata,
|
||||||
|
prefill_inputs["k_scale"],
|
||||||
|
prefill_inputs["output"],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return results[0] if len(results) == 1 else tuple(results)
|
||||||
|
|
||||||
# Warmup
|
# Warmup
|
||||||
for _ in range(config.warmup_iters):
|
for _ in range(config.warmup_iters):
|
||||||
forward_fn()
|
forward_fn()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
|
# Optionally capture a CUDA graph after warmup.
|
||||||
|
# Graph replay eliminates CPU launch overhead so timings reflect pure
|
||||||
|
# kernel time.
|
||||||
|
if config.use_cuda_graphs:
|
||||||
|
graph = torch.cuda.CUDAGraph()
|
||||||
|
with torch.cuda.graph(graph):
|
||||||
|
forward_fn()
|
||||||
|
benchmark_fn = graph.replay
|
||||||
|
else:
|
||||||
|
benchmark_fn = forward_fn
|
||||||
|
|
||||||
# Benchmark
|
# Benchmark
|
||||||
times = []
|
times = []
|
||||||
@@ -711,10 +864,10 @@ def _run_single_benchmark(
|
|||||||
|
|
||||||
start.record()
|
start.record()
|
||||||
for _ in range(config.num_layers):
|
for _ in range(config.num_layers):
|
||||||
forward_fn()
|
benchmark_fn()
|
||||||
end.record()
|
end.record()
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
elapsed_ms = start.elapsed_time(end)
|
elapsed_ms = start.elapsed_time(end)
|
||||||
times.append(elapsed_ms / 1000.0 / config.num_layers)
|
times.append(elapsed_ms / 1000.0 / config.num_layers)
|
||||||
|
|
||||||
@@ -733,6 +886,7 @@ def _run_mla_benchmark_batched(
|
|||||||
backend: str,
|
backend: str,
|
||||||
configs_with_params: list[tuple], # [(config, threshold, num_splits), ...]
|
configs_with_params: list[tuple], # [(config, threshold, num_splits), ...]
|
||||||
index_topk: int = 2048,
|
index_topk: int = 2048,
|
||||||
|
prefill_backend: str | None = None,
|
||||||
) -> list[BenchmarkResult]:
|
) -> list[BenchmarkResult]:
|
||||||
"""
|
"""
|
||||||
Unified batched MLA benchmark runner for all backends.
|
Unified batched MLA benchmark runner for all backends.
|
||||||
@@ -744,11 +898,13 @@ def _run_mla_benchmark_batched(
|
|||||||
to avoid setup/teardown overhead.
|
to avoid setup/teardown overhead.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
backend: Backend name
|
backend: Backend name (decode backend used for impl construction)
|
||||||
configs_with_params: List of (config, threshold, num_splits) tuples
|
configs_with_params: List of (config, threshold, num_splits) tuples
|
||||||
- threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
|
- threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
|
||||||
- num_splits: num_kv_splits (CUTLASS only)
|
- num_splits: num_kv_splits (CUTLASS only)
|
||||||
index_topk: Topk value for sparse MLA backends (default 2048)
|
index_topk: Topk value for sparse MLA backends (default 2048)
|
||||||
|
prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
|
||||||
|
When set, forces the specified FlashAttention version for prefill.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of BenchmarkResult objects
|
List of BenchmarkResult objects
|
||||||
@@ -758,7 +914,7 @@ def _run_mla_benchmark_batched(
|
|||||||
|
|
||||||
backend_cfg = _get_backend_config(backend)
|
backend_cfg = _get_backend_config(backend)
|
||||||
device = torch.device(configs_with_params[0][0].device)
|
device = torch.device(configs_with_params[0][0].device)
|
||||||
torch.cuda.set_device(device)
|
torch.accelerator.set_device_index(device)
|
||||||
|
|
||||||
# Determine block size
|
# Determine block size
|
||||||
config_block_size = configs_with_params[0][0].block_size
|
config_block_size = configs_with_params[0][0].block_size
|
||||||
@@ -775,26 +931,91 @@ def _run_mla_benchmark_batched(
|
|||||||
# Determine if this is a sparse backend
|
# Determine if this is a sparse backend
|
||||||
is_sparse = backend_cfg.get("is_sparse", False)
|
is_sparse = backend_cfg.get("is_sparse", False)
|
||||||
|
|
||||||
|
# Extract kv_cache_dtype from the first config
|
||||||
|
kv_cache_dtype = getattr(first_config, "kv_cache_dtype", "auto")
|
||||||
|
|
||||||
|
# FlashMLA sparse only supports "fp8_ds_mla" internally (not generic "fp8").
|
||||||
|
# Remap here so the user can pass --kv-cache-dtype fp8 regardless of backend.
|
||||||
|
if backend.upper() == "FLASHMLA_SPARSE" and kv_cache_dtype == "fp8":
|
||||||
|
kv_cache_dtype = "fp8_ds_mla"
|
||||||
|
|
||||||
|
# Compute max total_q across all configs so the metadata builder buffer
|
||||||
|
# and scheduler config are large enough for all batch specs.
|
||||||
|
max_total_q = max(
|
||||||
|
sum(r.q_len for r in parse_batch_spec(cfg.batch_spec))
|
||||||
|
for cfg, *_ in configs_with_params
|
||||||
|
)
|
||||||
|
|
||||||
# Create and set vLLM config for MLA (reused across all benchmarks)
|
# Create and set vLLM config for MLA (reused across all benchmarks)
|
||||||
vllm_config = create_minimal_vllm_config(
|
vllm_config = create_minimal_vllm_config(
|
||||||
model_name="deepseek-v3", # Used only for model path
|
model_name="deepseek-v3", # Used only for model path
|
||||||
block_size=block_size,
|
block_size=block_size,
|
||||||
|
max_num_batched_tokens=max_total_q,
|
||||||
mla_dims=mla_dims, # Use custom dims from config or default
|
mla_dims=mla_dims, # Use custom dims from config or default
|
||||||
index_topk=index_topk if is_sparse else None,
|
index_topk=index_topk if is_sparse else None,
|
||||||
|
prefill_backend=prefill_backend,
|
||||||
|
kv_cache_dtype=kv_cache_dtype,
|
||||||
)
|
)
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
with set_current_vllm_config(vllm_config):
|
with set_current_vllm_config(vllm_config):
|
||||||
|
# Clear cached prefill backend detection functions so they re-evaluate
|
||||||
|
# with the current VllmConfig. These are @functools.cache decorated and
|
||||||
|
# would otherwise return stale results from a previous backend's config.
|
||||||
|
from vllm.model_executor.layers.attention.mla_attention import (
|
||||||
|
use_cudnn_prefill,
|
||||||
|
use_flashinfer_prefill,
|
||||||
|
use_trtllm_ragged_deepseek_prefill,
|
||||||
|
)
|
||||||
|
|
||||||
|
use_flashinfer_prefill.cache_clear()
|
||||||
|
use_cudnn_prefill.cache_clear()
|
||||||
|
use_trtllm_ragged_deepseek_prefill.cache_clear()
|
||||||
|
|
||||||
# Create backend impl, layer, builder, and indexer (reused across benchmarks)
|
# Create backend impl, layer, builder, and indexer (reused across benchmarks)
|
||||||
impl, layer, builder_instance, indexer = _create_backend_impl(
|
impl, layer, builder_instance, indexer = _create_backend_impl(
|
||||||
backend_cfg,
|
backend_cfg,
|
||||||
mla_dims,
|
mla_dims,
|
||||||
vllm_config,
|
vllm_config,
|
||||||
device,
|
device,
|
||||||
|
max_num_tokens=max_total_q,
|
||||||
index_topk=index_topk if is_sparse else None,
|
index_topk=index_topk if is_sparse else None,
|
||||||
|
kv_cache_dtype=kv_cache_dtype,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Verify the actual prefill backend matches what was requested
|
||||||
|
if prefill_backend is not None:
|
||||||
|
prefill_cfg = get_prefill_backend_config(prefill_backend)
|
||||||
|
fa_version = prefill_cfg["flash_attn_version"]
|
||||||
|
|
||||||
|
if fa_version is not None:
|
||||||
|
# FA backend: verify the impl's FA version
|
||||||
|
actual_fa_version = getattr(impl, "vllm_flash_attn_version", None)
|
||||||
|
if actual_fa_version != fa_version:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Prefill backend '{prefill_backend}' requested FA "
|
||||||
|
f"version {fa_version}, but the impl is using FA "
|
||||||
|
f"version {actual_fa_version}. Check "
|
||||||
|
f"vllm/v1/attention/backends/fa_utils.py."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Non-FA backend: verify the builder picked the right path
|
||||||
|
expected_flags = {
|
||||||
|
"flashinfer": "_use_fi_prefill",
|
||||||
|
"cudnn": "_use_cudnn_prefill",
|
||||||
|
"trtllm": "_use_trtllm_ragged_prefill",
|
||||||
|
}
|
||||||
|
flag_name = expected_flags.get(prefill_backend)
|
||||||
|
if flag_name and not getattr(builder_instance, flag_name, False):
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Prefill backend '{prefill_backend}' was requested "
|
||||||
|
f"but the metadata builder did not enable it. This "
|
||||||
|
f"usually means a dependency is missing (e.g., "
|
||||||
|
f"flashinfer not installed) or the platform doesn't "
|
||||||
|
f"support it."
|
||||||
|
)
|
||||||
|
|
||||||
# Run each benchmark with the shared impl
|
# Run each benchmark with the shared impl
|
||||||
for config, threshold, num_splits in configs_with_params:
|
for config, threshold, num_splits in configs_with_params:
|
||||||
# Set threshold for this benchmark (FlashAttn/FlashMLA only)
|
# Set threshold for this benchmark (FlashAttn/FlashMLA only)
|
||||||
@@ -819,6 +1040,7 @@ def _run_mla_benchmark_batched(
|
|||||||
mla_dims,
|
mla_dims,
|
||||||
device,
|
device,
|
||||||
indexer=indexer,
|
indexer=indexer,
|
||||||
|
kv_cache_dtype=kv_cache_dtype,
|
||||||
)
|
)
|
||||||
results.append(result)
|
results.append(result)
|
||||||
|
|
||||||
@@ -845,6 +1067,7 @@ def run_mla_benchmark(
|
|||||||
reorder_batch_threshold: int | None = None,
|
reorder_batch_threshold: int | None = None,
|
||||||
num_kv_splits: int | None = None,
|
num_kv_splits: int | None = None,
|
||||||
index_topk: int = 2048,
|
index_topk: int = 2048,
|
||||||
|
prefill_backend: str | None = None,
|
||||||
) -> BenchmarkResult | list[BenchmarkResult]:
|
) -> BenchmarkResult | list[BenchmarkResult]:
|
||||||
"""
|
"""
|
||||||
Unified MLA benchmark runner for all backends.
|
Unified MLA benchmark runner for all backends.
|
||||||
@@ -862,6 +1085,8 @@ def run_mla_benchmark(
|
|||||||
(single config mode only)
|
(single config mode only)
|
||||||
num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
|
num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
|
||||||
index_topk: Topk value for sparse MLA backends (default 2048)
|
index_topk: Topk value for sparse MLA backends (default 2048)
|
||||||
|
prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
|
||||||
|
When set, forces the specified FlashAttention version for prefill.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
|
BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
|
||||||
@@ -885,7 +1110,9 @@ def run_mla_benchmark(
|
|||||||
return_single = True
|
return_single = True
|
||||||
|
|
||||||
# Use unified batched execution
|
# Use unified batched execution
|
||||||
results = _run_mla_benchmark_batched(backend, configs_with_params, index_topk)
|
results = _run_mla_benchmark_batched(
|
||||||
|
backend, configs_with_params, index_topk, prefill_backend=prefill_backend
|
||||||
|
)
|
||||||
|
|
||||||
# Return single result or list based on input
|
# Return single result or list based on input
|
||||||
return results[0] if return_single else results
|
return results[0] if return_single else results
|
||||||
|
|||||||
@@ -140,8 +140,7 @@ def _create_vllm_config(
|
|||||||
|
|
||||||
cache_config = CacheConfig(
|
cache_config = CacheConfig(
|
||||||
block_size=config.block_size,
|
block_size=config.block_size,
|
||||||
cache_dtype="auto",
|
cache_dtype=config.kv_cache_dtype,
|
||||||
swap_space=0,
|
|
||||||
)
|
)
|
||||||
cache_config.num_gpu_blocks = max_num_blocks
|
cache_config.num_gpu_blocks = max_num_blocks
|
||||||
cache_config.num_cpu_blocks = 0
|
cache_config.num_cpu_blocks = 0
|
||||||
@@ -216,7 +215,7 @@ def _create_backend_impl(
|
|||||||
num_kv_heads=config.num_kv_heads,
|
num_kv_heads=config.num_kv_heads,
|
||||||
alibi_slopes=None,
|
alibi_slopes=None,
|
||||||
sliding_window=None,
|
sliding_window=None,
|
||||||
kv_cache_dtype="auto",
|
kv_cache_dtype=config.kv_cache_dtype,
|
||||||
)
|
)
|
||||||
|
|
||||||
kv_cache_spec = FullAttentionSpec(
|
kv_cache_spec = FullAttentionSpec(
|
||||||
@@ -289,12 +288,22 @@ def _create_input_tensors(
|
|||||||
total_q: int,
|
total_q: int,
|
||||||
device: torch.device,
|
device: torch.device,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
|
quantize_query: bool = False,
|
||||||
) -> tuple:
|
) -> tuple:
|
||||||
"""Create Q, K, V input tensors for all layers."""
|
"""Create Q, K, V input tensors for all layers.
|
||||||
|
|
||||||
|
When quantize_query is True, queries are cast to fp8 to match backends
|
||||||
|
that require query/key/value dtype consistency.
|
||||||
|
"""
|
||||||
|
q_dtype = dtype
|
||||||
|
if quantize_query:
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
q_dtype = current_platform.fp8_dtype()
|
||||||
q_list = [
|
q_list = [
|
||||||
torch.randn(
|
torch.randn(
|
||||||
total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
|
total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
|
||||||
)
|
).to(q_dtype)
|
||||||
for _ in range(config.num_layers)
|
for _ in range(config.num_layers)
|
||||||
]
|
]
|
||||||
k_list = [
|
k_list = [
|
||||||
@@ -345,10 +354,17 @@ def _create_kv_cache(
|
|||||||
# Compute inverse permutation to get back to logical view
|
# Compute inverse permutation to get back to logical view
|
||||||
inv_order = [stride_order.index(i) for i in range(len(stride_order))]
|
inv_order = [stride_order.index(i) for i in range(len(stride_order))]
|
||||||
|
|
||||||
|
# Use fp8 dtype for cache when requested.
|
||||||
|
cache_dtype = dtype
|
||||||
|
if config.kv_cache_dtype == "fp8":
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
cache_dtype = current_platform.fp8_dtype()
|
||||||
|
|
||||||
cache_list = []
|
cache_list = []
|
||||||
for _ in range(config.num_layers):
|
for _ in range(config.num_layers):
|
||||||
# Allocate in physical layout order (contiguous in memory)
|
# Allocate in physical layout order (contiguous in memory)
|
||||||
cache = torch.zeros(*physical_shape, device=device, dtype=dtype)
|
cache = torch.zeros(*physical_shape, device=device, dtype=cache_dtype)
|
||||||
# Permute to logical view
|
# Permute to logical view
|
||||||
cache = cache.permute(*inv_order)
|
cache = cache.permute(*inv_order)
|
||||||
cache_list.append(cache)
|
cache_list.append(cache)
|
||||||
@@ -391,7 +407,38 @@ def _run_single_benchmark(
|
|||||||
attn_metadata,
|
attn_metadata,
|
||||||
output=out,
|
output=out,
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
|
# Optionally capture a CUDA graph after warmup.
|
||||||
|
# Graph replay eliminates CPU launch overhead so timings reflect pure
|
||||||
|
# kernel time.
|
||||||
|
if config.use_cuda_graphs:
|
||||||
|
graph = torch.cuda.CUDAGraph()
|
||||||
|
with torch.cuda.graph(graph):
|
||||||
|
for i in range(config.num_layers):
|
||||||
|
impl.forward(
|
||||||
|
layer,
|
||||||
|
q_list[i],
|
||||||
|
k_list[i],
|
||||||
|
v_list[i],
|
||||||
|
cache_list[i],
|
||||||
|
attn_metadata,
|
||||||
|
output=out,
|
||||||
|
)
|
||||||
|
benchmark_fn = graph.replay
|
||||||
|
else:
|
||||||
|
|
||||||
|
def benchmark_fn():
|
||||||
|
for i in range(config.num_layers):
|
||||||
|
impl.forward(
|
||||||
|
layer,
|
||||||
|
q_list[i],
|
||||||
|
k_list[i],
|
||||||
|
v_list[i],
|
||||||
|
cache_list[i],
|
||||||
|
attn_metadata,
|
||||||
|
output=out,
|
||||||
|
)
|
||||||
|
|
||||||
# Benchmark
|
# Benchmark
|
||||||
times = []
|
times = []
|
||||||
@@ -400,27 +447,18 @@ def _run_single_benchmark(
|
|||||||
end = torch.cuda.Event(enable_timing=True)
|
end = torch.cuda.Event(enable_timing=True)
|
||||||
|
|
||||||
start.record()
|
start.record()
|
||||||
for i in range(config.num_layers):
|
benchmark_fn()
|
||||||
impl.forward(
|
|
||||||
layer,
|
|
||||||
q_list[i],
|
|
||||||
k_list[i],
|
|
||||||
v_list[i],
|
|
||||||
cache_list[i],
|
|
||||||
attn_metadata,
|
|
||||||
output=out,
|
|
||||||
)
|
|
||||||
end.record()
|
end.record()
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
elapsed_ms = start.elapsed_time(end)
|
elapsed_ms = start.elapsed_time(end)
|
||||||
times.append(elapsed_ms / 1000.0 / config.num_layers) # seconds per layer
|
times.append(elapsed_ms / 1000.0 / config.num_layers) # seconds per layer
|
||||||
|
|
||||||
mem_stats = {}
|
mem_stats = {}
|
||||||
if config.profile_memory:
|
if config.profile_memory:
|
||||||
mem_stats = {
|
mem_stats = {
|
||||||
"allocated_mb": torch.cuda.memory_allocated(device) / 1024**2,
|
"allocated_mb": torch.accelerator.memory_allocated(device) / 1024**2,
|
||||||
"reserved_mb": torch.cuda.memory_reserved(device) / 1024**2,
|
"reserved_mb": torch.accelerator.memory_reserved(device) / 1024**2,
|
||||||
}
|
}
|
||||||
|
|
||||||
return times, mem_stats
|
return times, mem_stats
|
||||||
@@ -444,7 +482,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
|
|||||||
BenchmarkResult with timing and memory statistics
|
BenchmarkResult with timing and memory statistics
|
||||||
"""
|
"""
|
||||||
device = torch.device(config.device)
|
device = torch.device(config.device)
|
||||||
torch.cuda.set_device(device)
|
torch.accelerator.set_device_index(device)
|
||||||
|
|
||||||
backend_cfg = _get_backend_config(config.backend)
|
backend_cfg = _get_backend_config(config.backend)
|
||||||
|
|
||||||
@@ -503,8 +541,12 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
|
|||||||
common_attn_metadata=common_metadata,
|
common_attn_metadata=common_metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Only quantize queries when the impl supports it
|
||||||
|
quantize_query = config.kv_cache_dtype.startswith("fp8") and getattr(
|
||||||
|
impl, "supports_quant_query_input", False
|
||||||
|
)
|
||||||
q_list, k_list, v_list = _create_input_tensors(
|
q_list, k_list, v_list = _create_input_tensors(
|
||||||
config, total_q, device, dtype
|
config, total_q, device, dtype, quantize_query=quantize_query
|
||||||
)
|
)
|
||||||
|
|
||||||
cache_list = _create_kv_cache(
|
cache_list = _create_kv_cache(
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LE
|
|||||||
| --- | --- | --- |
|
| --- | --- | --- |
|
||||||
| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
|
| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
|
||||||
| `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` |
|
| `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` |
|
||||||
| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
|
| `SYSTEM` | **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
|
||||||
| `TP` | **Required.** The tensor-parallelism size. | `1` |
|
| `TP` | **Required.** The tensor-parallelism size. | `1` |
|
||||||
| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
|
| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
|
||||||
| `INPUT_LEN` | **Required.** Request input length. | `4000` |
|
| `INPUT_LEN` | **Required.** Request input length. | `4000` |
|
||||||
|
|||||||
@@ -94,15 +94,18 @@ def create_logits(
|
|||||||
|
|
||||||
def measure_memory() -> tuple[int, int]:
|
def measure_memory() -> tuple[int, int]:
|
||||||
"""Return (allocated, reserved) memory in bytes."""
|
"""Return (allocated, reserved) memory in bytes."""
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()
|
return (
|
||||||
|
torch.accelerator.memory_allocated(),
|
||||||
|
torch.accelerator.max_memory_allocated(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def reset_memory_stats():
|
def reset_memory_stats():
|
||||||
"""Reset peak memory statistics."""
|
"""Reset peak memory statistics."""
|
||||||
reset_buffer_cache()
|
reset_buffer_cache()
|
||||||
torch.cuda.reset_peak_memory_stats()
|
torch.accelerator.reset_peak_memory_stats()
|
||||||
torch.cuda.empty_cache()
|
torch.accelerator.empty_cache()
|
||||||
gc.collect()
|
gc.collect()
|
||||||
|
|
||||||
|
|
||||||
@@ -123,7 +126,7 @@ def benchmark_function(
|
|||||||
for _ in range(warmup_iters):
|
for _ in range(warmup_iters):
|
||||||
logits_copy = logits.clone()
|
logits_copy = logits.clone()
|
||||||
func(logits_copy, k, p)
|
func(logits_copy, k, p)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Reset memory stats before benchmark
|
# Reset memory stats before benchmark
|
||||||
reset_memory_stats()
|
reset_memory_stats()
|
||||||
@@ -140,7 +143,7 @@ def benchmark_function(
|
|||||||
func(logits_copy, k, p)
|
func(logits_copy, k, p)
|
||||||
end_events[i].record()
|
end_events[i].record()
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Calculate timing
|
# Calculate timing
|
||||||
times = [
|
times = [
|
||||||
|
|||||||
98
benchmarks/kernels/bench_concat_mla_q.py
Normal file
98
benchmarks/kernels/bench_concat_mla_q.py
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.triton_utils import triton
|
||||||
|
|
||||||
|
# DeepSeek V3 dimensions
|
||||||
|
NOPE_DIM = 512
|
||||||
|
ROPE_DIM = 64
|
||||||
|
NUM_HEADS = 128
|
||||||
|
|
||||||
|
NUM_TOKENS = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
|
||||||
|
|
||||||
|
|
||||||
|
def get_configs():
|
||||||
|
return NUM_TOKENS
|
||||||
|
|
||||||
|
|
||||||
|
def make_inputs(num_tokens, dtype):
|
||||||
|
"""Create inputs matching the real code path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
contiguous_nope: If False, simulate the transposed BMM output
|
||||||
|
(non-contiguous nope with stride pattern from
|
||||||
|
[N,B,L].transpose(0,1)).
|
||||||
|
"""
|
||||||
|
# Simulate: bmm output [N, B, L].transpose(0, 1) -> [B, N, L]
|
||||||
|
raw = torch.randn(NUM_HEADS, num_tokens, NOPE_DIM, dtype=dtype, device="cuda")
|
||||||
|
ql_nope = raw.transpose(0, 1)
|
||||||
|
|
||||||
|
q_pe = torch.randn(num_tokens, NUM_HEADS, ROPE_DIM, dtype=dtype, device="cuda")
|
||||||
|
return ql_nope, q_pe
|
||||||
|
|
||||||
|
|
||||||
|
# ---- Non-contiguous nope benchmark (real code path) ----
|
||||||
|
@triton.testing.perf_report(
|
||||||
|
triton.testing.Benchmark(
|
||||||
|
x_names=["num_tokens"],
|
||||||
|
x_vals=get_configs(),
|
||||||
|
line_arg="provider",
|
||||||
|
line_vals=["torch_cat", "concat_mla_q"],
|
||||||
|
line_names=["torch.cat", "concat_mla_q (v8)"],
|
||||||
|
styles=[("blue", "--"), ("green", "-")],
|
||||||
|
ylabel="Latency (us)",
|
||||||
|
plot_name="concat_mla_q-transposed",
|
||||||
|
args={},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
def bench_transposed(num_tokens, provider):
|
||||||
|
dtype = torch.bfloat16
|
||||||
|
ql_nope, q_pe = make_inputs(num_tokens, dtype)
|
||||||
|
|
||||||
|
q_out = torch.empty(
|
||||||
|
num_tokens, NUM_HEADS, NOPE_DIM + ROPE_DIM, dtype=dtype, device="cuda"
|
||||||
|
)
|
||||||
|
|
||||||
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
|
||||||
|
if provider == "torch_cat":
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: torch.cat((ql_nope, q_pe), dim=-1), quantiles=quantiles, rep=500
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: ops.concat_mla_q(ql_nope, q_pe, q_out), quantiles=quantiles, rep=500
|
||||||
|
)
|
||||||
|
|
||||||
|
return ms * 1000, max_ms * 1000, min_ms * 1000 # us
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description="Benchmark concat_mla_q vs torch.cat")
|
||||||
|
parser.add_argument(
|
||||||
|
"--save-path", type=str, default=None, help="Path to save benchmark results"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("CONCAT MLA Q KERNEL BENCHMARKS")
|
||||||
|
print("=" * 70)
|
||||||
|
print(f"Dimensions: nope={NOPE_DIM}, rope={ROPE_DIM}, heads={NUM_HEADS}")
|
||||||
|
print(
|
||||||
|
f"Per-head output: {NOPE_DIM + ROPE_DIM} bf16 = "
|
||||||
|
f"{(NOPE_DIM + ROPE_DIM) * 2} bytes"
|
||||||
|
)
|
||||||
|
print(f"num_tokens (decode=batch_size, prefill=chunk_size): {NUM_TOKENS}")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
print("\n--- Non-contiguous nope inputs (transposed BMM output) ---")
|
||||||
|
bench_transposed.run(print_data=True, save_path=args.save_path)
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("Benchmarking complete!")
|
||||||
|
print("=" * 70)
|
||||||
153
benchmarks/kernels/bench_cp_gather_fp8.py
Normal file
153
benchmarks/kernels/bench_cp_gather_fp8.py
Normal file
@@ -0,0 +1,153 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import argparse
|
||||||
|
import math
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.triton_utils import triton
|
||||||
|
|
||||||
|
# DeepSeek V3 MLA dimensions
|
||||||
|
NOPE_DIM = 512
|
||||||
|
ROPE_DIM = 64
|
||||||
|
HEAD_DIM = NOPE_DIM + ROPE_DIM # 576 BF16 output elements per token
|
||||||
|
ENTRY_BYTES = 656 # 512 FP8 + 16 scales + 128 BF16 RoPE
|
||||||
|
BLOCK_SIZE = 64 # tokens per physical cache block - get_supported_kernel_block_sizes
|
||||||
|
|
||||||
|
# Realistic prefill scenarios:
|
||||||
|
# - 1 long prefill: single request, 16K-96K tokens
|
||||||
|
# - 4 medium prefills: 4 requests, 4K-24K tokens each
|
||||||
|
# - 16 shorter prefills: 16 requests, 1K-6K tokens each
|
||||||
|
SCENARIOS = [
|
||||||
|
# (label, num_reqs, total_tokens_list)
|
||||||
|
("1-req", 1, [8192, 16384, 32768, 65536, 98304]),
|
||||||
|
("4-reqs", 4, [8192, 16384, 32768, 65536, 98304]),
|
||||||
|
("16-reqs", 16, [8192, 16384, 32768, 65536, 98304]),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def make_inputs(total_tokens, num_reqs, block_size):
|
||||||
|
"""Create synthetic FP8 cache, block table, and output buffer.
|
||||||
|
|
||||||
|
Fills the cache with random bytes (we only measure throughput,
|
||||||
|
not correctness). Block table maps each request to contiguous
|
||||||
|
physical blocks.
|
||||||
|
"""
|
||||||
|
# Divide tokens evenly across requests
|
||||||
|
base_len = total_tokens // num_reqs
|
||||||
|
remainder = total_tokens % num_reqs
|
||||||
|
seq_lens = [base_len + (1 if r < remainder else 0) for r in range(num_reqs)]
|
||||||
|
|
||||||
|
# workspace_starts: cumulative sum of seq_lens
|
||||||
|
workspace_starts = [0] * num_reqs
|
||||||
|
for r in range(1, num_reqs):
|
||||||
|
workspace_starts[r] = workspace_starts[r - 1] + seq_lens[r - 1]
|
||||||
|
|
||||||
|
# Physical blocks needed per request
|
||||||
|
blocks_per_req = [math.ceil(s / block_size) for s in seq_lens]
|
||||||
|
total_blocks = sum(blocks_per_req)
|
||||||
|
max_blocks = max(blocks_per_req)
|
||||||
|
|
||||||
|
# Allocate cache with random data (content doesn't matter for perf)
|
||||||
|
cache = torch.randint(
|
||||||
|
0,
|
||||||
|
256,
|
||||||
|
(total_blocks, block_size, ENTRY_BYTES),
|
||||||
|
dtype=torch.uint8,
|
||||||
|
device="cuda",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Block table: contiguous block assignments
|
||||||
|
block_table = torch.zeros(num_reqs, max_blocks, dtype=torch.int32, device="cuda")
|
||||||
|
block_idx = 0
|
||||||
|
for r in range(num_reqs):
|
||||||
|
for b in range(blocks_per_req[r]):
|
||||||
|
block_table[r, b] = block_idx
|
||||||
|
block_idx += 1
|
||||||
|
|
||||||
|
# Output workspace
|
||||||
|
dst = torch.zeros(total_tokens, HEAD_DIM, dtype=torch.bfloat16, device="cuda")
|
||||||
|
|
||||||
|
seq_lens_t = torch.tensor(seq_lens, dtype=torch.int32, device="cuda")
|
||||||
|
workspace_starts_t = torch.tensor(
|
||||||
|
workspace_starts, dtype=torch.int32, device="cuda"
|
||||||
|
)
|
||||||
|
|
||||||
|
return cache, dst, block_table, seq_lens_t, workspace_starts_t
|
||||||
|
|
||||||
|
|
||||||
|
def bench_scenario(label, num_reqs, total_tokens_list, save_path):
|
||||||
|
"""Run benchmark for a specific (num_reqs, total_tokens) scenario."""
|
||||||
|
|
||||||
|
@triton.testing.perf_report(
|
||||||
|
triton.testing.Benchmark(
|
||||||
|
x_names=["total_tokens"],
|
||||||
|
x_vals=total_tokens_list,
|
||||||
|
line_arg="provider",
|
||||||
|
line_vals=["cuda_kernel"],
|
||||||
|
line_names=["cp_gather_fp8 (CUDA)"],
|
||||||
|
styles=[("green", "-")],
|
||||||
|
ylabel="Latency (us)",
|
||||||
|
plot_name=f"cp_gather_fp8-{label}-bs{BLOCK_SIZE}",
|
||||||
|
args={"num_reqs": num_reqs},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
def bench_fn(total_tokens, provider, num_reqs):
|
||||||
|
cache, dst, block_table, seq_lens_t, ws_starts = make_inputs(
|
||||||
|
total_tokens, num_reqs, BLOCK_SIZE
|
||||||
|
)
|
||||||
|
|
||||||
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: ops.cp_gather_and_upconvert_fp8_kv_cache(
|
||||||
|
cache, dst, block_table, seq_lens_t, ws_starts, num_reqs
|
||||||
|
),
|
||||||
|
quantiles=quantiles,
|
||||||
|
rep=500,
|
||||||
|
)
|
||||||
|
|
||||||
|
return ms * 1000, max_ms * 1000, min_ms * 1000 # us
|
||||||
|
|
||||||
|
seq_len_per_req = total_tokens_list[0] // num_reqs
|
||||||
|
seq_len_per_req_max = total_tokens_list[-1] // num_reqs
|
||||||
|
print(
|
||||||
|
f"\n--- {label}: {num_reqs} request(s), "
|
||||||
|
f"~{seq_len_per_req}-{seq_len_per_req_max} tokens/req ---"
|
||||||
|
)
|
||||||
|
bench_fn.run(print_data=True, save_path=save_path)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Benchmark cp_gather_and_upconvert_fp8_kv_cache"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--save-path",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Path to save benchmark results as CSV",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Print data volume info for bandwidth analysis
|
||||||
|
read_per_token = ENTRY_BYTES # 656 bytes from cache
|
||||||
|
write_per_token = HEAD_DIM * 2 # 576 * 2 = 1152 bytes to workspace
|
||||||
|
total_per_token = read_per_token + write_per_token # 1808 bytes
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("CP_GATHER_AND_UPCONVERT_FP8_KV_CACHE BENCHMARKS")
|
||||||
|
print("=" * 70)
|
||||||
|
print(f"Cache entry: {ENTRY_BYTES} bytes (512 FP8 + 16 scales + 128 RoPE)")
|
||||||
|
print(f"Output row: {HEAD_DIM} BF16 = {HEAD_DIM * 2} bytes")
|
||||||
|
print(f"Per token: {total_per_token} bytes (read + write)")
|
||||||
|
print(f"Block size: {BLOCK_SIZE} tokens/block")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
for label, num_reqs, total_tokens_list in SCENARIOS:
|
||||||
|
bench_scenario(label, num_reqs, total_tokens_list, args.save_path)
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("Benchmarking complete!")
|
||||||
|
print("=" * 70)
|
||||||
@@ -168,7 +168,7 @@ def bench_impl(
|
|||||||
# warmup
|
# warmup
|
||||||
for kwargs in kwargs_list:
|
for kwargs in kwargs_list:
|
||||||
impl_type.get_impl()(**kwargs)
|
impl_type.get_impl()(**kwargs)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Merge into a single kwargs and qualify arguments as ArgPool
|
# Merge into a single kwargs and qualify arguments as ArgPool
|
||||||
kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
|
kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
|
||||||
@@ -202,7 +202,7 @@ def test_correctness(T: int, N: int):
|
|||||||
# reference output
|
# reference output
|
||||||
ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
|
ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
|
||||||
|
|
||||||
# test ouptut
|
# test output
|
||||||
out_q, out_s = output_from_impl(
|
out_q, out_s = output_from_impl(
|
||||||
ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
|
ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ def bench_run(
|
|||||||
per_out_ch: bool,
|
per_out_ch: bool,
|
||||||
mkn: tuple[int, int, int],
|
mkn: tuple[int, int, int],
|
||||||
):
|
):
|
||||||
init_workspace_manager(torch.cuda.current_device())
|
init_workspace_manager(torch.accelerator.current_device_index())
|
||||||
(m, k, n) = mkn
|
(m, k, n) = mkn
|
||||||
|
|
||||||
dtype = torch.half
|
dtype = torch.half
|
||||||
@@ -171,7 +171,7 @@ def bench_run(
|
|||||||
activation=MoEActivation.SILU,
|
activation=MoEActivation.SILU,
|
||||||
global_num_experts=num_experts,
|
global_num_experts=num_experts,
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
|
# Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
|
||||||
triton_stream = torch.cuda.Stream()
|
triton_stream = torch.cuda.Stream()
|
||||||
@@ -187,14 +187,14 @@ def bench_run(
|
|||||||
topk_ids,
|
topk_ids,
|
||||||
quant_config=quant_config,
|
quant_config=quant_config,
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
|
def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
|
||||||
"""Benchmark CUDA graph using events like benchmark_moe.py"""
|
"""Benchmark CUDA graph using events like benchmark_moe.py"""
|
||||||
# Warmup
|
# Warmup
|
||||||
for _ in range(num_warmup):
|
for _ in range(num_warmup):
|
||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Timing
|
# Timing
|
||||||
start_event = torch.Event(enable_timing=True)
|
start_event = torch.Event(enable_timing=True)
|
||||||
@@ -202,7 +202,7 @@ def bench_run(
|
|||||||
|
|
||||||
latencies = []
|
latencies = []
|
||||||
for _ in range(num_iters):
|
for _ in range(num_iters):
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start_event.record()
|
start_event.record()
|
||||||
graph.replay()
|
graph.replay()
|
||||||
end_event.record()
|
end_event.record()
|
||||||
|
|||||||
@@ -307,7 +307,7 @@ def bench_run(
|
|||||||
def replay_graph(graph, num_repeats):
|
def replay_graph(graph, num_repeats):
|
||||||
for _ in range(num_repeats):
|
for _ in range(num_repeats):
|
||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
cutlass_stream = torch.cuda.Stream()
|
cutlass_stream = torch.cuda.Stream()
|
||||||
cutlass_graph = torch.cuda.CUDAGraph()
|
cutlass_graph = torch.cuda.CUDAGraph()
|
||||||
@@ -330,7 +330,7 @@ def bench_run(
|
|||||||
e=num_experts,
|
e=num_experts,
|
||||||
device=device,
|
device=device,
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
triton_stream = torch.cuda.Stream()
|
triton_stream = torch.cuda.Stream()
|
||||||
triton_graph = torch.cuda.CUDAGraph()
|
triton_graph = torch.cuda.CUDAGraph()
|
||||||
@@ -345,7 +345,7 @@ def bench_run(
|
|||||||
w2_fp8scale,
|
w2_fp8scale,
|
||||||
a_fp8_scale,
|
a_fp8_scale,
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
min_run_time = 5
|
min_run_time = 5
|
||||||
num_warmup = 5
|
num_warmup = 5
|
||||||
|
|||||||
@@ -342,7 +342,7 @@ class CommunicatorBenchmark:
|
|||||||
if not should_use_fn(tensor):
|
if not should_use_fn(tensor):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
stream = torch.cuda.Stream()
|
stream = torch.cuda.Stream()
|
||||||
with torch.cuda.stream(stream):
|
with torch.cuda.stream(stream):
|
||||||
graph_input = tensor.clone()
|
graph_input = tensor.clone()
|
||||||
@@ -360,17 +360,17 @@ class CommunicatorBenchmark:
|
|||||||
for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
|
for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
|
||||||
allreduce_fn(graph_input)
|
allreduce_fn(graph_input)
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
for _ in range(num_warmup):
|
for _ in range(num_warmup):
|
||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
|
|
||||||
for _ in range(num_trials):
|
for _ in range(num_trials):
|
||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
end_time = time.perf_counter()
|
end_time = time.perf_counter()
|
||||||
|
|
||||||
@@ -495,7 +495,7 @@ def main():
|
|||||||
|
|
||||||
# Set device
|
# Set device
|
||||||
device = torch.device(f"cuda:{rank}")
|
device = torch.device(f"cuda:{rank}")
|
||||||
torch.cuda.set_device(device)
|
torch.accelerator.set_device_index(device)
|
||||||
|
|
||||||
# Get CPU process group
|
# Get CPU process group
|
||||||
cpu_group = dist.new_group(backend="gloo")
|
cpu_group = dist.new_group(backend="gloo")
|
||||||
|
|||||||
@@ -385,32 +385,32 @@ def benchmark_operation(
|
|||||||
# Warmup before graph capture
|
# Warmup before graph capture
|
||||||
for _ in range(warmup):
|
for _ in range(warmup):
|
||||||
operation_func(*args, **kwargs)
|
operation_func(*args, **kwargs)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Create CUDA graph
|
# Create CUDA graph
|
||||||
graph = torch.cuda.CUDAGraph()
|
graph = torch.cuda.CUDAGraph()
|
||||||
num_op_per_cudagraph = 10
|
num_op_per_cudagraph = 10
|
||||||
|
|
||||||
# Use vLLM's graph_capture to make tensor_model_parallel_all_reduce graph-safe
|
# Use vLLM's graph_capture to make tensor_model_parallel_all_reduce graph-safe
|
||||||
device = torch.device(f"cuda:{torch.cuda.current_device()}")
|
device = torch.device(f"cuda:{torch.accelerator.current_device_index()}")
|
||||||
with graph_capture(device=device), torch.cuda.graph(graph):
|
with graph_capture(device=device), torch.cuda.graph(graph):
|
||||||
for _ in range(num_op_per_cudagraph):
|
for _ in range(num_op_per_cudagraph):
|
||||||
operation_func(*args, **kwargs)
|
operation_func(*args, **kwargs)
|
||||||
|
|
||||||
# Graph warmup
|
# Graph warmup
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
for _ in range(warmup):
|
for _ in range(warmup):
|
||||||
graph.replay()
|
graph.replay()
|
||||||
|
|
||||||
# Benchmark with CUDA graph
|
# Benchmark with CUDA graph
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
|
|
||||||
for _ in range(trials // num_op_per_cudagraph):
|
for _ in range(trials // num_op_per_cudagraph):
|
||||||
# operation_func(*args, **kwargs)
|
# operation_func(*args, **kwargs)
|
||||||
graph.replay()
|
graph.replay()
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
end_time = time.perf_counter()
|
end_time = time.perf_counter()
|
||||||
|
|
||||||
avg_time_ms = ((end_time - start_time) / trials) * 1000
|
avg_time_ms = ((end_time - start_time) / trials) * 1000
|
||||||
@@ -984,7 +984,7 @@ def main():
|
|||||||
world_size = int(os.environ["WORLD_SIZE"])
|
world_size = int(os.environ["WORLD_SIZE"])
|
||||||
|
|
||||||
device = torch.device(f"cuda:{rank}")
|
device = torch.device(f"cuda:{rank}")
|
||||||
torch.cuda.set_device(device)
|
torch.accelerator.set_device_index(device)
|
||||||
torch.set_default_device(device)
|
torch.set_default_device(device)
|
||||||
|
|
||||||
init_distributed_environment()
|
init_distributed_environment()
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ def bench_run(
|
|||||||
per_out_ch: bool,
|
per_out_ch: bool,
|
||||||
mkn: tuple[int, int, int],
|
mkn: tuple[int, int, int],
|
||||||
):
|
):
|
||||||
init_workspace_manager(torch.cuda.current_device())
|
init_workspace_manager(torch.accelerator.current_device_index())
|
||||||
label = "Quant Matmul"
|
label = "Quant Matmul"
|
||||||
|
|
||||||
sub_label = (
|
sub_label = (
|
||||||
@@ -224,7 +224,7 @@ def bench_run(
|
|||||||
def replay_graph(graph, num_repeats):
|
def replay_graph(graph, num_repeats):
|
||||||
for _ in range(num_repeats):
|
for _ in range(num_repeats):
|
||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
cutlass_stream = torch.cuda.Stream()
|
cutlass_stream = torch.cuda.Stream()
|
||||||
cutlass_graph = torch.cuda.CUDAGraph()
|
cutlass_graph = torch.cuda.CUDAGraph()
|
||||||
@@ -239,7 +239,7 @@ def bench_run(
|
|||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
triton_stream = torch.cuda.Stream()
|
triton_stream = torch.cuda.Stream()
|
||||||
triton_graph = torch.cuda.CUDAGraph()
|
triton_graph = torch.cuda.CUDAGraph()
|
||||||
@@ -254,7 +254,7 @@ def bench_run(
|
|||||||
w2_scale,
|
w2_scale,
|
||||||
a_scale,
|
a_scale,
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
min_run_time = 5
|
min_run_time = 5
|
||||||
num_warmup = 5
|
num_warmup = 5
|
||||||
|
|||||||
@@ -34,14 +34,14 @@ def main(
|
|||||||
residual = torch.randn_like(x) * scale if add_residual else None
|
residual = torch.randn_like(x) * scale if add_residual else None
|
||||||
|
|
||||||
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
|
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
if profile:
|
if profile:
|
||||||
torch.cuda.cudart().cudaProfilerStart()
|
torch.cuda.cudart().cudaProfilerStart()
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
|
|
||||||
for _ in range(num_iters):
|
for _ in range(num_iters):
|
||||||
layer(x, residual)
|
layer(x, residual)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
end_time = time.perf_counter()
|
end_time = time.perf_counter()
|
||||||
if profile:
|
if profile:
|
||||||
|
|||||||
@@ -1035,7 +1035,7 @@ def bench_optype(
|
|||||||
# Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
|
# Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
|
||||||
for kwargs in kwargs_list:
|
for kwargs in kwargs_list:
|
||||||
op_type.bench_fn()(**kwargs)
|
op_type.bench_fn()(**kwargs)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Merge into a single kwargs and qualify arguments as ArgPool
|
# Merge into a single kwargs and qualify arguments as ArgPool
|
||||||
kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
|
kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
|
||||||
|
|||||||
@@ -47,13 +47,13 @@ def benchmark_method(
|
|||||||
# Warmup
|
# Warmup
|
||||||
for _ in range(num_warmup):
|
for _ in range(num_warmup):
|
||||||
_ = method(k_nope, k_pe)
|
_ = method(k_nope, k_pe)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Benchmark
|
# Benchmark
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
for _ in range(num_iters):
|
for _ in range(num_iters):
|
||||||
_ = method(k_nope, k_pe)
|
_ = method(k_nope, k_pe)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
|
|
||||||
return (end - start) / num_iters * 1000 # Convert to ms
|
return (end - start) / num_iters * 1000 # Convert to ms
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ def clear_triton_cache():
|
|||||||
|
|
||||||
# Clear CUDA memory cache
|
# Clear CUDA memory cache
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
torch.cuda.empty_cache()
|
torch.accelerator.empty_cache()
|
||||||
|
|
||||||
# Try to clear Triton's runtime cache
|
# Try to clear Triton's runtime cache
|
||||||
try:
|
try:
|
||||||
@@ -304,19 +304,19 @@ def benchmark_config(
|
|||||||
|
|
||||||
# JIT compilation & warmup
|
# JIT compilation & warmup
|
||||||
run()
|
run()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Capture 10 invocations with CUDA graph
|
# Capture 10 invocations with CUDA graph
|
||||||
graph = torch.cuda.CUDAGraph()
|
graph = torch.cuda.CUDAGraph()
|
||||||
with torch.cuda.graph(graph):
|
with torch.cuda.graph(graph):
|
||||||
for _ in range(10):
|
for _ in range(10):
|
||||||
run()
|
run()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Warmup
|
# Warmup
|
||||||
for _ in range(5):
|
for _ in range(5):
|
||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
start_event = torch.Event(enable_timing=True)
|
start_event = torch.Event(enable_timing=True)
|
||||||
end_event = torch.Event(enable_timing=True)
|
end_event = torch.Event(enable_timing=True)
|
||||||
@@ -324,7 +324,7 @@ def benchmark_config(
|
|||||||
latencies: list[float] = []
|
latencies: list[float] = []
|
||||||
for i in range(num_iters):
|
for i in range(num_iters):
|
||||||
prepare(i)
|
prepare(i)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
start_event.record()
|
start_event.record()
|
||||||
graph.replay()
|
graph.replay()
|
||||||
@@ -626,7 +626,11 @@ class BenchmarkWorker:
|
|||||||
if visible_device != f"{self.device_id}":
|
if visible_device != f"{self.device_id}":
|
||||||
need_device_guard = True
|
need_device_guard = True
|
||||||
|
|
||||||
with torch.cuda.device(self.device_id) if need_device_guard else nullcontext():
|
with (
|
||||||
|
torch.accelerator.device_index(self.device_id)
|
||||||
|
if need_device_guard
|
||||||
|
else nullcontext()
|
||||||
|
):
|
||||||
for idx, config in enumerate(tqdm(search_space)):
|
for idx, config in enumerate(tqdm(search_space)):
|
||||||
try:
|
try:
|
||||||
kernel_time = benchmark_config(
|
kernel_time = benchmark_config(
|
||||||
|
|||||||
@@ -131,7 +131,7 @@ def benchmark_config(
|
|||||||
topk_ids,
|
topk_ids,
|
||||||
quant_config=quant_config,
|
quant_config=quant_config,
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Benchmark
|
# Benchmark
|
||||||
start = torch.cuda.Event(enable_timing=True)
|
start = torch.cuda.Event(enable_timing=True)
|
||||||
@@ -149,7 +149,7 @@ def benchmark_config(
|
|||||||
quant_config=quant_config,
|
quant_config=quant_config,
|
||||||
)
|
)
|
||||||
end.record()
|
end.record()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
return start.elapsed_time(end) / num_iters * 1000 # ms -> us
|
return start.elapsed_time(end) / num_iters * 1000 # ms -> us
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -69,19 +69,19 @@ def benchmark_permute(
|
|||||||
|
|
||||||
# JIT compilation & warmup
|
# JIT compilation & warmup
|
||||||
run()
|
run()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Capture 10 invocations with CUDA graph
|
# Capture 10 invocations with CUDA graph
|
||||||
graph = torch.cuda.CUDAGraph()
|
graph = torch.cuda.CUDAGraph()
|
||||||
with torch.cuda.graph(graph):
|
with torch.cuda.graph(graph):
|
||||||
for _ in range(10):
|
for _ in range(10):
|
||||||
run()
|
run()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Warmup
|
# Warmup
|
||||||
for _ in range(5):
|
for _ in range(5):
|
||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
start_event = torch.Event(enable_timing=True)
|
start_event = torch.Event(enable_timing=True)
|
||||||
end_event = torch.Event(enable_timing=True)
|
end_event = torch.Event(enable_timing=True)
|
||||||
@@ -89,7 +89,7 @@ def benchmark_permute(
|
|||||||
latencies: list[float] = []
|
latencies: list[float] = []
|
||||||
for i in range(num_iters):
|
for i in range(num_iters):
|
||||||
prepare(i)
|
prepare(i)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
start_event.record()
|
start_event.record()
|
||||||
graph.replay()
|
graph.replay()
|
||||||
@@ -159,26 +159,26 @@ def benchmark_unpermute(
|
|||||||
# JIT compilation & warmup
|
# JIT compilation & warmup
|
||||||
input = prepare()
|
input = prepare()
|
||||||
run(input)
|
run(input)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Capture 10 invocations with CUDA graph
|
# Capture 10 invocations with CUDA graph
|
||||||
graph = torch.cuda.CUDAGraph()
|
graph = torch.cuda.CUDAGraph()
|
||||||
with torch.cuda.graph(graph):
|
with torch.cuda.graph(graph):
|
||||||
for _ in range(10):
|
for _ in range(10):
|
||||||
run(input)
|
run(input)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Warmup
|
# Warmup
|
||||||
for _ in range(5):
|
for _ in range(5):
|
||||||
graph.replay()
|
graph.replay()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
start_event = torch.Event(enable_timing=True)
|
start_event = torch.Event(enable_timing=True)
|
||||||
end_event = torch.Event(enable_timing=True)
|
end_event = torch.Event(enable_timing=True)
|
||||||
|
|
||||||
latencies: list[float] = []
|
latencies: list[float] = []
|
||||||
for i in range(num_iters):
|
for i in range(num_iters):
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start_event.record()
|
start_event.record()
|
||||||
graph.replay()
|
graph.replay()
|
||||||
end_event.record()
|
end_event.record()
|
||||||
|
|||||||
@@ -135,14 +135,14 @@ def benchmark_mrope(
|
|||||||
key.clone(),
|
key.clone(),
|
||||||
)
|
)
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Time reference implementation
|
# Time reference implementation
|
||||||
torch_times = []
|
torch_times = []
|
||||||
for _ in range(benchmark_iter):
|
for _ in range(benchmark_iter):
|
||||||
query_clone = query.clone()
|
query_clone = query.clone()
|
||||||
key_clone = key.clone()
|
key_clone = key.clone()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
mrope_helper_class.forward_native(
|
mrope_helper_class.forward_native(
|
||||||
@@ -151,7 +151,7 @@ def benchmark_mrope(
|
|||||||
key_clone,
|
key_clone,
|
||||||
)
|
)
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
torch_times.append(time.time() - start_time)
|
torch_times.append(time.time() - start_time)
|
||||||
|
|
||||||
# Time triton kernel implementation
|
# Time triton kernel implementation
|
||||||
@@ -159,14 +159,14 @@ def benchmark_mrope(
|
|||||||
for _ in range(benchmark_iter):
|
for _ in range(benchmark_iter):
|
||||||
query_clone = query.clone()
|
query_clone = query.clone()
|
||||||
key_clone = key.clone()
|
key_clone = key.clone()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
mrope_helper_class.forward_cuda(
|
mrope_helper_class.forward_cuda(
|
||||||
positions,
|
positions,
|
||||||
query_clone,
|
query_clone,
|
||||||
key_clone,
|
key_clone,
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
triton_times.append(time.time() - start_time)
|
triton_times.append(time.time() - start_time)
|
||||||
|
|
||||||
# Calculate statistics
|
# Calculate statistics
|
||||||
|
|||||||
@@ -103,7 +103,7 @@ def main(
|
|||||||
max_logits = torch.empty_like(exp_sums)
|
max_logits = torch.empty_like(exp_sums)
|
||||||
|
|
||||||
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
|
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
if profile:
|
if profile:
|
||||||
torch.cuda.cudart().cudaProfilerStart()
|
torch.cuda.cudart().cudaProfilerStart()
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
@@ -173,7 +173,7 @@ def main(
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid version: {version}")
|
raise ValueError(f"Invalid version: {version}")
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
end_time = time.perf_counter()
|
end_time = time.perf_counter()
|
||||||
if profile:
|
if profile:
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ def _time_cuda(
|
|||||||
# warmup
|
# warmup
|
||||||
for _ in range(warmup_iters):
|
for _ in range(warmup_iters):
|
||||||
fn()
|
fn()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
start = torch.Event(enable_timing=True)
|
start = torch.Event(enable_timing=True)
|
||||||
end = torch.Event(enable_timing=True)
|
end = torch.Event(enable_timing=True)
|
||||||
@@ -37,7 +37,7 @@ def _time_cuda(
|
|||||||
for _ in range(bench_iters):
|
for _ in range(bench_iters):
|
||||||
fn()
|
fn()
|
||||||
end.record()
|
end.record()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
return start.elapsed_time(end) / bench_iters # ms/iter
|
return start.elapsed_time(end) / bench_iters # ms/iter
|
||||||
|
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ def main(
|
|||||||
scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None
|
scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None
|
||||||
|
|
||||||
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
|
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
if profile:
|
if profile:
|
||||||
torch.cuda.cudart().cudaProfilerStart()
|
torch.cuda.cudart().cudaProfilerStart()
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
@@ -39,7 +39,7 @@ def main(
|
|||||||
ops.scaled_int8_quant(x, scale)
|
ops.scaled_int8_quant(x, scale)
|
||||||
else:
|
else:
|
||||||
ops.scaled_fp8_quant(x, scale)
|
ops.scaled_fp8_quant(x, scale)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
end_time = time.perf_counter()
|
end_time = time.perf_counter()
|
||||||
if profile:
|
if profile:
|
||||||
|
|||||||
@@ -84,16 +84,16 @@ def run_benchmark(
|
|||||||
g = torch.cuda.CUDAGraph()
|
g = torch.cuda.CUDAGraph()
|
||||||
with torch.cuda.graph(g):
|
with torch.cuda.graph(g):
|
||||||
function_under_test()
|
function_under_test()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
function_under_test = lambda: g.replay()
|
function_under_test = lambda: g.replay()
|
||||||
|
|
||||||
def run_cuda_benchmark(n_iters: int) -> float:
|
def run_cuda_benchmark(n_iters: int) -> float:
|
||||||
nonlocal key, value, key_cache, value_cache, slot_mapping
|
nonlocal key, value, key_cache, value_cache, slot_mapping
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
for _ in range(n_iters):
|
for _ in range(n_iters):
|
||||||
function_under_test()
|
function_under_test()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
return (end - start) / n_iters
|
return (end - start) / n_iters
|
||||||
|
|
||||||
@@ -104,7 +104,7 @@ def run_benchmark(
|
|||||||
|
|
||||||
# free tensors to mitigate OOM when sweeping
|
# free tensors to mitigate OOM when sweeping
|
||||||
del key, value, key_cache, value_cache, slot_mapping
|
del key, value, key_cache, value_cache, slot_mapping
|
||||||
torch.cuda.empty_cache()
|
torch.accelerator.empty_cache()
|
||||||
|
|
||||||
return lat
|
return lat
|
||||||
|
|
||||||
|
|||||||
@@ -109,16 +109,16 @@ def run_benchmark(
|
|||||||
g = torch.cuda.CUDAGraph()
|
g = torch.cuda.CUDAGraph()
|
||||||
with torch.cuda.graph(g):
|
with torch.cuda.graph(g):
|
||||||
function_under_test()
|
function_under_test()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
function_under_test = lambda: g.replay()
|
function_under_test = lambda: g.replay()
|
||||||
|
|
||||||
def run_cuda_benchmark(n_iters: int) -> float:
|
def run_cuda_benchmark(n_iters: int) -> float:
|
||||||
nonlocal key, value, key_cache, value_cache, slot_mapping
|
nonlocal key, value, key_cache, value_cache, slot_mapping
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
for _ in range(n_iters):
|
for _ in range(n_iters):
|
||||||
function_under_test()
|
function_under_test()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
return (end - start) / n_iters
|
return (end - start) / n_iters
|
||||||
|
|
||||||
@@ -129,7 +129,7 @@ def run_benchmark(
|
|||||||
|
|
||||||
# free tensors to mitigate OOM when sweeping
|
# free tensors to mitigate OOM when sweeping
|
||||||
del key, value, key_cache, value_cache, slot_mapping
|
del key, value, key_cache, value_cache, slot_mapping
|
||||||
torch.cuda.empty_cache()
|
torch.accelerator.empty_cache()
|
||||||
|
|
||||||
return lat
|
return lat
|
||||||
|
|
||||||
|
|||||||
@@ -251,7 +251,7 @@ def benchmark(
|
|||||||
kernel(
|
kernel(
|
||||||
y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
|
y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
|
||||||
)
|
)
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
start_event = torch.Event(enable_timing=True)
|
start_event = torch.Event(enable_timing=True)
|
||||||
end_event = torch.Event(enable_timing=True)
|
end_event = torch.Event(enable_timing=True)
|
||||||
@@ -259,7 +259,7 @@ def benchmark(
|
|||||||
# Benchmark
|
# Benchmark
|
||||||
latencies: list[float] = []
|
latencies: list[float] = []
|
||||||
for _ in range(runs):
|
for _ in range(runs):
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
start_event.record()
|
start_event.record()
|
||||||
for i in range(iterations_per_run):
|
for i in range(iterations_per_run):
|
||||||
|
|||||||
@@ -126,7 +126,7 @@ def benchmark_decode(
|
|||||||
)
|
)
|
||||||
|
|
||||||
def time_fn(fn, warmup=10, trials=20):
|
def time_fn(fn, warmup=10, trials=20):
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start = torch.Event(enable_timing=True)
|
start = torch.Event(enable_timing=True)
|
||||||
end = torch.Event(enable_timing=True)
|
end = torch.Event(enable_timing=True)
|
||||||
times = []
|
times = []
|
||||||
@@ -136,7 +136,7 @@ def benchmark_decode(
|
|||||||
start.record()
|
start.record()
|
||||||
fn()
|
fn()
|
||||||
end.record()
|
end.record()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
times.append(start.elapsed_time(end)) # ms
|
times.append(start.elapsed_time(end)) # ms
|
||||||
return sum(times) / len(times), torch.std(torch.tensor(times))
|
return sum(times) / len(times), torch.std(torch.tensor(times))
|
||||||
|
|
||||||
|
|||||||
@@ -138,7 +138,7 @@ def benchmark_prefill(
|
|||||||
)
|
)
|
||||||
|
|
||||||
def time_fn(fn, warmup=10, trials=20):
|
def time_fn(fn, warmup=10, trials=20):
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start = torch.Event(enable_timing=True)
|
start = torch.Event(enable_timing=True)
|
||||||
end = torch.Event(enable_timing=True)
|
end = torch.Event(enable_timing=True)
|
||||||
times = []
|
times = []
|
||||||
@@ -148,7 +148,7 @@ def benchmark_prefill(
|
|||||||
start.record()
|
start.record()
|
||||||
fn()
|
fn()
|
||||||
end.record()
|
end.record()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
times.append(start.elapsed_time(end)) # ms
|
times.append(start.elapsed_time(end)) # ms
|
||||||
return sum(times) / len(times), torch.std(torch.tensor(times))
|
return sum(times) / len(times), torch.std(torch.tensor(times))
|
||||||
|
|
||||||
|
|||||||
@@ -177,18 +177,18 @@ def benchmark_config(
|
|||||||
def run():
|
def run():
|
||||||
w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
|
w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
# JIT complication & warmup
|
# JIT complication & warmup
|
||||||
for _ in range(5):
|
for _ in range(5):
|
||||||
run()
|
run()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
start_event = torch.Event(enable_timing=True)
|
start_event = torch.Event(enable_timing=True)
|
||||||
end_event = torch.Event(enable_timing=True)
|
end_event = torch.Event(enable_timing=True)
|
||||||
|
|
||||||
latencies: list[float] = []
|
latencies: list[float] = []
|
||||||
for i in range(num_iters):
|
for i in range(num_iters):
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start_event.record()
|
start_event.record()
|
||||||
run()
|
run()
|
||||||
end_event.record()
|
end_event.record()
|
||||||
@@ -285,7 +285,7 @@ def tune_on_gpu(args_dict):
|
|||||||
weight_shapes = args_dict["weight_shapes"]
|
weight_shapes = args_dict["weight_shapes"]
|
||||||
args = args_dict["args"]
|
args = args_dict["args"]
|
||||||
|
|
||||||
torch.cuda.set_device(gpu_id)
|
torch.accelerator.set_device_index(gpu_id)
|
||||||
print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
|
print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
|
||||||
|
|
||||||
block_n = args.block_n
|
block_n = args.block_n
|
||||||
@@ -334,7 +334,7 @@ def distribute_batch_sizes(batch_sizes, num_gpus):
|
|||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
print(args)
|
print(args)
|
||||||
num_gpus = torch.cuda.device_count()
|
num_gpus = torch.accelerator.device_count()
|
||||||
if num_gpus == 0:
|
if num_gpus == 0:
|
||||||
raise RuntimeError("No GPU available for tuning")
|
raise RuntimeError("No GPU available for tuning")
|
||||||
print(f"Found {num_gpus} GPUs for parallel tuning")
|
print(f"Found {num_gpus} GPUs for parallel tuning")
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ def benchmark_shape(
|
|||||||
B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
|
B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
|
||||||
|
|
||||||
# Reference result in BF16
|
# Reference result in BF16
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
C_ref = A @ B.t()
|
C_ref = A @ B.t()
|
||||||
|
|
||||||
# Pre-quantize B for all implementations
|
# Pre-quantize B for all implementations
|
||||||
@@ -121,14 +121,14 @@ def benchmark_shape(
|
|||||||
# Warmup
|
# Warmup
|
||||||
for _ in range(warmup):
|
for _ in range(warmup):
|
||||||
func()
|
func()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
|
|
||||||
# Timing loop
|
# Timing loop
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
start = time.time()
|
start = time.time()
|
||||||
for _ in range(repeat):
|
for _ in range(repeat):
|
||||||
func()
|
func()
|
||||||
torch.cuda.synchronize()
|
torch.accelerator.synchronize()
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
||||||
# Calculate timing and TFLOPS
|
# Calculate timing and TFLOPS
|
||||||
|
|||||||
@@ -79,7 +79,8 @@ else()
|
|||||||
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
|
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
|
||||||
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
|
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
|
||||||
find_isa(${CPUINFO} "S390" S390_FOUND)
|
find_isa(${CPUINFO} "S390" S390_FOUND)
|
||||||
find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
|
find_isa(${CPUINFO} "zvfhmin" RVV_FP16_FOUND) # Check for RISC-V Vector FP16 support
|
||||||
|
find_isa(${CPUINFO} "zvfbfmin" RVV_BF16_FOUND) # Check for RISC-V Vector BF16 support
|
||||||
|
|
||||||
# Support cross-compilation by allowing override via environment variables
|
# Support cross-compilation by allowing override via environment variables
|
||||||
if (ENABLE_ARM_BF16)
|
if (ENABLE_ARM_BF16)
|
||||||
@@ -101,11 +102,13 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64" OR ENABLE_X86_ISA)
|
|||||||
"-mavx512f"
|
"-mavx512f"
|
||||||
"-mavx512vl"
|
"-mavx512vl"
|
||||||
"-mavx512bw"
|
"-mavx512bw"
|
||||||
"-mavx512dq"
|
"-mavx512dq")
|
||||||
"-mavx512bf16"
|
list(APPEND CXX_COMPILE_FLAGS_AVX512_AMX
|
||||||
"-mavx512vnni"
|
${CXX_COMPILE_FLAGS_AVX512}
|
||||||
"-mamx-bf16"
|
"-mamx-bf16"
|
||||||
"-mamx-tile")
|
"-mamx-tile"
|
||||||
|
"-mavx512bf16"
|
||||||
|
"-mavx512vnni")
|
||||||
list(APPEND CXX_COMPILE_FLAGS_AVX2
|
list(APPEND CXX_COMPILE_FLAGS_AVX2
|
||||||
"-mavx2")
|
"-mavx2")
|
||||||
elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
||||||
@@ -142,11 +145,19 @@ elseif (S390_FOUND)
|
|||||||
"-march=native"
|
"-march=native"
|
||||||
"-mtune=native")
|
"-mtune=native")
|
||||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
|
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
|
||||||
if(RVV_FOUND)
|
message(STATUS "RISC-V detected")
|
||||||
message(FAIL_ERROR "Can't support rvv now.")
|
if(RVV_BF16_FOUND)
|
||||||
|
message(STATUS "BF16 extension detected")
|
||||||
|
set(MARCH_FLAGS -march=rv64gcv_zvfh_zfbfmin_zvfbfmin_zvl128b -mrvv-vector-bits=zvl -mabi=lp64d)
|
||||||
|
add_compile_definitions(RISCV_BF16_SUPPORT)
|
||||||
|
elseif (RVV_FP16_FOUND)
|
||||||
|
message(WARNING "BF16 functionality is not available")
|
||||||
|
set(MARCH_FLAGS -march=rv64gcv_zvfh_zvl128b -mrvv-vector-bits=zvl -mabi=lp64d)
|
||||||
else()
|
else()
|
||||||
|
message(STATUS "compile riscv with scalar")
|
||||||
list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
|
list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
|
||||||
endif()
|
endif()
|
||||||
|
list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})
|
||||||
else()
|
else()
|
||||||
message(FATAL_ERROR "vLLM CPU backend requires X86, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
|
message(FATAL_ERROR "vLLM CPU backend requires X86, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
|
||||||
endif()
|
endif()
|
||||||
@@ -242,13 +253,24 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
|
|||||||
)
|
)
|
||||||
else()
|
else()
|
||||||
message(STATUS "Downloading oneDNN from GitHub")
|
message(STATUS "Downloading oneDNN from GitHub")
|
||||||
FetchContent_Declare(
|
if(ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
|
||||||
oneDNN
|
message(STATUS "aarch64 detected: using pinned oneDNN commit 9c5be1cc59e368aebf0909e6cf20f981ea61462a")
|
||||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
FetchContent_Declare(
|
||||||
GIT_TAG v3.10
|
oneDNN
|
||||||
GIT_PROGRESS TRUE
|
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||||
GIT_SHALLOW TRUE
|
GIT_TAG 9c5be1cc59e368aebf0909e6cf20f981ea61462a
|
||||||
)
|
GIT_PROGRESS TRUE
|
||||||
|
GIT_SHALLOW FALSE
|
||||||
|
)
|
||||||
|
else()
|
||||||
|
FetchContent_Declare(
|
||||||
|
oneDNN
|
||||||
|
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||||
|
GIT_TAG v3.10
|
||||||
|
GIT_PROGRESS TRUE
|
||||||
|
GIT_SHALLOW TRUE
|
||||||
|
)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
||||||
@@ -294,7 +316,8 @@ endif()
|
|||||||
|
|
||||||
# TODO: Refactor this
|
# TODO: Refactor this
|
||||||
if (ENABLE_X86_ISA)
|
if (ENABLE_X86_ISA)
|
||||||
message(STATUS "CPU extension (AVX512) compile flags: ${CXX_COMPILE_FLAGS_AVX512}")
|
message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) compile flags: ${CXX_COMPILE_FLAGS_AVX512_AMX}")
|
||||||
|
message(STATUS "CPU extension (AVX512F) compile flags: ${CXX_COMPILE_FLAGS_AVX512}")
|
||||||
message(STATUS "CPU extension (AVX2) compile flags: ${CXX_COMPILE_FLAGS_AVX2}")
|
message(STATUS "CPU extension (AVX2) compile flags: ${CXX_COMPILE_FLAGS_AVX2}")
|
||||||
else()
|
else()
|
||||||
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
||||||
@@ -346,13 +369,15 @@ if(USE_ONEDNN)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (ENABLE_X86_ISA)
|
if (ENABLE_X86_ISA)
|
||||||
set(VLLM_EXT_SRC_AVX512
|
set(VLLM_EXT_SRC_SGL
|
||||||
"csrc/cpu/sgl-kernels/gemm.cpp"
|
"csrc/cpu/sgl-kernels/gemm.cpp"
|
||||||
"csrc/cpu/sgl-kernels/gemm_int8.cpp"
|
"csrc/cpu/sgl-kernels/gemm_int8.cpp"
|
||||||
"csrc/cpu/sgl-kernels/gemm_fp8.cpp"
|
"csrc/cpu/sgl-kernels/gemm_fp8.cpp"
|
||||||
"csrc/cpu/sgl-kernels/moe.cpp"
|
"csrc/cpu/sgl-kernels/moe.cpp"
|
||||||
"csrc/cpu/sgl-kernels/moe_int8.cpp"
|
"csrc/cpu/sgl-kernels/moe_int8.cpp"
|
||||||
"csrc/cpu/sgl-kernels/moe_fp8.cpp"
|
"csrc/cpu/sgl-kernels/moe_fp8.cpp")
|
||||||
|
|
||||||
|
set(VLLM_EXT_SRC_AVX512
|
||||||
"csrc/cpu/shm.cpp"
|
"csrc/cpu/shm.cpp"
|
||||||
"csrc/cpu/cpu_wna16.cpp"
|
"csrc/cpu/cpu_wna16.cpp"
|
||||||
"csrc/cpu/cpu_fused_moe.cpp"
|
"csrc/cpu/cpu_fused_moe.cpp"
|
||||||
@@ -378,31 +403,48 @@ if (ENABLE_X86_ISA)
|
|||||||
"csrc/cpu/pos_encoding.cpp"
|
"csrc/cpu/pos_encoding.cpp"
|
||||||
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
|
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
|
||||||
|
|
||||||
message(STATUS "CPU extension (AVX512) source files: ${VLLM_EXT_SRC_AVX512}")
|
message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) source files: ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL}")
|
||||||
|
message(STATUS "CPU extension (AVX512F) source files: ${VLLM_EXT_SRC_AVX512}")
|
||||||
message(STATUS "CPU extension (AVX2) source files: ${VLLM_EXT_SRC_AVX2}")
|
message(STATUS "CPU extension (AVX2) source files: ${VLLM_EXT_SRC_AVX2}")
|
||||||
|
|
||||||
|
set(_C_LIBS numa dnnl_ext)
|
||||||
|
set(_C_AVX512_LIBS numa dnnl_ext)
|
||||||
|
set(_C_AVX2_LIBS numa)
|
||||||
|
|
||||||
|
# AMX + AVX512F + AVX512BF16 + AVX512VNNI
|
||||||
define_extension_target(
|
define_extension_target(
|
||||||
_C
|
_C
|
||||||
DESTINATION vllm
|
DESTINATION vllm
|
||||||
LANGUAGE CXX
|
LANGUAGE CXX
|
||||||
|
SOURCES ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL}
|
||||||
|
LIBRARIES ${_C_LIBS}
|
||||||
|
COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512_AMX}
|
||||||
|
USE_SABI 3
|
||||||
|
WITH_SOABI
|
||||||
|
)
|
||||||
|
|
||||||
|
# For AMX kernels
|
||||||
|
target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16")
|
||||||
|
|
||||||
|
# AVX512F
|
||||||
|
define_extension_target(
|
||||||
|
_C_AVX512
|
||||||
|
DESTINATION vllm
|
||||||
|
LANGUAGE CXX
|
||||||
SOURCES ${VLLM_EXT_SRC_AVX512}
|
SOURCES ${VLLM_EXT_SRC_AVX512}
|
||||||
LIBRARIES ${LIBS}
|
LIBRARIES ${_C_AVX512_LIBS}
|
||||||
COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512}
|
COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512}
|
||||||
USE_SABI 3
|
USE_SABI 3
|
||||||
WITH_SOABI
|
WITH_SOABI
|
||||||
)
|
)
|
||||||
|
|
||||||
# For SGL kernels
|
# AVX2
|
||||||
target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AVX512")
|
|
||||||
# For AMX kernels
|
|
||||||
target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16")
|
|
||||||
|
|
||||||
define_extension_target(
|
define_extension_target(
|
||||||
_C_AVX2
|
_C_AVX2
|
||||||
DESTINATION vllm
|
DESTINATION vllm
|
||||||
LANGUAGE CXX
|
LANGUAGE CXX
|
||||||
SOURCES ${VLLM_EXT_SRC_AVX2}
|
SOURCES ${VLLM_EXT_SRC_AVX2}
|
||||||
LIBRARIES ${LIBS}
|
LIBRARIES ${_C_AVX2_LIBS}
|
||||||
COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX2}
|
COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX2}
|
||||||
USE_SABI 3
|
USE_SABI 3
|
||||||
WITH_SOABI
|
WITH_SOABI
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ else()
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
vllm-flash-attn
|
vllm-flash-attn
|
||||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||||
GIT_TAG 140c00c0241bb60cc6e44e7c1be9998d4b20d8d2
|
GIT_TAG 1488682bb545f7d020e958a33116b1419d1cfc83
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
# Don't share the vllm-flash-attn build between build types
|
# Don't share the vllm-flash-attn build between build types
|
||||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||||
|
|||||||
@@ -74,6 +74,12 @@ void indexer_k_quant_and_cache(
|
|||||||
int64_t quant_block_size, // quantization block size
|
int64_t quant_block_size, // quantization block size
|
||||||
const std::string& scale_fmt);
|
const std::string& scale_fmt);
|
||||||
|
|
||||||
|
// Concatenate query nope and rope for MLA/DSA attention
|
||||||
|
void concat_mla_q(
|
||||||
|
torch::Tensor& ql_nope, // [num_tokens, num_heads, nope_dim]
|
||||||
|
torch::Tensor& q_pe, // [num_tokens, num_heads, rope_dim]
|
||||||
|
torch::Tensor& q_out); // [num_tokens, num_heads, nope_dim + rope_dim]
|
||||||
|
|
||||||
// Extract function to gather quantized K cache
|
// Extract function to gather quantized K cache
|
||||||
void cp_gather_indexer_k_quant_cache(
|
void cp_gather_indexer_k_quant_cache(
|
||||||
const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
|
const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
|
||||||
|
|||||||
@@ -8,6 +8,7 @@
|
|||||||
#include "cuda_compat.h"
|
#include "cuda_compat.h"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
#include "quantization/vectorization_utils.cuh"
|
#include "quantization/vectorization_utils.cuh"
|
||||||
|
#include "concat_mla_q.cuh"
|
||||||
|
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
#include "quantization/w8a8/fp8/amd/quant_utils.cuh"
|
#include "quantization/w8a8/fp8/amd/quant_utils.cuh"
|
||||||
@@ -918,8 +919,8 @@ __global__ void gather_and_maybe_dequant_cache(
|
|||||||
// SCALAR_T is the data type of the destination tensor.
|
// SCALAR_T is the data type of the destination tensor.
|
||||||
// CACHE_T is the stored data type of kv-cache.
|
// CACHE_T is the stored data type of kv-cache.
|
||||||
// KV_DTYPE is the real data type of kv-cache.
|
// KV_DTYPE is the real data type of kv-cache.
|
||||||
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \
|
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE, ENTRY_SZ) \
|
||||||
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576, \
|
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, ENTRY_SZ, \
|
||||||
thread_block_size> \
|
thread_block_size> \
|
||||||
<<<grid, block, 0, stream>>>( \
|
<<<grid, block, 0, stream>>>( \
|
||||||
reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \
|
reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \
|
||||||
@@ -930,6 +931,12 @@ __global__ void gather_and_maybe_dequant_cache(
|
|||||||
dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
|
dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
|
||||||
seq_starts_ptr);
|
seq_starts_ptr);
|
||||||
|
|
||||||
|
#define CALL_GATHER_CACHE_576(SCALAR_T, CACHE_T, KV_DTYPE) \
|
||||||
|
CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE, 576)
|
||||||
|
|
||||||
|
#define CALL_GATHER_CACHE_320(SCALAR_T, CACHE_T, KV_DTYPE) \
|
||||||
|
CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE, 320)
|
||||||
|
|
||||||
// Gather sequences from the cache into the destination tensor.
|
// Gather sequences from the cache into the destination tensor.
|
||||||
// - cu_seq_lens contains the cumulative sequence lengths for each batch
|
// - cu_seq_lens contains the cumulative sequence lengths for each batch
|
||||||
// - block_table contains the cache block indices for each sequence
|
// - block_table contains the cache block indices for each sequence
|
||||||
@@ -959,9 +966,10 @@ void gather_and_maybe_dequant_cache(
|
|||||||
TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
|
TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
|
||||||
"seq_starts must be int32");
|
"seq_starts must be int32");
|
||||||
}
|
}
|
||||||
TORCH_CHECK(head_dim == 576,
|
TORCH_CHECK(
|
||||||
"gather_and_maybe_dequant_cache only support the head_dim to 576 "
|
head_dim == 320 || head_dim == 576,
|
||||||
"for better performance")
|
"gather_and_maybe_dequant_cache only support the head_dim to 320 or 576 "
|
||||||
|
"for better performance")
|
||||||
|
|
||||||
TORCH_CHECK(src_cache.device() == dst.device(),
|
TORCH_CHECK(src_cache.device() == dst.device(),
|
||||||
"src_cache and dst must be on the same device");
|
"src_cache and dst must be on the same device");
|
||||||
@@ -986,7 +994,13 @@ void gather_and_maybe_dequant_cache(
|
|||||||
const int32_t* seq_starts_ptr =
|
const int32_t* seq_starts_ptr =
|
||||||
seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
|
seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
|
||||||
|
|
||||||
DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, CALL_GATHER_CACHE);
|
if (head_dim == 576) {
|
||||||
|
DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype,
|
||||||
|
CALL_GATHER_CACHE_576);
|
||||||
|
} else {
|
||||||
|
DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype,
|
||||||
|
CALL_GATHER_CACHE_320);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
@@ -995,75 +1009,67 @@ namespace vllm {
|
|||||||
// Similar to cp_gather_cache but specifically for FP8->BF16 conversion
|
// Similar to cp_gather_cache but specifically for FP8->BF16 conversion
|
||||||
__global__ void cp_gather_and_upconvert_fp8_kv_cache(
|
__global__ void cp_gather_and_upconvert_fp8_kv_cache(
|
||||||
const uint8_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
|
const uint8_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
|
||||||
__nv_bfloat16* __restrict__ dst, // [TOT_TOKENS, 576]
|
__nv_bfloat16* __restrict__ dst, // [total_tokens, 576]
|
||||||
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
|
const int32_t* __restrict__ block_table, // [num_reqs, BLOCK_INDICES]
|
||||||
const int32_t* __restrict__ seq_lens, // [BATCH]
|
const int32_t* __restrict__ workspace_starts, // [num_reqs]
|
||||||
const int32_t* __restrict__ workspace_starts, // [BATCH]
|
const int32_t num_reqs, const int32_t block_size,
|
||||||
const int32_t block_size, const int32_t head_dim,
|
const int32_t total_tokens, const int64_t block_table_stride,
|
||||||
const int64_t block_table_stride, const int64_t cache_block_stride,
|
const int64_t cache_block_stride, const int64_t cache_entry_stride,
|
||||||
const int64_t cache_entry_stride, const int64_t dst_entry_stride) {
|
const int64_t dst_entry_stride) {
|
||||||
const int64_t bid = blockIdx.x; // Batch ID
|
const int flat_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) >> 5;
|
||||||
const int32_t num_splits = gridDim.y;
|
if (flat_warp_id >= total_tokens) return;
|
||||||
const int32_t split = blockIdx.y;
|
const int lane_id = threadIdx.x & 31;
|
||||||
const int32_t seq_start = workspace_starts[bid];
|
|
||||||
const int32_t seq_len = seq_lens[bid];
|
|
||||||
const int32_t tot_slots = seq_len;
|
|
||||||
const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
|
|
||||||
|
|
||||||
const int32_t split_start = split * split_slots;
|
// Binary search to find which request owns this output token
|
||||||
const int32_t split_end = min((split + 1) * split_slots, tot_slots);
|
int lo = 0, hi = num_reqs - 1;
|
||||||
|
while (lo < hi) {
|
||||||
const bool is_active_split = (split_start < tot_slots);
|
int mid = (lo + hi + 1) >> 1;
|
||||||
|
if (workspace_starts[mid] <= flat_warp_id)
|
||||||
if (!is_active_split) return;
|
lo = mid;
|
||||||
|
else
|
||||||
// Adjust the pointer for the block_table for this batch
|
hi = mid - 1;
|
||||||
const int32_t batch_offset = bid * block_table_stride;
|
|
||||||
int32_t offset = split_start;
|
|
||||||
int32_t offset_div = offset / block_size;
|
|
||||||
offset = offset % block_size;
|
|
||||||
const int32_t* batch_block_table = block_table + batch_offset;
|
|
||||||
|
|
||||||
// Adjust dst pointer based on the cumulative sequence lengths
|
|
||||||
dst += seq_start * dst_entry_stride;
|
|
||||||
|
|
||||||
const int tid = threadIdx.x;
|
|
||||||
|
|
||||||
// Process each token in this split
|
|
||||||
for (int pid = split_start; pid < split_end; ++pid) {
|
|
||||||
auto block_id = batch_block_table[offset_div];
|
|
||||||
const uint8_t* token_ptr =
|
|
||||||
src_cache + block_id * cache_block_stride + offset * cache_entry_stride;
|
|
||||||
__nv_bfloat16* dst_ptr = dst + pid * dst_entry_stride;
|
|
||||||
|
|
||||||
// FP8 format: 512 bytes fp8 + 16 bytes scales + 128 bytes rope (64 bf16)
|
|
||||||
const uint8_t* no_pe_ptr = token_ptr;
|
|
||||||
const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
|
|
||||||
const __nv_bfloat16* rope_ptr =
|
|
||||||
reinterpret_cast<const __nv_bfloat16*>(token_ptr + 512 + 16);
|
|
||||||
|
|
||||||
// Parallelize fp8 dequant (512 elements) and rope copy (64 elements)
|
|
||||||
if (tid < 512) {
|
|
||||||
// FP8 dequantization
|
|
||||||
const int tile = tid >> 7; // each tile is 128 elements
|
|
||||||
const float scale = scales_ptr[tile];
|
|
||||||
const uint8_t val = no_pe_ptr[tid];
|
|
||||||
dst_ptr[tid] =
|
|
||||||
fp8::scaled_convert<__nv_bfloat16, uint8_t,
|
|
||||||
vllm::Fp8KVCacheDataType::kFp8E4M3>(val, scale);
|
|
||||||
} else if (tid < 576) {
|
|
||||||
// Rope copy (64 bf16 elements)
|
|
||||||
const int rope_idx = tid - 512;
|
|
||||||
dst_ptr[512 + rope_idx] = rope_ptr[rope_idx];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Move to next token
|
|
||||||
offset += 1;
|
|
||||||
if (offset == block_size) {
|
|
||||||
offset_div += 1;
|
|
||||||
offset = 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
const int req_id = lo;
|
||||||
|
|
||||||
|
// Compute physical token address via block table
|
||||||
|
const int out_token_id = flat_warp_id;
|
||||||
|
const int token_offset = out_token_id - workspace_starts[req_id];
|
||||||
|
const int cache_block_idx = token_offset / block_size;
|
||||||
|
const int offset_in_block = token_offset % block_size;
|
||||||
|
const int physical_block =
|
||||||
|
block_table[req_id * block_table_stride + cache_block_idx];
|
||||||
|
|
||||||
|
const uint8_t* token_ptr = src_cache + physical_block * cache_block_stride +
|
||||||
|
offset_in_block * cache_entry_stride;
|
||||||
|
|
||||||
|
const int4* nope_src = reinterpret_cast<const int4*>(token_ptr);
|
||||||
|
const int4 fp8_data = nope_src[lane_id];
|
||||||
|
|
||||||
|
const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
|
||||||
|
const float scale = scales_ptr[lane_id >> 3];
|
||||||
|
|
||||||
|
const uint2 fp8_lo = make_uint2(fp8_data.x, fp8_data.y);
|
||||||
|
const uint2 fp8_hi = make_uint2(fp8_data.z, fp8_data.w);
|
||||||
|
#ifdef USE_ROCM
|
||||||
|
const bf16_8_t bf16_lo =
|
||||||
|
fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_lo, scale);
|
||||||
|
const bf16_8_t bf16_hi =
|
||||||
|
fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_hi, scale);
|
||||||
|
#else
|
||||||
|
const bf16_8_t bf16_lo =
|
||||||
|
fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_lo, scale, __NV_E4M3);
|
||||||
|
const bf16_8_t bf16_hi =
|
||||||
|
fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_hi, scale, __NV_E4M3);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__nv_bfloat16* dst_ptr = dst + out_token_id * dst_entry_stride;
|
||||||
|
int4* nope_dst = reinterpret_cast<int4*>(dst_ptr) + lane_id * 2;
|
||||||
|
nope_dst[0] = *reinterpret_cast<const int4*>(&bf16_lo);
|
||||||
|
nope_dst[1] = *reinterpret_cast<const int4*>(&bf16_hi);
|
||||||
|
|
||||||
|
const int* rope_src = reinterpret_cast<const int*>(token_ptr + 528);
|
||||||
|
int* rope_dst = reinterpret_cast<int*>(dst_ptr + 512);
|
||||||
|
rope_dst[lane_id] = rope_src[lane_id];
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename scalar_t>
|
template <typename scalar_t>
|
||||||
@@ -1257,15 +1263,16 @@ void cp_gather_and_upconvert_fp8_kv_cache(
|
|||||||
src_ptr = reinterpret_cast<const uint8_t*>(src_cache.data_ptr());
|
src_ptr = reinterpret_cast<const uint8_t*>(src_cache.data_ptr());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Decide on the number of splits based on the batch size
|
const int total_tokens = dst.size(0);
|
||||||
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
|
constexpr int warps_per_block = 8;
|
||||||
dim3 grid(batch_size, num_splits);
|
const int grid_size = (total_tokens + warps_per_block - 1) / warps_per_block;
|
||||||
dim3 block(576);
|
const int block_size_threads = warps_per_block * 32; // 256 threads
|
||||||
|
|
||||||
vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid, block, 0, stream>>>(
|
vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid_size, block_size_threads, 0,
|
||||||
|
stream>>>(
|
||||||
src_ptr, reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
|
src_ptr, reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
|
||||||
block_table.data_ptr<int32_t>(), seq_lens.data_ptr<int32_t>(),
|
block_table.data_ptr<int32_t>(), workspace_starts.data_ptr<int32_t>(),
|
||||||
workspace_starts.data_ptr<int32_t>(), block_size, head_dim,
|
static_cast<int32_t>(batch_size), block_size, total_tokens,
|
||||||
block_table_stride, cache_block_stride, cache_entry_stride,
|
block_table_stride, cache_block_stride, cache_entry_stride,
|
||||||
dst_entry_stride);
|
dst_entry_stride);
|
||||||
}
|
}
|
||||||
@@ -1365,3 +1372,43 @@ void cp_gather_indexer_k_quant_cache(
|
|||||||
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32);
|
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Concatenate ql_nope and q_pe into a contiguous q_out tensor for MLA/DSA.
|
||||||
|
// Replaces torch.cat((ql_nope, q_pe), dim=-1).
|
||||||
|
void concat_mla_q(torch::Tensor& ql_nope, // [num_tokens, num_heads, nope_dim]
|
||||||
|
torch::Tensor& q_pe, // [num_tokens, num_heads, rope_dim]
|
||||||
|
torch::Tensor& q_out // [num_tokens, num_heads, nope_dim +
|
||||||
|
// rope_dim]
|
||||||
|
) {
|
||||||
|
const int num_tokens = ql_nope.size(0);
|
||||||
|
const int num_heads = ql_nope.size(1);
|
||||||
|
const int nope_dim = ql_nope.size(2);
|
||||||
|
const int rope_dim = q_pe.size(2);
|
||||||
|
|
||||||
|
TORCH_CHECK(nope_dim % 512 == 0, "nope_dim must be a multiple of 512, got ",
|
||||||
|
nope_dim);
|
||||||
|
TORCH_CHECK(rope_dim == 64, "rope_dim must be 64, got ", rope_dim);
|
||||||
|
TORCH_CHECK(q_out.size(2) == nope_dim + rope_dim);
|
||||||
|
|
||||||
|
TORCH_CHECK(ql_nope.stride(2) == 1, "ql_nope must have stride 1 in dim 2");
|
||||||
|
TORCH_CHECK(q_pe.stride(2) == 1, "q_pe must have stride 1 in dim 2");
|
||||||
|
TORCH_CHECK(q_out.stride(2) == 1, "q_out must have stride 1 in dim 2");
|
||||||
|
|
||||||
|
if (num_tokens == 0) return;
|
||||||
|
|
||||||
|
constexpr int warps_per_block = 8;
|
||||||
|
const int total_warps = num_tokens * num_heads;
|
||||||
|
const int grid_size = (total_warps + warps_per_block - 1) / warps_per_block;
|
||||||
|
const int block_size = warps_per_block * 32;
|
||||||
|
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(ql_nope));
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(ql_nope.scalar_type(), "concat_mla_q", [&] {
|
||||||
|
vllm::ConcatMLAQKernel<scalar_t, 512><<<grid_size, block_size, 0, stream>>>(
|
||||||
|
q_out.data_ptr<scalar_t>(), ql_nope.data_ptr<scalar_t>(),
|
||||||
|
q_pe.data_ptr<scalar_t>(), num_tokens, num_heads, q_out.stride(0),
|
||||||
|
q_out.stride(1), ql_nope.stride(0), ql_nope.stride(1), q_pe.stride(0),
|
||||||
|
q_pe.stride(1));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|||||||
60
csrc/concat_mla_q.cuh
Normal file
60
csrc/concat_mla_q.cuh
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
#ifndef CONCAT_MLA_Q_CUH_
|
||||||
|
#define CONCAT_MLA_Q_CUH_
|
||||||
|
|
||||||
|
#include <cuda_bf16.h>
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
|
||||||
|
#include "cuda_vec_utils.cuh"
|
||||||
|
|
||||||
|
namespace vllm {
|
||||||
|
|
||||||
|
// Concatenates ql_nope [num_tokens, num_heads, NOPE_DIM] and
|
||||||
|
// q_pe [num_tokens, num_heads, 64]
|
||||||
|
// into q_out [num_tokens, num_heads, NOPE_DIM+64].
|
||||||
|
// Currently instantiated only for NOPE_DIM=512.
|
||||||
|
// Rope dim is hardcoded to 64 (DeepSeek V3.2 MLA)
|
||||||
|
template <typename DType, int NOPE_DIM>
|
||||||
|
__global__ void ConcatMLAQKernel(
|
||||||
|
DType* __restrict__ q_out, const DType* __restrict__ ql_nope,
|
||||||
|
const DType* __restrict__ q_pe, const int num_tokens, const int num_heads,
|
||||||
|
const int64_t out_stride_0, const int64_t out_stride_1,
|
||||||
|
const int64_t nope_stride_0, const int64_t nope_stride_1,
|
||||||
|
const int64_t pe_stride_0, const int64_t pe_stride_1) {
|
||||||
|
const int flat_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) >> 5;
|
||||||
|
if (flat_warp_id >= num_tokens * num_heads) return;
|
||||||
|
|
||||||
|
const int token_id = flat_warp_id / num_heads;
|
||||||
|
const int head_id = flat_warp_id % num_heads;
|
||||||
|
const int lane_id = threadIdx.x & 31;
|
||||||
|
|
||||||
|
constexpr bool use_256b = VLLM_256B_PTX_ENABLED;
|
||||||
|
constexpr int nope_vec_loads =
|
||||||
|
NOPE_DIM * sizeof(DType) / (VecTraits<use_256b>::ARCH_MAX_VEC_SIZE * 32);
|
||||||
|
|
||||||
|
const DType* nope_src =
|
||||||
|
ql_nope + token_id * nope_stride_0 + head_id * nope_stride_1;
|
||||||
|
DType* nope_dst = q_out + token_id * out_stride_0 + head_id * out_stride_1;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < nope_vec_loads; i++) {
|
||||||
|
const int offset = i * 32 + lane_id;
|
||||||
|
if constexpr (use_256b) {
|
||||||
|
st256_cs(reinterpret_cast<u32x8_t*>(nope_dst) + offset,
|
||||||
|
ld256_cs(reinterpret_cast<const u32x8_t*>(nope_src) + offset));
|
||||||
|
} else {
|
||||||
|
st128_cs(reinterpret_cast<int4*>(nope_dst) + offset,
|
||||||
|
ld128_cs(reinterpret_cast<const int4*>(nope_src) + offset));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const int* rope_src = reinterpret_cast<const int*>(
|
||||||
|
q_pe + token_id * pe_stride_0 + head_id * pe_stride_1);
|
||||||
|
int* rope_dst = reinterpret_cast<int*>(q_out + token_id * out_stride_0 +
|
||||||
|
head_id * out_stride_1 + NOPE_DIM);
|
||||||
|
|
||||||
|
st32_cs(rope_dst + lane_id, ld32_cs(rope_src + lane_id));
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace vllm
|
||||||
|
|
||||||
|
#endif // CONCAT_MLA_Q_CUH_
|
||||||
@@ -420,7 +420,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
|
|||||||
const int64_t block_size, const int64_t block_size_stride) {
|
const int64_t block_size, const int64_t block_size_stride) {
|
||||||
// For AMX 2D tiles, size of each line is 64 bytes
|
// For AMX 2D tiles, size of each line is 64 bytes
|
||||||
constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES;
|
constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES;
|
||||||
// For AMX B martix, N always is 16
|
// For AMX B matrix, N always is 16
|
||||||
constexpr int64_t amx_b_tile_n_size = AMX_TILE_ROW_BYTES / 4;
|
constexpr int64_t amx_b_tile_n_size = AMX_TILE_ROW_BYTES / 4;
|
||||||
constexpr int64_t amx_b_tile_k_size = amx_tile_row_size / sizeof(scalar_t);
|
constexpr int64_t amx_b_tile_k_size = amx_tile_row_size / sizeof(scalar_t);
|
||||||
// For now suppose block_size is divisible by amx_tile_column_num
|
// For now suppose block_size is divisible by amx_tile_column_num
|
||||||
|
|||||||
@@ -13,6 +13,9 @@
|
|||||||
#elif defined(__aarch64__)
|
#elif defined(__aarch64__)
|
||||||
// arm implementation
|
// arm implementation
|
||||||
#include "cpu_types_arm.hpp"
|
#include "cpu_types_arm.hpp"
|
||||||
|
#elif defined(__riscv_v)
|
||||||
|
// riscv implementation
|
||||||
|
#include "cpu_types_riscv.hpp"
|
||||||
#else
|
#else
|
||||||
#warning "unsupported vLLM cpu implementation, vLLM will compile with scalar"
|
#warning "unsupported vLLM cpu implementation, vLLM will compile with scalar"
|
||||||
#include "cpu_types_scalar.hpp"
|
#include "cpu_types_scalar.hpp"
|
||||||
|
|||||||
832
csrc/cpu/cpu_types_riscv.hpp
Normal file
832
csrc/cpu/cpu_types_riscv.hpp
Normal file
@@ -0,0 +1,832 @@
|
|||||||
|
#ifndef CPU_TYPES_RISCV_HPP
|
||||||
|
#define CPU_TYPES_RISCV_HPP
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstring>
|
||||||
|
#include <iostream>
|
||||||
|
#include <limits>
|
||||||
|
#include <riscv_vector.h>
|
||||||
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Vector Register Type Definitions (VLEN=128 bits)
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
typedef vfloat16m1_t fixed_vfloat16m1_t
|
||||||
|
__attribute__((riscv_rvv_vector_bits(128)));
|
||||||
|
typedef vfloat16m2_t fixed_vfloat16m2_t
|
||||||
|
__attribute__((riscv_rvv_vector_bits(256)));
|
||||||
|
|
||||||
|
typedef vfloat32m1_t fixed_vfloat32m1_t
|
||||||
|
__attribute__((riscv_rvv_vector_bits(128)));
|
||||||
|
typedef vfloat32m2_t fixed_vfloat32m2_t
|
||||||
|
__attribute__((riscv_rvv_vector_bits(256)));
|
||||||
|
typedef vfloat32m4_t fixed_vfloat32m4_t
|
||||||
|
__attribute__((riscv_rvv_vector_bits(512)));
|
||||||
|
typedef vfloat32m8_t fixed_vfloat32m8_t
|
||||||
|
__attribute__((riscv_rvv_vector_bits(1024)));
|
||||||
|
|
||||||
|
typedef vint32m2_t fixed_vint32m2_t __attribute__((riscv_rvv_vector_bits(256)));
|
||||||
|
typedef vint32m4_t fixed_vint32m4_t __attribute__((riscv_rvv_vector_bits(512)));
|
||||||
|
|
||||||
|
typedef vuint16m1_t fixed_vuint16m1_t
|
||||||
|
__attribute__((riscv_rvv_vector_bits(128)));
|
||||||
|
typedef vuint16m2_t fixed_vuint16m2_t
|
||||||
|
__attribute__((riscv_rvv_vector_bits(256)));
|
||||||
|
typedef vuint16m4_t fixed_vuint16m4_t
|
||||||
|
__attribute__((riscv_rvv_vector_bits(512)));
|
||||||
|
|
||||||
|
#ifdef RISCV_BF16_SUPPORT
|
||||||
|
typedef vbfloat16m1_t fixed_vbfloat16m1_t
|
||||||
|
__attribute__((riscv_rvv_vector_bits(128)));
|
||||||
|
typedef vbfloat16m2_t fixed_vbfloat16m2_t
|
||||||
|
__attribute__((riscv_rvv_vector_bits(256)));
|
||||||
|
typedef vbfloat16m4_t fixed_vbfloat16m4_t
|
||||||
|
__attribute__((riscv_rvv_vector_bits(512)));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace vec_op {
|
||||||
|
|
||||||
|
#ifdef RISCV_BF16_SUPPORT
|
||||||
|
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
|
||||||
|
#else
|
||||||
|
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
||||||
|
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
|
||||||
|
|
||||||
|
#define FORCE_INLINE __attribute__((always_inline)) inline
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
template <typename T, T... indexes, typename F>
|
||||||
|
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
|
||||||
|
(f(std::integral_constant<T, indexes>{}), ...);
|
||||||
|
};
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
template <typename T, T count, typename F,
|
||||||
|
typename = std::enable_if_t<std::is_invocable_v<F, T>>>
|
||||||
|
constexpr void unroll_loop(F&& f) {
|
||||||
|
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
struct Vec {
|
||||||
|
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; };
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FP32Vec8;
|
||||||
|
struct FP32Vec16;
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// FP16 Implementation
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
struct FP16Vec8 : public Vec<FP16Vec8> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 8;
|
||||||
|
fixed_vfloat16m1_t reg;
|
||||||
|
|
||||||
|
explicit FP16Vec8(const void* ptr)
|
||||||
|
: reg(__riscv_vle16_v_f16m1(static_cast<const _Float16*>(ptr),
|
||||||
|
VEC_ELEM_NUM)) {};
|
||||||
|
|
||||||
|
explicit FP16Vec8(const FP32Vec8&);
|
||||||
|
|
||||||
|
void save(void* ptr) const {
|
||||||
|
__riscv_vse16_v_f16m1(static_cast<_Float16*>(ptr), reg, VEC_ELEM_NUM);
|
||||||
|
}
|
||||||
|
void save(void* ptr, int elem_num) const {
|
||||||
|
__riscv_vse16_v_f16m1(static_cast<_Float16*>(ptr), reg, elem_num);
|
||||||
|
}
|
||||||
|
void save_strided(void* ptr, ptrdiff_t stride) const {
|
||||||
|
ptrdiff_t byte_stride = stride * sizeof(_Float16);
|
||||||
|
__riscv_vsse16_v_f16m1(static_cast<_Float16*>(ptr), byte_stride, reg,
|
||||||
|
VEC_ELEM_NUM);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FP16Vec16 : public Vec<FP16Vec16> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 16;
|
||||||
|
fixed_vfloat16m2_t reg;
|
||||||
|
|
||||||
|
explicit FP16Vec16(const void* ptr)
|
||||||
|
: reg(__riscv_vle16_v_f16m2(static_cast<const _Float16*>(ptr),
|
||||||
|
VEC_ELEM_NUM)) {};
|
||||||
|
|
||||||
|
explicit FP16Vec16(const FP32Vec16& vec);
|
||||||
|
|
||||||
|
void save(void* ptr) const {
|
||||||
|
__riscv_vse16_v_f16m2(static_cast<_Float16*>(ptr), reg, VEC_ELEM_NUM);
|
||||||
|
}
|
||||||
|
void save(void* ptr, int elem_num) const {
|
||||||
|
__riscv_vse16_v_f16m2(static_cast<_Float16*>(ptr), reg, elem_num);
|
||||||
|
}
|
||||||
|
void save_strided(void* ptr, ptrdiff_t stride) const {
|
||||||
|
ptrdiff_t byte_stride = stride * sizeof(_Float16);
|
||||||
|
__riscv_vsse16_v_f16m2(static_cast<_Float16*>(ptr), byte_stride, reg,
|
||||||
|
VEC_ELEM_NUM);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// BF16 Implementation
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
#ifdef RISCV_BF16_SUPPORT
|
||||||
|
|
||||||
|
FORCE_INLINE fixed_vuint16m1_t bf16_to_u16(fixed_vbfloat16m1_t v) {
|
||||||
|
return __riscv_vreinterpret_v_bf16m1_u16m1(v);
|
||||||
|
}
|
||||||
|
FORCE_INLINE fixed_vuint16m2_t bf16_to_u16(fixed_vbfloat16m2_t v) {
|
||||||
|
return __riscv_vreinterpret_v_bf16m2_u16m2(v);
|
||||||
|
}
|
||||||
|
FORCE_INLINE fixed_vuint16m4_t bf16_to_u16(fixed_vbfloat16m4_t v) {
|
||||||
|
return __riscv_vreinterpret_v_bf16m4_u16m4(v);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct BF16Vec8 : public Vec<BF16Vec8> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 8;
|
||||||
|
fixed_vbfloat16m1_t reg;
|
||||||
|
|
||||||
|
explicit BF16Vec8(const void* ptr)
|
||||||
|
: reg(__riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vle16_v_u16m1(
|
||||||
|
reinterpret_cast<const uint16_t*>(ptr), VEC_ELEM_NUM))) {};
|
||||||
|
|
||||||
|
explicit BF16Vec8(fixed_vbfloat16m1_t data) : reg(data) {};
|
||||||
|
explicit BF16Vec8(const FP32Vec8&);
|
||||||
|
|
||||||
|
void save(void* ptr) const {
|
||||||
|
__riscv_vse16_v_u16m1(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
|
||||||
|
VEC_ELEM_NUM);
|
||||||
|
}
|
||||||
|
void save(void* ptr, int elem_num) const {
|
||||||
|
__riscv_vse16_v_u16m1(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
|
||||||
|
elem_num);
|
||||||
|
}
|
||||||
|
void save_strided(void* ptr, ptrdiff_t stride) const {
|
||||||
|
ptrdiff_t byte_stride = stride * sizeof(uint16_t);
|
||||||
|
__riscv_vsse16_v_u16m1(reinterpret_cast<uint16_t*>(ptr), byte_stride,
|
||||||
|
bf16_to_u16(reg), VEC_ELEM_NUM);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 16;
|
||||||
|
fixed_vbfloat16m2_t reg;
|
||||||
|
|
||||||
|
explicit BF16Vec16(const void* ptr)
|
||||||
|
: reg(__riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vle16_v_u16m2(
|
||||||
|
reinterpret_cast<const uint16_t*>(ptr), VEC_ELEM_NUM))) {};
|
||||||
|
|
||||||
|
explicit BF16Vec16(fixed_vbfloat16m2_t data) : reg(data) {};
|
||||||
|
explicit BF16Vec16(const FP32Vec16&);
|
||||||
|
|
||||||
|
void save(void* ptr) const {
|
||||||
|
__riscv_vse16_v_u16m2(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
|
||||||
|
VEC_ELEM_NUM);
|
||||||
|
}
|
||||||
|
void save(void* ptr, int elem_num) const {
|
||||||
|
__riscv_vse16_v_u16m2(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
|
||||||
|
elem_num);
|
||||||
|
}
|
||||||
|
void save_strided(void* ptr, ptrdiff_t stride) const {
|
||||||
|
ptrdiff_t byte_stride = stride * sizeof(uint16_t);
|
||||||
|
__riscv_vsse16_v_u16m2(reinterpret_cast<uint16_t*>(ptr), byte_stride,
|
||||||
|
bf16_to_u16(reg), VEC_ELEM_NUM);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct BF16Vec32 : public Vec<BF16Vec32> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 32;
|
||||||
|
fixed_vbfloat16m4_t reg;
|
||||||
|
|
||||||
|
explicit BF16Vec32(const void* ptr)
|
||||||
|
: reg(__riscv_vreinterpret_v_u16m4_bf16m4(__riscv_vle16_v_u16m4(
|
||||||
|
reinterpret_cast<const uint16_t*>(ptr), VEC_ELEM_NUM))) {};
|
||||||
|
|
||||||
|
explicit BF16Vec32(fixed_vbfloat16m4_t data) : reg(data) {};
|
||||||
|
|
||||||
|
explicit BF16Vec32(const BF16Vec8& v) {
|
||||||
|
fixed_vuint16m1_t u16_val = bf16_to_u16(v.reg);
|
||||||
|
fixed_vuint16m4_t u16_combined =
|
||||||
|
__riscv_vcreate_v_u16m1_u16m4(u16_val, u16_val, u16_val, u16_val);
|
||||||
|
reg = __riscv_vreinterpret_v_u16m4_bf16m4(u16_combined);
|
||||||
|
};
|
||||||
|
|
||||||
|
void save(void* ptr) const {
|
||||||
|
__riscv_vse16_v_u16m4(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
|
||||||
|
VEC_ELEM_NUM);
|
||||||
|
}
|
||||||
|
void save(void* ptr, int elem_num) const {
|
||||||
|
__riscv_vse16_v_u16m4(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
|
||||||
|
elem_num);
|
||||||
|
}
|
||||||
|
void save_strided(void* ptr, ptrdiff_t stride) const {
|
||||||
|
ptrdiff_t byte_stride = stride * sizeof(uint16_t);
|
||||||
|
__riscv_vsse16_v_u16m4(reinterpret_cast<uint16_t*>(ptr), byte_stride,
|
||||||
|
bf16_to_u16(reg), VEC_ELEM_NUM);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#else
|
||||||
|
// ============================================================================
|
||||||
|
// BF16 Fallback Implementation (FP32 Simulation)
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
struct BF16Vec8 : public Vec<BF16Vec8> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 8;
|
||||||
|
fixed_vfloat32m2_t reg_fp32;
|
||||||
|
explicit BF16Vec8(const void* ptr) {
|
||||||
|
const uint16_t* u16 = static_cast<const uint16_t*>(ptr);
|
||||||
|
float tmp[8];
|
||||||
|
for (int i = 0; i < 8; ++i) {
|
||||||
|
uint32_t v = static_cast<uint32_t>(u16[i]) << 16;
|
||||||
|
std::memcpy(&tmp[i], &v, 4);
|
||||||
|
}
|
||||||
|
reg_fp32 = __riscv_vle32_v_f32m2(tmp, 8);
|
||||||
|
}
|
||||||
|
explicit BF16Vec8(const FP32Vec8&);
|
||||||
|
void save(void* ptr) const {
|
||||||
|
float tmp[8];
|
||||||
|
__riscv_vse32_v_f32m2(tmp, reg_fp32, 8);
|
||||||
|
uint16_t* u16 = static_cast<uint16_t*>(ptr);
|
||||||
|
for (int i = 0; i < 8; ++i) {
|
||||||
|
uint32_t v;
|
||||||
|
std::memcpy(&v, &tmp[i], 4);
|
||||||
|
u16[i] = static_cast<uint16_t>(v >> 16);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void save(void* ptr, int elem_num) const {
|
||||||
|
float tmp[8];
|
||||||
|
__riscv_vse32_v_f32m2(tmp, reg_fp32, 8);
|
||||||
|
uint16_t* u16 = static_cast<uint16_t*>(ptr);
|
||||||
|
for (int i = 0; i < elem_num; ++i) {
|
||||||
|
uint32_t v;
|
||||||
|
std::memcpy(&v, &tmp[i], 4);
|
||||||
|
u16[i] = static_cast<uint16_t>(v >> 16);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void save_strided(void* ptr, ptrdiff_t stride) const {
|
||||||
|
float tmp[8];
|
||||||
|
__riscv_vse32_v_f32m2(tmp, reg_fp32, 8);
|
||||||
|
uint8_t* u8 = static_cast<uint8_t*>(ptr);
|
||||||
|
ptrdiff_t byte_stride = stride * sizeof(uint16_t);
|
||||||
|
for (int i = 0; i < 8; ++i) {
|
||||||
|
uint32_t v;
|
||||||
|
std::memcpy(&v, &tmp[i], 4);
|
||||||
|
uint16_t val = static_cast<uint16_t>(v >> 16);
|
||||||
|
*reinterpret_cast<uint16_t*>(u8 + i * byte_stride) = val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 16;
|
||||||
|
fixed_vfloat32m4_t reg_fp32;
|
||||||
|
explicit BF16Vec16(const void* ptr) {
|
||||||
|
const uint16_t* u16 = static_cast<const uint16_t*>(ptr);
|
||||||
|
float tmp[16];
|
||||||
|
for (int i = 0; i < 16; ++i) {
|
||||||
|
uint32_t v = static_cast<uint32_t>(u16[i]) << 16;
|
||||||
|
std::memcpy(&tmp[i], &v, 4);
|
||||||
|
}
|
||||||
|
reg_fp32 = __riscv_vle32_v_f32m4(tmp, 16);
|
||||||
|
}
|
||||||
|
explicit BF16Vec16(const FP32Vec16&);
|
||||||
|
void save(void* ptr) const {
|
||||||
|
float tmp[16];
|
||||||
|
__riscv_vse32_v_f32m4(tmp, reg_fp32, 16);
|
||||||
|
uint16_t* u16 = static_cast<uint16_t*>(ptr);
|
||||||
|
for (int i = 0; i < 16; ++i) {
|
||||||
|
uint32_t v;
|
||||||
|
std::memcpy(&v, &tmp[i], 4);
|
||||||
|
u16[i] = static_cast<uint16_t>(v >> 16);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void save(void* ptr, int elem_num) const {
|
||||||
|
float tmp[16];
|
||||||
|
__riscv_vse32_v_f32m4(tmp, reg_fp32, 16);
|
||||||
|
uint16_t* u16 = static_cast<uint16_t*>(ptr);
|
||||||
|
for (int i = 0; i < elem_num; ++i) {
|
||||||
|
uint32_t v;
|
||||||
|
std::memcpy(&v, &tmp[i], 4);
|
||||||
|
u16[i] = static_cast<uint16_t>(v >> 16);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void save_strided(void* ptr, ptrdiff_t stride) const {
|
||||||
|
float tmp[16];
|
||||||
|
__riscv_vse32_v_f32m4(tmp, reg_fp32, 16);
|
||||||
|
uint8_t* u8 = static_cast<uint8_t*>(ptr);
|
||||||
|
ptrdiff_t byte_stride = stride * sizeof(uint16_t);
|
||||||
|
for (int i = 0; i < 16; ++i) {
|
||||||
|
uint32_t v;
|
||||||
|
std::memcpy(&v, &tmp[i], 4);
|
||||||
|
uint16_t val = static_cast<uint16_t>(v >> 16);
|
||||||
|
*reinterpret_cast<uint16_t*>(u8 + i * byte_stride) = val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct BF16Vec32 : public Vec<BF16Vec32> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 32;
|
||||||
|
fixed_vfloat32m8_t reg_fp32;
|
||||||
|
|
||||||
|
explicit BF16Vec32(const void* ptr) {
|
||||||
|
const uint16_t* u16 = static_cast<const uint16_t*>(ptr);
|
||||||
|
float tmp[32];
|
||||||
|
for (int i = 0; i < 32; ++i) {
|
||||||
|
uint32_t v = static_cast<uint32_t>(u16[i]) << 16;
|
||||||
|
std::memcpy(&tmp[i], &v, 4);
|
||||||
|
}
|
||||||
|
reg_fp32 = __riscv_vle32_v_f32m8(tmp, 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit BF16Vec32(const BF16Vec8& v) {
|
||||||
|
float tmp_small[8];
|
||||||
|
__riscv_vse32_v_f32m2(tmp_small, v.reg_fp32, 8);
|
||||||
|
float tmp_large[32];
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
std::memcpy(tmp_large + (i * 8), tmp_small, 8 * sizeof(float));
|
||||||
|
}
|
||||||
|
reg_fp32 = __riscv_vle32_v_f32m8(tmp_large, 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(void* ptr) const {
|
||||||
|
float tmp[32];
|
||||||
|
__riscv_vse32_v_f32m8(tmp, reg_fp32, 32);
|
||||||
|
uint16_t* u16 = static_cast<uint16_t*>(ptr);
|
||||||
|
for (int i = 0; i < 32; ++i) {
|
||||||
|
uint32_t v;
|
||||||
|
std::memcpy(&v, &tmp[i], 4);
|
||||||
|
u16[i] = static_cast<uint16_t>(v >> 16);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(void* ptr, int elem_num) const {
|
||||||
|
float tmp[32];
|
||||||
|
__riscv_vse32_v_f32m8(tmp, reg_fp32, 32);
|
||||||
|
uint16_t* u16 = static_cast<uint16_t*>(ptr);
|
||||||
|
for (int i = 0; i < elem_num; ++i) {
|
||||||
|
uint32_t v;
|
||||||
|
std::memcpy(&v, &tmp[i], 4);
|
||||||
|
u16[i] = static_cast<uint16_t>(v >> 16);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void save_strided(void* ptr, ptrdiff_t stride) const {
|
||||||
|
float tmp[32];
|
||||||
|
__riscv_vse32_v_f32m8(tmp, reg_fp32, 32);
|
||||||
|
uint8_t* u8 = static_cast<uint8_t*>(ptr);
|
||||||
|
ptrdiff_t byte_stride = stride * sizeof(uint16_t);
|
||||||
|
for (int i = 0; i < 32; ++i) {
|
||||||
|
uint32_t v;
|
||||||
|
std::memcpy(&v, &tmp[i], 4);
|
||||||
|
uint16_t val = static_cast<uint16_t>(v >> 16);
|
||||||
|
*reinterpret_cast<uint16_t*>(u8 + i * byte_stride) = val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// FP32 Implementation
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
struct FP32Vec4 : public Vec<FP32Vec4> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 4;
|
||||||
|
fixed_vfloat32m1_t reg;
|
||||||
|
explicit FP32Vec4(float v) : reg(__riscv_vfmv_v_f_f32m1(v, VEC_ELEM_NUM)) {};
|
||||||
|
explicit FP32Vec4() : reg(__riscv_vfmv_v_f_f32m1(0.0f, VEC_ELEM_NUM)) {};
|
||||||
|
explicit FP32Vec4(const float* ptr)
|
||||||
|
: reg(__riscv_vle32_v_f32m1(ptr, VEC_ELEM_NUM)) {};
|
||||||
|
explicit FP32Vec4(fixed_vfloat32m1_t data) : reg(data) {};
|
||||||
|
explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {};
|
||||||
|
void save(float* ptr) const { __riscv_vse32_v_f32m1(ptr, reg, VEC_ELEM_NUM); }
|
||||||
|
void save(float* ptr, int elem_num) const {
|
||||||
|
__riscv_vse32_v_f32m1(ptr, reg, elem_num);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FP32Vec8 : public Vec<FP32Vec8> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 8;
|
||||||
|
fixed_vfloat32m2_t reg;
|
||||||
|
|
||||||
|
explicit FP32Vec8(float v) : reg(__riscv_vfmv_v_f_f32m2(v, VEC_ELEM_NUM)) {};
|
||||||
|
explicit FP32Vec8() : reg(__riscv_vfmv_v_f_f32m2(0.0f, VEC_ELEM_NUM)) {};
|
||||||
|
explicit FP32Vec8(const float* ptr)
|
||||||
|
: reg(__riscv_vle32_v_f32m2(ptr, VEC_ELEM_NUM)) {};
|
||||||
|
explicit FP32Vec8(fixed_vfloat32m2_t data) : reg(data) {};
|
||||||
|
explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
|
||||||
|
explicit FP32Vec8(const FP16Vec8& v)
|
||||||
|
: reg(__riscv_vfwcvt_f_f_v_f32m2(v.reg, VEC_ELEM_NUM)) {};
|
||||||
|
explicit FP32Vec8(fixed_vfloat16m1_t v)
|
||||||
|
: reg(__riscv_vfwcvt_f_f_v_f32m2(v, VEC_ELEM_NUM)) {};
|
||||||
|
|
||||||
|
#ifdef RISCV_BF16_SUPPORT
|
||||||
|
explicit FP32Vec8(fixed_vbfloat16m1_t v)
|
||||||
|
: reg(__riscv_vfwcvtbf16_f_f_v_f32m2(v, VEC_ELEM_NUM)) {};
|
||||||
|
explicit FP32Vec8(const BF16Vec8& v)
|
||||||
|
: reg(__riscv_vfwcvtbf16_f_f_v_f32m2(v.reg, VEC_ELEM_NUM)) {};
|
||||||
|
#else
|
||||||
|
explicit FP32Vec8(const BF16Vec8& v) : reg(v.reg_fp32) {};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
float reduce_sum() const {
|
||||||
|
fixed_vfloat32m1_t scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1);
|
||||||
|
scalar = __riscv_vfredusum_vs_f32m2_f32m1(reg, scalar, VEC_ELEM_NUM);
|
||||||
|
return __riscv_vfmv_f_s_f32m1_f32(scalar);
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 operator*(const FP32Vec8& b) const {
|
||||||
|
return FP32Vec8(__riscv_vfmul_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
FP32Vec8 operator+(const FP32Vec8& b) const {
|
||||||
|
return FP32Vec8(__riscv_vfadd_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
FP32Vec8 operator-(const FP32Vec8& b) const {
|
||||||
|
return FP32Vec8(__riscv_vfsub_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
FP32Vec8 operator/(const FP32Vec8& b) const {
|
||||||
|
return FP32Vec8(__riscv_vfdiv_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 min(const FP32Vec8& b) const {
|
||||||
|
return FP32Vec8(__riscv_vfmin_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
FP32Vec8 max(const FP32Vec8& b) const {
|
||||||
|
return FP32Vec8(__riscv_vfmax_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
FP32Vec8 abs() const {
|
||||||
|
return FP32Vec8(__riscv_vfabs_v_f32m2(reg, VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 min(const FP32Vec8& b, int elem_num) const {
|
||||||
|
return FP32Vec8(__riscv_vfmin_vv_f32m2(reg, b.reg, elem_num));
|
||||||
|
}
|
||||||
|
FP32Vec8 max(const FP32Vec8& b, int elem_num) const {
|
||||||
|
return FP32Vec8(__riscv_vfmax_vv_f32m2(reg, b.reg, elem_num));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 clamp(const FP32Vec8& min_v, const FP32Vec8& max_v) const {
|
||||||
|
fixed_vfloat32m2_t temp =
|
||||||
|
__riscv_vfmax_vv_f32m2(min_v.reg, reg, VEC_ELEM_NUM);
|
||||||
|
return FP32Vec8(__riscv_vfmin_vv_f32m2(max_v.reg, temp, VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(float* ptr) const { __riscv_vse32_v_f32m2(ptr, reg, VEC_ELEM_NUM); }
|
||||||
|
void save(float* ptr, int elem_num) const {
|
||||||
|
__riscv_vse32_v_f32m2(ptr, reg, elem_num);
|
||||||
|
}
|
||||||
|
void save_strided(float* ptr, ptrdiff_t stride) const {
|
||||||
|
ptrdiff_t byte_stride = stride * sizeof(float);
|
||||||
|
__riscv_vsse32_v_f32m2(ptr, byte_stride, reg, VEC_ELEM_NUM);
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 exp() const {
|
||||||
|
const float inv_ln2 = 1.44269504088896341f;
|
||||||
|
fixed_vfloat32m2_t x_scaled =
|
||||||
|
__riscv_vfmul_vf_f32m2(reg, inv_ln2, VEC_ELEM_NUM);
|
||||||
|
fixed_vint32m2_t n_int = __riscv_vfcvt_x_f_v_i32m2(x_scaled, VEC_ELEM_NUM);
|
||||||
|
fixed_vfloat32m2_t n_float = __riscv_vfcvt_f_x_v_f32m2(n_int, VEC_ELEM_NUM);
|
||||||
|
|
||||||
|
fixed_vfloat32m2_t r =
|
||||||
|
__riscv_vfsub_vv_f32m2(x_scaled, n_float, VEC_ELEM_NUM);
|
||||||
|
|
||||||
|
fixed_vfloat32m2_t poly =
|
||||||
|
__riscv_vfmv_v_f_f32m2(0.001333355810164f, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfadd_vf_f32m2(poly, 0.009618129107628f, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfadd_vf_f32m2(poly, 0.055504108664821f, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfadd_vf_f32m2(poly, 0.240226506959101f, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfadd_vf_f32m2(poly, 0.693147180559945f, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfadd_vf_f32m2(poly, 1.0f, VEC_ELEM_NUM);
|
||||||
|
|
||||||
|
fixed_vint32m2_t biased_exp =
|
||||||
|
__riscv_vadd_vx_i32m2(n_int, 127, VEC_ELEM_NUM);
|
||||||
|
biased_exp = __riscv_vmax_vx_i32m2(biased_exp, 0, VEC_ELEM_NUM);
|
||||||
|
fixed_vint32m2_t exponent_bits =
|
||||||
|
__riscv_vsll_vx_i32m2(biased_exp, 23, VEC_ELEM_NUM);
|
||||||
|
fixed_vfloat32m2_t scale =
|
||||||
|
__riscv_vreinterpret_v_i32m2_f32m2(exponent_bits);
|
||||||
|
|
||||||
|
return FP32Vec8(__riscv_vfmul_vv_f32m2(poly, scale, VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 tanh() const {
|
||||||
|
fixed_vfloat32m2_t x_clamped = __riscv_vfmin_vf_f32m2(
|
||||||
|
__riscv_vfmax_vf_f32m2(reg, -9.0f, VEC_ELEM_NUM), 9.0f, VEC_ELEM_NUM);
|
||||||
|
fixed_vfloat32m2_t x2 =
|
||||||
|
__riscv_vfmul_vf_f32m2(x_clamped, 2.0f, VEC_ELEM_NUM);
|
||||||
|
FP32Vec8 exp_val = FP32Vec8(x2).exp();
|
||||||
|
fixed_vfloat32m2_t num =
|
||||||
|
__riscv_vfsub_vf_f32m2(exp_val.reg, 1.0f, VEC_ELEM_NUM);
|
||||||
|
fixed_vfloat32m2_t den =
|
||||||
|
__riscv_vfadd_vf_f32m2(exp_val.reg, 1.0f, VEC_ELEM_NUM);
|
||||||
|
return FP32Vec8(__riscv_vfdiv_vv_f32m2(num, den, VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 er() const {
|
||||||
|
const float p = 0.3275911f, a1 = 0.254829592f, a2 = -0.284496736f,
|
||||||
|
a3 = 1.421413741f, a4 = -1.453152027f, a5 = 1.061405429f;
|
||||||
|
fixed_vfloat32m2_t abs_x = __riscv_vfabs_v_f32m2(reg, VEC_ELEM_NUM);
|
||||||
|
|
||||||
|
fixed_vfloat32m2_t t = __riscv_vfadd_vf_f32m2(
|
||||||
|
__riscv_vfmul_vf_f32m2(abs_x, p, VEC_ELEM_NUM), 1.0f, VEC_ELEM_NUM);
|
||||||
|
t = __riscv_vfrdiv_vf_f32m2(t, 1.0f, VEC_ELEM_NUM);
|
||||||
|
|
||||||
|
fixed_vfloat32m2_t poly = __riscv_vfmv_v_f_f32m2(a5, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
|
||||||
|
a4, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
|
||||||
|
a3, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
|
||||||
|
a2, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
|
||||||
|
a1, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM);
|
||||||
|
|
||||||
|
fixed_vfloat32m2_t exp_val =
|
||||||
|
FP32Vec8(__riscv_vfneg_v_f32m2(
|
||||||
|
__riscv_vfmul_vv_f32m2(abs_x, abs_x, VEC_ELEM_NUM),
|
||||||
|
VEC_ELEM_NUM))
|
||||||
|
.exp()
|
||||||
|
.reg;
|
||||||
|
fixed_vfloat32m2_t res = __riscv_vfrsub_vf_f32m2(
|
||||||
|
__riscv_vfmul_vv_f32m2(poly, exp_val, VEC_ELEM_NUM), 1.0f,
|
||||||
|
VEC_ELEM_NUM);
|
||||||
|
|
||||||
|
vbool16_t mask = __riscv_vmflt_vf_f32m2_b16(reg, 0.0f, VEC_ELEM_NUM);
|
||||||
|
return FP32Vec8(__riscv_vfneg_v_f32m2_m(mask, res, VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 16;
|
||||||
|
fixed_vfloat32m4_t reg;
|
||||||
|
|
||||||
|
explicit FP32Vec16(float v) : reg(__riscv_vfmv_v_f_f32m4(v, VEC_ELEM_NUM)) {};
|
||||||
|
explicit FP32Vec16() : reg(__riscv_vfmv_v_f_f32m4(0.0f, VEC_ELEM_NUM)) {};
|
||||||
|
explicit FP32Vec16(const float* ptr)
|
||||||
|
: reg(__riscv_vle32_v_f32m4(ptr, VEC_ELEM_NUM)) {};
|
||||||
|
explicit FP32Vec16(fixed_vfloat32m4_t data) : reg(data) {};
|
||||||
|
explicit FP32Vec16(const FP32Vec8& data)
|
||||||
|
: reg(__riscv_vcreate_v_f32m2_f32m4(data.reg, data.reg)) {};
|
||||||
|
explicit FP32Vec16(const FP32Vec16& data) : reg(data.reg) {};
|
||||||
|
explicit FP32Vec16(const FP16Vec16& v);
|
||||||
|
|
||||||
|
#ifdef RISCV_BF16_SUPPORT
|
||||||
|
explicit FP32Vec16(fixed_vbfloat16m2_t v)
|
||||||
|
: reg(__riscv_vfwcvtbf16_f_f_v_f32m4(v, VEC_ELEM_NUM)) {};
|
||||||
|
explicit FP32Vec16(const BF16Vec16& v)
|
||||||
|
: reg(__riscv_vfwcvtbf16_f_f_v_f32m4(v.reg, VEC_ELEM_NUM)) {};
|
||||||
|
#else
|
||||||
|
explicit FP32Vec16(const BF16Vec16& v) : reg(v.reg_fp32) {};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
FP32Vec16 operator+(const FP32Vec16& b) const {
|
||||||
|
return FP32Vec16(__riscv_vfadd_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
FP32Vec16 operator-(const FP32Vec16& b) const {
|
||||||
|
return FP32Vec16(__riscv_vfsub_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
FP32Vec16 operator*(const FP32Vec16& b) const {
|
||||||
|
return FP32Vec16(__riscv_vfmul_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
FP32Vec16 operator/(const FP32Vec16& b) const {
|
||||||
|
return FP32Vec16(__riscv_vfdiv_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 fma(const FP32Vec16& a, const FP32Vec16& b) const {
|
||||||
|
return FP32Vec16(__riscv_vfmacc_vv_f32m4(reg, a.reg, b.reg, VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
|
||||||
|
float reduce_sum() const {
|
||||||
|
fixed_vfloat32m1_t scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1);
|
||||||
|
scalar = __riscv_vfredusum_vs_f32m4_f32m1(reg, scalar, VEC_ELEM_NUM);
|
||||||
|
return __riscv_vfmv_f_s_f32m1_f32(scalar);
|
||||||
|
}
|
||||||
|
|
||||||
|
float reduce_max() const {
|
||||||
|
fixed_vfloat32m1_t scalar =
|
||||||
|
__riscv_vfmv_s_f_f32m1(std::numeric_limits<float>::lowest(), 1);
|
||||||
|
scalar = __riscv_vfredmax_vs_f32m4_f32m1(reg, scalar, VEC_ELEM_NUM);
|
||||||
|
return __riscv_vfmv_f_s_f32m1_f32(scalar);
|
||||||
|
}
|
||||||
|
|
||||||
|
float reduce_min() const {
|
||||||
|
fixed_vfloat32m1_t scalar =
|
||||||
|
__riscv_vfmv_s_f_f32m1(std::numeric_limits<float>::max(), 1);
|
||||||
|
scalar = __riscv_vfredmin_vs_f32m4_f32m1(reg, scalar, VEC_ELEM_NUM);
|
||||||
|
return __riscv_vfmv_f_s_f32m1_f32(scalar);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int group_size>
|
||||||
|
float reduce_sub_sum(int idx) {
|
||||||
|
static_assert(VEC_ELEM_NUM % group_size == 0);
|
||||||
|
const int start = idx * group_size;
|
||||||
|
vuint32m4_t indices = __riscv_vid_v_u32m4(VEC_ELEM_NUM);
|
||||||
|
vbool8_t mask = __riscv_vmand_mm_b8(
|
||||||
|
__riscv_vmsgeu_vx_u32m4_b8(indices, start, VEC_ELEM_NUM),
|
||||||
|
__riscv_vmsltu_vx_u32m4_b8(indices, start + group_size, VEC_ELEM_NUM),
|
||||||
|
VEC_ELEM_NUM);
|
||||||
|
fixed_vfloat32m1_t scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1);
|
||||||
|
scalar =
|
||||||
|
__riscv_vfredusum_vs_f32m4_f32m1_m(mask, reg, scalar, VEC_ELEM_NUM);
|
||||||
|
return __riscv_vfmv_f_s_f32m1_f32(scalar);
|
||||||
|
};
|
||||||
|
|
||||||
|
FP32Vec16 max(const FP32Vec16& b) const {
|
||||||
|
return FP32Vec16(__riscv_vfmax_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
FP32Vec16 min(const FP32Vec16& b) const {
|
||||||
|
return FP32Vec16(__riscv_vfmin_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
FP32Vec16 abs() const {
|
||||||
|
return FP32Vec16(__riscv_vfabs_v_f32m4(reg, VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 clamp(const FP32Vec16& min_v, const FP32Vec16& max_v) const {
|
||||||
|
return FP32Vec16(__riscv_vfmin_vv_f32m4(
|
||||||
|
max_v.reg, __riscv_vfmax_vv_f32m4(min_v.reg, reg, VEC_ELEM_NUM),
|
||||||
|
VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(float* ptr) const { __riscv_vse32_v_f32m4(ptr, reg, VEC_ELEM_NUM); }
|
||||||
|
void save(float* ptr, int elem_num) const {
|
||||||
|
__riscv_vse32_v_f32m4(ptr, reg, elem_num);
|
||||||
|
}
|
||||||
|
void save_strided(float* ptr, ptrdiff_t stride) const {
|
||||||
|
ptrdiff_t byte_stride = stride * sizeof(float);
|
||||||
|
__riscv_vsse32_v_f32m4(ptr, byte_stride, reg, VEC_ELEM_NUM);
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 exp() const {
|
||||||
|
const float inv_ln2 = 1.44269504088896341f;
|
||||||
|
fixed_vfloat32m4_t x_scaled =
|
||||||
|
__riscv_vfmul_vf_f32m4(reg, inv_ln2, VEC_ELEM_NUM);
|
||||||
|
fixed_vint32m4_t n_int = __riscv_vfcvt_x_f_v_i32m4(x_scaled, VEC_ELEM_NUM);
|
||||||
|
fixed_vfloat32m4_t n_float = __riscv_vfcvt_f_x_v_f32m4(n_int, VEC_ELEM_NUM);
|
||||||
|
fixed_vfloat32m4_t r =
|
||||||
|
__riscv_vfsub_vv_f32m4(x_scaled, n_float, VEC_ELEM_NUM);
|
||||||
|
|
||||||
|
fixed_vfloat32m4_t poly =
|
||||||
|
__riscv_vfmv_v_f_f32m4(0.001333355810164f, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
|
||||||
|
0.009618129107628f, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
|
||||||
|
0.055504108664821f, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
|
||||||
|
0.240226506959101f, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
|
||||||
|
0.693147180559945f, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
|
||||||
|
1.0f, VEC_ELEM_NUM);
|
||||||
|
|
||||||
|
fixed_vint32m4_t biased_exp = __riscv_vmax_vx_i32m4(
|
||||||
|
__riscv_vadd_vx_i32m4(n_int, 127, VEC_ELEM_NUM), 0, VEC_ELEM_NUM);
|
||||||
|
fixed_vfloat32m4_t scale = __riscv_vreinterpret_v_i32m4_f32m4(
|
||||||
|
__riscv_vsll_vx_i32m4(biased_exp, 23, VEC_ELEM_NUM));
|
||||||
|
|
||||||
|
return FP32Vec16(__riscv_vfmul_vv_f32m4(poly, scale, VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 tanh() const {
|
||||||
|
fixed_vfloat32m4_t x_clamped = __riscv_vfmin_vf_f32m4(
|
||||||
|
__riscv_vfmax_vf_f32m4(reg, -9.0f, VEC_ELEM_NUM), 9.0f, VEC_ELEM_NUM);
|
||||||
|
FP32Vec16 exp_val =
|
||||||
|
FP32Vec16(__riscv_vfmul_vf_f32m4(x_clamped, 2.0f, VEC_ELEM_NUM)).exp();
|
||||||
|
return FP32Vec16(__riscv_vfdiv_vv_f32m4(
|
||||||
|
__riscv_vfsub_vf_f32m4(exp_val.reg, 1.0f, VEC_ELEM_NUM),
|
||||||
|
__riscv_vfadd_vf_f32m4(exp_val.reg, 1.0f, VEC_ELEM_NUM), VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 er() const {
|
||||||
|
const float p = 0.3275911f, a1 = 0.254829592f, a2 = -0.284496736f,
|
||||||
|
a3 = 1.421413741f, a4 = -1.453152027f, a5 = 1.061405429f;
|
||||||
|
fixed_vfloat32m4_t abs_x = __riscv_vfabs_v_f32m4(reg, VEC_ELEM_NUM);
|
||||||
|
fixed_vfloat32m4_t t = __riscv_vfrdiv_vf_f32m4(
|
||||||
|
__riscv_vfadd_vf_f32m4(__riscv_vfmul_vf_f32m4(abs_x, p, VEC_ELEM_NUM),
|
||||||
|
1.0f, VEC_ELEM_NUM),
|
||||||
|
1.0f, VEC_ELEM_NUM);
|
||||||
|
|
||||||
|
fixed_vfloat32m4_t poly = __riscv_vfmv_v_f_f32m4(a5, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
|
||||||
|
a4, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
|
||||||
|
a3, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
|
||||||
|
a2, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
|
||||||
|
a1, VEC_ELEM_NUM);
|
||||||
|
poly = __riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM);
|
||||||
|
|
||||||
|
fixed_vfloat32m4_t exp_val =
|
||||||
|
FP32Vec16(__riscv_vfneg_v_f32m4(
|
||||||
|
__riscv_vfmul_vv_f32m4(abs_x, abs_x, VEC_ELEM_NUM),
|
||||||
|
VEC_ELEM_NUM))
|
||||||
|
.exp()
|
||||||
|
.reg;
|
||||||
|
fixed_vfloat32m4_t res = __riscv_vfrsub_vf_f32m4(
|
||||||
|
__riscv_vfmul_vv_f32m4(poly, exp_val, VEC_ELEM_NUM), 1.0f,
|
||||||
|
VEC_ELEM_NUM);
|
||||||
|
|
||||||
|
vbool8_t mask = __riscv_vmflt_vf_f32m4_b8(reg, 0.0f, VEC_ELEM_NUM);
|
||||||
|
return FP32Vec16(__riscv_vfneg_v_f32m4_m(mask, res, VEC_ELEM_NUM));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Type Traits & Global Helpers
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
struct VecType {
|
||||||
|
using vec_type = void;
|
||||||
|
using vec_t = void;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
using vec_t = typename VecType<T>::vec_type;
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct VecType<float> {
|
||||||
|
using vec_type = FP32Vec8;
|
||||||
|
using vec_t = FP32Vec8;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct VecType<c10::Half> {
|
||||||
|
using vec_type = FP16Vec8;
|
||||||
|
using vec_t = FP16Vec8;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct VecType<c10::BFloat16> {
|
||||||
|
using vec_type = BF16Vec8;
|
||||||
|
using vec_t = BF16Vec8;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void storeFP32(float v, T* ptr) {
|
||||||
|
*ptr = v;
|
||||||
|
}
|
||||||
|
template <>
|
||||||
|
inline void storeFP32<c10::Half>(float v, c10::Half* ptr) {
|
||||||
|
*reinterpret_cast<_Float16*>(ptr) = static_cast<_Float16>(v);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
|
||||||
|
reg = __riscv_vfncvt_f_f_w_f16m2(v.reg, VEC_ELEM_NUM);
|
||||||
|
}
|
||||||
|
inline FP16Vec8::FP16Vec8(const FP32Vec8& v) {
|
||||||
|
reg = __riscv_vfncvt_f_f_w_f16m1(v.reg, VEC_ELEM_NUM);
|
||||||
|
}
|
||||||
|
inline FP32Vec16::FP32Vec16(const FP16Vec16& v) {
|
||||||
|
reg = __riscv_vfwcvt_f_f_v_f32m4(v.reg, VEC_ELEM_NUM);
|
||||||
|
}
|
||||||
|
inline void fma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
|
||||||
|
acc = acc.fma(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef RISCV_BF16_SUPPORT
|
||||||
|
template <>
|
||||||
|
inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
|
||||||
|
*ptr = static_cast<__bf16>(v);
|
||||||
|
};
|
||||||
|
inline BF16Vec8::BF16Vec8(const FP32Vec8& v)
|
||||||
|
: reg(__riscv_vfncvtbf16_f_f_w_bf16m1(v.reg, VEC_ELEM_NUM)) {};
|
||||||
|
inline BF16Vec16::BF16Vec16(const FP32Vec16& v)
|
||||||
|
: reg(__riscv_vfncvtbf16_f_f_w_bf16m2(v.reg, VEC_ELEM_NUM)) {};
|
||||||
|
#else
|
||||||
|
template <>
|
||||||
|
inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
|
||||||
|
uint32_t val;
|
||||||
|
std::memcpy(&val, &v, 4);
|
||||||
|
*reinterpret_cast<uint16_t*>(ptr) = static_cast<uint16_t>(val >> 16);
|
||||||
|
}
|
||||||
|
inline BF16Vec8::BF16Vec8(const FP32Vec8& v) : reg_fp32(v.reg) {}
|
||||||
|
inline BF16Vec16::BF16Vec16(const FP32Vec16& v) : reg_fp32(v.reg) {}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 1); }
|
||||||
|
|
||||||
|
} // namespace vec_op
|
||||||
|
|
||||||
|
#ifndef CPU_KERNEL_GUARD_IN
|
||||||
|
#define CPU_KERNEL_GUARD_IN(NAME)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef CPU_KERNEL_GUARD_OUT
|
||||||
|
#define CPU_KERNEL_GUARD_OUT(NAME)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // CPU_TYPES_RISCV_HPP
|
||||||
@@ -237,13 +237,10 @@ W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
|
|||||||
};
|
};
|
||||||
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
|
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
|
||||||
{b_k_stride_, b_n_stride_});
|
{b_k_stride_, b_n_stride_});
|
||||||
#ifdef __aarch64__
|
|
||||||
// dummy M size for prepacking weights
|
// dummy M size for prepacking weights
|
||||||
// Prepacking weights improves performance and avoid runtime reorders
|
// Prepacking weights improves performance and avoid runtime reorders
|
||||||
constexpr dnnl_dim_t kProbeM = 128;
|
constexpr dnnl_dim_t kProbeM = 128;
|
||||||
#else
|
|
||||||
constexpr dnnl_dim_t kProbeM = DNNL_RUNTIME_DIM_VAL;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
prepack_weight(args.b_ptr, original_b_md,
|
prepack_weight(args.b_ptr, original_b_md,
|
||||||
create_primitive_desc(
|
create_primitive_desc(
|
||||||
@@ -411,21 +408,19 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
|
|||||||
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
|
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
|
||||||
{b_k_stride_, b_n_stride_});
|
{b_k_stride_, b_n_stride_});
|
||||||
|
|
||||||
|
// dummy M size for prepacking weights
|
||||||
|
// Prepacking weights improves performance and avoid runtime reorders
|
||||||
|
constexpr dnnl_dim_t kProbeM = 128;
|
||||||
|
|
||||||
prepack_weight(args.b_ptr, original_b_md,
|
prepack_weight(args.b_ptr, original_b_md,
|
||||||
create_primitive_desc(
|
create_primitive_desc(
|
||||||
MSizeCacheKey{
|
MSizeCacheKey{// Use a concrete M so oneDNN's kernel
|
||||||
#ifdef VLLM_USE_ACL
|
// selector can choose an optimally blocked
|
||||||
// Arm Compute Library (ACL) backend for oneDNN does
|
// weight layout.
|
||||||
// not support runtime
|
.a_m_size = kProbeM,
|
||||||
// dimensions, so we set M to a default value
|
.a_m_stride = b_k_size_,
|
||||||
.a_m_size = 128,
|
.use_bias = false,
|
||||||
.a_m_stride = b_k_size_,
|
.bias_type = dnnl::memory::data_type::undef},
|
||||||
#else
|
|
||||||
.a_m_size = DNNL_RUNTIME_DIM_VAL,
|
|
||||||
.a_m_stride = DNNL_RUNTIME_DIM_VAL,
|
|
||||||
#endif
|
|
||||||
.use_bias = false,
|
|
||||||
.bias_type = dnnl::memory::data_type::undef},
|
|
||||||
true)
|
true)
|
||||||
.weights_desc());
|
.weights_desc());
|
||||||
init_runtime_memory_cache(args);
|
init_runtime_memory_cache(args);
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
#include <torch/library.h>
|
#include <torch/library.h>
|
||||||
|
|
||||||
// Note: overwrite the external defination for sharing same name between
|
// Note: overwrite the external definition for sharing same name between
|
||||||
// libraries use different ISAs.
|
// libraries use different ISAs.
|
||||||
#define TORCH_EXTENSION_NAME _C
|
#define TORCH_EXTENSION_NAME _C
|
||||||
|
|
||||||
|
|||||||
@@ -196,7 +196,7 @@ __forceinline__ __device__ u32x8_t ld256_cs(const u32x8_t* addr) {
|
|||||||
return val;
|
return val;
|
||||||
#else
|
#else
|
||||||
assert(false && "ld256_cs requires SM100+ with CUDA 12.9+");
|
assert(false && "ld256_cs requires SM100+ with CUDA 12.9+");
|
||||||
return {};
|
return u32x8_t{};
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -211,23 +211,51 @@ __forceinline__ __device__ void st256_cs(u32x8_t* addr, u32x8_t val) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// 32-bit cache-streaming (.cs) load / store — SM100+ only.
|
// 32-bit load / store.
|
||||||
|
__device__ __forceinline__ int ld32(const int* addr) { return __ldg(addr); }
|
||||||
|
|
||||||
|
__device__ __forceinline__ void st32(int* addr, int val) { *addr = val; }
|
||||||
|
|
||||||
|
// 32-bit cache-streaming (.cs) load / store.
|
||||||
|
// Falls back to ld32/st32 on ROCm (no .cs hint).
|
||||||
__forceinline__ __device__ int ld32_cs(const int* addr) {
|
__forceinline__ __device__ int ld32_cs(const int* addr) {
|
||||||
#if VLLM_256B_PTX_ENABLED
|
|
||||||
int val;
|
int val;
|
||||||
|
#ifndef USE_ROCM
|
||||||
asm volatile("ld.global.cs.b32 %0, [%1];" : "=r"(val) : "l"(addr));
|
asm volatile("ld.global.cs.b32 %0, [%1];" : "=r"(val) : "l"(addr));
|
||||||
return val;
|
|
||||||
#else
|
#else
|
||||||
assert(false && "ld32_cs requires SM100+ with CUDA 12.9+");
|
val = ld32(addr);
|
||||||
return 0;
|
|
||||||
#endif
|
#endif
|
||||||
|
return val;
|
||||||
}
|
}
|
||||||
|
|
||||||
__forceinline__ __device__ void st32_cs(int* addr, int val) {
|
__forceinline__ __device__ void st32_cs(int* addr, int val) {
|
||||||
#if VLLM_256B_PTX_ENABLED
|
#ifndef USE_ROCM
|
||||||
asm volatile("st.global.cs.b32 [%0], %1;" ::"l"(addr), "r"(val));
|
asm volatile("st.global.cs.b32 [%0], %1;" ::"l"(addr), "r"(val));
|
||||||
#else
|
#else
|
||||||
assert(false && "st32_cs requires SM100+ with CUDA 12.9+");
|
st32(addr, val);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// 128-bit cache-streaming (.cs) load / store.
|
||||||
|
// Falls back to ld128/st128 on ROCm (no .cs hint).
|
||||||
|
__forceinline__ __device__ int4 ld128_cs(const int4* addr) {
|
||||||
|
int4 val;
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
asm volatile("ld.global.cs.v4.u32 {%0,%1,%2,%3}, [%4];"
|
||||||
|
: "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
|
||||||
|
: "l"(addr));
|
||||||
|
#else
|
||||||
|
ld128(val, addr);
|
||||||
|
#endif
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
|
__forceinline__ __device__ void st128_cs(int4* addr, int4 val) {
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
asm volatile("st.global.cs.v4.u32 [%0], {%1,%2,%3,%4};" ::"l"(addr),
|
||||||
|
"r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w));
|
||||||
|
#else
|
||||||
|
st128(val, addr);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -260,7 +288,7 @@ __device__ __forceinline__ void ld256_cg_or_zero(u32x8_t& val, const void* ptr,
|
|||||||
|
|
||||||
__device__ __forceinline__ void ld128_cg_or_zero(uint4& val, const void* ptr,
|
__device__ __forceinline__ void ld128_cg_or_zero(uint4& val, const void* ptr,
|
||||||
bool pred) {
|
bool pred) {
|
||||||
#if VLLM_256B_PTX_ENABLED
|
#ifndef USE_ROCM
|
||||||
uint32_t r0, r1, r2, r3;
|
uint32_t r0, r1, r2, r3;
|
||||||
|
|
||||||
asm volatile(
|
asm volatile(
|
||||||
@@ -278,7 +306,7 @@ __device__ __forceinline__ void ld128_cg_or_zero(uint4& val, const void* ptr,
|
|||||||
|
|
||||||
val = uint4{r0, r1, r2, r3};
|
val = uint4{r0, r1, r2, r3};
|
||||||
#else
|
#else
|
||||||
assert(false && "ld128_cg_or_zero requires SM100+ with CUDA 12.9+");
|
assert(false && "ld128_cg_or_zero is not supported on ROCm");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -109,16 +109,18 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
|
|||||||
|
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
int flag = 0;
|
int flag = 0;
|
||||||
CUDA_CHECK(cuDeviceGetAttribute(
|
CUresult rdma_result = cuDeviceGetAttribute(
|
||||||
&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
|
&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
|
||||||
device));
|
device);
|
||||||
if (flag) { // support GPUDirect RDMA if possible
|
if (rdma_result == CUDA_SUCCESS &&
|
||||||
|
flag) { // support GPUDirect RDMA if possible
|
||||||
prop.allocFlags.gpuDirectRDMACapable = 1;
|
prop.allocFlags.gpuDirectRDMACapable = 1;
|
||||||
}
|
}
|
||||||
int fab_flag = 0;
|
int fab_flag = 0;
|
||||||
CUDA_CHECK(cuDeviceGetAttribute(
|
CUresult fab_result = cuDeviceGetAttribute(
|
||||||
&fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device));
|
&fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device);
|
||||||
if (fab_flag) { // support fabric handle if possible
|
if (fab_result == CUDA_SUCCESS &&
|
||||||
|
fab_flag) { // support fabric handle if possible
|
||||||
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
|
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -35,11 +35,11 @@ __global__ void batched_moe_align_block_size_kernel(
|
|||||||
int32_t const block_ids_size = sorted_ids_size / block_size;
|
int32_t const block_ids_size = sorted_ids_size / block_size;
|
||||||
int32_t const SENTINEL =
|
int32_t const SENTINEL =
|
||||||
num_batches * max_tokens_per_batch; // To denote invalid entries.
|
num_batches * max_tokens_per_batch; // To denote invalid entries.
|
||||||
// Intialize sorted_ids
|
// Initialize sorted_ids
|
||||||
for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
|
for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
|
||||||
sorted_ids[i] = SENTINEL;
|
sorted_ids[i] = SENTINEL;
|
||||||
}
|
}
|
||||||
// Intialize expert_ids with -1
|
// Initialize expert_ids with -1
|
||||||
for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
|
for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
|
||||||
block_ids[i] = -1;
|
block_ids[i] = -1;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -73,10 +73,9 @@ void moe_permute(
|
|||||||
MOE_DISPATCH(input.scalar_type(), [&] {
|
MOE_DISPATCH(input.scalar_type(), [&] {
|
||||||
expandInputRowsKernelLauncher<scalar_t>(
|
expandInputRowsKernelLauncher<scalar_t>(
|
||||||
get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
|
get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
|
||||||
get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
|
get_ptr<int>(sorted_row_idx), get_ptr<int>(inv_permuted_idx),
|
||||||
get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
|
get_ptr<int>(permuted_idx), get_ptr<int64_t>(expert_first_token_offset),
|
||||||
get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
|
n_token, valid_num_ptr, n_hidden, topk, n_local_expert, stream);
|
||||||
n_hidden, topk, n_local_expert, stream);
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ void sortAndScanExpert(const int* expert_for_source_row, const int* source_rows,
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void expandInputRowsKernelLauncher(
|
void expandInputRowsKernelLauncher(
|
||||||
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
|
T const* unpermuted_input, T* permuted_output,
|
||||||
int const* expanded_dest_row_to_expanded_source_row,
|
int const* expanded_dest_row_to_expanded_source_row,
|
||||||
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
|
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
|
||||||
int64_t const* expert_first_token_offset, int64_t const num_rows,
|
int64_t const* expert_first_token_offset, int64_t const num_rows,
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
template <typename T, bool CHECK_SKIPPED>
|
template <typename T, bool CHECK_SKIPPED>
|
||||||
__global__ void expandInputRowsKernel(
|
__global__ void expandInputRowsKernel(
|
||||||
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
|
T const* unpermuted_input, T* permuted_output,
|
||||||
int const* expanded_dest_row_to_expanded_source_row,
|
int const* expanded_dest_row_to_expanded_source_row,
|
||||||
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
|
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
|
||||||
int64_t const* expert_first_token_offset, int64_t const num_rows,
|
int64_t const* expert_first_token_offset, int64_t const num_rows,
|
||||||
@@ -16,7 +16,6 @@ __global__ void expandInputRowsKernel(
|
|||||||
int64_t expanded_dest_row = blockIdx.x;
|
int64_t expanded_dest_row = blockIdx.x;
|
||||||
int64_t const expanded_source_row =
|
int64_t const expanded_source_row =
|
||||||
expanded_dest_row_to_expanded_source_row[expanded_dest_row];
|
expanded_dest_row_to_expanded_source_row[expanded_dest_row];
|
||||||
int expert_id = sorted_experts[expanded_dest_row];
|
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
assert(expanded_dest_row <= INT32_MAX);
|
assert(expanded_dest_row <= INT32_MAX);
|
||||||
@@ -54,7 +53,7 @@ __global__ void expandInputRowsKernel(
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void expandInputRowsKernelLauncher(
|
void expandInputRowsKernelLauncher(
|
||||||
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
|
T const* unpermuted_input, T* permuted_output,
|
||||||
int const* expanded_dest_row_to_expanded_source_row,
|
int const* expanded_dest_row_to_expanded_source_row,
|
||||||
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
|
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
|
||||||
int64_t const* expert_first_token_offset, int64_t const num_rows,
|
int64_t const* expert_first_token_offset, int64_t const num_rows,
|
||||||
@@ -70,12 +69,12 @@ void expandInputRowsKernelLauncher(
|
|||||||
bool is_check_skip = num_valid_tokens_ptr != nullptr;
|
bool is_check_skip = num_valid_tokens_ptr != nullptr;
|
||||||
auto func = func_map[is_check_skip];
|
auto func = func_map[is_check_skip];
|
||||||
|
|
||||||
func<<<blocks, threads, 0, stream>>>(
|
func<<<blocks, threads, 0, stream>>>(unpermuted_input, permuted_output,
|
||||||
unpermuted_input, permuted_output, sorted_experts,
|
expanded_dest_row_to_expanded_source_row,
|
||||||
expanded_dest_row_to_expanded_source_row,
|
expanded_source_row_to_expanded_dest_row,
|
||||||
expanded_source_row_to_expanded_dest_row, permuted_idx,
|
permuted_idx, expert_first_token_offset,
|
||||||
expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
|
num_rows, num_valid_tokens_ptr, cols, k,
|
||||||
num_local_experts);
|
num_local_experts);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class T, class U>
|
template <class T, class U>
|
||||||
|
|||||||
12
csrc/ops.h
12
csrc/ops.h
@@ -295,10 +295,14 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
|
|||||||
|
|
||||||
std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a);
|
std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a);
|
||||||
|
|
||||||
void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
|
std::tuple<torch::Tensor, torch::Tensor> scaled_fp4_quant_func(
|
||||||
torch::Tensor& output_scale,
|
torch::Tensor const& input, torch::Tensor const& input_scale,
|
||||||
torch::Tensor const& input_scale,
|
bool is_sf_swizzled_layout);
|
||||||
bool is_sf_swizzled_layout);
|
|
||||||
|
void scaled_fp4_quant_out(torch::Tensor const& input,
|
||||||
|
torch::Tensor const& input_scale,
|
||||||
|
bool is_sf_swizzled_layout, torch::Tensor& output,
|
||||||
|
torch::Tensor& output_scale);
|
||||||
|
|
||||||
void scaled_fp4_experts_quant(
|
void scaled_fp4_experts_quant(
|
||||||
torch::Tensor& output, torch::Tensor& output_scale,
|
torch::Tensor& output, torch::Tensor& output_scale,
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user