Compare commits
431 Commits
v0.16.1rc0
...
v0.17.1rc0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f83b933b84 | ||
|
|
82f3f30e26 | ||
|
|
9095cbbfb6 | ||
|
|
721ae79f50 | ||
|
|
aefc59f088 | ||
|
|
d88f28da05 | ||
|
|
106ff69c4e | ||
|
|
ca5fb4bbd8 | ||
|
|
cf88b23749 | ||
|
|
a3189a08b0 | ||
|
|
409c4e632d | ||
|
|
8850738b70 | ||
|
|
234860399b | ||
|
|
c88510083b | ||
|
|
4ff8c3c8f9 | ||
|
|
507ddbe992 | ||
|
|
ddbb0d230a | ||
|
|
9efc3bdcd6 | ||
|
|
156e33553c | ||
|
|
d0cd736caa | ||
|
|
195c997203 | ||
|
|
04b67d8f62 | ||
|
|
7279374f91 | ||
|
|
006aea17d7 | ||
|
|
0836be3b03 | ||
|
|
4e95ec111c | ||
|
|
179547d62c | ||
|
|
f85b4eda3a | ||
|
|
2a194ddd72 | ||
|
|
203a7f27da | ||
|
|
483463f735 | ||
|
|
4e571ce643 | ||
|
|
4ff9b045fe | ||
|
|
3fd03f1ec2 | ||
|
|
10a5f4d53d | ||
|
|
fe0c085c28 | ||
|
|
8d6b3d5dda | ||
|
|
4b87ffbefb | ||
|
|
fa028207aa | ||
|
|
d460a18fc6 | ||
|
|
6e956d9eca | ||
|
|
1e0f917b34 | ||
|
|
c174d54f86 | ||
|
|
55d27cca55 | ||
|
|
580864d81e | ||
|
|
2b28b9b269 | ||
|
|
70485a11bd | ||
|
|
74a9f54cdb | ||
|
|
00c4cb5606 | ||
|
|
941e52c298 | ||
|
|
be292b7c14 | ||
|
|
77a73458e3 | ||
|
|
5578f2a4d3 | ||
|
|
3ec2115015 | ||
|
|
b0906d8b02 | ||
|
|
aaf5fa9abf | ||
|
|
f96c3ab08c | ||
|
|
dc6b578466 | ||
|
|
1bc9c77f6d | ||
|
|
65a4da1504 | ||
|
|
217f27598d | ||
|
|
fff3711a24 | ||
|
|
c4d859c274 | ||
|
|
747431044d | ||
|
|
d62856b928 | ||
|
|
bd2659a566 | ||
|
|
90512b2e8b | ||
|
|
dcf8862fd4 | ||
|
|
43aa389231 | ||
|
|
384425f84e | ||
|
|
a0f44bb616 | ||
|
|
fde4771bbd | ||
|
|
e5ff140216 | ||
|
|
0a6a3a1290 | ||
|
|
4497431df6 | ||
|
|
b7332b058c | ||
|
|
40077ea3de | ||
|
|
5d6aae4577 | ||
|
|
63298ee173 | ||
|
|
2dde535df1 | ||
|
|
379689d533 | ||
|
|
a6be75dbd2 | ||
|
|
ee54f9cdb9 | ||
|
|
fc4657756f | ||
|
|
eebd14651f | ||
|
|
ebb9cc5f2b | ||
|
|
85f50eb41f | ||
|
|
5261223c2d | ||
|
|
00b814ba5a | ||
|
|
ee8a29511f | ||
|
|
755356b3d1 | ||
|
|
58928475e4 | ||
|
|
1a9718085c | ||
|
|
7eb524e64c | ||
|
|
c7f32e08c2 | ||
|
|
b354686524 | ||
|
|
6a18d8789b | ||
|
|
24a03915f5 | ||
|
|
b5e34e1fca | ||
|
|
ce8546a12b | ||
|
|
c188749bcd | ||
|
|
225d1090a0 | ||
|
|
f3c6c9c9d7 | ||
|
|
26bd43b52d | ||
|
|
6b625a8807 | ||
|
|
54756b6109 | ||
|
|
39f9ea0da4 | ||
|
|
e4ae148a78 | ||
|
|
1d0c0d209c | ||
|
|
fcb73f306c | ||
|
|
e2090bf3af | ||
|
|
2a00d3241f | ||
|
|
10f4db4dbe | ||
|
|
5b3ba94ab4 | ||
|
|
90f3c01fa4 | ||
|
|
807d680337 | ||
|
|
5afb387bd4 | ||
|
|
43e77e59ab | ||
|
|
00bd08edee | ||
|
|
43f10573c9 | ||
|
|
86e1060b17 | ||
|
|
27066d1b2b | ||
|
|
57c84ff129 | ||
|
|
e68de8adc0 | ||
|
|
a1ffa56a1e | ||
|
|
0a208d1f54 | ||
|
|
03a49bb8f0 | ||
|
|
8e87cc57f1 | ||
|
|
6dd302653f | ||
|
|
de00ebeac4 | ||
|
|
639680d220 | ||
|
|
c5362c739f | ||
|
|
0a49676fb0 | ||
|
|
c012a8c477 | ||
|
|
ebed80a7c8 | ||
|
|
a73af584fe | ||
|
|
a97954b6a8 | ||
|
|
a911f4dd20 | ||
|
|
5395471d29 | ||
|
|
a57c877f18 | ||
|
|
f917020983 | ||
|
|
86483ca774 | ||
|
|
b93a9e6f6d | ||
|
|
d8839ef7d9 | ||
|
|
e998fa76b9 | ||
|
|
6a895197fa | ||
|
|
8c760b6ab6 | ||
|
|
3ee68590c7 | ||
|
|
7196348157 | ||
|
|
176c799f4c | ||
|
|
612e7729c2 | ||
|
|
ecde7af9c4 | ||
|
|
8df523351f | ||
|
|
b03ff6a96b | ||
|
|
ed81d5edd1 | ||
|
|
3c23ac840e | ||
|
|
a708ef5944 | ||
|
|
66a2209645 | ||
|
|
0bfa229bf1 | ||
|
|
7493c51c55 | ||
|
|
ac773bbe80 | ||
|
|
48e376a007 | ||
|
|
21eb2c3372 | ||
|
|
e2b31243c0 | ||
|
|
c3598d02fa | ||
|
|
57c629e9c1 | ||
|
|
d106bf39f5 | ||
|
|
b0651021e5 | ||
|
|
f600d5192e | ||
|
|
8e7820131e | ||
|
|
0a12cea25f | ||
|
|
dd6dbd93f8 | ||
|
|
26366009c5 | ||
|
|
16c472abe7 | ||
|
|
3b23d57c96 | ||
|
|
2f4226fe52 | ||
|
|
792cbd64ca | ||
|
|
2ed4722e26 | ||
|
|
a3299c3d1d | ||
|
|
6c21a0c2d7 | ||
|
|
562339abc3 | ||
|
|
d7adcadb9b | ||
|
|
f678c3f61a | ||
|
|
be0a3f7570 | ||
|
|
17dc9c7fc9 | ||
|
|
7eca859110 | ||
|
|
636ee223ac | ||
|
|
b7d59ffce2 | ||
|
|
5569f5218d | ||
|
|
138d891d7f | ||
|
|
d7166e74c1 | ||
|
|
417fd28fb1 | ||
|
|
7faba503c4 | ||
|
|
bc6be89d16 | ||
|
|
32224f568a | ||
|
|
f3dc292e9f | ||
|
|
138c5fa186 | ||
|
|
2f2c1d73a7 | ||
|
|
fb3e78ab09 | ||
|
|
fd3bfe74c9 | ||
|
|
bfdb512f11 | ||
|
|
d25c1ec3c9 | ||
|
|
7cc6058ac6 | ||
|
|
28028dff2f | ||
|
|
3417ba5648 | ||
|
|
58cfe0dc44 | ||
|
|
e86221deb6 | ||
|
|
289fc48ab7 | ||
|
|
2f2212e6cc | ||
|
|
18e01a0a10 | ||
|
|
6cb901093f | ||
|
|
ead7bde1ab | ||
|
|
6aa6ad8992 | ||
|
|
c8c3935b70 | ||
|
|
bb6888b8b1 | ||
|
|
1aaec59d79 | ||
|
|
1659b2e058 | ||
|
|
d6e04f4c43 | ||
|
|
a8f66cbde8 | ||
|
|
16d2ad1d38 | ||
|
|
5dc3538736 | ||
|
|
36bf213181 | ||
|
|
6f0dd93801 | ||
|
|
5d199ac8f2 | ||
|
|
9e0f44bec4 | ||
|
|
097eb544e9 | ||
|
|
7cdba98edf | ||
|
|
3c85cd9d74 | ||
|
|
edba15045a | ||
|
|
e379396167 | ||
|
|
6e9f21e8a2 | ||
|
|
c1d963403c | ||
|
|
77e6dcbbfa | ||
|
|
70c73df69e | ||
|
|
9a9d442464 | ||
|
|
f7da9cdffc | ||
|
|
f22ff2958c | ||
|
|
d15c3b90fc | ||
|
|
97286a20ed | ||
|
|
12b38c0f45 | ||
|
|
467886a0c4 | ||
|
|
a9b8b13e5c | ||
|
|
e7213003cb | ||
|
|
3a8eef5869 | ||
|
|
97995f6376 | ||
|
|
881a6b011b | ||
|
|
8e1fd5baf0 | ||
|
|
ae88468bcc | ||
|
|
e05cb3b93e | ||
|
|
28ef9ba399 | ||
|
|
fb7fdc49c4 | ||
|
|
ea463978bb | ||
|
|
440f0e7dc6 | ||
|
|
fd4a90f337 | ||
|
|
ad9d09e2b8 | ||
|
|
4beebfd146 | ||
|
|
b8401cde0e | ||
|
|
5dfc5abe94 | ||
|
|
8fa68a8ce4 | ||
|
|
35a6f0bfe2 | ||
|
|
3a6cbf16e2 | ||
|
|
f44d1ddc8c | ||
|
|
48a54c1e0d | ||
|
|
8b9e8b7454 | ||
|
|
c21d0039ec | ||
|
|
7d8bbe6f42 | ||
|
|
25e02647c2 | ||
|
|
a0a5178ab4 | ||
|
|
8ea8ba275e | ||
|
|
4f85bae9d6 | ||
|
|
0a7165fd71 | ||
|
|
6521ccf286 | ||
|
|
8ebd872f50 | ||
|
|
168ee03e1c | ||
|
|
9dd656f0ea | ||
|
|
c8b678e53e | ||
|
|
18c29c746b | ||
|
|
96fc09503a | ||
|
|
1b82b433fc | ||
|
|
9319044ee9 | ||
|
|
c42dc402c1 | ||
|
|
fa6a6be519 | ||
|
|
cad21918e3 | ||
|
|
53700bf49b | ||
|
|
a13d8c03c9 | ||
|
|
9433acb8df | ||
|
|
d1a6e96d9e | ||
|
|
2a9e3347e9 | ||
|
|
cc0d565f40 | ||
|
|
358e4d5ba7 | ||
|
|
792a74b973 | ||
|
|
4034c3d32e | ||
|
|
7560d674c9 | ||
|
|
d9c7730877 | ||
|
|
ada4f4fadd | ||
|
|
7e9149d9a9 | ||
|
|
87c98b0236 | ||
|
|
de7dd634b9 | ||
|
|
9a87b0578f | ||
|
|
510bc9e1df | ||
|
|
cbd361fd46 | ||
|
|
c212202d93 | ||
|
|
ec27b36b4b | ||
|
|
3fd1d4ec2c | ||
|
|
cb21972a97 | ||
|
|
c34963f138 | ||
|
|
f26650d649 | ||
|
|
92f5d0f070 | ||
|
|
a60985b07e | ||
|
|
8b5014d3dd | ||
|
|
57a96e26c9 | ||
|
|
e82fbeec7b | ||
|
|
6290470843 | ||
|
|
72f4d16262 | ||
|
|
5a435507d8 | ||
|
|
59d7af9c6c | ||
|
|
bbf81f9a92 | ||
|
|
da543d1abe | ||
|
|
87d319c52f | ||
|
|
a9ec392c86 | ||
|
|
afd089f231 | ||
|
|
3ecd0bf9fc | ||
|
|
e3eb146f7a | ||
|
|
95a395dbec | ||
|
|
e94b263bd6 | ||
|
|
e113a30113 | ||
|
|
1dafb29f91 | ||
|
|
49b9ae32e9 | ||
|
|
63d7972f13 | ||
|
|
c68e69f144 | ||
|
|
7e08c22b8c | ||
|
|
8e75d88554 | ||
|
|
0892d1ab1f | ||
|
|
7600642eae | ||
|
|
1e69c04887 | ||
|
|
4292e3b807 | ||
|
|
24d6ea8afd | ||
|
|
57c86c0741 | ||
|
|
06254d4cbb | ||
|
|
f5d1281c9d | ||
|
|
94029ffaf0 | ||
|
|
88e8525f2e | ||
|
|
b2d8b422b2 | ||
|
|
1d5ab5d603 | ||
|
|
7b346ba8ed | ||
|
|
dea268336f | ||
|
|
90805ff464 | ||
|
|
2562e0271e | ||
|
|
fd68cd132b | ||
|
|
0edf101d2b | ||
|
|
d5b6f3ba36 | ||
|
|
1a014a0a93 | ||
|
|
86ac7bcf84 | ||
|
|
405f28d38d | ||
|
|
5323672bc2 | ||
|
|
a201ad72d8 | ||
|
|
e3691988d0 | ||
|
|
9fa6c68fa6 | ||
|
|
2ce6f3cf67 | ||
|
|
1f3dbd95fd | ||
|
|
1d532f9d8f | ||
|
|
234a65b781 | ||
|
|
2decec9856 | ||
|
|
29b35477b0 | ||
|
|
b1d9f5372d | ||
|
|
fd6de37fca | ||
|
|
c8aca0c9e1 | ||
|
|
b602e4f299 | ||
|
|
157722da75 | ||
|
|
1d897ff04f | ||
|
|
905d76b51d | ||
|
|
9098ce690c | ||
|
|
876312f0b5 | ||
|
|
5de98abc12 | ||
|
|
9251ed5c4f | ||
|
|
e8249378e4 | ||
|
|
6d4f9d3ad5 | ||
|
|
fbe3f0120a | ||
|
|
66c1751d13 | ||
|
|
6467b635b6 | ||
|
|
9c3fe9936b | ||
|
|
b66a74649e | ||
|
|
07bdabef03 | ||
|
|
a572baff5e | ||
|
|
516cf26698 | ||
|
|
487e5c51f7 | ||
|
|
1a8c71674e | ||
|
|
062b789632 | ||
|
|
a532c83849 | ||
|
|
1e5ad9b74f | ||
|
|
cabdaa7619 | ||
|
|
06be53563b | ||
|
|
c29ee9c326 | ||
|
|
d43048ce05 | ||
|
|
4fec53cfcb | ||
|
|
38c498b8e3 | ||
|
|
56a6371706 | ||
|
|
6283021142 | ||
|
|
01923eec70 | ||
|
|
31fb6f43da | ||
|
|
eb19955c37 | ||
|
|
0f2f24c8b2 | ||
|
|
d0105b84f0 | ||
|
|
832a780f3a | ||
|
|
98217b09f9 | ||
|
|
967572dd5f | ||
|
|
3d66502e1b | ||
|
|
c66aa48e99 | ||
|
|
b6d5a17298 | ||
|
|
5e58bdc711 | ||
|
|
a1f53addb1 | ||
|
|
05970c772c | ||
|
|
d940607629 | ||
|
|
99c7892c5b | ||
|
|
ec8f943db1 | ||
|
|
f2ad952f40 | ||
|
|
9e2cabdf9c | ||
|
|
ec8ab9d254 | ||
|
|
05972ea7e5 | ||
|
|
111d869069 | ||
|
|
7fea7250a4 | ||
|
|
845ee348ef | ||
|
|
ec13e549d3 | ||
|
|
c6ca51598a | ||
|
|
c0615a296d | ||
|
|
01914445b0 | ||
|
|
5281713e11 | ||
|
|
32693db8ce | ||
|
|
e03ddcfbd4 | ||
|
|
02acd16861 | ||
|
|
ab87f85231 |
@@ -13,9 +13,10 @@ import os
|
||||
from contextlib import contextmanager
|
||||
|
||||
import lm_eval
|
||||
import numpy as np
|
||||
import yaml
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
DEFAULT_RTOL = 0.08
|
||||
|
||||
|
||||
@@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size):
|
||||
"allow_deprecated_quantization=True,"
|
||||
)
|
||||
|
||||
if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]:
|
||||
model_args += "attention_backend=TRITON_ATTN"
|
||||
|
||||
env_vars = eval_config.get("env_vars", None)
|
||||
with scoped_env_vars(env_vars):
|
||||
results = lm_eval.simple_evaluate(
|
||||
@@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
||||
f"ground_truth={ground_truth:.3f} | "
|
||||
f"measured={measured_value:.3f} | rtol={rtol}"
|
||||
)
|
||||
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
|
||||
|
||||
min_acceptable = ground_truth * (1 - rtol)
|
||||
success = success and measured_value >= min_acceptable
|
||||
|
||||
assert success
|
||||
|
||||
@@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B",
|
||||
"tensor_parallel_size": 1,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
|
||||
@@ -51,5 +51,56 @@
|
||||
"max-model-len": 256,
|
||||
"async-scheduling": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "latency_deepseek_r1",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "deepseek-ai/DeepSeek-R1",
|
||||
"tensor_parallel_size": 8,
|
||||
"load_format": "dummy",
|
||||
"max-model-len": 2048,
|
||||
"dtype": "bfloat16"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "latency_llama4_maverick_17b128e_instruct_fp8",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
"tensor_parallel_size": 8,
|
||||
"max-model-len": 512,
|
||||
"max-num-seqs": 128,
|
||||
"async-scheduling": "",
|
||||
"gpu-memory-utilization": 0.95,
|
||||
"enable_expert_parallel": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "latency_qwen3_8b",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "Qwen/Qwen3-8B",
|
||||
"tensor_parallel_size": 1,
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 128,
|
||||
"dtype": "bfloat16",
|
||||
"async-scheduling": ""
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy",
|
||||
"max-model-len": 2048,
|
||||
@@ -37,7 +36,6 @@
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy",
|
||||
"max-model-len": 2048,
|
||||
@@ -64,7 +62,6 @@
|
||||
"server_parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"tensor_parallel_size": 2,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy",
|
||||
"max-model-len": 2048,
|
||||
@@ -78,5 +75,83 @@
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_deepseek_r1",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "deepseek-ai/DeepSeek-R1",
|
||||
"tensor_parallel_size": 8,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy",
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 200,
|
||||
"async-scheduling": "",
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "deepseek-ai/DeepSeek-R1",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama4_maverick_17b128e_instruct_fp8",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
"tensor_parallel_size": 8,
|
||||
"disable_log_stats": "",
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 128,
|
||||
"async-scheduling": "",
|
||||
"enable_expert_parallel": "",
|
||||
"max-num-batched-tokens": 4096
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_qwen3_8b",
|
||||
"qps_list": [1, 4, 10, "inf"],
|
||||
"server_environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "Qwen/Qwen-3-8B",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"disable_log_stats": "",
|
||||
"async-scheduling": ""
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "Qwen/Qwen-3-8B",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
@@ -23,7 +22,6 @@
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
@@ -41,7 +39,6 @@
|
||||
"server_parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"tensor_parallel_size": 2,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
@@ -59,7 +56,6 @@
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"swap_space": 16,
|
||||
"speculative_config": {
|
||||
"model": "turboderp/Qwama-0.5B-Instruct",
|
||||
"num_speculative_tokens": 4,
|
||||
|
||||
@@ -57,5 +57,67 @@
|
||||
"max-num-seqs": 512,
|
||||
"async-scheduling": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "throughput_deepseek_r1",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "deepseek-ai/DeepSeek-R1",
|
||||
"tensor_parallel_size": 8,
|
||||
"load_format": "dummy",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"dataset_name": "sharegpt",
|
||||
"num_prompts": 1000,
|
||||
"backend": "vllm",
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 384,
|
||||
"async-scheduling": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "throughput_llama4_maverick_17b128e_instruct_fp8",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
"tensor_parallel_size": 8,
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"dataset_name": "sharegpt",
|
||||
"num_prompts": 1000,
|
||||
"backend": "vllm",
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 512,
|
||||
"async-scheduling": "",
|
||||
"enable_expert_parallel": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "throughput_qwen3_8b",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "Qwen/Qwen-3-8B",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"dataset_name": "sharegpt",
|
||||
"num_prompts": 1000,
|
||||
"max-num-seqs": 512,
|
||||
"backend": "vllm",
|
||||
"async-scheduling": ""
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
@@ -68,7 +68,7 @@ aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/aiter-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amd_aiter-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
|
||||
\`\`\`
|
||||
|
||||
@@ -80,7 +80,7 @@ aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-
|
||||
- **torchvision**: TorchVision for ROCm PyTorch
|
||||
- **torchaudio**: Torchaudio for ROCm PyTorch
|
||||
- **amdsmi**: AMD SMI Python bindings
|
||||
- **aiter**: Aiter for ROCm
|
||||
- **amd_aiter**: Aiter for ROCm
|
||||
- **flash-attn**: Flash Attention for ROCm
|
||||
|
||||
### :warning: Notes
|
||||
|
||||
213
.buildkite/scripts/check-ray-compatibility.sh
Normal file
213
.buildkite/scripts/check-ray-compatibility.sh
Normal file
@@ -0,0 +1,213 @@
|
||||
#!/bin/bash
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Check if Ray LLM can generate lock files that are compatible with this
|
||||
# version of vllm. Downloads Ray's requirement files and runs a full
|
||||
# dependency resolution with the installed vllm's constraints to see if
|
||||
# a valid lock file can be produced.
|
||||
#
|
||||
# See: https://github.com/vllm-project/vllm/issues/33599
|
||||
|
||||
set -eo pipefail
|
||||
|
||||
RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python"
|
||||
|
||||
WORK_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "$WORK_DIR"' EXIT
|
||||
|
||||
# Fetch all Ray requirement files used in the LLM depset pipeline
|
||||
echo ">>> Fetching Ray requirement files"
|
||||
RAY_FILES=(
|
||||
"requirements.txt"
|
||||
"requirements/cloud-requirements.txt"
|
||||
"requirements/base-test-requirements.txt"
|
||||
"requirements/llm/llm-requirements.txt"
|
||||
"requirements/llm/llm-test-requirements.txt"
|
||||
)
|
||||
for FILE in "${RAY_FILES[@]}"; do
|
||||
LOCAL_PATH="${WORK_DIR}/$(basename "$FILE")"
|
||||
echo " ${FILE}"
|
||||
curl -fsSL -o "$LOCAL_PATH" "${RAY_BASE_URL}/${FILE}"
|
||||
done
|
||||
|
||||
# Extract installed vllm deps
|
||||
echo ">>> Extracting installed vllm dependency constraints"
|
||||
python3 - "${WORK_DIR}/vllm-constraints.txt" <<'PYEOF'
|
||||
"""Write out the installed vllm's dependencies as pip constraint lines.
|
||||
|
||||
Ray uses vllm[audio], so audio-extra deps are included with their extra
|
||||
markers stripped. The resolver cannot evaluate extra markers for a
|
||||
package that is not itself being resolved from an index, so we activate
|
||||
them manually here.
|
||||
"""
|
||||
import importlib.metadata
|
||||
import re
|
||||
import sys
|
||||
|
||||
out_path = sys.argv[1]
|
||||
raw_reqs = importlib.metadata.requires("vllm") or []
|
||||
|
||||
# Ray uses vllm[audio] – activate that extra.
|
||||
ACTIVE_EXTRAS = {"audio"}
|
||||
EXTRA_RE = re.compile(r"""extra\s*==\s*['"]([^'"]+)['"]""")
|
||||
|
||||
lines = []
|
||||
for r in raw_reqs:
|
||||
if ";" not in r:
|
||||
# Unconditional dep — always include.
|
||||
lines.append(r.strip())
|
||||
continue
|
||||
|
||||
req_part, _, marker_part = r.partition(";")
|
||||
marker_part = marker_part.strip()
|
||||
|
||||
extra_matches = EXTRA_RE.findall(marker_part)
|
||||
if not extra_matches:
|
||||
# Non-extra marker (python_version, etc.) — keep as-is.
|
||||
lines.append(r.strip())
|
||||
continue
|
||||
|
||||
if not ACTIVE_EXTRAS.intersection(extra_matches):
|
||||
continue # Skip inactive extras (tensorizer, bench, …).
|
||||
|
||||
# Strip the extra== conditions but keep any remaining markers
|
||||
# (e.g. python_version).
|
||||
cleaned = EXTRA_RE.sub("", marker_part)
|
||||
cleaned = re.sub(r"\band\b\s*\band\b", "and", cleaned)
|
||||
cleaned = re.sub(r"^\s*and\s+|\s+and\s*$", "", cleaned).strip()
|
||||
|
||||
if cleaned:
|
||||
lines.append(f"{req_part.strip()} ; {cleaned}")
|
||||
else:
|
||||
lines.append(req_part.strip())
|
||||
|
||||
with open(out_path, "w") as f:
|
||||
for line in lines:
|
||||
f.write(line + "\n")
|
||||
|
||||
print(f"Wrote {len(lines)} constraints to {out_path}")
|
||||
PYEOF
|
||||
|
||||
echo ">>> Installed vllm deps (first 20 lines):"
|
||||
head -20 "${WORK_DIR}/vllm-constraints.txt"
|
||||
|
||||
# Remove Ray's vllm pin — the installed vllm's transitive deps
|
||||
# (written above) replace it in the resolution. vllm itself cannot
|
||||
# be resolved from PyPI for in-development versions, so we test
|
||||
# whether Ray's requirements can coexist with vllm's dependency
|
||||
# constraints instead.
|
||||
sed -i '/^vllm/d' "${WORK_DIR}/llm-requirements.txt"
|
||||
|
||||
# Install uv if needed
|
||||
if ! command -v uv &>/dev/null; then
|
||||
echo ">>> Installing uv"
|
||||
pip install uv -q
|
||||
fi
|
||||
|
||||
# Resolve: given vllm's constraints, can Ray compile a lock file?
|
||||
#
|
||||
# vllm's dependency constraints are the fixed side — Ray is flexible and
|
||||
# can regenerate its lock files. We pass vllm's constraints via -c so
|
||||
# the resolver treats them as non-negotiable bounds, then check whether
|
||||
# Ray's own requirements can still be satisfied within those bounds.
|
||||
echo ""
|
||||
echo "============================================================"
|
||||
echo ">>> Resolving: Can Ray generate compatible lock files?"
|
||||
echo "============================================================"
|
||||
|
||||
set +e
|
||||
uv pip compile \
|
||||
"${WORK_DIR}/requirements.txt" \
|
||||
"${WORK_DIR}/cloud-requirements.txt" \
|
||||
"${WORK_DIR}/base-test-requirements.txt" \
|
||||
"${WORK_DIR}/llm-requirements.txt" \
|
||||
"${WORK_DIR}/llm-test-requirements.txt" \
|
||||
-c "${WORK_DIR}/vllm-constraints.txt" \
|
||||
--python-version 3.12 \
|
||||
--python-platform x86_64-manylinux_2_31 \
|
||||
--extra-index-url https://download.pytorch.org/whl/cu129 \
|
||||
--index-strategy unsafe-best-match \
|
||||
--unsafe-package setuptools \
|
||||
--unsafe-package ray \
|
||||
--no-header \
|
||||
-o "${WORK_DIR}/resolved.txt" \
|
||||
2>&1
|
||||
EXIT_CODE=$?
|
||||
set -e
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
if [ $EXIT_CODE -eq 0 ]; then
|
||||
echo "SUCCESS: Ray can generate lock files compatible with this vllm."
|
||||
echo ""
|
||||
echo "Key resolved versions:"
|
||||
grep -E '^(protobuf|torch|numpy|transformers)==' \
|
||||
"${WORK_DIR}/resolved.txt" | sort || true
|
||||
echo "=========================================="
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "FAILURE: Ray cannot generate lock files compatible with this vllm."
|
||||
echo "This means a fundamental dependency conflict exists that Ray"
|
||||
echo "cannot resolve by regenerating its lock files."
|
||||
echo "See: https://github.com/vllm-project/vllm/issues/33599"
|
||||
echo "=========================================="
|
||||
|
||||
# Buildkite annotation
|
||||
if [ -f /usr/bin/buildkite-agent ]; then
|
||||
buildkite-agent annotate --style 'warning' --context 'ray-compat' << EOF
|
||||
### :warning: Ray Dependency Compatibility Warning
|
||||
This PR introduces dependencies that **cannot** be resolved with Ray's requirements.
|
||||
Ray would not be able to regenerate its lock files to accommodate this vllm version.
|
||||
|
||||
Please check the **Ray Dependency Compatibility Check** step logs for details.
|
||||
See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for context.
|
||||
EOF
|
||||
fi
|
||||
|
||||
# Notify Slack if webhook is configured and PR/branch are valid.
|
||||
if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
|
||||
PR="${BUILDKITE_PULL_REQUEST:-}"
|
||||
BRANCH="${BUILDKITE_BRANCH:-}"
|
||||
|
||||
# Skip notification if PR is invalid or branch is empty
|
||||
if [[ "$PR" = "false" || -z "$PR" || -z "$BRANCH" ]]; then
|
||||
echo ">>> Skipping Slack notification (invalid PR or empty branch: PR=$PR, branch=$BRANCH)"
|
||||
else
|
||||
echo ">>> Sending Slack notification"
|
||||
# Single quotes are intentional: the f-string expressions are Python, not shell.
|
||||
# shellcheck disable=SC2016
|
||||
PAYLOAD=$(python3 -c '
|
||||
import json, os, sys
|
||||
pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A")
|
||||
branch = os.getenv("BUILDKITE_BRANCH", "unknown")
|
||||
url = os.getenv("BUILDKITE_BUILD_URL", "#")
|
||||
data = {
|
||||
"text": ":warning: Ray Dependency Compatibility Check Failed",
|
||||
"blocks": [{
|
||||
"type": "section",
|
||||
"text": {
|
||||
"type": "mrkdwn",
|
||||
"text": (
|
||||
"*:warning: Ray Dependency Compatibility Check Failed*\n"
|
||||
f"PR #{pr} on branch `{branch}` introduces dependencies "
|
||||
f"that cannot be resolved with Ray'\''s requirements.\n"
|
||||
f"<{url}|View Build>"
|
||||
),
|
||||
},
|
||||
}],
|
||||
}
|
||||
print(json.dumps(data))
|
||||
')
|
||||
|
||||
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
|
||||
-H 'Content-type: application/json' \
|
||||
-d "$PAYLOAD")
|
||||
echo " Slack webhook response: $HTTP_CODE"
|
||||
fi
|
||||
else
|
||||
echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
|
||||
fi
|
||||
|
||||
exit 1
|
||||
@@ -6,6 +6,26 @@
|
||||
# Multi-node detection: Instead of matching on fragile group names, we detect
|
||||
# multi-node jobs structurally by looking for the bracket command syntax
|
||||
# "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
|
||||
#
|
||||
###############################################################################
|
||||
# QUOTING / COMMAND PASSING
|
||||
#
|
||||
# Passing commands as positional arguments ($*) is fragile when the command
|
||||
# string itself contains double quotes, e.g.:
|
||||
#
|
||||
# bash run-amd-test.sh "export FLAGS="value" && pytest -m "not slow""
|
||||
#
|
||||
# The outer shell resolves the nested quotes *before* this script runs, so
|
||||
# the script receives mangled input it cannot fully recover.
|
||||
#
|
||||
# Preferred: pass commands via the VLLM_TEST_COMMANDS environment variable:
|
||||
#
|
||||
# export VLLM_TEST_COMMANDS='export FLAGS="value" && pytest -m "not slow"'
|
||||
# bash run-amd-test.sh
|
||||
#
|
||||
# Single-quoted assignment preserves all inner double quotes verbatim.
|
||||
# The $* path is kept for backward compatibility but callers should migrate.
|
||||
###############################################################################
|
||||
set -o pipefail
|
||||
|
||||
# Export Python path
|
||||
@@ -79,26 +99,157 @@ is_multi_node() {
|
||||
return 1
|
||||
}
|
||||
|
||||
handle_pytest_exit() {
|
||||
local exit_code=$1
|
||||
if [ "$exit_code" -eq 5 ]; then
|
||||
echo "Pytest exit code 5 (no tests collected) - treating as success."
|
||||
exit 0
|
||||
fi
|
||||
exit "$exit_code"
|
||||
}
|
||||
|
||||
###############################################################################
|
||||
# Pytest marker re-quoting
|
||||
# Pytest marker/keyword re-quoting
|
||||
#
|
||||
# When commands are passed through Buildkite -> shell -> $* -> bash -c,
|
||||
# quotes around pytest -m marker expressions get stripped:
|
||||
# quotes around multi-word pytest -m/-k expressions get stripped:
|
||||
# pytest -v -s -m 'not cpu_test' v1/core
|
||||
# becomes:
|
||||
# pytest -v -s -m not cpu_test v1/core
|
||||
#
|
||||
# pytest then interprets "cpu_test" as a file path, not part of the marker.
|
||||
# This function detects unquoted multi-word marker expressions and re-quotes
|
||||
# them so they survive the final bash -c expansion.
|
||||
#
|
||||
# This function detects unquoted expressions after -m/-k and re-quotes them
|
||||
# by collecting tokens until a recognizable boundary is reached:
|
||||
# - test path (contains '/')
|
||||
# - test file (ends with '.py')
|
||||
# - another pytest flag (--xxx or -x single-char flags)
|
||||
# - command separator (&& || ; |)
|
||||
# - environment variable assignment (FOO=bar)
|
||||
#
|
||||
# Single-word markers (e.g. -m cpu_test, -m hybrid_model) pass through
|
||||
# unquoted since they have no spaces and work fine.
|
||||
#
|
||||
# Already-quoted expressions (containing literal single quotes) are passed
|
||||
# through untouched to avoid double-quoting values injected by
|
||||
# apply_rocm_test_overrides.
|
||||
#
|
||||
# NOTE: This ONLY fixes -m/-k flags. It cannot recover arbitrary inner
|
||||
# double-quotes stripped by the calling shell (see header comment).
|
||||
# Use VLLM_TEST_COMMANDS to avoid the problem entirely.
|
||||
###############################################################################
|
||||
|
||||
re_quote_pytest_markers() {
|
||||
local cmds="$1"
|
||||
# Pattern: -m not <identifier> -> -m 'not <identifier>'
|
||||
# Handles the common cases: 'not cpu_test', 'not slow_test', etc.
|
||||
cmds=$(echo "$cmds" | sed -E "s/-m not ([a-zA-Z_][a-zA-Z0-9_]*)/-m 'not \1'/g")
|
||||
echo "$cmds"
|
||||
local input="$1"
|
||||
local output=""
|
||||
local collecting=false
|
||||
local marker_buf=""
|
||||
|
||||
# Strip backslash-newline continuations, then flatten remaining newlines
|
||||
local flat="${input//$'\\\n'/ }"
|
||||
flat="${flat//$'\n'/ }"
|
||||
|
||||
# Disable globbing to prevent *.py etc. from expanding during read -ra
|
||||
local restore_glob
|
||||
restore_glob="$(shopt -p -o noglob 2>/dev/null || true)"
|
||||
set -o noglob
|
||||
local -a words
|
||||
read -ra words <<< "$flat"
|
||||
eval "$restore_glob"
|
||||
|
||||
for word in "${words[@]}"; do
|
||||
if $collecting; then
|
||||
# If the token we're about to collect already contains a literal
|
||||
# single quote, the expression was already quoted upstream.
|
||||
# Flush and stop collecting.
|
||||
if [[ "$word" == *"'"* ]]; then
|
||||
if [[ -n "$marker_buf" ]]; then
|
||||
# Should not normally happen (partial buf + quote), flush raw
|
||||
output+="${marker_buf} "
|
||||
marker_buf=""
|
||||
fi
|
||||
output+="${word} "
|
||||
collecting=false
|
||||
continue
|
||||
fi
|
||||
|
||||
local is_boundary=false
|
||||
case "$word" in
|
||||
# Line-continuation artifact
|
||||
"\\")
|
||||
is_boundary=true ;;
|
||||
# Command separators
|
||||
"&&"|"||"|";"|"|")
|
||||
is_boundary=true ;;
|
||||
# Long flags (--ignore, --shard-id, etc.)
|
||||
--*)
|
||||
is_boundary=true ;;
|
||||
# Short flags (-v, -s, -x, etc.) but NOT negative marker tokens
|
||||
# like "not" which don't start with "-". Also skip -k/-m which
|
||||
# would start a new marker (handled below).
|
||||
-[a-zA-Z])
|
||||
is_boundary=true ;;
|
||||
# Test path (contains /)
|
||||
*/*)
|
||||
is_boundary=true ;;
|
||||
# Test file (ends with .py, possibly with ::method)
|
||||
*.py|*.py::*)
|
||||
is_boundary=true ;;
|
||||
# Environment variable assignment preceding a command (FOO=bar)
|
||||
*=*)
|
||||
# Only treat as boundary if it looks like VAR=value, not
|
||||
# pytest filter expressions like num_gpus=2 inside markers
|
||||
if [[ "$word" =~ ^[A-Z_][A-Z0-9_]*= ]]; then
|
||||
is_boundary=true
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
|
||||
if $is_boundary; then
|
||||
# Flush the collected marker expression
|
||||
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
|
||||
output+="'${marker_buf}' "
|
||||
else
|
||||
output+="${marker_buf} "
|
||||
fi
|
||||
collecting=false
|
||||
marker_buf=""
|
||||
# Check if this boundary word itself starts a new -m/-k
|
||||
if [[ "$word" == "-m" || "$word" == "-k" ]]; then
|
||||
output+="${word} "
|
||||
collecting=true
|
||||
# Drop stray backslash tokens silently
|
||||
elif [[ "$word" == "\\" ]]; then
|
||||
:
|
||||
else
|
||||
output+="${word} "
|
||||
fi
|
||||
else
|
||||
# Accumulate into marker buffer
|
||||
if [[ -n "$marker_buf" ]]; then
|
||||
marker_buf+=" ${word}"
|
||||
else
|
||||
marker_buf="${word}"
|
||||
fi
|
||||
fi
|
||||
elif [[ "$word" == "-m" || "$word" == "-k" ]]; then
|
||||
output+="${word} "
|
||||
collecting=true
|
||||
marker_buf=""
|
||||
else
|
||||
output+="${word} "
|
||||
fi
|
||||
done
|
||||
|
||||
# Flush any trailing marker expression (marker at end of command)
|
||||
if $collecting && [[ -n "$marker_buf" ]]; then
|
||||
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
|
||||
output+="'${marker_buf}'"
|
||||
else
|
||||
output+="${marker_buf}"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "${output% }"
|
||||
}
|
||||
|
||||
###############################################################################
|
||||
@@ -231,11 +382,35 @@ HF_CACHE="$(realpath ~)/huggingface"
|
||||
mkdir -p "${HF_CACHE}"
|
||||
HF_MOUNT="/root/.cache/huggingface"
|
||||
|
||||
commands="$*"
|
||||
# ---- Command source selection ----
|
||||
# Prefer VLLM_TEST_COMMANDS (preserves all inner quoting intact).
|
||||
# Fall back to $* for backward compatibility, but warn that inner
|
||||
# double-quotes will have been stripped by the calling shell.
|
||||
if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then
|
||||
commands="${VLLM_TEST_COMMANDS}"
|
||||
echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)"
|
||||
else
|
||||
commands="$*"
|
||||
if [[ -z "$commands" ]]; then
|
||||
echo "Error: No test commands provided." >&2
|
||||
echo "Usage:" >&2
|
||||
echo " Preferred: VLLM_TEST_COMMANDS='...' bash $0" >&2
|
||||
echo " Legacy: bash $0 \"commands here\"" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "Commands sourced from positional args (legacy mode)"
|
||||
echo "WARNING: Inner double-quotes in the command string may have been"
|
||||
echo " stripped by the calling shell. If you see syntax errors, switch to:"
|
||||
echo " export VLLM_TEST_COMMANDS='your commands here'"
|
||||
echo " bash $0"
|
||||
fi
|
||||
|
||||
echo "Raw commands: $commands"
|
||||
|
||||
# Fix quoting before ROCm overrides (so overrides see correct structure)
|
||||
commands=$(re_quote_pytest_markers "$commands")
|
||||
echo "After re-quoting: $commands"
|
||||
|
||||
commands=$(apply_rocm_test_overrides "$commands")
|
||||
echo "Final commands: $commands"
|
||||
|
||||
@@ -248,6 +423,18 @@ if [[ -z "$render_gid" ]]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# --- RDMA device passthrough (conditional) ---
|
||||
# If the host has RDMA devices, pass them through so tests like
|
||||
# test_moriio_connector can access ibverbs. On hosts without RDMA
|
||||
# hardware the tests will gracefully skip via _rdma_available().
|
||||
RDMA_FLAGS=""
|
||||
if [ -d /dev/infiniband ]; then
|
||||
echo "RDMA devices detected on host, enabling passthrough"
|
||||
RDMA_FLAGS="--device /dev/infiniband --cap-add=IPC_LOCK"
|
||||
else
|
||||
echo "No RDMA devices found on host, RDMA tests will be skipped"
|
||||
fi
|
||||
|
||||
# --- Route: multi-node vs single-node ---
|
||||
if is_multi_node "$commands"; then
|
||||
echo "--- Multi-node job detected"
|
||||
@@ -282,7 +469,9 @@ if is_multi_node "$commands"; then
|
||||
done
|
||||
|
||||
/bin/bash -c "${composite_command}"
|
||||
exit_code=$?
|
||||
cleanup_network
|
||||
handle_pytest_exit "$exit_code"
|
||||
else
|
||||
echo "Multi-node job detected but failed to parse bracket command syntax."
|
||||
echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]"
|
||||
@@ -295,6 +484,7 @@ else
|
||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||
docker run \
|
||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||
$RDMA_FLAGS \
|
||||
--network=host \
|
||||
--shm-size=16gb \
|
||||
--group-add "$render_gid" \
|
||||
@@ -308,4 +498,7 @@ else
|
||||
--name "${container_name}" \
|
||||
"${image_name}" \
|
||||
/bin/bash -c "${commands}"
|
||||
|
||||
exit_code=$?
|
||||
handle_pytest_exit "$exit_code"
|
||||
fi
|
||||
|
||||
@@ -1,26 +1,43 @@
|
||||
#!/bin/bash
|
||||
set -euox pipefail
|
||||
export VLLM_CPU_CI_ENV=0
|
||||
|
||||
echo "--- PP+TP"
|
||||
vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
||||
server_pid=$!
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||
--num-prompts 20 \
|
||||
--result-dir ./test_results \
|
||||
--result-filename tp_pp.json \
|
||||
--save-result \
|
||||
--endpoint /v1/completions
|
||||
kill -s SIGTERM $server_pid &
|
||||
kill -s SIGTERM $server_pid; wait $server_pid || true
|
||||
failed_req=$(jq '.failed' ./test_results/tp_pp.json)
|
||||
if [ "$failed_req" -ne 0 ]; then
|
||||
echo "Some requests were failed!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "--- DP+TP"
|
||||
vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
|
||||
server_pid=$!
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||
--num-prompts 20 \
|
||||
--result-dir ./test_results \
|
||||
--result-filename dp_pp.json \
|
||||
--save-result \
|
||||
--endpoint /v1/completions
|
||||
kill -s SIGTERM $server_pid &
|
||||
kill -s SIGTERM $server_pid; wait $server_pid || true
|
||||
failed_req=$(jq '.failed' ./test_results/dp_pp.json)
|
||||
if [ "$failed_req" -ne 0 ]; then
|
||||
echo "Some requests were failed!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -34,7 +34,7 @@ function cpu_tests() {
|
||||
# offline inference
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run model tests
|
||||
docker exec cpu-test bash -c "
|
||||
|
||||
@@ -27,7 +27,7 @@ function cpu_tests() {
|
||||
podman exec -it "$container_id" bash -c "
|
||||
export TORCH_COMPILE_DISABLE=1
|
||||
set -xve
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
|
||||
|
||||
# Run basic model test
|
||||
podman exec -it "$container_id" bash -c "
|
||||
|
||||
@@ -25,5 +25,5 @@ remove_docker_container
|
||||
|
||||
# Run the image and test offline inference
|
||||
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
||||
python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
|
||||
'
|
||||
|
||||
@@ -1,9 +1,27 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# This script builds the HPU docker image and runs the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
#
|
||||
# vllm-gaudi compatibility pinning:
|
||||
# The vllm-gaudi plugin is installed on top of the vllm upstream checkout used by this CI job.
|
||||
# When upstream vllm changes its API, the plugin may break before it has been updated.
|
||||
# To handle this, the vllm-gaudi repository maintains a file:
|
||||
# vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT
|
||||
# The first line of that file controls what version of vllm is used inside the Docker image:
|
||||
# - "latest" : no checkout override; the current Buildkite CI commit is used as-is.
|
||||
# - "<commit SHA>" : vllm is checked out to that specific commit before building, pinning
|
||||
# the test to a known-compatible baseline.
|
||||
# To unpin (resume testing against the live vllm tip), set the file content back to "latest".
|
||||
set -exuo pipefail
|
||||
|
||||
# Fetch the vllm community commit reference from vllm-gaudi (first line only).
|
||||
VLLM_COMMUNITY_COMMIT=$(curl -s \
|
||||
https://raw.githubusercontent.com/vllm-project/vllm-gaudi/vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT \
|
||||
| head -1 | tr -d '\n')
|
||||
|
||||
echo "Using vllm community commit: ${VLLM_COMMUNITY_COMMIT}"
|
||||
|
||||
# Try building the docker image
|
||||
image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
|
||||
@@ -12,6 +30,13 @@ FROM gaudi-base-image:latest
|
||||
|
||||
COPY ./ /workspace/vllm
|
||||
|
||||
# If VLLM_COMMUNITY_COMMIT is a specific commit (not "latest"), check it out to pin vllm
|
||||
# to the version known to be compatible with vllm-gaudi. When the value is "latest",
|
||||
# the current checkout (the Buildkite CI commit) is used unchanged.
|
||||
RUN if [ "${VLLM_COMMUNITY_COMMIT}" != "latest" ]; then \
|
||||
cd /workspace/vllm && git fetch --unshallow 2>/dev/null || true && git checkout ${VLLM_COMMUNITY_COMMIT}; \
|
||||
fi
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
|
||||
ENV no_proxy=localhost,127.0.0.1
|
||||
@@ -51,7 +76,7 @@ docker run --rm --runtime=habana --name="${container_name}" --network=host \
|
||||
-e PT_HPU_LAZY_MODE=1 \
|
||||
"${image_name}" \
|
||||
/bin/bash -c '
|
||||
cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
|
||||
'
|
||||
|
||||
EXITCODE=$?
|
||||
|
||||
@@ -34,17 +34,17 @@ docker run \
|
||||
set -e
|
||||
echo $ZE_AFFINITY_MASK
|
||||
pip install tblib==3.1.0
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
|
||||
python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
|
||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
|
||||
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
|
||||
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
|
||||
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
|
||||
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
|
||||
cd tests
|
||||
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
|
||||
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
|
||||
pytest -v -s v1/engine
|
||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||
|
||||
@@ -24,7 +24,7 @@ if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:
|
||||
BACKENDS=("allgather_reducescatter")
|
||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||
export VLLM_ROCM_MOE_PADDING=0
|
||||
PLATFORM_ARGS=("--no-async-scheduling")
|
||||
PLATFORM_ARGS=("--no-async-scheduling" "--attention-backend=TRITON_ATTN")
|
||||
echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
|
||||
else
|
||||
# Non-ROCm platform (CUDA/other)
|
||||
|
||||
@@ -72,7 +72,7 @@ obj_json="objects.json"
|
||||
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
|
||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||
|
||||
# call script to generate indicies for all existing wheels
|
||||
# call script to generate indices for all existing wheels
|
||||
# this indices have relative paths that could work as long as it is next to the wheel directory in s3
|
||||
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
|
||||
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
|
||||
|
||||
@@ -54,10 +54,13 @@ mkdir -p $DIST_DIR
|
||||
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
|
||||
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
|
||||
echo "Wheels copied to local directory"
|
||||
# generate source tarball
|
||||
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT"
|
||||
# generate source distribution using setup.py
|
||||
python setup.py sdist --dist-dir=$DIST_DIR
|
||||
ls -la $DIST_DIR
|
||||
|
||||
SDIST_FILE=$(find $DIST_DIR -name "vllm*.tar.gz")
|
||||
echo "Found sdist: $SDIST_FILE"
|
||||
|
||||
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
|
||||
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
|
||||
if [[ -z "$PYPI_WHEEL_FILES" ]]; then
|
||||
@@ -65,6 +68,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
python3 -m twine check "$PYPI_WHEEL_FILES"
|
||||
python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES"
|
||||
echo "Wheels uploaded to PyPI"
|
||||
python3 -m twine check "$PYPI_WHEEL_FILES" "$SDIST_FILE"
|
||||
python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES" "$SDIST_FILE"
|
||||
echo "Wheels and source distribution uploaded to PyPI"
|
||||
|
||||
@@ -156,8 +156,9 @@ steps:
|
||||
|
||||
- label: Entrypoints Integration Test (API Server 1) # 100min
|
||||
timeout_in_minutes: 130
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
fast_check: true
|
||||
@@ -173,8 +174,9 @@ steps:
|
||||
|
||||
- label: Entrypoints Integration Test (API Server 2)
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
fast_check: true
|
||||
@@ -192,8 +194,9 @@ steps:
|
||||
|
||||
- label: Entrypoints Integration Test (Pooling)
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
fast_check: true
|
||||
@@ -207,8 +210,9 @@ steps:
|
||||
|
||||
- label: Entrypoints Integration Test (Responses API)
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
fast_check: true
|
||||
@@ -222,8 +226,9 @@ steps:
|
||||
|
||||
- label: Distributed Tests (4 GPUs) # 35min
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
@@ -278,14 +283,16 @@ steps:
|
||||
- popd
|
||||
# NEW rlhf examples
|
||||
- pushd ../examples/offline_inference/new_weight_syncing
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
|
||||
- popd
|
||||
|
||||
- label: Distributed Tests (8 GPUs) # 4min
|
||||
timeout_in_minutes: 10
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_8
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
gpu: h100
|
||||
num_gpus: 8
|
||||
@@ -380,10 +387,9 @@ steps:
|
||||
|
||||
- label: V1 Test e2e + engine # 65min
|
||||
timeout_in_minutes: 90
|
||||
mirror_hardwares: [amdexperimental]
|
||||
# The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
|
||||
# See discussion here: https://github.com/vllm-project/vllm/pull/31040
|
||||
agent_pool: mi325_8
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@@ -394,6 +400,34 @@ steps:
|
||||
- pytest -v -s v1/e2e
|
||||
- pytest -v -s v1/engine
|
||||
|
||||
- label: V1 Test e2e (2 GPUs) # 65min
|
||||
timeout_in_minutes: 90
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_2
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
# Only run tests that need exactly 2 GPUs
|
||||
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
|
||||
|
||||
- label: V1 Test e2e (4 GPUs) # 65min
|
||||
timeout_in_minutes: 90
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
# The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
|
||||
# See discussion here: https://github.com/vllm-project/vllm/pull/31040
|
||||
agent_pool: mi325_4
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
# Only run tests that need 4 GPUs
|
||||
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
|
||||
|
||||
- label: V1 Test entrypoints # 35min
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
@@ -407,8 +441,9 @@ steps:
|
||||
|
||||
- label: V1 Test others # 42min
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@@ -432,11 +467,12 @@ steps:
|
||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
|
||||
# TODO: Add the "V1 Test attetion (MI300)" test group
|
||||
# TODO: Add the "V1 Test attention (MI300)" test group
|
||||
|
||||
- label: V1 Test attention (H100) # 10min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 30
|
||||
gpu: h100
|
||||
@@ -463,17 +499,6 @@ steps:
|
||||
- pytest -v -s v1/determinism/test_batch_invariance.py
|
||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||
|
||||
- label: V1 Test attention (B200) # 10min
|
||||
timeout_in_minutes: 30
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- pytest -v -s v1/attention
|
||||
|
||||
- label: V1 Test others (CPU) # 5 mins
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
|
||||
agent_pool: mi325_1
|
||||
@@ -504,12 +529,12 @@ steps:
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
# for basic
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
- python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN
|
||||
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
|
||||
- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 basic/offline_inference/classify.py
|
||||
- python3 basic/offline_inference/embed.py
|
||||
- python3 basic/offline_inference/score.py
|
||||
# for multi-modal models
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
@@ -540,8 +565,9 @@ steps:
|
||||
|
||||
- label: Samplers Test # 56min
|
||||
timeout_in_minutes: 75
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers
|
||||
@@ -553,8 +579,9 @@ steps:
|
||||
|
||||
- label: LoRA Test %N # 20min each
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
@@ -572,6 +599,8 @@ steps:
|
||||
--ignore=lora/test_qwen3moe_tp.py
|
||||
parallelism: 4
|
||||
|
||||
##### .buildkite/test_areas/pytorch.yaml #####
|
||||
# corresponds to .buildkite/test_areas/pytorch.yaml
|
||||
- label: PyTorch Compilation Unit Tests # 15min
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
@@ -589,6 +618,20 @@ steps:
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
|
||||
|
||||
# corresponds to .buildkite/test_areas/pytorch.yaml
|
||||
- label: PyTorch Compilation Passes Unit Tests
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile/passes
|
||||
commands:
|
||||
# TODO: clean up this comment if not needed. It is used to
|
||||
# keep track of the tests changes during vLLM IR Ops refactoring.
|
||||
# Use `find` to launch multiple instances of pytest.
|
||||
- "find compile/passes -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
@@ -664,8 +707,9 @@ steps:
|
||||
|
||||
- label: Kernels Quantization Test %N # 64min
|
||||
timeout_in_minutes: 90
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
@@ -798,8 +842,9 @@ steps:
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
timeout_in_minutes: 75
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
@@ -860,8 +905,9 @@ steps:
|
||||
|
||||
- label: Basic Models Tests (Other)
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
@@ -902,8 +948,9 @@ steps:
|
||||
|
||||
- label: Language Models Tests (Extra Standard) %N
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
@@ -923,8 +970,9 @@ steps:
|
||||
|
||||
- label: Language Models Tests (Hybrid) %N
|
||||
timeout_in_minutes: 75
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
@@ -944,7 +992,7 @@ steps:
|
||||
|
||||
- label: Language Models Test (Extended Generation) # 80min
|
||||
timeout_in_minutes: 110
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
optional: true
|
||||
@@ -960,7 +1008,7 @@ steps:
|
||||
|
||||
- label: Language Models Test (PPL)
|
||||
timeout_in_minutes: 110
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
optional: true
|
||||
@@ -972,7 +1020,7 @@ steps:
|
||||
|
||||
- label: Language Models Test (Extended Pooling) # 36min
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
optional: true
|
||||
@@ -984,7 +1032,7 @@ steps:
|
||||
|
||||
- label: Language Models Test (MTEB)
|
||||
timeout_in_minutes: 110
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
optional: true
|
||||
@@ -996,11 +1044,12 @@ steps:
|
||||
|
||||
- label: Multi-Modal Processor Test (CPU)
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
- tests/models/registry.py
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
@@ -1008,19 +1057,20 @@ steps:
|
||||
|
||||
- label: Multi-Modal Processor Test # 44min
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
- tests/models/registry.py
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/processing
|
||||
|
||||
- label: Multi-Modal Models Test (Standard) # 60min
|
||||
timeout_in_minutes: 100
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
torch_nightly: true
|
||||
@@ -1053,7 +1103,7 @@ steps:
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 1 # 60min
|
||||
timeout_in_minutes: 120
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
optional: true
|
||||
@@ -1068,7 +1118,7 @@ steps:
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 2 #60min
|
||||
timeout_in_minutes: 120
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
optional: true
|
||||
@@ -1083,7 +1133,7 @@ steps:
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 3 # 75min
|
||||
timeout_in_minutes: 150
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
optional: true
|
||||
@@ -1108,7 +1158,7 @@ steps:
|
||||
- pytest -v -s models/quantization
|
||||
|
||||
- label: Transformers Nightly Models Test
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/"
|
||||
@@ -1119,53 +1169,11 @@ steps:
|
||||
- pytest -v -s tests/models/test_transformers.py
|
||||
# - pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
- python3 examples/basic/offline_inference/chat.py
|
||||
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
|
||||
- label: Blackwell Test # 21 min
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
# optional: true
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- csrc/attention/mla/
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/v1/attention/backends/mla/cutlass_mla.py
|
||||
- vllm/v1/attention/backends/mla/flashinfer_mla.py
|
||||
- vllm/v1/attention/selector.py
|
||||
- vllm/platforms/cuda.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
# Attention
|
||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
||||
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
|
||||
# Quantization
|
||||
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
||||
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
||||
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
|
||||
- label: Blackwell Fusion and Compile Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
@@ -1232,16 +1240,6 @@ steps:
|
||||
commands:
|
||||
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
||||
|
||||
- label: Blackwell LM Eval Small Models
|
||||
timeout_in_minutes: 120
|
||||
gpu: b200
|
||||
optional: true # run on nightlies
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
||||
|
||||
##### 1 GPU test #####
|
||||
##### multi gpus test #####
|
||||
|
||||
@@ -1263,8 +1261,9 @@ steps:
|
||||
|
||||
- label: 2 Node Tests (4 GPUs in total) # 16min
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental, amdmultinode]
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdmultinode]
|
||||
agent_pool: mi325_4
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
@@ -1290,8 +1289,9 @@ steps:
|
||||
|
||||
- label: Distributed Tests (2 GPUs) # 68min
|
||||
timeout_in_minutes: 90
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_2
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
@@ -1311,6 +1311,7 @@ steps:
|
||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- tests/v1/shutdown
|
||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||
- examples/offline_inference/new_weight_syncing/
|
||||
commands:
|
||||
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
|
||||
# TODO: Remove when the bug is fixed in a future ROCm release
|
||||
@@ -1324,14 +1325,14 @@ steps:
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- pytest -v -s compile/correctness_e2e/test_sequence_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||
|
||||
- label: Distributed Model Tests (2 GPUs) # 37min
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_2
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
@@ -1370,6 +1371,10 @@ steps:
|
||||
- pip install -e ./plugins/prithvi_io_processor_plugin
|
||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||
- pip uninstall prithvi_io_processor_plugin -y
|
||||
# test bge_m3_sparse io_processor plugin
|
||||
- pip install -e ./plugins/bge_m3_sparse_plugin
|
||||
- pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
|
||||
- pip uninstall bge_m3_sparse_plugin -y
|
||||
# end io_processor plugins test
|
||||
# begin stat_logger plugins test
|
||||
- pip install -e ./plugins/vllm_add_dummy_stat_logger
|
||||
@@ -1441,7 +1446,7 @@ steps:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
|
||||
|
||||
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_2
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
@@ -1481,11 +1486,25 @@ steps:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
|
||||
- DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
|
||||
- CROSS_LAYERS_BLOCKS=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
##### multi gpus test #####
|
||||
##### A100 test #####
|
||||
|
||||
- label: Distributed Tests (A100) # optional
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: a100
|
||||
@@ -1508,7 +1527,7 @@ steps:
|
||||
- label: LM Eval Large Models # optional
|
||||
gpu: a100
|
||||
optional: true
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
num_gpus: 4
|
||||
@@ -1520,11 +1539,11 @@ steps:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
##### H100 test #####
|
||||
- label: LM Eval Large Models (H100) # optional
|
||||
##### FP8 test #####
|
||||
- label: LM Eval Large Models (H100) # optional, still use H100 for consistency
|
||||
gpu: h100
|
||||
optional: true
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
num_gpus: 4
|
||||
@@ -1533,13 +1552,13 @@ steps:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||
- export VLLM_USE_DEEP_GEMM=0
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=4
|
||||
|
||||
|
||||
##### H200 test #####
|
||||
- label: Distributed Tests (H200) # optional
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_2
|
||||
# grade: Blocking
|
||||
gpu: h200
|
||||
@@ -1549,16 +1568,16 @@ steps:
|
||||
commands:
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
|
||||
- pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||
# TODO: this test is not supported on ROCm, there are aiter kernels for this.
|
||||
# - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||
# - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
|
||||
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
|
||||
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
# this test is not supported on ROCm
|
||||
# - pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### B200 test #####
|
||||
- label: Distributed Tests (B200) # optional
|
||||
@@ -1599,8 +1618,9 @@ steps:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
- label: ROCm LM Eval Large Models (8 Card)
|
||||
mirror_hardwares: [amdproduction]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_8
|
||||
optional: true
|
||||
num_gpus: 8
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
commands:
|
||||
@@ -1619,8 +1639,8 @@ steps:
|
||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx942.txt
|
||||
|
||||
##### EPLB Accuracy Tests #####
|
||||
- label: DeepSeek V2-Lite Accuracy
|
||||
@@ -1647,19 +1667,9 @@ steps:
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
||||
timeout_in_minutes: 60
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
|
||||
|
||||
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
optional: true
|
||||
@@ -1668,6 +1678,93 @@ steps:
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||
|
||||
##### .buildkite/test_areas/compile.yaml #####
|
||||
# Slowly setting up the tests so that it is also easier for the
|
||||
# CI team to review and upstream to the pipelinev2.
|
||||
# The following tests are important for vLLM IR Ops refactoring,
|
||||
# which affects fusion passes on ROCm. So we have to
|
||||
# enable them as as soon as possible.
|
||||
|
||||
## TODO: Enable the test in this group
|
||||
# # corresponds to .buildkite/test_areas/compile.yaml
|
||||
# - label: Fusion and Compile Unit Tests (2xMI325 GPUs)
|
||||
# timeout_in_minutes: 20
|
||||
# working_dir: "/vllm-workspace/"
|
||||
# mirror_hardwares: [amdexperimental, amdproduction, tj]
|
||||
# agent_pool: mi325_1 # changed to 1 GPU until the fusion all reduce is enabled then only revert back to 2 GPUs
|
||||
# source_file_dependencies:
|
||||
# - csrc/quantization/fp4/
|
||||
# - vllm/model_executor/layers/quantization/
|
||||
# - vllm/model_executor/layers/layernorm.py
|
||||
# - vllm/model_executor/layers/activation.py
|
||||
# - vllm/model_executor/layers/attention/attention.py
|
||||
# - vllm/v1/attention/backends/flashinfer.py
|
||||
# - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
|
||||
# - tests/compile/test_fusion_attn.py
|
||||
# - tests/compile/test_silu_mul_quant_fusion.py
|
||||
# - tests/compile/distributed/test_fusion_all_reduce.py
|
||||
# - tests/compile/fullgraph/test_full_graph.py
|
||||
# commands:
|
||||
# - rocm-smi
|
||||
# # we run all backend tests on ROCm
|
||||
# # These two tests are covered in "PyTorch Compilation Passes Unit Tests"
|
||||
# # - "pytest -v -s tests/compile/passes/test_fusion_attn.py"
|
||||
# # - "pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py"
|
||||
# # TODO: this test is not supported on ROCm, there are aiter kernels for this.
|
||||
# # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
|
||||
# # TODO: find out more details
|
||||
# # - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
# corresponds to .buildkite/test_areas/compile.yaml
|
||||
- label: Fusion E2E Quick (MI325)
|
||||
timeout_in_minutes: 15
|
||||
working_dir: "/vllm-workspace/"
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
num_devices: 1
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/model_executor/
|
||||
- vllm/v1/attention/
|
||||
- vllm/compilation/
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- rocm-smi
|
||||
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||
- "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
|
||||
# Different from CUDA, Qwen requires +rms_norm and +quant_fp8 as rms+quant fusion is only supported on AITER
|
||||
- "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and +rms_norm and +quant_fp8 and qwen3'"
|
||||
|
||||
# corresponds to .buildkite/test_areas/compile.yaml
|
||||
- label: Fusion E2E Config Sweep (MI325)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
num_devices: 1
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/attention/attention.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
- tests/compile/fusions_e2e/
|
||||
commands:
|
||||
- rocm-smi
|
||||
# Run just llama3 (fp8) for all config combinations
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
|
||||
|
||||
## There are no ops on ROCm for these tests.
|
||||
## The test still passes but the logs are not useful.
|
||||
## fused ops just call torch.ops.symm_mem which
|
||||
## exists in ROCm even though they don't work
|
||||
# - label: AsyncTP Correctness Tests (2xMI325 GPUs)
|
||||
# - label: Fusion E2E TP2 Quick (MI325)
|
||||
# - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325)
|
||||
# - label: Fusion E2E TP2 (MI325)
|
||||
# - label: Sequence Parallel Correctness Tests (2xMI325 GPUs)
|
||||
|
||||
|
||||
#####################################################################################################################################
|
||||
@@ -1850,8 +1947,10 @@ steps:
|
||||
|
||||
- label: Distributed Tests (4 GPUs) # 35min
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi355_4
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
@@ -1905,7 +2004,8 @@ steps:
|
||||
- popd
|
||||
# NEW rlhf examples
|
||||
- pushd ../examples/offline_inference/new_weight_syncing
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
|
||||
- popd
|
||||
|
||||
@@ -2050,20 +2150,7 @@ steps:
|
||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
|
||||
# TODO: Add the "V1 Test attetion (MI300)" test group
|
||||
|
||||
- label: V1 Test attention (H100) # 10min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi355_1
|
||||
timeout_in_minutes: 30
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/config/attention.py
|
||||
- vllm/model_executor/layers/attention
|
||||
- vllm/v1/attention
|
||||
- tests/v1/attention
|
||||
commands:
|
||||
- pytest -v -s v1/attention
|
||||
# TODO: Add the "V1 Test attention (MI300)" test group
|
||||
|
||||
- label: Batch Invariance Tests (H100) # 10min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@@ -2081,6 +2168,8 @@ steps:
|
||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||
|
||||
- label: V1 Test attention (B200) # 10min
|
||||
mirror_hardwares: [amdexperimental, amdmi355]
|
||||
agent_pool: mi355_1
|
||||
timeout_in_minutes: 30
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
@@ -2119,12 +2208,12 @@ steps:
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
# for basic
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
- python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN
|
||||
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
|
||||
- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 basic/offline_inference/classify.py
|
||||
- python3 basic/offline_inference/embed.py
|
||||
- python3 basic/offline_inference/score.py
|
||||
# for multi-modal models
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
@@ -2700,12 +2789,14 @@ steps:
|
||||
- pytest -v -s tests/models/test_transformers.py
|
||||
# - pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
- python3 examples/basic/offline_inference/chat.py
|
||||
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
|
||||
- label: Blackwell Test # 21 min
|
||||
- label: Blackwell Test (MI355) # 21 min
|
||||
mirror_hardwares: [amdexperimental, amdmi355]
|
||||
agent_pool: mi355_1
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
@@ -2724,28 +2815,28 @@ steps:
|
||||
- vllm/v1/attention/selector.py
|
||||
- vllm/platforms/cuda.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
- rocm-smi
|
||||
- python3 examples/basic/offline_inference/chat.py
|
||||
# Attention
|
||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
||||
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
||||
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
|
||||
# Quantization
|
||||
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
||||
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
||||
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||
#- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
||||
#- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
||||
#- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
||||
#- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
|
||||
## Quantization
|
||||
#- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
||||
#- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
||||
#- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
|
||||
#- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||
#- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||
#- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||
#- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
||||
#- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
||||
#- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
#- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
#- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
#- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
|
||||
|
||||
- label: Blackwell Fusion and Compile Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
@@ -2815,13 +2906,15 @@ steps:
|
||||
|
||||
- label: Blackwell LM Eval Small Models
|
||||
timeout_in_minutes: 120
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdmi355]
|
||||
agent_pool: mi355_2
|
||||
gpu: b200
|
||||
optional: true # run on nightlies
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi355.txt
|
||||
|
||||
##### 1 GPU test #####
|
||||
##### multi gpus test #####
|
||||
@@ -2869,8 +2962,10 @@ steps:
|
||||
|
||||
- label: Distributed Tests (2 GPUs) # 68min
|
||||
timeout_in_minutes: 90
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi355_2
|
||||
optional: true
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
@@ -2946,6 +3041,10 @@ steps:
|
||||
- pip install -e ./plugins/prithvi_io_processor_plugin
|
||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||
- pip uninstall prithvi_io_processor_plugin -y
|
||||
# test bge_m3_sparse io_processor plugin
|
||||
- pip install -e ./plugins/bge_m3_sparse_plugin
|
||||
- pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
|
||||
- pip uninstall bge_m3_sparse_plugin -y
|
||||
# end io_processor plugins test
|
||||
# begin stat_logger plugins test
|
||||
- pip install -e ./plugins/vllm_add_dummy_stat_logger
|
||||
@@ -3051,6 +3150,20 @@ steps:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
|
||||
- DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi355_4
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
|
||||
- CROSS_LAYERS_BLOCKS=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
##### multi gpus test #####
|
||||
##### A100 test #####
|
||||
|
||||
@@ -3183,8 +3296,8 @@ steps:
|
||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx950.txt
|
||||
|
||||
##### EPLB Accuracy Tests #####
|
||||
- label: DeepSeek V2-Lite Accuracy
|
||||
@@ -3198,18 +3311,9 @@ steps:
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi355_4
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200-MI355)
|
||||
mirror_hardwares: [amdexperimental, amdproduction, amdmi355]
|
||||
agent_pool: mi355_2
|
||||
timeout_in_minutes: 60
|
||||
gpu: b200
|
||||
optional: true
|
||||
@@ -3227,4 +3331,19 @@ steps:
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||
|
||||
- label: Attention Benchmarks Smoke Test (B200-MI355)
|
||||
device: b200
|
||||
mirror_hardwares: [amdexperimental, amdmi355]
|
||||
agent_pool: mi355_2
|
||||
num_gpus: 2
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
timeout_in_minutes: 10
|
||||
source_file_dependencies:
|
||||
- benchmarks/attention_benchmarks/
|
||||
- vllm/v1/attention/
|
||||
commands:
|
||||
- python3 benchmarks/attention_benchmarks/benchmark.py --backends ROCM_ATTN ROCM_AITER_FA ROCM_AITER_UNIFIED_ATTN --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1
|
||||
|
||||
|
||||
@@ -36,6 +36,16 @@ steps:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
|
||||
|
||||
- label: AsyncTP Correctness Tests (B200)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: b200
|
||||
optional: true
|
||||
num_devices: 2
|
||||
commands:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
|
||||
|
||||
- label: Distributed Compile Unit Tests (2xH100)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/"
|
||||
|
||||
@@ -67,6 +67,7 @@ steps:
|
||||
- tests/v1/distributed
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
- tests/distributed/test_symm_mem_allreduce.py
|
||||
- tests/distributed/test_multiproc_executor.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
@@ -95,6 +96,8 @@ steps:
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
- pytest -v -s distributed/test_events.py
|
||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||
# test multi-node TP with multiproc executor (simulated on single node)
|
||||
- pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
|
||||
# TODO: create a dedicated test section for multi-GPU example tests
|
||||
# when we have multiple distributed example tests
|
||||
# OLD rlhf examples
|
||||
@@ -103,7 +106,8 @@ steps:
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
# NEW rlhf examples
|
||||
- cd new_weight_syncing
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
|
||||
|
||||
- label: Distributed Tests (8 GPUs)(H100)
|
||||
timeout_in_minutes: 10
|
||||
@@ -145,7 +149,7 @@ steps:
|
||||
num_devices: 2
|
||||
commands:
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
|
||||
# - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py --- failing, need to re-enable
|
||||
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
@@ -209,6 +213,19 @@ steps:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||
|
||||
- label: NixlConnector PD + Spec Decode acceptance (2 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
device: a100
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- vllm/v1/worker/kv_connector_model_runner_mixin.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
|
||||
|
||||
- label: Pipeline + Context Parallelism (4 GPUs)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
|
||||
@@ -14,7 +14,7 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||
|
||||
- label: V1 e2e + engine
|
||||
- label: V1 e2e + engine (1 GPU)
|
||||
timeout_in_minutes: 45
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@@ -36,3 +36,35 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s v1/e2e
|
||||
- pytest -v -s v1/engine
|
||||
|
||||
- label: V1 e2e (2 GPUs)
|
||||
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
|
||||
optional: true
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1/e2e
|
||||
commands:
|
||||
# Only run tests that need exactly 2 GPUs
|
||||
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_2
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: V1 e2e (4 GPUs)
|
||||
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
|
||||
optional: true
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1/e2e
|
||||
commands:
|
||||
# Only run tests that need 4 GPUs
|
||||
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_4
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
@@ -41,6 +41,11 @@ steps:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: Entrypoints Integration (API Server 2)
|
||||
timeout_in_minutes: 130
|
||||
@@ -55,6 +60,11 @@ steps:
|
||||
- pytest -v -s entrypoints/instrumentator
|
||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
|
||||
- pytest -v -s tool_use
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: Entrypoints Integration (Pooling)
|
||||
timeout_in_minutes: 50
|
||||
@@ -87,6 +97,11 @@ steps:
|
||||
- tests/v1
|
||||
commands:
|
||||
- pytest -v -s v1/entrypoints
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: OpenAI API Correctness
|
||||
timeout_in_minutes: 30
|
||||
|
||||
@@ -20,4 +20,19 @@ steps:
|
||||
- tests/distributed/test_eplb_execute.py
|
||||
commands:
|
||||
- pytest -v -s distributed/test_eplb_execute.py
|
||||
- pytest -v -s distributed/test_eplb_spec_decode.py
|
||||
- pytest -v -s distributed/test_eplb_spec_decode.py
|
||||
|
||||
- label: Elastic EP Scaling Test
|
||||
timeout_in_minutes: 20
|
||||
device: b200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/compilation/
|
||||
- tests/distributed/
|
||||
commands:
|
||||
- pytest -v -s distributed/test_elastic_ep.py
|
||||
|
||||
@@ -8,8 +8,9 @@ steps:
|
||||
- csrc/
|
||||
- tests/kernels/core
|
||||
- tests/kernels/test_top_k_per_row.py
|
||||
- tests/kernels/test_concat_mla_q.py
|
||||
commands:
|
||||
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
||||
- pytest -v -s kernels/core kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py
|
||||
|
||||
- label: Kernels Attention Test %N
|
||||
timeout_in_minutes: 35
|
||||
@@ -44,7 +45,8 @@ steps:
|
||||
- vllm/envs.py
|
||||
- vllm/config
|
||||
commands:
|
||||
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
- pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels Mamba Test
|
||||
@@ -70,7 +72,7 @@ steps:
|
||||
- tests/kernels/moe/test_batched_deepgemm.py
|
||||
- tests/kernels/attention/test_deepgemm_attention.py
|
||||
commands:
|
||||
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
|
||||
- pytest -v -s kernels/quantization/test_block_fp8.py
|
||||
- pytest -v -s kernels/moe/test_deepgemm.py
|
||||
- pytest -v -s kernels/moe/test_batched_deepgemm.py
|
||||
- pytest -v -s kernels/attention/test_deepgemm_attention.py
|
||||
@@ -95,7 +97,7 @@ steps:
|
||||
- vllm/platforms/cuda.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
- python3 examples/basic/offline_inference/chat.py
|
||||
# Attention
|
||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||
- pytest -v -s tests/kernels/attention/test_attention_selector.py
|
||||
@@ -155,5 +157,14 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
|
||||
- pytest -v -s kernels/moe/test_deepep_moe.py
|
||||
- pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
|
||||
# - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
|
||||
|
||||
- label: Kernels Fp4 MoE Test (B200)
|
||||
timeout_in_minutes: 60
|
||||
device: b200
|
||||
num_devices: 1
|
||||
optional: true
|
||||
commands:
|
||||
- pytest -v -s kernels/moe/test_cutedsl_moe.py
|
||||
- pytest -v -s kernels/moe/test_flashinfer_moe.py
|
||||
- pytest -v -s kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s kernels/moe/test_ocp_mx_moe.py
|
||||
|
||||
@@ -11,17 +11,17 @@ steps:
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
|
||||
|
||||
- label: LM Eval Large Models (4 GPUs)(A100)
|
||||
device: a100
|
||||
optional: true
|
||||
num_devices: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
# - label: LM Eval Large Models (4 GPUs)(A100)
|
||||
# device: a100
|
||||
# optional: true
|
||||
# num_devices: 4
|
||||
# working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
# source_file_dependencies:
|
||||
# - csrc/
|
||||
# - vllm/model_executor/layers/quantization
|
||||
# commands:
|
||||
# - export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
# - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
- label: LM Eval Large Models (4 GPUs)(H100)
|
||||
device: h100
|
||||
|
||||
@@ -9,6 +9,7 @@ steps:
|
||||
- tests/v1
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s -m 'not cpu_test' v1/core
|
||||
- pytest -v -s v1/executor
|
||||
@@ -66,12 +67,13 @@ steps:
|
||||
- examples/
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
- python3 offline_inference/basic/chat.py # for basic
|
||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
# for basic
|
||||
- python3 basic/offline_inference/chat.py
|
||||
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
|
||||
- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 basic/offline_inference/classify.py
|
||||
- python3 basic/offline_inference/embed.py
|
||||
- python3 basic/offline_inference/score.py
|
||||
# for multi-modal models
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
@@ -86,6 +88,11 @@ steps:
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: Metrics, Tracing (2 GPUs)
|
||||
timeout_in_minutes: 20
|
||||
|
||||
@@ -65,7 +65,7 @@ steps:
|
||||
- pytest -v -s tests/models/test_transformers.py
|
||||
- pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
- python3 examples/basic/offline_inference/chat.py
|
||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
|
||||
@@ -12,6 +12,11 @@ steps:
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: Multi-Modal Processor Test (CPU)
|
||||
depends_on:
|
||||
@@ -20,6 +25,7 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
- tests/models/registry.py
|
||||
device: cpu
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
@@ -30,6 +36,7 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
- tests/models/registry.py
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/processing/test_tensor_schema.py
|
||||
@@ -52,6 +59,11 @@ steps:
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: Multi-Modal Models (Extended) 2
|
||||
optional: true
|
||||
@@ -70,12 +82,3 @@ steps:
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
||||
|
||||
# This test is used only in PR development phase to test individual models and should never run on main
|
||||
- label: Custom Models
|
||||
optional: true
|
||||
commands:
|
||||
- echo 'Testing custom models...'
|
||||
# PR authors can temporarily add commands below to test individual models
|
||||
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
|
||||
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
|
||||
|
||||
@@ -15,10 +15,17 @@ steps:
|
||||
- pytest -v -s plugins_tests/test_platform_plugins.py
|
||||
- pip uninstall vllm_add_dummy_platform -y
|
||||
# end platform plugin tests
|
||||
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
|
||||
# begin io_processor plugins test
|
||||
# test generic io_processor plugins functions
|
||||
- pytest -v -s ./plugins_tests/test_io_processor_plugins.py
|
||||
# test Terratorch io_processor plugins
|
||||
- pip install -e ./plugins/prithvi_io_processor_plugin
|
||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||
- pytest -v -s plugins_tests/test_terratorch_io_processor_plugins.py
|
||||
- pip uninstall prithvi_io_processor_plugin -y
|
||||
# test bge_m3_sparse io_processor plugin
|
||||
- pip install -e ./plugins/bge_m3_sparse_plugin
|
||||
- pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
|
||||
- pip uninstall bge_m3_sparse_plugin -y
|
||||
# end io_processor plugins test
|
||||
# begin stat_logger plugins test
|
||||
- pip install -e ./plugins/vllm_add_dummy_stat_logger
|
||||
@@ -32,3 +39,8 @@ steps:
|
||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_2
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
16
.buildkite/test_areas/ray_compat.yaml
Normal file
16
.buildkite/test_areas/ray_compat.yaml
Normal file
@@ -0,0 +1,16 @@
|
||||
group: Ray Compatibility
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Ray Dependency Compatibility Check
|
||||
# Informational only — does not block the pipeline.
|
||||
# If this fails, it means the PR introduces a dependency that
|
||||
# conflicts with Ray's dependency constraints.
|
||||
# See https://github.com/vllm-project/vllm/issues/33599
|
||||
soft_fail: true
|
||||
timeout_in_minutes: 10
|
||||
source_file_dependencies:
|
||||
- requirements/
|
||||
- setup.py
|
||||
commands:
|
||||
- bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh
|
||||
@@ -13,13 +13,13 @@ steps:
|
||||
commands:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||
|
||||
- label: Weight Loading Multiple GPU - Large Models # optional
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 2
|
||||
device: a100
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/weight_loading
|
||||
commands:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||
# - label: Weight Loading Multiple GPU - Large Models # optional
|
||||
# working_dir: "/vllm-workspace/tests"
|
||||
# num_devices: 2
|
||||
# device: a100
|
||||
# optional: true
|
||||
# source_file_dependencies:
|
||||
# - vllm/
|
||||
# - tests/weight_loading
|
||||
# commands:
|
||||
# - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||
|
||||
24
.github/.bc-linter.yml
vendored
24
.github/.bc-linter.yml
vendored
@@ -1,24 +0,0 @@
|
||||
# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
|
||||
version: 1
|
||||
paths:
|
||||
# We temporarily disable globally, and will only enable with `annotations.include`
|
||||
# include:
|
||||
# - "vllm/v1/attetion/*.py"
|
||||
# - "vllm/v1/core/*.py"
|
||||
exclude:
|
||||
- "**/*.py"
|
||||
|
||||
scan:
|
||||
functions: true # check free functions and methods
|
||||
classes: true # check classes/dataclasses
|
||||
public_only: true # ignore names starting with "_" at any level
|
||||
|
||||
annotations:
|
||||
include: # decorators that force‑include a symbol
|
||||
- name: "bc_linter_include" # matched by simple name or dotted suffix
|
||||
propagate_to_members: false # for classes, include methods/inner classes
|
||||
exclude: # decorators that force‑exclude a symbol
|
||||
- name: "bc_linter_skip" # matched by simple name or dotted suffix
|
||||
propagate_to_members: true # for classes, exclude methods/inner classes
|
||||
|
||||
excluded_violations: [] # e.g. ["ParameterRenamed", "FieldTypeChanged"]
|
||||
9
.github/CODEOWNERS
vendored
9
.github/CODEOWNERS
vendored
@@ -2,7 +2,7 @@
|
||||
# for more info about CODEOWNERS file
|
||||
|
||||
# This lists cover the "core" components of vLLM that require careful review
|
||||
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
|
||||
/vllm/compilation @zou3519 @youkaichao @ProExpertProg @BoyuanFeng
|
||||
/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
|
||||
/vllm/lora @jeejeelee
|
||||
/vllm/model_executor/layers/attention @LucasWilkinson @MatthewBonanni
|
||||
@@ -54,11 +54,14 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
||||
/vllm/v1/kv_cache_interface.py @heheda12345
|
||||
/vllm/v1/kv_offload @ApostaC @orozery
|
||||
/vllm/v1/worker/gpu/kv_connector.py @orozery
|
||||
/vllm/v1/engine @njhill
|
||||
/vllm/v1/executor @njhill
|
||||
/vllm/v1/worker @njhill
|
||||
/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery @NickLucche
|
||||
|
||||
# Model runner V2
|
||||
/vllm/v1/worker/gpu @WoosukKwon
|
||||
/vllm/v1/worker/gpu @WoosukKwon @njhill
|
||||
/vllm/v1/worker/gpu/kv_connector.py @orozery
|
||||
|
||||
# Test ownership
|
||||
/.buildkite/lm-eval-harness @mgoin
|
||||
|
||||
10
.github/mergify.yml
vendored
10
.github/mergify.yml
vendored
@@ -3,6 +3,7 @@ pull_request_rules:
|
||||
description: Automatically apply documentation label
|
||||
conditions:
|
||||
- label != stale
|
||||
- -closed
|
||||
- or:
|
||||
- files~=^[^/]+\.md$
|
||||
- files~=^docs/
|
||||
@@ -37,15 +38,13 @@ pull_request_rules:
|
||||
|
||||
> [!TIP]
|
||||
> <details>
|
||||
> <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
|
||||
> <summary>Is <code>mypy</code> failing?</summary>
|
||||
> <br/>
|
||||
> <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
|
||||
> <code>mypy</code> is run differently in CI. If the failure is related to this check, please use the following command to run it locally:
|
||||
>
|
||||
> ```bash
|
||||
> # For mypy (substitute "3.10" with the failing version if needed)
|
||||
> pre-commit run --hook-stage manual mypy-3.10
|
||||
> # For markdownlint
|
||||
> pre-commit run --hook-stage manual markdownlint
|
||||
> ```
|
||||
> </details>
|
||||
|
||||
@@ -259,8 +258,7 @@ pull_request_rules:
|
||||
- files=benchmarks/run_structured_output_benchmark.sh
|
||||
- files=docs/features/structured_outputs.md
|
||||
- files=examples/offline_inference/structured_outputs.py
|
||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
||||
- files=examples/online_serving/structured_outputs/structured_outputs.py
|
||||
- files~=^tests/v1/structured_output/
|
||||
- files=tests/v1/entrypoints/llm/test_struct_output_generate.py
|
||||
- files~=^vllm/v1/structured_output/
|
||||
|
||||
29
.github/workflows/bc-lint.yml
vendored
29
.github/workflows/bc-lint.yml
vendored
@@ -1,29 +0,0 @@
|
||||
name: BC Lint
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types:
|
||||
- opened
|
||||
- synchronize
|
||||
- reopened
|
||||
- labeled
|
||||
- unlabeled
|
||||
|
||||
jobs:
|
||||
bc_lint:
|
||||
if: github.repository_owner == 'vllm-project'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Run BC Lint Action
|
||||
uses: pytorch/test-infra/.github/actions/bc-lint@main
|
||||
with:
|
||||
repo: ${{ github.event.pull_request.head.repo.full_name }}
|
||||
base_sha: ${{ github.event.pull_request.base.sha }}
|
||||
head_sha: ${{ github.event.pull_request.head.sha }}
|
||||
suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
|
||||
docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
|
||||
config_dir: .github
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
|
||||
cancel-in-progress: true
|
||||
3
.github/workflows/macos-smoke-test.yml
vendored
3
.github/workflows/macos-smoke-test.yml
vendored
@@ -6,6 +6,9 @@ on:
|
||||
- main
|
||||
workflow_dispatch: # Manual trigger
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
macos-m1-smoke-test:
|
||||
runs-on: macos-latest
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -3,6 +3,8 @@
|
||||
|
||||
# vllm-flash-attn built from source
|
||||
vllm/vllm_flash_attn/*
|
||||
!vllm/vllm_flash_attn/__init__.py
|
||||
!vllm/vllm_flash_attn/flash_attn_interface.py
|
||||
|
||||
# OpenAI triton kernels copied from source
|
||||
vllm/third_party/triton_kernels/*
|
||||
|
||||
@@ -13,7 +13,7 @@ repos:
|
||||
args: [--output-format, github, --fix]
|
||||
- id: ruff-format
|
||||
- repo: https://github.com/crate-ci/typos
|
||||
rev: v1.38.1
|
||||
rev: v1.43.5
|
||||
hooks:
|
||||
- id: typos
|
||||
args: [--force-exclude]
|
||||
@@ -24,12 +24,12 @@ repos:
|
||||
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
||||
types_or: [c++, cuda]
|
||||
args: [--style=file, --verbose]
|
||||
- repo: https://github.com/igorshubovych/markdownlint-cli
|
||||
rev: v0.45.0
|
||||
- repo: https://github.com/DavidAnson/markdownlint-cli2
|
||||
rev: v0.21.0
|
||||
hooks:
|
||||
- id: markdownlint
|
||||
exclude: '.*\.inc\.md'
|
||||
stages: [manual] # Only run in CI
|
||||
- id: markdownlint-cli2
|
||||
language_version: lts
|
||||
args: [--fix]
|
||||
- repo: https://github.com/rhysd/actionlint
|
||||
rev: v1.7.7
|
||||
hooks:
|
||||
@@ -55,7 +55,7 @@ repos:
|
||||
language: python
|
||||
types_or: [python, pyi]
|
||||
require_serial: true
|
||||
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
|
||||
additional_dependencies: ["mypy[faster-cache]==1.19.1", regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
|
||||
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.10
|
||||
entry: python tools/pre_commit/mypy.py 1 "3.10"
|
||||
@@ -127,6 +127,13 @@ repos:
|
||||
language: python
|
||||
types: [python]
|
||||
additional_dependencies: [regex]
|
||||
# prevent use torch.cuda APIs
|
||||
- id: check-torch-cuda-call
|
||||
name: "Prevent new 'torch.cuda' APIs call"
|
||||
entry: python tools/pre_commit/check_torch_cuda.py
|
||||
language: python
|
||||
types: [python]
|
||||
additional_dependencies: [regex]
|
||||
- id: validate-config
|
||||
name: Validate configuration has default values and that each field has a docstring
|
||||
entry: python tools/pre_commit/validate_config.py
|
||||
|
||||
@@ -9,6 +9,7 @@ build:
|
||||
python: "3.12"
|
||||
jobs:
|
||||
post_checkout:
|
||||
- bash docs/maybe_skip_pr_build.sh
|
||||
- git fetch origin main --unshallow --no-tags --filter=blob:none || true
|
||||
pre_create_environment:
|
||||
- pip install uv
|
||||
|
||||
@@ -725,7 +725,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# CUTLASS MoE kernels
|
||||
|
||||
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
|
||||
# on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
|
||||
# on Hopper). get_cutlass_(batched_)moe_mm_data should only be compiled
|
||||
# if it's possible to compile MoE kernels that use its output.
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||
@@ -771,6 +771,33 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Expert-specialization MXFP8 blockscaled grouped kernels (SM100+).
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(ES_MXFP8_GROUPED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(ES_MXFP8_GROUPED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND ES_MXFP8_GROUPED_MM_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm.cu"
|
||||
"csrc/moe/mxfp8_moe/mxfp8_experts_quant.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${ES_MXFP8_GROUPED_MM_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_ES_MXFP8_GROUPED_MM_SM100=1")
|
||||
message(STATUS "Building ES MXFP8 grouped kernels for archs: ${ES_MXFP8_GROUPED_MM_ARCHS}")
|
||||
else()
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8
|
||||
AND ES_MXFP8_GROUPED_MM_ARCHS)
|
||||
message(STATUS "Not building ES MXFP8 grouped kernels as CUDA Compiler version is "
|
||||
"not >= 12.8.")
|
||||
else()
|
||||
message(STATUS "Not building ES MXFP8 grouped kernels as no compatible archs found "
|
||||
"in CUDA target architectures.")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# DeepSeek V3 fused A GEMM kernel (requires SM 9.0+, Hopper and later)
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
|
||||
@@ -971,7 +998,8 @@ set(VLLM_MOE_EXT_SRC
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_MOE_EXT_SRC
|
||||
"csrc/moe/moe_wna16.cu"
|
||||
"csrc/moe/grouped_topk_kernels.cu")
|
||||
"csrc/moe/grouped_topk_kernels.cu"
|
||||
"csrc/moe/router_gemm.cu")
|
||||
endif()
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
@@ -187,7 +187,7 @@ python benchmark.py \
|
||||
## Hardware Requirements
|
||||
|
||||
| Backend | Hardware |
|
||||
|---------|----------|
|
||||
| ------- | -------- |
|
||||
| Flash/Triton/FlashInfer | Any CUDA GPU |
|
||||
| CUTLASS MLA | Blackwell (SM100+) |
|
||||
| FlashAttn MLA | Hopper (SM90+) |
|
||||
|
||||
@@ -15,7 +15,6 @@ from .common import (
|
||||
BenchmarkConfig,
|
||||
BenchmarkResult,
|
||||
MockLayer,
|
||||
MockModelConfig,
|
||||
ResultsFormatter,
|
||||
get_attention_scale,
|
||||
is_mla_backend,
|
||||
@@ -36,7 +35,6 @@ __all__ = [
|
||||
"ResultsFormatter",
|
||||
# Mock objects
|
||||
"MockLayer",
|
||||
"MockModelConfig",
|
||||
# Utilities
|
||||
"setup_mla_dims",
|
||||
"get_attention_scale",
|
||||
|
||||
@@ -10,7 +10,6 @@ from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from batch_spec import get_batch_type, parse_batch_spec
|
||||
from rich.console import Console
|
||||
@@ -31,7 +30,7 @@ def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
|
||||
max_kv_len = max(r.kv_len for r in requests) if requests else 0
|
||||
return (batch_size, max_q_len, max_kv_len)
|
||||
except Exception:
|
||||
# Fallback for unparseable specs
|
||||
# Fallback for unparsable specs
|
||||
return (0, 0, 0)
|
||||
|
||||
|
||||
@@ -62,10 +61,7 @@ class MockHfConfig:
|
||||
# Import AttentionLayerBase at module level to avoid circular dependencies
|
||||
try:
|
||||
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
||||
|
||||
_HAS_ATTENTION_LAYER_BASE = True
|
||||
except ImportError:
|
||||
_HAS_ATTENTION_LAYER_BASE = False
|
||||
AttentionLayerBase = object # Fallback
|
||||
|
||||
|
||||
@@ -167,95 +163,6 @@ class MockLayer(AttentionLayerBase):
|
||||
return self._kv_cache_spec
|
||||
|
||||
|
||||
class MockModelConfig:
|
||||
"""Mock model configuration."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_q_heads: int,
|
||||
num_kv_heads: int,
|
||||
head_dim: int,
|
||||
dtype: torch.dtype = torch.float16,
|
||||
max_model_len: int = 32768,
|
||||
):
|
||||
self._n_q = num_q_heads
|
||||
self._n_kv = num_kv_heads
|
||||
self._d = head_dim
|
||||
self.dtype = dtype
|
||||
self.max_model_len = max_model_len
|
||||
|
||||
def get_num_attention_heads(self, _=None) -> int:
|
||||
return self._n_q
|
||||
|
||||
def get_num_kv_heads(self, _=None) -> int:
|
||||
return self._n_kv
|
||||
|
||||
def get_head_size(self) -> int:
|
||||
return self._d
|
||||
|
||||
def get_num_layers(self) -> int:
|
||||
"""Mock method for layer count queries."""
|
||||
return 1
|
||||
|
||||
def get_sliding_window_for_layer(self, _layer_idx: int):
|
||||
"""Mock method for sliding window queries."""
|
||||
return None
|
||||
|
||||
def get_logits_soft_cap_for_layer(self, _layer_idx: int):
|
||||
"""Mock method for logits soft cap queries."""
|
||||
return None
|
||||
|
||||
def get_sm_scale_for_layer(self, _layer_idx: int) -> float:
|
||||
"""Mock method for SM scale queries."""
|
||||
return 1.0 / (self.get_head_size() ** 0.5)
|
||||
|
||||
|
||||
class MockParallelConfig:
|
||||
"""Mock parallel configuration."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class MockCompilationConfig:
|
||||
"""Mock compilation configuration."""
|
||||
|
||||
def __init__(self):
|
||||
self.full_cuda_graph = False
|
||||
self.static_forward_context = {}
|
||||
|
||||
|
||||
class MockVLLMConfig:
|
||||
"""Mock VLLM configuration."""
|
||||
|
||||
def __init__(self):
|
||||
self.compilation_config = MockCompilationConfig()
|
||||
|
||||
|
||||
class MockRunner:
|
||||
"""Mock GPU runner for metadata builders."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
seq_lens: np.ndarray,
|
||||
query_start_locs: np.ndarray,
|
||||
device: torch.device,
|
||||
num_q_heads: int,
|
||||
num_kv_heads: int,
|
||||
head_dim: int,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
self.model_config = MockModelConfig(num_q_heads, num_kv_heads, head_dim, dtype)
|
||||
self.parallel_config = MockParallelConfig()
|
||||
self.vllm_config = MockVLLMConfig()
|
||||
self.seq_lens_np = seq_lens
|
||||
self.query_start_loc_np = query_start_locs
|
||||
self.device = device
|
||||
self.attention_chunk_size = None
|
||||
self.num_query_heads = num_q_heads
|
||||
self.num_kv_heads = num_kv_heads
|
||||
self.dtype = dtype
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParameterSweep:
|
||||
"""Configuration for sweeping a backend parameter."""
|
||||
|
||||
@@ -145,7 +145,6 @@ def create_minimal_vllm_config(
|
||||
cache_config = CacheConfig(
|
||||
block_size=block_size,
|
||||
gpu_memory_utilization=0.9,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
enable_prefix_caching=False,
|
||||
)
|
||||
@@ -701,7 +700,7 @@ def _run_single_benchmark(
|
||||
# Warmup
|
||||
for _ in range(config.warmup_iters):
|
||||
forward_fn()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Benchmark
|
||||
times = []
|
||||
@@ -714,7 +713,7 @@ def _run_single_benchmark(
|
||||
forward_fn()
|
||||
end.record()
|
||||
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
elapsed_ms = start.elapsed_time(end)
|
||||
times.append(elapsed_ms / 1000.0 / config.num_layers)
|
||||
|
||||
|
||||
@@ -141,7 +141,6 @@ def _create_vllm_config(
|
||||
cache_config = CacheConfig(
|
||||
block_size=config.block_size,
|
||||
cache_dtype="auto",
|
||||
swap_space=0,
|
||||
)
|
||||
cache_config.num_gpu_blocks = max_num_blocks
|
||||
cache_config.num_cpu_blocks = 0
|
||||
@@ -391,7 +390,7 @@ def _run_single_benchmark(
|
||||
attn_metadata,
|
||||
output=out,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Benchmark
|
||||
times = []
|
||||
@@ -412,7 +411,7 @@ def _run_single_benchmark(
|
||||
)
|
||||
end.record()
|
||||
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
elapsed_ms = start.elapsed_time(end)
|
||||
times.append(elapsed_ms / 1000.0 / config.num_layers) # seconds per layer
|
||||
|
||||
|
||||
@@ -41,7 +41,7 @@ MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LE
|
||||
| --- | --- | --- |
|
||||
| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
|
||||
| `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` |
|
||||
| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
|
||||
| `SYSTEM` | **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
|
||||
| `TP` | **Required.** The tensor-parallelism size. | `1` |
|
||||
| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
|
||||
| `INPUT_LEN` | **Required.** Request input length. | `4000` |
|
||||
|
||||
@@ -85,7 +85,6 @@ start_server() {
|
||||
# Each argument and its value are separate elements.
|
||||
local common_args_array=(
|
||||
"$MODEL"
|
||||
"--disable-log-requests"
|
||||
"--port" "8004"
|
||||
"--host" "$HOSTNAME"
|
||||
"--gpu-memory-utilization" "$gpu_memory_utilization"
|
||||
|
||||
@@ -649,9 +649,3 @@ ASYNC_REQUEST_FUNCS = {
|
||||
"sglang": async_request_openai_completions,
|
||||
"llama.cpp": async_request_openai_completions,
|
||||
}
|
||||
|
||||
OPENAI_COMPATIBLE_BACKENDS = [
|
||||
k
|
||||
for k, v in ASYNC_REQUEST_FUNCS.items()
|
||||
if v in (async_request_openai_completions, async_request_openai_chat_completions)
|
||||
]
|
||||
|
||||
@@ -94,7 +94,7 @@ def create_logits(
|
||||
|
||||
def measure_memory() -> tuple[int, int]:
|
||||
"""Return (allocated, reserved) memory in bytes."""
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()
|
||||
|
||||
|
||||
@@ -102,7 +102,7 @@ def reset_memory_stats():
|
||||
"""Reset peak memory statistics."""
|
||||
reset_buffer_cache()
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
torch.cuda.empty_cache()
|
||||
torch.accelerator.empty_cache()
|
||||
gc.collect()
|
||||
|
||||
|
||||
@@ -123,7 +123,7 @@ def benchmark_function(
|
||||
for _ in range(warmup_iters):
|
||||
logits_copy = logits.clone()
|
||||
func(logits_copy, k, p)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Reset memory stats before benchmark
|
||||
reset_memory_stats()
|
||||
@@ -140,7 +140,7 @@ def benchmark_function(
|
||||
func(logits_copy, k, p)
|
||||
end_events[i].record()
|
||||
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Calculate timing
|
||||
times = [
|
||||
|
||||
@@ -1,78 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import time
|
||||
from types import TracebackType
|
||||
from typing import Any
|
||||
|
||||
|
||||
def convert_to_pytorch_benchmark_format(
|
||||
args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any]
|
||||
) -> list:
|
||||
"""
|
||||
Save the benchmark results in the format used by PyTorch OSS benchmark with
|
||||
on metric per record
|
||||
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
|
||||
"""
|
||||
records = []
|
||||
if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
|
||||
return records
|
||||
|
||||
for name, benchmark_values in metrics.items():
|
||||
record = {
|
||||
"benchmark": {
|
||||
"name": "vLLM benchmark",
|
||||
"extra_info": {
|
||||
"args": vars(args),
|
||||
},
|
||||
},
|
||||
"model": {
|
||||
"name": args.model,
|
||||
},
|
||||
"metric": {
|
||||
"name": name,
|
||||
"benchmark_values": benchmark_values,
|
||||
"extra_info": extra_info,
|
||||
},
|
||||
}
|
||||
|
||||
tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
|
||||
# Save tensor_parallel_size parameter if it's part of the metadata
|
||||
if not tp and "tensor_parallel_size" in extra_info:
|
||||
record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = (
|
||||
extra_info["tensor_parallel_size"]
|
||||
)
|
||||
|
||||
records.append(record)
|
||||
|
||||
return records
|
||||
|
||||
|
||||
class InfEncoder(json.JSONEncoder):
|
||||
def clear_inf(self, o: Any):
|
||||
if isinstance(o, dict):
|
||||
return {k: self.clear_inf(v) for k, v in o.items()}
|
||||
elif isinstance(o, list):
|
||||
return [self.clear_inf(v) for v in o]
|
||||
elif isinstance(o, float) and math.isinf(o):
|
||||
return "inf"
|
||||
return o
|
||||
|
||||
def iterencode(self, o: Any, *args, **kwargs) -> Any:
|
||||
return super().iterencode(self.clear_inf(o), *args, **kwargs)
|
||||
|
||||
|
||||
def write_to_json(filename: str, records: list) -> None:
|
||||
with open(filename, "w") as f:
|
||||
json.dump(
|
||||
records,
|
||||
f,
|
||||
cls=InfEncoder,
|
||||
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
|
||||
)
|
||||
|
||||
|
||||
# Collect time and generate time metrics
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Cutlass bench utils
|
||||
from collections.abc import Iterable
|
||||
|
||||
import torch
|
||||
|
||||
@@ -86,15 +85,3 @@ def make_rand_sparse_tensors(
|
||||
|
||||
# Compressed B, Metadata, Original A, B
|
||||
return b_compressed, e, a, b
|
||||
|
||||
|
||||
def make_n_rand_sparse_tensors(
|
||||
num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int
|
||||
) -> tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
|
||||
ABs = []
|
||||
for _ in range(num_tensors):
|
||||
b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
|
||||
if b_comp is not None:
|
||||
ABs.append(make_rand_sparse_tensors(dtype, m, n, k))
|
||||
BComps, Es, As, Bs = zip(*ABs)
|
||||
return list(BComps), list(Es), list(As), list(Bs)
|
||||
|
||||
@@ -1,45 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""Token bucket rate limiter implementation"""
|
||||
|
||||
def __init__(self, rate_limit):
|
||||
self.rate_limit = rate_limit # Requests per second
|
||||
self.num_available_tokens = rate_limit # Available tokens
|
||||
self.last_refill = time.monotonic() # Last token refill time
|
||||
self.lock = asyncio.Lock() # Synchronization lock
|
||||
|
||||
async def acquire(self):
|
||||
"""Acquire a token from the rate limiter"""
|
||||
while True:
|
||||
async with self.lock:
|
||||
current_time = time.monotonic()
|
||||
elapsed = current_time - self.last_refill
|
||||
|
||||
# Refill num_available_tokens if more than 1 second has passed
|
||||
if elapsed > 1.0:
|
||||
self.num_available_tokens = self.rate_limit
|
||||
self.last_refill = current_time
|
||||
|
||||
# Check if num_available_tokens are available
|
||||
if self.num_available_tokens > 0:
|
||||
self.num_available_tokens -= 1
|
||||
return True
|
||||
|
||||
# Calculate wait time if no num_available_tokens available
|
||||
wait_time = 1.0 - elapsed
|
||||
await asyncio.sleep(wait_time)
|
||||
|
||||
async def __aenter__(self):
|
||||
"""Enter async context manager - acquire token"""
|
||||
await self.acquire()
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_value, traceback):
|
||||
"""Exit async context manager - no cleanup needed"""
|
||||
pass
|
||||
@@ -1,39 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
from collections import deque
|
||||
|
||||
|
||||
class RequestQueue:
|
||||
"""Request queue manager with concurrency control"""
|
||||
|
||||
def __init__(self, max_concurrent, max_queue_size):
|
||||
# Maximum concurrent requests
|
||||
self.max_concurrent = max_concurrent
|
||||
self.max_queue_size = max_queue_size # Maximum queue size
|
||||
# Concurrency control
|
||||
self.semaphore = asyncio.Semaphore(max_concurrent)
|
||||
self.queue = deque() # Request queue
|
||||
self.queue_size = 0 # Current queue size
|
||||
self.lock = asyncio.Lock() # Sync queue Lock
|
||||
|
||||
async def enqueue(self, task):
|
||||
"""Add a request task to the queue"""
|
||||
async with self.lock:
|
||||
if self.queue_size >= self.max_queue_size:
|
||||
return False
|
||||
|
||||
self.queue.append(task)
|
||||
self.queue_size += 1
|
||||
return True
|
||||
|
||||
async def process(self):
|
||||
"""Process queued requests using semaphore for concurrency control"""
|
||||
while True:
|
||||
if self.queue:
|
||||
async with self.semaphore, self.lock:
|
||||
task = self.queue.popleft()
|
||||
self.queue_size -= 1
|
||||
await task
|
||||
await asyncio.sleep(0.01) # Yield control to event loop
|
||||
98
benchmarks/kernels/bench_concat_mla_q.py
Normal file
98
benchmarks/kernels/bench_concat_mla_q.py
Normal file
@@ -0,0 +1,98 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.triton_utils import triton
|
||||
|
||||
# DeepSeek V3 dimensions
|
||||
NOPE_DIM = 512
|
||||
ROPE_DIM = 64
|
||||
NUM_HEADS = 128
|
||||
|
||||
NUM_TOKENS = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
|
||||
|
||||
|
||||
def get_configs():
|
||||
return NUM_TOKENS
|
||||
|
||||
|
||||
def make_inputs(num_tokens, dtype):
|
||||
"""Create inputs matching the real code path.
|
||||
|
||||
Args:
|
||||
contiguous_nope: If False, simulate the transposed BMM output
|
||||
(non-contiguous nope with stride pattern from
|
||||
[N,B,L].transpose(0,1)).
|
||||
"""
|
||||
# Simulate: bmm output [N, B, L].transpose(0, 1) -> [B, N, L]
|
||||
raw = torch.randn(NUM_HEADS, num_tokens, NOPE_DIM, dtype=dtype, device="cuda")
|
||||
ql_nope = raw.transpose(0, 1)
|
||||
|
||||
q_pe = torch.randn(num_tokens, NUM_HEADS, ROPE_DIM, dtype=dtype, device="cuda")
|
||||
return ql_nope, q_pe
|
||||
|
||||
|
||||
# ---- Non-contiguous nope benchmark (real code path) ----
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["num_tokens"],
|
||||
x_vals=get_configs(),
|
||||
line_arg="provider",
|
||||
line_vals=["torch_cat", "concat_mla_q"],
|
||||
line_names=["torch.cat", "concat_mla_q (v8)"],
|
||||
styles=[("blue", "--"), ("green", "-")],
|
||||
ylabel="Latency (us)",
|
||||
plot_name="concat_mla_q-transposed",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def bench_transposed(num_tokens, provider):
|
||||
dtype = torch.bfloat16
|
||||
ql_nope, q_pe = make_inputs(num_tokens, dtype)
|
||||
|
||||
q_out = torch.empty(
|
||||
num_tokens, NUM_HEADS, NOPE_DIM + ROPE_DIM, dtype=dtype, device="cuda"
|
||||
)
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if provider == "torch_cat":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: torch.cat((ql_nope, q_pe), dim=-1), quantiles=quantiles, rep=500
|
||||
)
|
||||
else:
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: ops.concat_mla_q(ql_nope, q_pe, q_out), quantiles=quantiles, rep=500
|
||||
)
|
||||
|
||||
return ms * 1000, max_ms * 1000, min_ms * 1000 # us
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Benchmark concat_mla_q vs torch.cat")
|
||||
parser.add_argument(
|
||||
"--save-path", type=str, default=None, help="Path to save benchmark results"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("CONCAT MLA Q KERNEL BENCHMARKS")
|
||||
print("=" * 70)
|
||||
print(f"Dimensions: nope={NOPE_DIM}, rope={ROPE_DIM}, heads={NUM_HEADS}")
|
||||
print(
|
||||
f"Per-head output: {NOPE_DIM + ROPE_DIM} bf16 = "
|
||||
f"{(NOPE_DIM + ROPE_DIM) * 2} bytes"
|
||||
)
|
||||
print(f"num_tokens (decode=batch_size, prefill=chunk_size): {NUM_TOKENS}")
|
||||
print("=" * 70)
|
||||
|
||||
print("\n--- Non-contiguous nope inputs (transposed BMM output) ---")
|
||||
bench_transposed.run(print_data=True, save_path=args.save_path)
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("Benchmarking complete!")
|
||||
print("=" * 70)
|
||||
153
benchmarks/kernels/bench_cp_gather_fp8.py
Normal file
153
benchmarks/kernels/bench_cp_gather_fp8.py
Normal file
@@ -0,0 +1,153 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
import math
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.triton_utils import triton
|
||||
|
||||
# DeepSeek V3 MLA dimensions
|
||||
NOPE_DIM = 512
|
||||
ROPE_DIM = 64
|
||||
HEAD_DIM = NOPE_DIM + ROPE_DIM # 576 BF16 output elements per token
|
||||
ENTRY_BYTES = 656 # 512 FP8 + 16 scales + 128 BF16 RoPE
|
||||
BLOCK_SIZE = 64 # tokens per physical cache block - get_supported_kernel_block_sizes
|
||||
|
||||
# Realistic prefill scenarios:
|
||||
# - 1 long prefill: single request, 16K-96K tokens
|
||||
# - 4 medium prefills: 4 requests, 4K-24K tokens each
|
||||
# - 16 shorter prefills: 16 requests, 1K-6K tokens each
|
||||
SCENARIOS = [
|
||||
# (label, num_reqs, total_tokens_list)
|
||||
("1-req", 1, [8192, 16384, 32768, 65536, 98304]),
|
||||
("4-reqs", 4, [8192, 16384, 32768, 65536, 98304]),
|
||||
("16-reqs", 16, [8192, 16384, 32768, 65536, 98304]),
|
||||
]
|
||||
|
||||
|
||||
def make_inputs(total_tokens, num_reqs, block_size):
|
||||
"""Create synthetic FP8 cache, block table, and output buffer.
|
||||
|
||||
Fills the cache with random bytes (we only measure throughput,
|
||||
not correctness). Block table maps each request to contiguous
|
||||
physical blocks.
|
||||
"""
|
||||
# Divide tokens evenly across requests
|
||||
base_len = total_tokens // num_reqs
|
||||
remainder = total_tokens % num_reqs
|
||||
seq_lens = [base_len + (1 if r < remainder else 0) for r in range(num_reqs)]
|
||||
|
||||
# workspace_starts: cumulative sum of seq_lens
|
||||
workspace_starts = [0] * num_reqs
|
||||
for r in range(1, num_reqs):
|
||||
workspace_starts[r] = workspace_starts[r - 1] + seq_lens[r - 1]
|
||||
|
||||
# Physical blocks needed per request
|
||||
blocks_per_req = [math.ceil(s / block_size) for s in seq_lens]
|
||||
total_blocks = sum(blocks_per_req)
|
||||
max_blocks = max(blocks_per_req)
|
||||
|
||||
# Allocate cache with random data (content doesn't matter for perf)
|
||||
cache = torch.randint(
|
||||
0,
|
||||
256,
|
||||
(total_blocks, block_size, ENTRY_BYTES),
|
||||
dtype=torch.uint8,
|
||||
device="cuda",
|
||||
)
|
||||
|
||||
# Block table: contiguous block assignments
|
||||
block_table = torch.zeros(num_reqs, max_blocks, dtype=torch.int32, device="cuda")
|
||||
block_idx = 0
|
||||
for r in range(num_reqs):
|
||||
for b in range(blocks_per_req[r]):
|
||||
block_table[r, b] = block_idx
|
||||
block_idx += 1
|
||||
|
||||
# Output workspace
|
||||
dst = torch.zeros(total_tokens, HEAD_DIM, dtype=torch.bfloat16, device="cuda")
|
||||
|
||||
seq_lens_t = torch.tensor(seq_lens, dtype=torch.int32, device="cuda")
|
||||
workspace_starts_t = torch.tensor(
|
||||
workspace_starts, dtype=torch.int32, device="cuda"
|
||||
)
|
||||
|
||||
return cache, dst, block_table, seq_lens_t, workspace_starts_t
|
||||
|
||||
|
||||
def bench_scenario(label, num_reqs, total_tokens_list, save_path):
|
||||
"""Run benchmark for a specific (num_reqs, total_tokens) scenario."""
|
||||
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["total_tokens"],
|
||||
x_vals=total_tokens_list,
|
||||
line_arg="provider",
|
||||
line_vals=["cuda_kernel"],
|
||||
line_names=["cp_gather_fp8 (CUDA)"],
|
||||
styles=[("green", "-")],
|
||||
ylabel="Latency (us)",
|
||||
plot_name=f"cp_gather_fp8-{label}-bs{BLOCK_SIZE}",
|
||||
args={"num_reqs": num_reqs},
|
||||
)
|
||||
)
|
||||
def bench_fn(total_tokens, provider, num_reqs):
|
||||
cache, dst, block_table, seq_lens_t, ws_starts = make_inputs(
|
||||
total_tokens, num_reqs, BLOCK_SIZE
|
||||
)
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: ops.cp_gather_and_upconvert_fp8_kv_cache(
|
||||
cache, dst, block_table, seq_lens_t, ws_starts, num_reqs
|
||||
),
|
||||
quantiles=quantiles,
|
||||
rep=500,
|
||||
)
|
||||
|
||||
return ms * 1000, max_ms * 1000, min_ms * 1000 # us
|
||||
|
||||
seq_len_per_req = total_tokens_list[0] // num_reqs
|
||||
seq_len_per_req_max = total_tokens_list[-1] // num_reqs
|
||||
print(
|
||||
f"\n--- {label}: {num_reqs} request(s), "
|
||||
f"~{seq_len_per_req}-{seq_len_per_req_max} tokens/req ---"
|
||||
)
|
||||
bench_fn.run(print_data=True, save_path=save_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Benchmark cp_gather_and_upconvert_fp8_kv_cache"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to save benchmark results as CSV",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Print data volume info for bandwidth analysis
|
||||
read_per_token = ENTRY_BYTES # 656 bytes from cache
|
||||
write_per_token = HEAD_DIM * 2 # 576 * 2 = 1152 bytes to workspace
|
||||
total_per_token = read_per_token + write_per_token # 1808 bytes
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("CP_GATHER_AND_UPCONVERT_FP8_KV_CACHE BENCHMARKS")
|
||||
print("=" * 70)
|
||||
print(f"Cache entry: {ENTRY_BYTES} bytes (512 FP8 + 16 scales + 128 RoPE)")
|
||||
print(f"Output row: {HEAD_DIM} BF16 = {HEAD_DIM * 2} bytes")
|
||||
print(f"Per token: {total_per_token} bytes (read + write)")
|
||||
print(f"Block size: {BLOCK_SIZE} tokens/block")
|
||||
print("=" * 70)
|
||||
|
||||
for label, num_reqs, total_tokens_list in SCENARIOS:
|
||||
bench_scenario(label, num_reqs, total_tokens_list, args.save_path)
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("Benchmarking complete!")
|
||||
print("=" * 70)
|
||||
@@ -168,7 +168,7 @@ def bench_impl(
|
||||
# warmup
|
||||
for kwargs in kwargs_list:
|
||||
impl_type.get_impl()(**kwargs)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Merge into a single kwargs and qualify arguments as ArgPool
|
||||
kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
|
||||
@@ -202,7 +202,7 @@ def test_correctness(T: int, N: int):
|
||||
# reference output
|
||||
ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
|
||||
|
||||
# test ouptut
|
||||
# test output
|
||||
out_q, out_s = output_from_impl(
|
||||
ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
|
||||
)
|
||||
|
||||
@@ -12,12 +12,12 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from tests.kernels.moe.utils import make_dummy_moe_config
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
|
||||
from vllm.model_executor.layers.fused_moe.all2all_utils import (
|
||||
maybe_make_prepare_finalize,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
||||
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
|
||||
MoEPrepareAndFinalizeNoEP,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.v1.worker.workspace import init_workspace_manager
|
||||
@@ -137,15 +137,21 @@ def bench_run(
|
||||
per_out_ch_quant=per_out_ch,
|
||||
)
|
||||
|
||||
fn = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
moe_config = make_dummy_moe_config(
|
||||
num_experts=num_experts,
|
||||
hidden_dim=k,
|
||||
intermediate_size_per_partition=n,
|
||||
in_dtype=a.dtype,
|
||||
)
|
||||
fn = mk.FusedMoEKernel(
|
||||
maybe_make_prepare_finalize(
|
||||
moe=moe_config,
|
||||
quant_config=quant_config,
|
||||
allow_new_interface=True,
|
||||
use_monolithic=False,
|
||||
),
|
||||
CutlassExpertsFp8(
|
||||
moe_config=make_dummy_moe_config(
|
||||
num_experts=num_experts,
|
||||
hidden_dim=k,
|
||||
intermediate_size_per_partition=n,
|
||||
in_dtype=a.dtype,
|
||||
),
|
||||
moe_config=moe_config,
|
||||
quant_config=quant_config,
|
||||
),
|
||||
)
|
||||
@@ -165,7 +171,7 @@ def bench_run(
|
||||
activation=MoEActivation.SILU,
|
||||
global_num_experts=num_experts,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
|
||||
triton_stream = torch.cuda.Stream()
|
||||
@@ -181,14 +187,14 @@ def bench_run(
|
||||
topk_ids,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
|
||||
"""Benchmark CUDA graph using events like benchmark_moe.py"""
|
||||
# Warmup
|
||||
for _ in range(num_warmup):
|
||||
graph.replay()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Timing
|
||||
start_event = torch.Event(enable_timing=True)
|
||||
@@ -196,7 +202,7 @@ def bench_run(
|
||||
|
||||
latencies = []
|
||||
for _ in range(num_iters):
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
start_event.record()
|
||||
graph.replay()
|
||||
end_event.record()
|
||||
|
||||
@@ -15,6 +15,9 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from tests.kernels.moe.utils import make_dummy_moe_config
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.model_executor.layers.fused_moe.all2all_utils import (
|
||||
maybe_make_prepare_finalize,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
fp8_w8a8_moe_quant_config,
|
||||
nvfp4_moe_quant_config,
|
||||
@@ -23,9 +26,6 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import (
|
||||
CutlassExpertsFp4,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
||||
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
|
||||
MoEPrepareAndFinalizeNoEP,
|
||||
)
|
||||
from vllm.scalar_type import scalar_types
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.v1.worker.workspace import init_workspace_manager
|
||||
@@ -196,10 +196,21 @@ def bench_run(
|
||||
g2_alphas=w2_gs,
|
||||
)
|
||||
|
||||
kernel = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
moe_config = make_dummy_moe_config(
|
||||
num_experts=num_experts,
|
||||
hidden_dim=k,
|
||||
intermediate_size_per_partition=n,
|
||||
in_dtype=a.dtype,
|
||||
)
|
||||
kernel = mk.FusedMoEKernel(
|
||||
maybe_make_prepare_finalize(
|
||||
moe=moe_config,
|
||||
quant_config=quant_config,
|
||||
allow_new_interface=True,
|
||||
use_monolithic=False,
|
||||
),
|
||||
CutlassExpertsFp4(
|
||||
make_dummy_moe_config(),
|
||||
moe_config=moe_config,
|
||||
quant_config=quant_config,
|
||||
),
|
||||
)
|
||||
@@ -240,11 +251,17 @@ def bench_run(
|
||||
g1_alphas=w1_gs,
|
||||
g2_alphas=w2_gs,
|
||||
)
|
||||
moe_config = make_dummy_moe_config()
|
||||
|
||||
kernel = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
kernel = mk.FusedMoEKernel(
|
||||
maybe_make_prepare_finalize(
|
||||
moe=moe_config,
|
||||
quant_config=quant_config,
|
||||
allow_new_interface=True,
|
||||
use_monolithic=False,
|
||||
),
|
||||
CutlassExpertsFp4(
|
||||
make_dummy_moe_config(),
|
||||
moe_config=moe_config,
|
||||
quant_config=quant_config,
|
||||
),
|
||||
)
|
||||
@@ -290,7 +307,7 @@ def bench_run(
|
||||
def replay_graph(graph, num_repeats):
|
||||
for _ in range(num_repeats):
|
||||
graph.replay()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
cutlass_stream = torch.cuda.Stream()
|
||||
cutlass_graph = torch.cuda.CUDAGraph()
|
||||
@@ -313,7 +330,7 @@ def bench_run(
|
||||
e=num_experts,
|
||||
device=device,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
triton_stream = torch.cuda.Stream()
|
||||
triton_graph = torch.cuda.CUDAGraph()
|
||||
@@ -328,7 +345,7 @@ def bench_run(
|
||||
w2_fp8scale,
|
||||
a_fp8_scale,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
min_run_time = 5
|
||||
num_warmup = 5
|
||||
|
||||
@@ -342,7 +342,7 @@ class CommunicatorBenchmark:
|
||||
if not should_use_fn(tensor):
|
||||
return None
|
||||
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
stream = torch.cuda.Stream()
|
||||
with torch.cuda.stream(stream):
|
||||
graph_input = tensor.clone()
|
||||
@@ -360,17 +360,17 @@ class CommunicatorBenchmark:
|
||||
for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
|
||||
allreduce_fn(graph_input)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
for _ in range(num_warmup):
|
||||
graph.replay()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
start_time = time.perf_counter()
|
||||
|
||||
for _ in range(num_trials):
|
||||
graph.replay()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
end_time = time.perf_counter()
|
||||
|
||||
|
||||
@@ -385,7 +385,7 @@ def benchmark_operation(
|
||||
# Warmup before graph capture
|
||||
for _ in range(warmup):
|
||||
operation_func(*args, **kwargs)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Create CUDA graph
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
@@ -398,19 +398,19 @@ def benchmark_operation(
|
||||
operation_func(*args, **kwargs)
|
||||
|
||||
# Graph warmup
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
for _ in range(warmup):
|
||||
graph.replay()
|
||||
|
||||
# Benchmark with CUDA graph
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
start_time = time.perf_counter()
|
||||
|
||||
for _ in range(trials // num_op_per_cudagraph):
|
||||
# operation_func(*args, **kwargs)
|
||||
graph.replay()
|
||||
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
end_time = time.perf_counter()
|
||||
|
||||
avg_time_ms = ((end_time - start_time) / trials) * 1000
|
||||
|
||||
@@ -9,15 +9,15 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from tests.kernels.moe.utils import make_dummy_moe_config
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.model_executor.layers.fused_moe.all2all_utils import (
|
||||
maybe_make_prepare_finalize,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
||||
fused_experts,
|
||||
fused_topk,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
|
||||
MoEPrepareAndFinalizeNoEP,
|
||||
)
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.v1.worker.workspace import init_workspace_manager
|
||||
|
||||
@@ -131,16 +131,22 @@ def bench_run(
|
||||
w2_scale=w2_scale,
|
||||
per_act_token_quant=per_act_token,
|
||||
)
|
||||
moe_config = make_dummy_moe_config(
|
||||
num_experts=w2.shape[0],
|
||||
hidden_dim=w2.shape[1],
|
||||
intermediate_size_per_partition=w2.shape[2],
|
||||
in_dtype=a.dtype,
|
||||
)
|
||||
|
||||
fn = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
fn = mk.FusedMoEKernel(
|
||||
maybe_make_prepare_finalize(
|
||||
moe=moe_config,
|
||||
quant_config=quant_config,
|
||||
allow_new_interface=True,
|
||||
use_monolithic=False,
|
||||
),
|
||||
CutlassExpertsFp8(
|
||||
moe_config=make_dummy_moe_config(
|
||||
num_experts=w2.shape[0],
|
||||
hidden_dim=w2.shape[1],
|
||||
intermediate_size_per_partition=w2.shape[2],
|
||||
in_dtype=a.dtype,
|
||||
),
|
||||
moe_config=moe_config,
|
||||
quant_config=quant_config,
|
||||
),
|
||||
)
|
||||
@@ -163,16 +169,22 @@ def bench_run(
|
||||
w2_scale=w2_scale,
|
||||
per_act_token_quant=per_act_token,
|
||||
)
|
||||
moe_config = make_dummy_moe_config(
|
||||
num_experts=w2.shape[0],
|
||||
hidden_dim=w2.shape[1],
|
||||
intermediate_size_per_partition=w2.shape[2],
|
||||
in_dtype=a.dtype,
|
||||
)
|
||||
|
||||
fn = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
fn = mk.FusedMoEKernel(
|
||||
maybe_make_prepare_finalize(
|
||||
moe=moe_config,
|
||||
quant_config=quant_config,
|
||||
allow_new_interface=True,
|
||||
use_monolithic=False,
|
||||
),
|
||||
CutlassExpertsFp8(
|
||||
moe_config=make_dummy_moe_config(
|
||||
num_experts=w2.shape[0],
|
||||
hidden_dim=w2.shape[1],
|
||||
intermediate_size_per_partition=w2.shape[2],
|
||||
in_dtype=a.dtype,
|
||||
),
|
||||
moe_config=moe_config,
|
||||
quant_config=quant_config,
|
||||
),
|
||||
)
|
||||
@@ -212,7 +224,7 @@ def bench_run(
|
||||
def replay_graph(graph, num_repeats):
|
||||
for _ in range(num_repeats):
|
||||
graph.replay()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
cutlass_stream = torch.cuda.Stream()
|
||||
cutlass_graph = torch.cuda.CUDAGraph()
|
||||
@@ -227,7 +239,7 @@ def bench_run(
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
triton_stream = torch.cuda.Stream()
|
||||
triton_graph = torch.cuda.CUDAGraph()
|
||||
@@ -242,7 +254,7 @@ def bench_run(
|
||||
w2_scale,
|
||||
a_scale,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
min_run_time = 5
|
||||
num_warmup = 5
|
||||
|
||||
@@ -34,14 +34,14 @@ def main(
|
||||
residual = torch.randn_like(x) * scale if add_residual else None
|
||||
|
||||
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
if profile:
|
||||
torch.cuda.cudart().cudaProfilerStart()
|
||||
start_time = time.perf_counter()
|
||||
|
||||
for _ in range(num_iters):
|
||||
layer(x, residual)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
end_time = time.perf_counter()
|
||||
if profile:
|
||||
|
||||
@@ -1035,7 +1035,7 @@ def bench_optype(
|
||||
# Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
|
||||
for kwargs in kwargs_list:
|
||||
op_type.bench_fn()(**kwargs)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Merge into a single kwargs and qualify arguments as ArgPool
|
||||
kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
|
||||
|
||||
@@ -47,13 +47,13 @@ def benchmark_method(
|
||||
# Warmup
|
||||
for _ in range(num_warmup):
|
||||
_ = method(k_nope, k_pe)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Benchmark
|
||||
start = time.perf_counter()
|
||||
for _ in range(num_iters):
|
||||
_ = method(k_nope, k_pe)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
end = time.perf_counter()
|
||||
|
||||
return (end - start) / num_iters * 1000 # Convert to ms
|
||||
|
||||
@@ -17,6 +17,9 @@ from ray.experimental.tqdm_ray import tqdm
|
||||
|
||||
from vllm.model_executor.layers.fused_moe import fused_topk
|
||||
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
|
||||
from vllm.model_executor.layers.fused_moe.all2all_utils import (
|
||||
maybe_make_prepare_finalize,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
FusedMoEConfig,
|
||||
FusedMoEParallelConfig,
|
||||
@@ -51,7 +54,7 @@ def clear_triton_cache():
|
||||
|
||||
# Clear CUDA memory cache
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
torch.accelerator.empty_cache()
|
||||
|
||||
# Try to clear Triton's runtime cache
|
||||
try:
|
||||
@@ -242,24 +245,33 @@ def benchmark_config(
|
||||
|
||||
deep_gemm_experts = None
|
||||
if use_deep_gemm:
|
||||
deep_gemm_experts = mk.FusedMoEModularKernel(
|
||||
prepare_finalize=MoEPrepareAndFinalizeNoEP(),
|
||||
moe_config = (
|
||||
FusedMoEConfig(
|
||||
num_experts=num_experts,
|
||||
experts_per_token=topk,
|
||||
hidden_dim=hidden_size,
|
||||
intermediate_size_per_partition=shard_intermediate_size,
|
||||
num_local_experts=num_experts,
|
||||
num_logical_experts=num_experts,
|
||||
activation=MoEActivation.SILU,
|
||||
moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
|
||||
in_dtype=init_dtype,
|
||||
routing_method=RoutingMethodType.TopK,
|
||||
device="cuda",
|
||||
),
|
||||
)
|
||||
deep_gemm_experts = mk.FusedMoEKernel(
|
||||
prepare_finalize=maybe_make_prepare_finalize(
|
||||
moe=moe_config,
|
||||
quant_config=quant_config,
|
||||
allow_new_interface=True,
|
||||
use_monolithic=False,
|
||||
),
|
||||
fused_experts=TritonOrDeepGemmExperts(
|
||||
moe_config=FusedMoEConfig(
|
||||
num_experts=num_experts,
|
||||
experts_per_token=topk,
|
||||
hidden_dim=hidden_size,
|
||||
intermediate_size_per_partition=shard_intermediate_size,
|
||||
num_local_experts=num_experts,
|
||||
num_logical_experts=num_experts,
|
||||
activation=MoEActivation.SILU,
|
||||
moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
|
||||
in_dtype=init_dtype,
|
||||
routing_method=RoutingMethodType.TopK,
|
||||
device="cuda",
|
||||
),
|
||||
moe_config=moe_config,
|
||||
quant_config=quant_config,
|
||||
),
|
||||
inplace=not disable_inplace(),
|
||||
)
|
||||
|
||||
with override_config(config):
|
||||
@@ -269,8 +281,16 @@ def benchmark_config(
|
||||
|
||||
inplace = not disable_inplace()
|
||||
if use_deep_gemm:
|
||||
return deep_gemm_experts(
|
||||
x, w1, w2, topk_weights, topk_ids, inplace=inplace
|
||||
return deep_gemm_experts.apply(
|
||||
x,
|
||||
w1,
|
||||
w2,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
activation=MoEActivation.SILU,
|
||||
global_num_experts=num_experts,
|
||||
apply_router_weight_on_input=False,
|
||||
expert_map=False,
|
||||
)
|
||||
return fused_experts(
|
||||
x,
|
||||
@@ -284,19 +304,19 @@ def benchmark_config(
|
||||
|
||||
# JIT compilation & warmup
|
||||
run()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Capture 10 invocations with CUDA graph
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(graph):
|
||||
for _ in range(10):
|
||||
run()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Warmup
|
||||
for _ in range(5):
|
||||
graph.replay()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
start_event = torch.Event(enable_timing=True)
|
||||
end_event = torch.Event(enable_timing=True)
|
||||
@@ -304,7 +324,7 @@ def benchmark_config(
|
||||
latencies: list[float] = []
|
||||
for i in range(num_iters):
|
||||
prepare(i)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
start_event.record()
|
||||
graph.replay()
|
||||
|
||||
@@ -131,7 +131,7 @@ def benchmark_config(
|
||||
topk_ids,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Benchmark
|
||||
start = torch.cuda.Event(enable_timing=True)
|
||||
@@ -149,7 +149,7 @@ def benchmark_config(
|
||||
quant_config=quant_config,
|
||||
)
|
||||
end.record()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
return start.elapsed_time(end) / num_iters * 1000 # ms -> us
|
||||
|
||||
|
||||
|
||||
@@ -69,19 +69,19 @@ def benchmark_permute(
|
||||
|
||||
# JIT compilation & warmup
|
||||
run()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Capture 10 invocations with CUDA graph
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(graph):
|
||||
for _ in range(10):
|
||||
run()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Warmup
|
||||
for _ in range(5):
|
||||
graph.replay()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
start_event = torch.Event(enable_timing=True)
|
||||
end_event = torch.Event(enable_timing=True)
|
||||
@@ -89,7 +89,7 @@ def benchmark_permute(
|
||||
latencies: list[float] = []
|
||||
for i in range(num_iters):
|
||||
prepare(i)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
start_event.record()
|
||||
graph.replay()
|
||||
@@ -159,26 +159,26 @@ def benchmark_unpermute(
|
||||
# JIT compilation & warmup
|
||||
input = prepare()
|
||||
run(input)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Capture 10 invocations with CUDA graph
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(graph):
|
||||
for _ in range(10):
|
||||
run(input)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Warmup
|
||||
for _ in range(5):
|
||||
graph.replay()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
start_event = torch.Event(enable_timing=True)
|
||||
end_event = torch.Event(enable_timing=True)
|
||||
|
||||
latencies: list[float] = []
|
||||
for i in range(num_iters):
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
start_event.record()
|
||||
graph.replay()
|
||||
end_event.record()
|
||||
|
||||
@@ -135,14 +135,14 @@ def benchmark_mrope(
|
||||
key.clone(),
|
||||
)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Time reference implementation
|
||||
torch_times = []
|
||||
for _ in range(benchmark_iter):
|
||||
query_clone = query.clone()
|
||||
key_clone = key.clone()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
start_time = time.time()
|
||||
|
||||
mrope_helper_class.forward_native(
|
||||
@@ -151,7 +151,7 @@ def benchmark_mrope(
|
||||
key_clone,
|
||||
)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
torch_times.append(time.time() - start_time)
|
||||
|
||||
# Time triton kernel implementation
|
||||
@@ -159,14 +159,14 @@ def benchmark_mrope(
|
||||
for _ in range(benchmark_iter):
|
||||
query_clone = query.clone()
|
||||
key_clone = key.clone()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
start_time = time.time()
|
||||
mrope_helper_class.forward_cuda(
|
||||
positions,
|
||||
query_clone,
|
||||
key_clone,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
triton_times.append(time.time() - start_time)
|
||||
|
||||
# Calculate statistics
|
||||
|
||||
@@ -103,7 +103,7 @@ def main(
|
||||
max_logits = torch.empty_like(exp_sums)
|
||||
|
||||
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
if profile:
|
||||
torch.cuda.cudart().cudaProfilerStart()
|
||||
start_time = time.perf_counter()
|
||||
@@ -173,7 +173,7 @@ def main(
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Invalid version: {version}")
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
end_time = time.perf_counter()
|
||||
if profile:
|
||||
|
||||
@@ -28,7 +28,7 @@ def _time_cuda(
|
||||
# warmup
|
||||
for _ in range(warmup_iters):
|
||||
fn()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
start = torch.Event(enable_timing=True)
|
||||
end = torch.Event(enable_timing=True)
|
||||
@@ -37,7 +37,7 @@ def _time_cuda(
|
||||
for _ in range(bench_iters):
|
||||
fn()
|
||||
end.record()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
return start.elapsed_time(end) / bench_iters # ms/iter
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ def main(
|
||||
scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None
|
||||
|
||||
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
if profile:
|
||||
torch.cuda.cudart().cudaProfilerStart()
|
||||
start_time = time.perf_counter()
|
||||
@@ -39,7 +39,7 @@ def main(
|
||||
ops.scaled_int8_quant(x, scale)
|
||||
else:
|
||||
ops.scaled_fp8_quant(x, scale)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
end_time = time.perf_counter()
|
||||
if profile:
|
||||
|
||||
@@ -84,16 +84,16 @@ def run_benchmark(
|
||||
g = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(g):
|
||||
function_under_test()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
function_under_test = lambda: g.replay()
|
||||
|
||||
def run_cuda_benchmark(n_iters: int) -> float:
|
||||
nonlocal key, value, key_cache, value_cache, slot_mapping
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
start = time.perf_counter()
|
||||
for _ in range(n_iters):
|
||||
function_under_test()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
end = time.perf_counter()
|
||||
return (end - start) / n_iters
|
||||
|
||||
@@ -104,7 +104,7 @@ def run_benchmark(
|
||||
|
||||
# free tensors to mitigate OOM when sweeping
|
||||
del key, value, key_cache, value_cache, slot_mapping
|
||||
torch.cuda.empty_cache()
|
||||
torch.accelerator.empty_cache()
|
||||
|
||||
return lat
|
||||
|
||||
|
||||
@@ -109,16 +109,16 @@ def run_benchmark(
|
||||
g = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(g):
|
||||
function_under_test()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
function_under_test = lambda: g.replay()
|
||||
|
||||
def run_cuda_benchmark(n_iters: int) -> float:
|
||||
nonlocal key, value, key_cache, value_cache, slot_mapping
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
start = time.perf_counter()
|
||||
for _ in range(n_iters):
|
||||
function_under_test()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
end = time.perf_counter()
|
||||
return (end - start) / n_iters
|
||||
|
||||
@@ -129,7 +129,7 @@ def run_benchmark(
|
||||
|
||||
# free tensors to mitigate OOM when sweeping
|
||||
del key, value, key_cache, value_cache, slot_mapping
|
||||
torch.cuda.empty_cache()
|
||||
torch.accelerator.empty_cache()
|
||||
|
||||
return lat
|
||||
|
||||
|
||||
@@ -251,7 +251,7 @@ def benchmark(
|
||||
kernel(
|
||||
y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
start_event = torch.Event(enable_timing=True)
|
||||
end_event = torch.Event(enable_timing=True)
|
||||
@@ -259,7 +259,7 @@ def benchmark(
|
||||
# Benchmark
|
||||
latencies: list[float] = []
|
||||
for _ in range(runs):
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
start_event.record()
|
||||
for i in range(iterations_per_run):
|
||||
|
||||
@@ -126,7 +126,7 @@ def benchmark_decode(
|
||||
)
|
||||
|
||||
def time_fn(fn, warmup=10, trials=20):
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
start = torch.Event(enable_timing=True)
|
||||
end = torch.Event(enable_timing=True)
|
||||
times = []
|
||||
@@ -136,7 +136,7 @@ def benchmark_decode(
|
||||
start.record()
|
||||
fn()
|
||||
end.record()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
times.append(start.elapsed_time(end)) # ms
|
||||
return sum(times) / len(times), torch.std(torch.tensor(times))
|
||||
|
||||
|
||||
@@ -138,7 +138,7 @@ def benchmark_prefill(
|
||||
)
|
||||
|
||||
def time_fn(fn, warmup=10, trials=20):
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
start = torch.Event(enable_timing=True)
|
||||
end = torch.Event(enable_timing=True)
|
||||
times = []
|
||||
@@ -148,7 +148,7 @@ def benchmark_prefill(
|
||||
start.record()
|
||||
fn()
|
||||
end.record()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
times.append(start.elapsed_time(end)) # ms
|
||||
return sum(times) / len(times), torch.std(torch.tensor(times))
|
||||
|
||||
|
||||
@@ -177,18 +177,18 @@ def benchmark_config(
|
||||
def run():
|
||||
w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
# JIT complication & warmup
|
||||
for _ in range(5):
|
||||
run()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
start_event = torch.Event(enable_timing=True)
|
||||
end_event = torch.Event(enable_timing=True)
|
||||
|
||||
latencies: list[float] = []
|
||||
for i in range(num_iters):
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
start_event.record()
|
||||
run()
|
||||
end_event.record()
|
||||
|
||||
@@ -35,7 +35,7 @@ def benchmark_shape(
|
||||
B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
|
||||
|
||||
# Reference result in BF16
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
C_ref = A @ B.t()
|
||||
|
||||
# Pre-quantize B for all implementations
|
||||
@@ -121,14 +121,14 @@ def benchmark_shape(
|
||||
# Warmup
|
||||
for _ in range(warmup):
|
||||
func()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Timing loop
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
start = time.time()
|
||||
for _ in range(repeat):
|
||||
func()
|
||||
torch.cuda.synchronize()
|
||||
torch.accelerator.synchronize()
|
||||
end = time.time()
|
||||
|
||||
# Calculate timing and TFLOPS
|
||||
|
||||
@@ -7,7 +7,7 @@ First start serving your model
|
||||
```bash
|
||||
export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
|
||||
|
||||
vllm serve $MODEL_PATH --served-model-name Llama --disable-log-requests
|
||||
vllm serve $MODEL_PATH --served-model-name Llama
|
||||
```
|
||||
|
||||
The variable `MODEL_PATH` should be a path to the model files (e.g. downloaded from huggingface).
|
||||
|
||||
@@ -13,28 +13,16 @@ endif()
|
||||
#
|
||||
# Define environment variables for special configurations
|
||||
#
|
||||
set(ENABLE_AVX2 $ENV{VLLM_CPU_AVX2})
|
||||
set(ENABLE_AVX512 $ENV{VLLM_CPU_AVX512})
|
||||
set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16})
|
||||
set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI})
|
||||
set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16})
|
||||
set(ENABLE_X86_ISA $ENV{VLLM_CPU_X86})
|
||||
set(ENABLE_ARM_BF16 $ENV{VLLM_CPU_ARM_BF16})
|
||||
|
||||
include_directories("${CMAKE_SOURCE_DIR}/csrc")
|
||||
|
||||
|
||||
set (ENABLE_NUMA TRUE)
|
||||
|
||||
#
|
||||
# Check the compile flags
|
||||
#
|
||||
|
||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
|
||||
list(APPEND CXX_COMPILE_FLAGS
|
||||
"-mf16c"
|
||||
)
|
||||
endif()
|
||||
|
||||
if(MACOSX_FOUND)
|
||||
list(APPEND CXX_COMPILE_FLAGS
|
||||
"-DVLLM_CPU_EXTENSION")
|
||||
@@ -78,18 +66,6 @@ function(check_sysctl TARGET OUT)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
|
||||
function (is_avx512_disabled OUT)
|
||||
set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512})
|
||||
if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true")
|
||||
set(${OUT} ON PARENT_SCOPE)
|
||||
else()
|
||||
set(${OUT} OFF PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
is_avx512_disabled(AVX512_DISABLED)
|
||||
|
||||
if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
|
||||
message(STATUS "Apple Silicon Detected")
|
||||
set(APPLE_SILICON_FOUND TRUE)
|
||||
@@ -97,8 +73,6 @@ if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
|
||||
check_sysctl(hw.optional.neon ASIMD_FOUND)
|
||||
check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND)
|
||||
else()
|
||||
find_isa(${CPUINFO} "avx2" AVX2_FOUND)
|
||||
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
|
||||
find_isa(${CPUINFO} "Power11" POWER11_FOUND)
|
||||
find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
|
||||
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
|
||||
@@ -108,77 +82,32 @@ else()
|
||||
find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
|
||||
|
||||
# Support cross-compilation by allowing override via environment variables
|
||||
if (ENABLE_AVX2)
|
||||
set(AVX2_FOUND ON)
|
||||
message(STATUS "AVX2 support enabled via VLLM_CPU_AVX2 environment variable")
|
||||
endif()
|
||||
if (ENABLE_AVX512)
|
||||
set(AVX512_FOUND ON)
|
||||
message(STATUS "AVX512 support enabled via VLLM_CPU_AVX512 environment variable")
|
||||
endif()
|
||||
if (ENABLE_ARM_BF16)
|
||||
set(ARM_BF16_FOUND ON)
|
||||
message(STATUS "ARM BF16 support enabled via VLLM_CPU_ARM_BF16 environment variable")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
list(APPEND CXX_COMPILE_FLAGS
|
||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64" OR ENABLE_X86_ISA)
|
||||
set(ENABLE_X86_ISA ON)
|
||||
if (NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
|
||||
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3))
|
||||
message(FATAL_ERROR "X86 backend requires gcc/g++ >= 12.3")
|
||||
endif()
|
||||
list(APPEND CXX_COMPILE_FLAGS "-mf16c")
|
||||
list(APPEND CXX_COMPILE_FLAGS_AVX512 ${CXX_COMPILE_FLAGS})
|
||||
list(APPEND CXX_COMPILE_FLAGS_AVX2 ${CXX_COMPILE_FLAGS})
|
||||
list(APPEND CXX_COMPILE_FLAGS_AVX512
|
||||
"-mavx512f"
|
||||
"-mavx512vl"
|
||||
"-mavx512bw"
|
||||
"-mavx512dq")
|
||||
|
||||
find_isa(${CPUINFO} "avx512_bf16" AVX512BF16_FOUND)
|
||||
if (AVX512BF16_FOUND OR ENABLE_AVX512BF16)
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
|
||||
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
|
||||
list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16")
|
||||
set(ENABLE_AVX512BF16 ON)
|
||||
else()
|
||||
set(ENABLE_AVX512BF16 OFF)
|
||||
message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3")
|
||||
endif()
|
||||
else()
|
||||
set(ENABLE_AVX512BF16 OFF)
|
||||
message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
|
||||
endif()
|
||||
|
||||
find_isa(${CPUINFO} "avx512_vnni" AVX512VNNI_FOUND)
|
||||
if (AVX512VNNI_FOUND OR ENABLE_AVX512VNNI)
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
|
||||
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
|
||||
list(APPEND CXX_COMPILE_FLAGS "-mavx512vnni")
|
||||
set(ENABLE_AVX512VNNI ON)
|
||||
else()
|
||||
set(ENABLE_AVX512VNNI OFF)
|
||||
message(WARNING "Disable AVX512-VNNI ISA support, requires gcc/g++ >= 12.3")
|
||||
endif()
|
||||
else()
|
||||
set(ENABLE_AVX512VNNI OFF)
|
||||
message(WARNING "Disable AVX512-VNNI ISA support, no avx512_vnni found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512VNNI=1.")
|
||||
endif()
|
||||
|
||||
find_isa(${CPUINFO} "amx_bf16" AMXBF16_FOUND)
|
||||
if (AMXBF16_FOUND OR ENABLE_AMXBF16)
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
|
||||
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
|
||||
list(APPEND CXX_COMPILE_FLAGS "-mamx-bf16" "-mamx-tile")
|
||||
set(ENABLE_AMXBF16 ON)
|
||||
add_compile_definitions(-DCPU_CAPABILITY_AMXBF16)
|
||||
else()
|
||||
set(ENABLE_AMXBF16 OFF)
|
||||
message(WARNING "Disable AMX_BF16 ISA support, requires gcc/g++ >= 12.3")
|
||||
endif()
|
||||
else()
|
||||
set(ENABLE_AMXBF16 OFF)
|
||||
message(WARNING "Disable AMX_BF16 ISA support, no amx_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AMXBF16=1.")
|
||||
endif()
|
||||
|
||||
elseif (AVX2_FOUND)
|
||||
list(APPEND CXX_COMPILE_FLAGS "-mavx2")
|
||||
message(WARNING "vLLM CPU backend using AVX2 ISA")
|
||||
|
||||
"-mavx512dq"
|
||||
"-mavx512bf16"
|
||||
"-mavx512vnni"
|
||||
"-mamx-bf16"
|
||||
"-mamx-tile")
|
||||
list(APPEND CXX_COMPILE_FLAGS_AVX2
|
||||
"-mavx2")
|
||||
elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
||||
message(STATUS "PowerPC detected")
|
||||
if (POWER9_FOUND)
|
||||
@@ -219,12 +148,12 @@ elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
|
||||
list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
|
||||
endif()
|
||||
else()
|
||||
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
|
||||
message(FATAL_ERROR "vLLM CPU backend requires X86, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
|
||||
endif()
|
||||
|
||||
|
||||
# Build oneDNN for GEMM kernels (only for x86-AVX512 /ARM platforms)
|
||||
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
||||
# Build oneDNN for GEMM kernels
|
||||
if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
||||
# Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
|
||||
# TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
|
||||
set(ONEDNN_AARCH64_USE_ACL OFF CACHE BOOL "")
|
||||
@@ -313,13 +242,24 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
||||
)
|
||||
else()
|
||||
message(STATUS "Downloading oneDNN from GitHub")
|
||||
FetchContent_Declare(
|
||||
oneDNN
|
||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||
GIT_TAG v3.10
|
||||
GIT_PROGRESS TRUE
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
if(ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
|
||||
message(STATUS "aarch64 detected: using pinned oneDNN commit 9c5be1cc59e368aebf0909e6cf20f981ea61462a")
|
||||
FetchContent_Declare(
|
||||
oneDNN
|
||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||
GIT_TAG 9c5be1cc59e368aebf0909e6cf20f981ea61462a
|
||||
GIT_PROGRESS TRUE
|
||||
GIT_SHALLOW FALSE
|
||||
)
|
||||
else()
|
||||
FetchContent_Declare(
|
||||
oneDNN
|
||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||
GIT_TAG v3.10
|
||||
GIT_PROGRESS TRUE
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
||||
@@ -329,13 +269,21 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
||||
set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
|
||||
set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
|
||||
set(ONEDNN_BUILD_GRAPH "OFF")
|
||||
set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
|
||||
set(ONEDNN_ENABLE_JIT_PROFILING "ON")
|
||||
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
|
||||
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
|
||||
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
|
||||
set(ONEDNN_VERBOSE "OFF")
|
||||
set(ONEDNN_ENABLE_MAX_CPU_ISA "ON")
|
||||
set(ONEDNN_ENABLE_CPU_ISA_HINTS "ON")
|
||||
set(ONEDNN_VERBOSE "ON")
|
||||
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
||||
|
||||
# TODO: Refactor this
|
||||
if (ENABLE_X86_ISA)
|
||||
# Note: only enable oneDNN for AVX512
|
||||
list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512})
|
||||
else()
|
||||
list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS})
|
||||
endif()
|
||||
|
||||
set(VLLM_BUILD_TYPE ${CMAKE_BUILD_TYPE})
|
||||
set(CMAKE_BUILD_TYPE "Release") # remove oneDNN debug symbols to reduce size
|
||||
FetchContent_MakeAvailable(oneDNN)
|
||||
@@ -348,14 +296,20 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
||||
PRIVATE ${oneDNN_SOURCE_DIR}/src
|
||||
)
|
||||
target_link_libraries(dnnl_ext dnnl torch)
|
||||
target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC)
|
||||
target_compile_options(dnnl_ext PRIVATE ${DNNL_COMPILE_FLAGS} -fPIC)
|
||||
list(APPEND LIBS dnnl_ext)
|
||||
set(USE_ONEDNN ON)
|
||||
else()
|
||||
set(USE_ONEDNN OFF)
|
||||
endif()
|
||||
|
||||
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
||||
# TODO: Refactor this
|
||||
if (ENABLE_X86_ISA)
|
||||
message(STATUS "CPU extension (AVX512) compile flags: ${CXX_COMPILE_FLAGS_AVX512}")
|
||||
message(STATUS "CPU extension (AVX2) compile flags: ${CXX_COMPILE_FLAGS_AVX2}")
|
||||
else()
|
||||
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
||||
endif()
|
||||
|
||||
if(ENABLE_NUMA)
|
||||
list(APPEND LIBS numa)
|
||||
@@ -390,25 +344,6 @@ set(VLLM_EXT_SRC
|
||||
"csrc/cpu/cpu_attn.cpp"
|
||||
"csrc/cpu/torch_bindings.cpp")
|
||||
|
||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
set(VLLM_EXT_SRC
|
||||
"csrc/cpu/shm.cpp"
|
||||
"csrc/cpu/cpu_wna16.cpp"
|
||||
"csrc/cpu/cpu_fused_moe.cpp"
|
||||
${VLLM_EXT_SRC})
|
||||
if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
|
||||
set(VLLM_EXT_SRC
|
||||
"csrc/cpu/sgl-kernels/gemm.cpp"
|
||||
"csrc/cpu/sgl-kernels/gemm_int8.cpp"
|
||||
"csrc/cpu/sgl-kernels/gemm_fp8.cpp"
|
||||
"csrc/cpu/sgl-kernels/moe.cpp"
|
||||
"csrc/cpu/sgl-kernels/moe_int8.cpp"
|
||||
"csrc/cpu/sgl-kernels/moe_fp8.cpp"
|
||||
${VLLM_EXT_SRC})
|
||||
add_compile_definitions(-DCPU_CAPABILITY_AVX512)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
|
||||
set(VLLM_EXT_SRC
|
||||
"csrc/cpu/shm.cpp"
|
||||
@@ -421,21 +356,83 @@ if(USE_ONEDNN)
|
||||
${VLLM_EXT_SRC})
|
||||
endif()
|
||||
|
||||
message(STATUS "CPU extension source files: ${VLLM_EXT_SRC}")
|
||||
if (ENABLE_X86_ISA)
|
||||
set(VLLM_EXT_SRC_AVX512
|
||||
"csrc/cpu/sgl-kernels/gemm.cpp"
|
||||
"csrc/cpu/sgl-kernels/gemm_int8.cpp"
|
||||
"csrc/cpu/sgl-kernels/gemm_fp8.cpp"
|
||||
"csrc/cpu/sgl-kernels/moe.cpp"
|
||||
"csrc/cpu/sgl-kernels/moe_int8.cpp"
|
||||
"csrc/cpu/sgl-kernels/moe_fp8.cpp"
|
||||
"csrc/cpu/shm.cpp"
|
||||
"csrc/cpu/cpu_wna16.cpp"
|
||||
"csrc/cpu/cpu_fused_moe.cpp"
|
||||
"csrc/cpu/utils.cpp"
|
||||
"csrc/cpu/cpu_attn.cpp"
|
||||
"csrc/cpu/dnnl_kernels.cpp"
|
||||
"csrc/cpu/torch_bindings.cpp"
|
||||
# TODO: Remove these files
|
||||
"csrc/cpu/activation.cpp"
|
||||
"csrc/cpu/layernorm.cpp"
|
||||
"csrc/cpu/mla_decode.cpp"
|
||||
"csrc/cpu/pos_encoding.cpp"
|
||||
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
|
||||
|
||||
#
|
||||
# Define extension targets
|
||||
#
|
||||
set(VLLM_EXT_SRC_AVX2
|
||||
"csrc/cpu/utils.cpp"
|
||||
"csrc/cpu/cpu_attn.cpp"
|
||||
"csrc/cpu/torch_bindings.cpp"
|
||||
# TODO: Remove these files
|
||||
"csrc/cpu/activation.cpp"
|
||||
"csrc/cpu/layernorm.cpp"
|
||||
"csrc/cpu/mla_decode.cpp"
|
||||
"csrc/cpu/pos_encoding.cpp"
|
||||
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
|
||||
|
||||
define_extension_target(
|
||||
_C
|
||||
DESTINATION vllm
|
||||
LANGUAGE CXX
|
||||
SOURCES ${VLLM_EXT_SRC}
|
||||
LIBRARIES ${LIBS}
|
||||
COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
|
||||
USE_SABI 3
|
||||
WITH_SOABI
|
||||
)
|
||||
message(STATUS "CPU extension (AVX512) source files: ${VLLM_EXT_SRC_AVX512}")
|
||||
message(STATUS "CPU extension (AVX2) source files: ${VLLM_EXT_SRC_AVX2}")
|
||||
|
||||
define_extension_target(
|
||||
_C
|
||||
DESTINATION vllm
|
||||
LANGUAGE CXX
|
||||
SOURCES ${VLLM_EXT_SRC_AVX512}
|
||||
LIBRARIES ${LIBS}
|
||||
COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512}
|
||||
USE_SABI 3
|
||||
WITH_SOABI
|
||||
)
|
||||
|
||||
# For SGL kernels
|
||||
target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AVX512")
|
||||
# For AMX kernels
|
||||
target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16")
|
||||
|
||||
define_extension_target(
|
||||
_C_AVX2
|
||||
DESTINATION vllm
|
||||
LANGUAGE CXX
|
||||
SOURCES ${VLLM_EXT_SRC_AVX2}
|
||||
LIBRARIES ${LIBS}
|
||||
COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX2}
|
||||
USE_SABI 3
|
||||
WITH_SOABI
|
||||
)
|
||||
else()
|
||||
message(STATUS "CPU extension source files: ${VLLM_EXT_SRC}")
|
||||
#
|
||||
# Define extension targets
|
||||
#
|
||||
define_extension_target(
|
||||
_C
|
||||
DESTINATION vllm
|
||||
LANGUAGE CXX
|
||||
SOURCES ${VLLM_EXT_SRC}
|
||||
LIBRARIES ${LIBS}
|
||||
COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
|
||||
USE_SABI 3
|
||||
WITH_SOABI
|
||||
)
|
||||
endif()
|
||||
|
||||
message(STATUS "Enabling C extension.")
|
||||
|
||||
@@ -17,7 +17,8 @@ endif()
|
||||
# They should be identical but if they aren't, this is a massive footgun.
|
||||
#
|
||||
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
|
||||
# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
|
||||
# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2), --component _vllm_fa3_C (for FA3),
|
||||
# or --component _vllm_fa4_cutedsl_C (for FA4 CuteDSL Python files).
|
||||
# If no component is specified, vllm-flash-attn is still installed.
|
||||
|
||||
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
|
||||
@@ -38,22 +39,16 @@ else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 5824e6e2008271063c3229ab3e7032bd74abbbc6
|
||||
GIT_TAG 140c00c0241bb60cc6e44e7c1be9998d4b20d8d2
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
)
|
||||
endif()
|
||||
|
||||
|
||||
# Ensure the vllm/vllm_flash_attn directory exists before installation
|
||||
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" ALL_COMPONENTS)
|
||||
|
||||
# Make sure vllm-flash-attn install rules are nested under vllm/
|
||||
# This is here to support installing all components under the same prefix with cmake --install.
|
||||
# setup.py installs every component separately but uses the same prefix for all.
|
||||
# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3,
|
||||
# and these statements don't hurt when installing neither component.
|
||||
# ALL_COMPONENTS ensures the save/modify/restore runs exactly once regardless
|
||||
# of how many components are being installed, avoiding double-append of /vllm/.
|
||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS)
|
||||
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
|
||||
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_COMPONENTS)
|
||||
@@ -62,22 +57,48 @@ install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_
|
||||
FetchContent_MakeAvailable(vllm-flash-attn)
|
||||
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
|
||||
|
||||
# Restore the install prefix
|
||||
# Restore the install prefix after FA's install rules
|
||||
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
|
||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
||||
|
||||
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
|
||||
# case only one is built, in the case both are built redundant work is done)
|
||||
install(
|
||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||
DESTINATION vllm/vllm_flash_attn
|
||||
COMPONENT _vllm_fa2_C
|
||||
FILES_MATCHING PATTERN "*.py"
|
||||
)
|
||||
# Install shared Python files for both FA2 and FA3 components
|
||||
foreach(_FA_COMPONENT _vllm_fa2_C _vllm_fa3_C)
|
||||
# Ensure the vllm/vllm_flash_attn directory exists before installation
|
||||
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")"
|
||||
COMPONENT ${_FA_COMPONENT})
|
||||
|
||||
install(
|
||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||
DESTINATION vllm/vllm_flash_attn
|
||||
COMPONENT _vllm_fa3_C
|
||||
FILES_MATCHING PATTERN "*.py"
|
||||
)
|
||||
# Copy vllm_flash_attn python files (except __init__.py and flash_attn_interface.py
|
||||
# which are source-controlled in vllm)
|
||||
install(
|
||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||
DESTINATION vllm/vllm_flash_attn
|
||||
COMPONENT ${_FA_COMPONENT}
|
||||
FILES_MATCHING PATTERN "*.py"
|
||||
PATTERN "__init__.py" EXCLUDE
|
||||
PATTERN "flash_attn_interface.py" EXCLUDE
|
||||
)
|
||||
|
||||
endforeach()
|
||||
|
||||
#
|
||||
# FA4 CuteDSL component
|
||||
# This is a Python-only component that copies the flash_attn/cute directory
|
||||
# and transforms imports to match our package structure.
|
||||
#
|
||||
add_custom_target(_vllm_fa4_cutedsl_C)
|
||||
|
||||
# Copy flash_attn/cute directory (needed for FA4) and transform imports
|
||||
# The cute directory uses flash_attn.cute imports internally, which we replace
|
||||
# with vllm.vllm_flash_attn.cute to match our package structure.
|
||||
install(CODE "
|
||||
file(GLOB_RECURSE CUTE_PY_FILES \"${vllm-flash-attn_SOURCE_DIR}/flash_attn/cute/*.py\")
|
||||
foreach(SRC_FILE \${CUTE_PY_FILES})
|
||||
file(RELATIVE_PATH REL_PATH \"${vllm-flash-attn_SOURCE_DIR}/flash_attn/cute\" \${SRC_FILE})
|
||||
set(DST_FILE \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn/cute/\${REL_PATH}\")
|
||||
get_filename_component(DST_DIR \${DST_FILE} DIRECTORY)
|
||||
file(MAKE_DIRECTORY \${DST_DIR})
|
||||
file(READ \${SRC_FILE} FILE_CONTENTS)
|
||||
string(REPLACE \"flash_attn.cute\" \"vllm.vllm_flash_attn.cute\" FILE_CONTENTS \"\${FILE_CONTENTS}\")
|
||||
file(WRITE \${DST_FILE} \"\${FILE_CONTENTS}\")
|
||||
endforeach()
|
||||
" COMPONENT _vllm_fa4_cutedsl_C)
|
||||
|
||||
@@ -5,117 +5,11 @@
|
||||
#include <cmath>
|
||||
|
||||
#include "cuda_compat.h"
|
||||
#include "cuda_vec_utils.cuh"
|
||||
#include "dispatch_utils.h"
|
||||
|
||||
namespace vllm {
|
||||
|
||||
struct alignas(32) u32x8_t {
|
||||
uint32_t u0, u1, u2, u3, u4, u5, u6, u7;
|
||||
};
|
||||
|
||||
__device__ __forceinline__ void ld256(u32x8_t& val, const u32x8_t* ptr) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 && \
|
||||
defined(CUDA_VERSION) && CUDA_VERSION >= 12090
|
||||
asm volatile("ld.global.nc.v8.u32 {%0,%1,%2,%3,%4,%5,%6,%7}, [%8];\n"
|
||||
: "=r"(val.u0), "=r"(val.u1), "=r"(val.u2), "=r"(val.u3),
|
||||
"=r"(val.u4), "=r"(val.u5), "=r"(val.u6), "=r"(val.u7)
|
||||
: "l"(ptr));
|
||||
#else
|
||||
const uint4* uint_ptr = reinterpret_cast<const uint4*>(ptr);
|
||||
uint4 top_half = __ldg(&uint_ptr[0]);
|
||||
uint4 bottom_half = __ldg(&uint_ptr[1]);
|
||||
val.u0 = top_half.x;
|
||||
val.u1 = top_half.y;
|
||||
val.u2 = top_half.z;
|
||||
val.u3 = top_half.w;
|
||||
val.u4 = bottom_half.x;
|
||||
val.u5 = bottom_half.y;
|
||||
val.u6 = bottom_half.z;
|
||||
val.u7 = bottom_half.w;
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void st256(u32x8_t& val, u32x8_t* ptr) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 && \
|
||||
defined(CUDA_VERSION) && CUDA_VERSION >= 12090
|
||||
asm volatile("st.global.v8.u32 [%0], {%1,%2,%3,%4,%5,%6,%7,%8};\n"
|
||||
:
|
||||
: "l"(ptr), "r"(val.u0), "r"(val.u1), "r"(val.u2), "r"(val.u3),
|
||||
"r"(val.u4), "r"(val.u5), "r"(val.u6), "r"(val.u7)
|
||||
: "memory");
|
||||
#else
|
||||
uint4* uint_ptr = reinterpret_cast<uint4*>(ptr);
|
||||
uint_ptr[0] = make_uint4(val.u0, val.u1, val.u2, val.u3);
|
||||
uint_ptr[1] = make_uint4(val.u4, val.u5, val.u6, val.u7);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <bool support_256>
|
||||
struct VecTraits;
|
||||
|
||||
template <>
|
||||
struct VecTraits<true> {
|
||||
static constexpr int ARCH_MAX_VEC_SIZE = 32;
|
||||
using vec_t = u32x8_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct VecTraits<false> {
|
||||
static constexpr int ARCH_MAX_VEC_SIZE = 16;
|
||||
using vec_t = int4;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct PackedTraits;
|
||||
|
||||
template <>
|
||||
struct PackedTraits<c10::BFloat16> {
|
||||
using packed_t = __nv_bfloat162;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PackedTraits<c10::Half> {
|
||||
using packed_t = __half2;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PackedTraits<float> {
|
||||
using packed_t = float2;
|
||||
};
|
||||
|
||||
template <typename packed_t>
|
||||
__device__ __forceinline__ float2 cast_to_float2(const packed_t& val) {
|
||||
if constexpr (std::is_same_v<packed_t, __nv_bfloat162>) {
|
||||
return __bfloat1622float2(val);
|
||||
} else if constexpr (std::is_same_v<packed_t, __half2>) {
|
||||
return __half22float2(val);
|
||||
} else if constexpr (std::is_same_v<packed_t, float2>) {
|
||||
return float2(val);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename packed_t>
|
||||
__device__ __forceinline__ packed_t cast_to_packed(const float2& val) {
|
||||
if constexpr (std::is_same_v<packed_t, __nv_bfloat162>) {
|
||||
return __float22bfloat162_rn(val);
|
||||
} else if constexpr (std::is_same_v<packed_t, __half2>) {
|
||||
return __float22half2_rn(val);
|
||||
} else if constexpr (std::is_same_v<packed_t, float2>) {
|
||||
return float2(val);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename packed_t>
|
||||
__device__ __forceinline__ packed_t packed_mul(const packed_t& x,
|
||||
const packed_t& y) {
|
||||
if constexpr (std::is_same_v<packed_t, __nv_bfloat162> ||
|
||||
std::is_same_v<packed_t, __half2>) {
|
||||
return __hmul2(x, y);
|
||||
} else if constexpr (std::is_same_v<packed_t, float2>) {
|
||||
return make_float2(x.x * y.x, x.y * y.y);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
|
||||
bool act_first>
|
||||
__device__ __forceinline__ scalar_t compute(const scalar_t& x,
|
||||
@@ -131,16 +25,6 @@ __device__ __forceinline__ packed_t packed_compute(const packed_t& x,
|
||||
: packed_mul(x, PACKED_ACT_FN(y));
|
||||
}
|
||||
|
||||
// Check if all pointers are 16-byte aligned for int4 vectorized access
|
||||
__host__ __device__ __forceinline__ bool is_16byte_aligned(const void* ptr) {
|
||||
return (reinterpret_cast<uintptr_t>(ptr) & 15) == 0;
|
||||
}
|
||||
|
||||
// Check if all pointers are 16-byte aligned for longlong4_32a vectorized access
|
||||
__host__ __device__ __forceinline__ bool is_32byte_aligned(const void* ptr) {
|
||||
return (reinterpret_cast<uintptr_t>(ptr) & 31) == 0;
|
||||
}
|
||||
|
||||
// Activation and gating kernel template.
|
||||
template <typename scalar_t, typename packed_t,
|
||||
scalar_t (*ACT_FN)(const scalar_t&),
|
||||
@@ -155,36 +39,32 @@ __global__ void act_and_mul_kernel(
|
||||
scalar_t* out_ptr = out + blockIdx.x * d;
|
||||
|
||||
if constexpr (use_vec) {
|
||||
// Fast path: 128-bit/256-bit vectorized loop
|
||||
using vec_t = typename VecTraits<use_256b>::vec_t;
|
||||
constexpr int ARCH_MAX_VEC_SIZE = VecTraits<use_256b>::ARCH_MAX_VEC_SIZE;
|
||||
constexpr int VEC_SIZE = ARCH_MAX_VEC_SIZE / sizeof(packed_t);
|
||||
using cuda_t = typename CUDATypeConverter<scalar_t>::Type;
|
||||
using pvec_t = PackedVec<cuda_t, use_256b>;
|
||||
|
||||
const vec_t* x_vec = reinterpret_cast<const vec_t*>(x_ptr);
|
||||
const vec_t* y_vec = reinterpret_cast<const vec_t*>(y_ptr);
|
||||
vec_t* out_vec = reinterpret_cast<vec_t*>(out_ptr);
|
||||
const int num_vecs = d / 2 / VEC_SIZE;
|
||||
const pvec_t* x_vec = reinterpret_cast<const pvec_t*>(x_ptr);
|
||||
const pvec_t* y_vec = reinterpret_cast<const pvec_t*>(y_ptr);
|
||||
pvec_t* out_vec = reinterpret_cast<pvec_t*>(out_ptr);
|
||||
const int num_vecs = d / 2 / pvec_t::NUM_ELTS;
|
||||
|
||||
for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
|
||||
vec_t x, y;
|
||||
pvec_t x, y;
|
||||
if constexpr (use_256b) {
|
||||
ld256(x, &x_vec[i]);
|
||||
ld256(y, &y_vec[i]);
|
||||
} else {
|
||||
x = VLLM_LDG(&x_vec[i]);
|
||||
y = VLLM_LDG(&y_vec[i]);
|
||||
ld128(x, &x_vec[i]);
|
||||
ld128(y, &y_vec[i]);
|
||||
}
|
||||
auto* xp = reinterpret_cast<packed_t*>(&x);
|
||||
auto* yp = reinterpret_cast<packed_t*>(&y);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < VEC_SIZE; j++) {
|
||||
xp[j] =
|
||||
packed_compute<packed_t, PACKED_ACT_FN, act_first>(xp[j], yp[j]);
|
||||
for (int j = 0; j < pvec_t::NUM_ELTS; j++) {
|
||||
x.elts[j] = packed_compute<packed_t, PACKED_ACT_FN, act_first>(
|
||||
x.elts[j], y.elts[j]);
|
||||
}
|
||||
if constexpr (use_256b) {
|
||||
st256(x, &out_vec[i]);
|
||||
} else {
|
||||
out_vec[i] = x;
|
||||
st128(x, &out_vec[i]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@@ -272,51 +152,54 @@ packed_gelu_tanh_kernel(const packed_t& val) {
|
||||
// Launch activation and gating kernel.
|
||||
// Use ACT_FIRST (bool) indicating whether to apply the activation function
|
||||
// first.
|
||||
#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, PACKED_KERNEL, ACT_FIRST) \
|
||||
auto dtype = input.scalar_type(); \
|
||||
int d = input.size(-1) / 2; \
|
||||
int64_t num_tokens = input.numel() / input.size(-1); \
|
||||
if (num_tokens == 0) { \
|
||||
return; \
|
||||
} \
|
||||
dim3 grid(num_tokens); \
|
||||
int cc_major = at::cuda::getCurrentDeviceProperties()->major; \
|
||||
int support_vec = (cc_major >= 10 && num_tokens > 128) ? 32 : 16; \
|
||||
int vec_size = support_vec / at::elementSize(dtype); \
|
||||
const bool use_vec = (d % vec_size == 0); \
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
|
||||
if (use_vec) { \
|
||||
dim3 block(std::min(d / vec_size, 1024)); \
|
||||
if (cc_major >= 10 && num_tokens > 128) { \
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] { \
|
||||
vllm::act_and_mul_kernel< \
|
||||
scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t, \
|
||||
KERNEL<scalar_t>, \
|
||||
PACKED_KERNEL<typename vllm::PackedTraits<scalar_t>::packed_t>, \
|
||||
ACT_FIRST, true, true><<<grid, block, 0, stream>>>( \
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d); \
|
||||
}); \
|
||||
} else { \
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] { \
|
||||
vllm::act_and_mul_kernel< \
|
||||
scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t, \
|
||||
KERNEL<scalar_t>, \
|
||||
PACKED_KERNEL<typename vllm::PackedTraits<scalar_t>::packed_t>, \
|
||||
ACT_FIRST, true, false><<<grid, block, 0, stream>>>( \
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d); \
|
||||
}); \
|
||||
} \
|
||||
} else { \
|
||||
dim3 block(std::min(d, 1024)); \
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] { \
|
||||
vllm::act_and_mul_kernel< \
|
||||
scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t, \
|
||||
KERNEL<scalar_t>, \
|
||||
PACKED_KERNEL<typename vllm::PackedTraits<scalar_t>::packed_t>, \
|
||||
ACT_FIRST, false><<<grid, block, 0, stream>>>( \
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d); \
|
||||
}); \
|
||||
#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, PACKED_KERNEL, ACT_FIRST) \
|
||||
auto dtype = input.scalar_type(); \
|
||||
int d = input.size(-1) / 2; \
|
||||
int64_t num_tokens = input.numel() / input.size(-1); \
|
||||
if (num_tokens == 0) { \
|
||||
return; \
|
||||
} \
|
||||
dim3 grid(num_tokens); \
|
||||
int cc_major = at::cuda::getCurrentDeviceProperties()->major; \
|
||||
int support_vec = \
|
||||
(CUDA_VERSION >= 12090 && cc_major >= 10 && num_tokens > 128) \
|
||||
? vllm::VecTraits<true>::ARCH_MAX_VEC_SIZE \
|
||||
: vllm::VecTraits<false>::ARCH_MAX_VEC_SIZE; \
|
||||
int vec_size = support_vec / at::elementSize(dtype); \
|
||||
const bool use_vec = (d % vec_size == 0); \
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
|
||||
if (use_vec) { \
|
||||
dim3 block(std::min(d / vec_size, 1024)); \
|
||||
if (CUDA_VERSION >= 12090 && cc_major >= 10 && num_tokens > 128) { \
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] { \
|
||||
vllm::act_and_mul_kernel< \
|
||||
scalar_t, typename vllm::PackedTypeConverter<scalar_t>::Type, \
|
||||
KERNEL<scalar_t>, \
|
||||
PACKED_KERNEL<typename vllm::PackedTypeConverter<scalar_t>::Type>, \
|
||||
ACT_FIRST, true, true><<<grid, block, 0, stream>>>( \
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d); \
|
||||
}); \
|
||||
} else { \
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] { \
|
||||
vllm::act_and_mul_kernel< \
|
||||
scalar_t, typename vllm::PackedTypeConverter<scalar_t>::Type, \
|
||||
KERNEL<scalar_t>, \
|
||||
PACKED_KERNEL<typename vllm::PackedTypeConverter<scalar_t>::Type>, \
|
||||
ACT_FIRST, true, false><<<grid, block, 0, stream>>>( \
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d); \
|
||||
}); \
|
||||
} \
|
||||
} else { \
|
||||
dim3 block(std::min(d, 1024)); \
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] { \
|
||||
vllm::act_and_mul_kernel< \
|
||||
scalar_t, typename vllm::PackedTypeConverter<scalar_t>::Type, \
|
||||
KERNEL<scalar_t>, \
|
||||
PACKED_KERNEL<typename vllm::PackedTypeConverter<scalar_t>::Type>, \
|
||||
ACT_FIRST, false><<<grid, block, 0, stream>>>( \
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d); \
|
||||
}); \
|
||||
}
|
||||
|
||||
void silu_and_mul(torch::Tensor& out, // [..., d]
|
||||
@@ -378,35 +261,31 @@ __global__ void act_and_mul_kernel_with_param(
|
||||
scalar_t* out_ptr = out + blockIdx.x * d;
|
||||
|
||||
if constexpr (use_vec) {
|
||||
// Fast path: 128-bit/256-bit vectorized loop
|
||||
using vec_t = typename VecTraits<use_256b>::vec_t;
|
||||
constexpr int ARCH_MAX_VEC_SIZE = VecTraits<use_256b>::ARCH_MAX_VEC_SIZE;
|
||||
constexpr int VEC_SIZE = ARCH_MAX_VEC_SIZE / sizeof(packed_t);
|
||||
using cuda_t = typename CUDATypeConverter<scalar_t>::Type;
|
||||
using pvec_t = PackedVec<cuda_t, use_256b>;
|
||||
|
||||
const vec_t* x_vec = reinterpret_cast<const vec_t*>(x_ptr);
|
||||
const vec_t* y_vec = reinterpret_cast<const vec_t*>(y_ptr);
|
||||
vec_t* out_vec = reinterpret_cast<vec_t*>(out_ptr);
|
||||
const int num_vecs = d / 2 / VEC_SIZE;
|
||||
const pvec_t* x_vec = reinterpret_cast<const pvec_t*>(x_ptr);
|
||||
const pvec_t* y_vec = reinterpret_cast<const pvec_t*>(y_ptr);
|
||||
pvec_t* out_vec = reinterpret_cast<pvec_t*>(out_ptr);
|
||||
const int num_vecs = d / 2 / pvec_t::NUM_ELTS;
|
||||
|
||||
for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
|
||||
vec_t x, y;
|
||||
pvec_t x, y;
|
||||
if constexpr (use_256b) {
|
||||
ld256(x, &x_vec[i]);
|
||||
ld256(y, &y_vec[i]);
|
||||
} else {
|
||||
x = VLLM_LDG(&x_vec[i]);
|
||||
y = VLLM_LDG(&y_vec[i]);
|
||||
ld128(x, &x_vec[i]);
|
||||
ld128(y, &y_vec[i]);
|
||||
}
|
||||
auto* xp = reinterpret_cast<packed_t*>(&x);
|
||||
auto* yp = reinterpret_cast<packed_t*>(&y);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < VEC_SIZE; j++) {
|
||||
xp[j] = packed_mul(PACKED_ACT_FN(xp[j], param), yp[j]);
|
||||
for (int j = 0; j < pvec_t::NUM_ELTS; j++) {
|
||||
x.elts[j] = packed_mul(PACKED_ACT_FN(x.elts[j], param), y.elts[j]);
|
||||
}
|
||||
if constexpr (use_256b) {
|
||||
st256(x, &out_vec[i]);
|
||||
} else {
|
||||
out_vec[i] = x;
|
||||
st128(x, &out_vec[i]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@@ -499,21 +378,24 @@ __global__ void swigluoai_and_mul_kernel(
|
||||
} \
|
||||
dim3 grid(num_tokens); \
|
||||
int cc_major = at::cuda::getCurrentDeviceProperties()->major; \
|
||||
int support_vec = (cc_major >= 10 && num_tokens > 128) ? 32 : 16; \
|
||||
int support_vec = \
|
||||
(CUDA_VERSION >= 12090 && cc_major >= 10 && num_tokens > 128) \
|
||||
? vllm::VecTraits<true>::ARCH_MAX_VEC_SIZE \
|
||||
: vllm::VecTraits<false>::ARCH_MAX_VEC_SIZE; \
|
||||
int vec_size = support_vec / at::elementSize(dtype); \
|
||||
const bool use_vec = (d % vec_size == 0); \
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
|
||||
if (use_vec) { \
|
||||
dim3 block(std::min(d / vec_size, 1024)); \
|
||||
if (cc_major >= 10 && num_tokens > 128) { \
|
||||
if (CUDA_VERSION >= 12090 && cc_major >= 10 && num_tokens > 128) { \
|
||||
VLLM_DISPATCH_FLOATING_TYPES( \
|
||||
dtype, "act_and_mul_kernel_with_param", [&] { \
|
||||
vllm::act_and_mul_kernel_with_param< \
|
||||
scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t, \
|
||||
scalar_t, typename vllm::PackedTypeConverter<scalar_t>::Type, \
|
||||
KERNEL<scalar_t>, \
|
||||
PACKED_KERNEL< \
|
||||
typename vllm::PackedTraits<scalar_t>::packed_t>, \
|
||||
typename vllm::PackedTypeConverter<scalar_t>::Type>, \
|
||||
true, true><<<grid, block, 0, stream>>>( \
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d, \
|
||||
PARAM); \
|
||||
@@ -522,10 +404,10 @@ __global__ void swigluoai_and_mul_kernel(
|
||||
VLLM_DISPATCH_FLOATING_TYPES( \
|
||||
dtype, "act_and_mul_kernel_with_param", [&] { \
|
||||
vllm::act_and_mul_kernel_with_param< \
|
||||
scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t, \
|
||||
scalar_t, typename vllm::PackedTypeConverter<scalar_t>::Type, \
|
||||
KERNEL<scalar_t>, \
|
||||
PACKED_KERNEL< \
|
||||
typename vllm::PackedTraits<scalar_t>::packed_t>, \
|
||||
typename vllm::PackedTypeConverter<scalar_t>::Type>, \
|
||||
true, false><<<grid, block, 0, stream>>>( \
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d, \
|
||||
PARAM); \
|
||||
@@ -535,9 +417,9 @@ __global__ void swigluoai_and_mul_kernel(
|
||||
dim3 block(std::min(d, 1024)); \
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel_with_param", [&] { \
|
||||
vllm::act_and_mul_kernel_with_param< \
|
||||
scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t, \
|
||||
scalar_t, typename vllm::PackedTypeConverter<scalar_t>::Type, \
|
||||
KERNEL<scalar_t>, \
|
||||
PACKED_KERNEL<typename vllm::PackedTraits<scalar_t>::packed_t>, \
|
||||
PACKED_KERNEL<typename vllm::PackedTypeConverter<scalar_t>::Type>, \
|
||||
false><<<grid, block, 0, stream>>>( \
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d, PARAM); \
|
||||
}); \
|
||||
@@ -629,14 +511,17 @@ __global__ void activation_kernel(
|
||||
} \
|
||||
dim3 grid(num_tokens); \
|
||||
int cc_major = at::cuda::getCurrentDeviceProperties()->major; \
|
||||
int support_vec = (cc_major >= 10 && num_tokens > 128) ? 32 : 16; \
|
||||
int support_vec = \
|
||||
(CUDA_VERSION >= 12090 && cc_major >= 10 && num_tokens > 128) \
|
||||
? vllm::VecTraits<true>::ARCH_MAX_VEC_SIZE \
|
||||
: vllm::VecTraits<false>::ARCH_MAX_VEC_SIZE; \
|
||||
int vec_size = support_vec / at::elementSize(dtype); \
|
||||
const bool use_vec = (d % vec_size == 0); \
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
|
||||
if (use_vec) { \
|
||||
dim3 block(std::min(d / vec_size, 1024)); \
|
||||
if (cc_major >= 10 && num_tokens > 128) { \
|
||||
if (CUDA_VERSION >= 12090 && cc_major >= 10 && num_tokens > 128) { \
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "activation_kernel", [&] { \
|
||||
vllm::activation_kernel<scalar_t, KERNEL<scalar_t>, true, true> \
|
||||
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
|
||||
|
||||
@@ -74,6 +74,12 @@ void indexer_k_quant_and_cache(
|
||||
int64_t quant_block_size, // quantization block size
|
||||
const std::string& scale_fmt);
|
||||
|
||||
// Concatenate query nope and rope for MLA/DSA attention
|
||||
void concat_mla_q(
|
||||
torch::Tensor& ql_nope, // [num_tokens, num_heads, nope_dim]
|
||||
torch::Tensor& q_pe, // [num_tokens, num_heads, rope_dim]
|
||||
torch::Tensor& q_out); // [num_tokens, num_heads, nope_dim + rope_dim]
|
||||
|
||||
// Extract function to gather quantized K cache
|
||||
void cp_gather_indexer_k_quant_cache(
|
||||
const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include "cuda_compat.h"
|
||||
#include "dispatch_utils.h"
|
||||
#include "quantization/vectorization_utils.cuh"
|
||||
#include "concat_mla_q.cuh"
|
||||
|
||||
#ifdef USE_ROCM
|
||||
#include "quantization/w8a8/fp8/amd/quant_utils.cuh"
|
||||
@@ -995,75 +996,67 @@ namespace vllm {
|
||||
// Similar to cp_gather_cache but specifically for FP8->BF16 conversion
|
||||
__global__ void cp_gather_and_upconvert_fp8_kv_cache(
|
||||
const uint8_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
|
||||
__nv_bfloat16* __restrict__ dst, // [TOT_TOKENS, 576]
|
||||
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
|
||||
const int32_t* __restrict__ seq_lens, // [BATCH]
|
||||
const int32_t* __restrict__ workspace_starts, // [BATCH]
|
||||
const int32_t block_size, const int32_t head_dim,
|
||||
const int64_t block_table_stride, const int64_t cache_block_stride,
|
||||
const int64_t cache_entry_stride, const int64_t dst_entry_stride) {
|
||||
const int64_t bid = blockIdx.x; // Batch ID
|
||||
const int32_t num_splits = gridDim.y;
|
||||
const int32_t split = blockIdx.y;
|
||||
const int32_t seq_start = workspace_starts[bid];
|
||||
const int32_t seq_len = seq_lens[bid];
|
||||
const int32_t tot_slots = seq_len;
|
||||
const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
|
||||
__nv_bfloat16* __restrict__ dst, // [total_tokens, 576]
|
||||
const int32_t* __restrict__ block_table, // [num_reqs, BLOCK_INDICES]
|
||||
const int32_t* __restrict__ workspace_starts, // [num_reqs]
|
||||
const int32_t num_reqs, const int32_t block_size,
|
||||
const int32_t total_tokens, const int64_t block_table_stride,
|
||||
const int64_t cache_block_stride, const int64_t cache_entry_stride,
|
||||
const int64_t dst_entry_stride) {
|
||||
const int flat_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) >> 5;
|
||||
if (flat_warp_id >= total_tokens) return;
|
||||
const int lane_id = threadIdx.x & 31;
|
||||
|
||||
const int32_t split_start = split * split_slots;
|
||||
const int32_t split_end = min((split + 1) * split_slots, tot_slots);
|
||||
|
||||
const bool is_active_split = (split_start < tot_slots);
|
||||
|
||||
if (!is_active_split) return;
|
||||
|
||||
// Adjust the pointer for the block_table for this batch
|
||||
const int32_t batch_offset = bid * block_table_stride;
|
||||
int32_t offset = split_start;
|
||||
int32_t offset_div = offset / block_size;
|
||||
offset = offset % block_size;
|
||||
const int32_t* batch_block_table = block_table + batch_offset;
|
||||
|
||||
// Adjust dst pointer based on the cumulative sequence lengths
|
||||
dst += seq_start * dst_entry_stride;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
|
||||
// Process each token in this split
|
||||
for (int pid = split_start; pid < split_end; ++pid) {
|
||||
auto block_id = batch_block_table[offset_div];
|
||||
const uint8_t* token_ptr =
|
||||
src_cache + block_id * cache_block_stride + offset * cache_entry_stride;
|
||||
__nv_bfloat16* dst_ptr = dst + pid * dst_entry_stride;
|
||||
|
||||
// FP8 format: 512 bytes fp8 + 16 bytes scales + 128 bytes rope (64 bf16)
|
||||
const uint8_t* no_pe_ptr = token_ptr;
|
||||
const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
|
||||
const __nv_bfloat16* rope_ptr =
|
||||
reinterpret_cast<const __nv_bfloat16*>(token_ptr + 512 + 16);
|
||||
|
||||
// Parallelize fp8 dequant (512 elements) and rope copy (64 elements)
|
||||
if (tid < 512) {
|
||||
// FP8 dequantization
|
||||
const int tile = tid >> 7; // each tile is 128 elements
|
||||
const float scale = scales_ptr[tile];
|
||||
const uint8_t val = no_pe_ptr[tid];
|
||||
dst_ptr[tid] =
|
||||
fp8::scaled_convert<__nv_bfloat16, uint8_t,
|
||||
vllm::Fp8KVCacheDataType::kFp8E4M3>(val, scale);
|
||||
} else if (tid < 576) {
|
||||
// Rope copy (64 bf16 elements)
|
||||
const int rope_idx = tid - 512;
|
||||
dst_ptr[512 + rope_idx] = rope_ptr[rope_idx];
|
||||
}
|
||||
|
||||
// Move to next token
|
||||
offset += 1;
|
||||
if (offset == block_size) {
|
||||
offset_div += 1;
|
||||
offset = 0;
|
||||
}
|
||||
// Binary search to find which request owns this output token
|
||||
int lo = 0, hi = num_reqs - 1;
|
||||
while (lo < hi) {
|
||||
int mid = (lo + hi + 1) >> 1;
|
||||
if (workspace_starts[mid] <= flat_warp_id)
|
||||
lo = mid;
|
||||
else
|
||||
hi = mid - 1;
|
||||
}
|
||||
const int req_id = lo;
|
||||
|
||||
// Compute physical token address via block table
|
||||
const int out_token_id = flat_warp_id;
|
||||
const int token_offset = out_token_id - workspace_starts[req_id];
|
||||
const int cache_block_idx = token_offset / block_size;
|
||||
const int offset_in_block = token_offset % block_size;
|
||||
const int physical_block =
|
||||
block_table[req_id * block_table_stride + cache_block_idx];
|
||||
|
||||
const uint8_t* token_ptr = src_cache + physical_block * cache_block_stride +
|
||||
offset_in_block * cache_entry_stride;
|
||||
|
||||
const int4* nope_src = reinterpret_cast<const int4*>(token_ptr);
|
||||
const int4 fp8_data = nope_src[lane_id];
|
||||
|
||||
const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
|
||||
const float scale = scales_ptr[lane_id >> 3];
|
||||
|
||||
const uint2 fp8_lo = make_uint2(fp8_data.x, fp8_data.y);
|
||||
const uint2 fp8_hi = make_uint2(fp8_data.z, fp8_data.w);
|
||||
#ifdef USE_ROCM
|
||||
const bf16_8_t bf16_lo =
|
||||
fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_lo, scale);
|
||||
const bf16_8_t bf16_hi =
|
||||
fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_hi, scale);
|
||||
#else
|
||||
const bf16_8_t bf16_lo =
|
||||
fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_lo, scale, __NV_E4M3);
|
||||
const bf16_8_t bf16_hi =
|
||||
fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_hi, scale, __NV_E4M3);
|
||||
#endif
|
||||
|
||||
__nv_bfloat16* dst_ptr = dst + out_token_id * dst_entry_stride;
|
||||
int4* nope_dst = reinterpret_cast<int4*>(dst_ptr) + lane_id * 2;
|
||||
nope_dst[0] = *reinterpret_cast<const int4*>(&bf16_lo);
|
||||
nope_dst[1] = *reinterpret_cast<const int4*>(&bf16_hi);
|
||||
|
||||
const int* rope_src = reinterpret_cast<const int*>(token_ptr + 528);
|
||||
int* rope_dst = reinterpret_cast<int*>(dst_ptr + 512);
|
||||
rope_dst[lane_id] = rope_src[lane_id];
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
@@ -1257,15 +1250,16 @@ void cp_gather_and_upconvert_fp8_kv_cache(
|
||||
src_ptr = reinterpret_cast<const uint8_t*>(src_cache.data_ptr());
|
||||
}
|
||||
|
||||
// Decide on the number of splits based on the batch size
|
||||
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
|
||||
dim3 grid(batch_size, num_splits);
|
||||
dim3 block(576);
|
||||
const int total_tokens = dst.size(0);
|
||||
constexpr int warps_per_block = 8;
|
||||
const int grid_size = (total_tokens + warps_per_block - 1) / warps_per_block;
|
||||
const int block_size_threads = warps_per_block * 32; // 256 threads
|
||||
|
||||
vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid, block, 0, stream>>>(
|
||||
vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid_size, block_size_threads, 0,
|
||||
stream>>>(
|
||||
src_ptr, reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
|
||||
block_table.data_ptr<int32_t>(), seq_lens.data_ptr<int32_t>(),
|
||||
workspace_starts.data_ptr<int32_t>(), block_size, head_dim,
|
||||
block_table.data_ptr<int32_t>(), workspace_starts.data_ptr<int32_t>(),
|
||||
static_cast<int32_t>(batch_size), block_size, total_tokens,
|
||||
block_table_stride, cache_block_stride, cache_entry_stride,
|
||||
dst_entry_stride);
|
||||
}
|
||||
@@ -1365,3 +1359,43 @@ void cp_gather_indexer_k_quant_cache(
|
||||
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32);
|
||||
}
|
||||
}
|
||||
|
||||
// Concatenate ql_nope and q_pe into a contiguous q_out tensor for MLA/DSA.
|
||||
// Replaces torch.cat((ql_nope, q_pe), dim=-1).
|
||||
void concat_mla_q(torch::Tensor& ql_nope, // [num_tokens, num_heads, nope_dim]
|
||||
torch::Tensor& q_pe, // [num_tokens, num_heads, rope_dim]
|
||||
torch::Tensor& q_out // [num_tokens, num_heads, nope_dim +
|
||||
// rope_dim]
|
||||
) {
|
||||
const int num_tokens = ql_nope.size(0);
|
||||
const int num_heads = ql_nope.size(1);
|
||||
const int nope_dim = ql_nope.size(2);
|
||||
const int rope_dim = q_pe.size(2);
|
||||
|
||||
TORCH_CHECK(nope_dim % 512 == 0, "nope_dim must be a multiple of 512, got ",
|
||||
nope_dim);
|
||||
TORCH_CHECK(rope_dim == 64, "rope_dim must be 64, got ", rope_dim);
|
||||
TORCH_CHECK(q_out.size(2) == nope_dim + rope_dim);
|
||||
|
||||
TORCH_CHECK(ql_nope.stride(2) == 1, "ql_nope must have stride 1 in dim 2");
|
||||
TORCH_CHECK(q_pe.stride(2) == 1, "q_pe must have stride 1 in dim 2");
|
||||
TORCH_CHECK(q_out.stride(2) == 1, "q_out must have stride 1 in dim 2");
|
||||
|
||||
if (num_tokens == 0) return;
|
||||
|
||||
constexpr int warps_per_block = 8;
|
||||
const int total_warps = num_tokens * num_heads;
|
||||
const int grid_size = (total_warps + warps_per_block - 1) / warps_per_block;
|
||||
const int block_size = warps_per_block * 32;
|
||||
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(ql_nope));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(ql_nope.scalar_type(), "concat_mla_q", [&] {
|
||||
vllm::ConcatMLAQKernel<scalar_t, 512><<<grid_size, block_size, 0, stream>>>(
|
||||
q_out.data_ptr<scalar_t>(), ql_nope.data_ptr<scalar_t>(),
|
||||
q_pe.data_ptr<scalar_t>(), num_tokens, num_heads, q_out.stride(0),
|
||||
q_out.stride(1), ql_nope.stride(0), ql_nope.stride(1), q_pe.stride(0),
|
||||
q_pe.stride(1));
|
||||
});
|
||||
}
|
||||
|
||||
60
csrc/concat_mla_q.cuh
Normal file
60
csrc/concat_mla_q.cuh
Normal file
@@ -0,0 +1,60 @@
|
||||
#ifndef CONCAT_MLA_Q_CUH_
|
||||
#define CONCAT_MLA_Q_CUH_
|
||||
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "cuda_vec_utils.cuh"
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// Concatenates ql_nope [num_tokens, num_heads, NOPE_DIM] and
|
||||
// q_pe [num_tokens, num_heads, 64]
|
||||
// into q_out [num_tokens, num_heads, NOPE_DIM+64].
|
||||
// Currently instantiated only for NOPE_DIM=512.
|
||||
// Rope dim is hardcoded to 64 (DeepSeek V3.2 MLA)
|
||||
template <typename DType, int NOPE_DIM>
|
||||
__global__ void ConcatMLAQKernel(
|
||||
DType* __restrict__ q_out, const DType* __restrict__ ql_nope,
|
||||
const DType* __restrict__ q_pe, const int num_tokens, const int num_heads,
|
||||
const int64_t out_stride_0, const int64_t out_stride_1,
|
||||
const int64_t nope_stride_0, const int64_t nope_stride_1,
|
||||
const int64_t pe_stride_0, const int64_t pe_stride_1) {
|
||||
const int flat_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) >> 5;
|
||||
if (flat_warp_id >= num_tokens * num_heads) return;
|
||||
|
||||
const int token_id = flat_warp_id / num_heads;
|
||||
const int head_id = flat_warp_id % num_heads;
|
||||
const int lane_id = threadIdx.x & 31;
|
||||
|
||||
constexpr bool use_256b = VLLM_256B_PTX_ENABLED;
|
||||
constexpr int nope_vec_loads =
|
||||
NOPE_DIM * sizeof(DType) / (VecTraits<use_256b>::ARCH_MAX_VEC_SIZE * 32);
|
||||
|
||||
const DType* nope_src =
|
||||
ql_nope + token_id * nope_stride_0 + head_id * nope_stride_1;
|
||||
DType* nope_dst = q_out + token_id * out_stride_0 + head_id * out_stride_1;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < nope_vec_loads; i++) {
|
||||
const int offset = i * 32 + lane_id;
|
||||
if constexpr (use_256b) {
|
||||
st256_cs(reinterpret_cast<u32x8_t*>(nope_dst) + offset,
|
||||
ld256_cs(reinterpret_cast<const u32x8_t*>(nope_src) + offset));
|
||||
} else {
|
||||
st128_cs(reinterpret_cast<int4*>(nope_dst) + offset,
|
||||
ld128_cs(reinterpret_cast<const int4*>(nope_src) + offset));
|
||||
}
|
||||
}
|
||||
|
||||
const int* rope_src = reinterpret_cast<const int*>(
|
||||
q_pe + token_id * pe_stride_0 + head_id * pe_stride_1);
|
||||
int* rope_dst = reinterpret_cast<int*>(q_out + token_id * out_stride_0 +
|
||||
head_id * out_stride_1 + NOPE_DIM);
|
||||
|
||||
st32_cs(rope_dst + lane_id, ld32_cs(rope_src + lane_id));
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
#endif // CONCAT_MLA_Q_CUH_
|
||||
@@ -420,7 +420,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
|
||||
const int64_t block_size, const int64_t block_size_stride) {
|
||||
// For AMX 2D tiles, size of each line is 64 bytes
|
||||
constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES;
|
||||
// For AMX B martix, N always is 16
|
||||
// For AMX B matrix, N always is 16
|
||||
constexpr int64_t amx_b_tile_n_size = AMX_TILE_ROW_BYTES / 4;
|
||||
constexpr int64_t amx_b_tile_k_size = amx_tile_row_size / sizeof(scalar_t);
|
||||
// For now suppose block_size is divisible by amx_tile_column_num
|
||||
|
||||
@@ -237,13 +237,10 @@ W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
|
||||
};
|
||||
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
|
||||
{b_k_stride_, b_n_stride_});
|
||||
#ifdef __aarch64__
|
||||
|
||||
// dummy M size for prepacking weights
|
||||
// Prepacking weights improves performance and avoid runtime reorders
|
||||
constexpr dnnl_dim_t kProbeM = 128;
|
||||
#else
|
||||
constexpr dnnl_dim_t kProbeM = DNNL_RUNTIME_DIM_VAL;
|
||||
#endif
|
||||
|
||||
prepack_weight(args.b_ptr, original_b_md,
|
||||
create_primitive_desc(
|
||||
@@ -411,21 +408,19 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
|
||||
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
|
||||
{b_k_stride_, b_n_stride_});
|
||||
|
||||
// dummy M size for prepacking weights
|
||||
// Prepacking weights improves performance and avoid runtime reorders
|
||||
constexpr dnnl_dim_t kProbeM = 128;
|
||||
|
||||
prepack_weight(args.b_ptr, original_b_md,
|
||||
create_primitive_desc(
|
||||
MSizeCacheKey{
|
||||
#ifdef VLLM_USE_ACL
|
||||
// Arm Compute Library (ACL) backend for oneDNN does
|
||||
// not support runtime
|
||||
// dimensions, so we set M to a default value
|
||||
.a_m_size = 128,
|
||||
.a_m_stride = b_k_size_,
|
||||
#else
|
||||
.a_m_size = DNNL_RUNTIME_DIM_VAL,
|
||||
.a_m_stride = DNNL_RUNTIME_DIM_VAL,
|
||||
#endif
|
||||
.use_bias = false,
|
||||
.bias_type = dnnl::memory::data_type::undef},
|
||||
MSizeCacheKey{// Use a concrete M so oneDNN's kernel
|
||||
// selector can choose an optimally blocked
|
||||
// weight layout.
|
||||
.a_m_size = kProbeM,
|
||||
.a_m_stride = b_k_size_,
|
||||
.use_bias = false,
|
||||
.bias_type = dnnl::memory::data_type::undef},
|
||||
true)
|
||||
.weights_desc());
|
||||
init_runtime_memory_cache(args);
|
||||
|
||||
@@ -4,6 +4,10 @@
|
||||
|
||||
#include <torch/library.h>
|
||||
|
||||
// Note: overwrite the external definition for sharing same name between
|
||||
// libraries use different ISAs.
|
||||
#define TORCH_EXTENSION_NAME _C
|
||||
|
||||
std::string init_cpu_threads_env(const std::string& cpu_ids);
|
||||
|
||||
void release_dnnl_matmul_handler(int64_t handler);
|
||||
@@ -324,19 +328,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
"str act, str isa) -> ()");
|
||||
ops.impl("cpu_fused_moe", torch::kCPU, &cpu_fused_moe);
|
||||
#endif
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
|
||||
// CPU utils
|
||||
utils.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cpu), cpu_ops) {
|
||||
cpu_ops.def(
|
||||
ops.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
|
||||
ops.def(
|
||||
"mla_decode_kvcache("
|
||||
" Tensor! out, Tensor query, Tensor kv_cache,"
|
||||
" float scale, Tensor block_tables, Tensor seq_lens) -> ()");
|
||||
cpu_ops.impl("mla_decode_kvcache", torch::kCPU, &mla_decode_kvcache);
|
||||
ops.impl("mla_decode_kvcache", torch::kCPU, &mla_decode_kvcache);
|
||||
}
|
||||
|
||||
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
|
||||
|
||||
361
csrc/cuda_vec_utils.cuh
Normal file
361
csrc/cuda_vec_utils.cuh
Normal file
@@ -0,0 +1,361 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <c10/util/BFloat16.h>
|
||||
#include <c10/util/Half.h>
|
||||
#include <cassert>
|
||||
|
||||
#ifdef USE_ROCM
|
||||
#include <hip/hip_runtime.h>
|
||||
#else
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
// Device-side: SM100+ architecture with CUDA 12.9+ toolkit, which
|
||||
// together enable 256-bit (v8.u32) PTX load/store instructions.
|
||||
// Use for PTX instruction selection with architecture fallback paths.
|
||||
#if !defined(USE_ROCM) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 && \
|
||||
defined(CUDA_VERSION) && CUDA_VERSION >= 12090
|
||||
#define VLLM_256B_PTX_ENABLED 1
|
||||
#else
|
||||
#define VLLM_256B_PTX_ENABLED 0
|
||||
#endif
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// ============================================================
|
||||
// Types and traits
|
||||
// ============================================================
|
||||
|
||||
// 256-bit (32-byte) aligned vector type: 8 x uint32_t
|
||||
struct alignas(32) u32x8_t {
|
||||
uint32_t d[8];
|
||||
};
|
||||
|
||||
// VecTraits — select between 128-bit (int4) and 256-bit
|
||||
// (u32x8_t) vector types at compile time.
|
||||
template <bool support_256>
|
||||
struct VecTraits;
|
||||
|
||||
template <>
|
||||
struct VecTraits<true> {
|
||||
static constexpr int ARCH_MAX_VEC_SIZE = 32;
|
||||
using vec_t = u32x8_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct VecTraits<false> {
|
||||
static constexpr int ARCH_MAX_VEC_SIZE = 16;
|
||||
using vec_t = int4;
|
||||
};
|
||||
|
||||
// PackedTypeConverter — map between CUDA scalar and packed types
|
||||
// half <-> half2, __nv_bfloat16 <-> __nv_bfloat162, etc.
|
||||
template <typename T>
|
||||
struct PackedTypeConverter {
|
||||
static_assert(sizeof(T) == 0,
|
||||
"PackedTypeConverter is not specialized for this type.");
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PackedTypeConverter<half2> {
|
||||
using Type = half;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PackedTypeConverter<half> {
|
||||
using Type = half2;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PackedTypeConverter<__nv_bfloat162> {
|
||||
using Type = __nv_bfloat16;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PackedTypeConverter<__nv_bfloat16> {
|
||||
using Type = __nv_bfloat162;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PackedTypeConverter<float> {
|
||||
using Type = float2;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PackedTypeConverter<float2> {
|
||||
using Type = float;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PackedTypeConverter<c10::Half> {
|
||||
using Type = half2;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PackedTypeConverter<c10::BFloat16> {
|
||||
using Type = __nv_bfloat162;
|
||||
};
|
||||
|
||||
// CUDATypeConverter — map PyTorch scalar types to CUDA scalar
|
||||
// c10::Half -> half, c10::BFloat16 -> __nv_bfloat16
|
||||
template <typename T>
|
||||
struct CUDATypeConverter {
|
||||
using Type = T;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct CUDATypeConverter<c10::Half> {
|
||||
using Type = half;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct CUDATypeConverter<c10::BFloat16> {
|
||||
using Type = __nv_bfloat16;
|
||||
};
|
||||
|
||||
// PackedVec — typed vector container for packed element access.
|
||||
// Derives alignment and element count from VecTraits.
|
||||
// Type is the CUDA scalar type (e.g. half, __nv_bfloat16).
|
||||
template <class Type, bool use_256b>
|
||||
struct alignas(VecTraits<use_256b>::ARCH_MAX_VEC_SIZE) PackedVec {
|
||||
static constexpr int NUM_ELTS =
|
||||
VecTraits<use_256b>::ARCH_MAX_VEC_SIZE /
|
||||
sizeof(typename PackedTypeConverter<Type>::Type);
|
||||
typename PackedTypeConverter<Type>::Type elts[NUM_ELTS];
|
||||
};
|
||||
|
||||
// ============================================================
|
||||
// Load / store primitives
|
||||
// ============================================================
|
||||
|
||||
// 256-bit load / store — SM100+ only (PTX v8 instructions).
|
||||
__device__ __forceinline__ void ld256(u32x8_t& val, const u32x8_t* ptr) {
|
||||
#if VLLM_256B_PTX_ENABLED
|
||||
asm volatile("ld.global.nc.v8.u32 {%0,%1,%2,%3,%4,%5,%6,%7}, [%8];\n"
|
||||
: "=r"(val.d[0]), "=r"(val.d[1]), "=r"(val.d[2]), "=r"(val.d[3]),
|
||||
"=r"(val.d[4]), "=r"(val.d[5]), "=r"(val.d[6]), "=r"(val.d[7])
|
||||
: "l"(ptr));
|
||||
#else
|
||||
assert(false && "ld256 requires SM100+ with CUDA 12.9+");
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void st256(u32x8_t& val, u32x8_t* ptr) {
|
||||
#if VLLM_256B_PTX_ENABLED
|
||||
asm volatile("st.global.v8.u32 [%0], {%1,%2,%3,%4,%5,%6,%7,%8};\n"
|
||||
:
|
||||
: "l"(ptr), "r"(val.d[0]), "r"(val.d[1]), "r"(val.d[2]),
|
||||
"r"(val.d[3]), "r"(val.d[4]), "r"(val.d[5]), "r"(val.d[6]),
|
||||
"r"(val.d[7])
|
||||
: "memory");
|
||||
#else
|
||||
assert(false && "st256 requires SM100+ with CUDA 12.9+");
|
||||
#endif
|
||||
}
|
||||
|
||||
// Generic ld256 / st256 for any 32-byte aligned type (e.g. PackedVec).
|
||||
// Non-template overloads above are preferred for u32x8_t.
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void ld256(T& val, const T* ptr) {
|
||||
static_assert(sizeof(T) == 32, "ld256 requires a 32-byte type");
|
||||
ld256(reinterpret_cast<u32x8_t&>(val), reinterpret_cast<const u32x8_t*>(ptr));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void st256(T& val, T* ptr) {
|
||||
static_assert(sizeof(T) == 32, "st256 requires a 32-byte type");
|
||||
st256(reinterpret_cast<u32x8_t&>(val), reinterpret_cast<u32x8_t*>(ptr));
|
||||
}
|
||||
|
||||
// 128-bit load / store via __ldg (read-only cache hint).
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void ld128(T& val, const T* ptr) {
|
||||
static_assert(sizeof(T) == 16, "ld128 requires a 16-byte type");
|
||||
*reinterpret_cast<int4*>(&val) = __ldg(reinterpret_cast<const int4*>(ptr));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void st128(T& val, T* ptr) {
|
||||
static_assert(sizeof(T) == 16, "st128 requires a 16-byte type");
|
||||
*reinterpret_cast<int4*>(ptr) = *reinterpret_cast<int4*>(&val);
|
||||
}
|
||||
|
||||
// 256-bit cache-streaming (.cs) load / store — SM100+ only.
|
||||
__forceinline__ __device__ u32x8_t ld256_cs(const u32x8_t* addr) {
|
||||
#if VLLM_256B_PTX_ENABLED
|
||||
u32x8_t val;
|
||||
asm volatile("ld.global.cs.v8.u32 {%0,%1,%2,%3,%4,%5,%6,%7}, [%8];"
|
||||
: "=r"(val.d[0]), "=r"(val.d[1]), "=r"(val.d[2]), "=r"(val.d[3]),
|
||||
"=r"(val.d[4]), "=r"(val.d[5]), "=r"(val.d[6]), "=r"(val.d[7])
|
||||
: "l"(addr));
|
||||
return val;
|
||||
#else
|
||||
assert(false && "ld256_cs requires SM100+ with CUDA 12.9+");
|
||||
#endif
|
||||
}
|
||||
|
||||
__forceinline__ __device__ void st256_cs(u32x8_t* addr, u32x8_t val) {
|
||||
#if VLLM_256B_PTX_ENABLED
|
||||
asm volatile(
|
||||
"st.global.cs.v8.u32 [%0], {%1,%2,%3,%4,%5,%6,%7,%8};" ::"l"(addr),
|
||||
"r"(val.d[0]), "r"(val.d[1]), "r"(val.d[2]), "r"(val.d[3]), "r"(val.d[4]),
|
||||
"r"(val.d[5]), "r"(val.d[6]), "r"(val.d[7]));
|
||||
#else
|
||||
assert(false && "st256_cs requires SM100+ with CUDA 12.9+");
|
||||
#endif
|
||||
}
|
||||
|
||||
// 32-bit load / store.
|
||||
__device__ __forceinline__ int ld32(const int* addr) { return __ldg(addr); }
|
||||
|
||||
__device__ __forceinline__ void st32(int* addr, int val) { *addr = val; }
|
||||
|
||||
// 32-bit cache-streaming (.cs) load / store.
|
||||
// Falls back to ld32/st32 on ROCm (no .cs hint).
|
||||
__forceinline__ __device__ int ld32_cs(const int* addr) {
|
||||
int val;
|
||||
#ifndef USE_ROCM
|
||||
asm volatile("ld.global.cs.b32 %0, [%1];" : "=r"(val) : "l"(addr));
|
||||
#else
|
||||
val = ld32(addr);
|
||||
#endif
|
||||
return val;
|
||||
}
|
||||
|
||||
__forceinline__ __device__ void st32_cs(int* addr, int val) {
|
||||
#ifndef USE_ROCM
|
||||
asm volatile("st.global.cs.b32 [%0], %1;" ::"l"(addr), "r"(val));
|
||||
#else
|
||||
st32(addr, val);
|
||||
#endif
|
||||
}
|
||||
|
||||
// 128-bit cache-streaming (.cs) load / store.
|
||||
// Falls back to ld128/st128 on ROCm (no .cs hint).
|
||||
__forceinline__ __device__ int4 ld128_cs(const int4* addr) {
|
||||
int4 val;
|
||||
#ifndef USE_ROCM
|
||||
asm volatile("ld.global.cs.v4.u32 {%0,%1,%2,%3}, [%4];"
|
||||
: "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
|
||||
: "l"(addr));
|
||||
#else
|
||||
ld128(val, addr);
|
||||
#endif
|
||||
return val;
|
||||
}
|
||||
|
||||
__forceinline__ __device__ void st128_cs(int4* addr, int4 val) {
|
||||
#ifndef USE_ROCM
|
||||
asm volatile("st.global.cs.v4.u32 [%0], {%1,%2,%3,%4};" ::"l"(addr),
|
||||
"r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w));
|
||||
#else
|
||||
st128(val, addr);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Predicated 256-bit / 128-bit cache-global (.cg) loads.
|
||||
// Returns zero if pred is false. SM100+ only.
|
||||
__device__ __forceinline__ void ld256_cg_or_zero(u32x8_t& val, const void* ptr,
|
||||
bool pred) {
|
||||
#if VLLM_256B_PTX_ENABLED
|
||||
asm volatile(
|
||||
"{\n"
|
||||
" .reg .pred pr;\n"
|
||||
" setp.ne.u32 pr, %8, 0;\n"
|
||||
" mov.u32 %0, 0;\n"
|
||||
" mov.u32 %1, 0;\n"
|
||||
" mov.u32 %2, 0;\n"
|
||||
" mov.u32 %3, 0;\n"
|
||||
" mov.u32 %4, 0;\n"
|
||||
" mov.u32 %5, 0;\n"
|
||||
" mov.u32 %6, 0;\n"
|
||||
" mov.u32 %7, 0;\n"
|
||||
" @pr ld.global.cg.v8.u32 {%0,%1,%2,%3,%4,%5,%6,%7}, [%9];\n"
|
||||
"}\n"
|
||||
: "=r"(val.d[0]), "=r"(val.d[1]), "=r"(val.d[2]), "=r"(val.d[3]),
|
||||
"=r"(val.d[4]), "=r"(val.d[5]), "=r"(val.d[6]), "=r"(val.d[7])
|
||||
: "r"((int)pred), "l"(ptr));
|
||||
#else
|
||||
assert(false && "ld256_cg_or_zero requires SM100+ with CUDA 12.9+");
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ld128_cg_or_zero(uint4& val, const void* ptr,
|
||||
bool pred) {
|
||||
#ifndef USE_ROCM
|
||||
uint32_t r0, r1, r2, r3;
|
||||
|
||||
asm volatile(
|
||||
"{\n"
|
||||
" .reg .pred pr;\n"
|
||||
" setp.ne.u32 pr, %4, 0;\n"
|
||||
" mov.u32 %0, 0;\n"
|
||||
" mov.u32 %1, 0;\n"
|
||||
" mov.u32 %2, 0;\n"
|
||||
" mov.u32 %3, 0;\n"
|
||||
" @pr ld.global.cg.v4.u32 {%0,%1,%2,%3}, [%5];\n"
|
||||
"}\n"
|
||||
: "=r"(r0), "=r"(r1), "=r"(r2), "=r"(r3)
|
||||
: "r"((int)pred), "l"(ptr));
|
||||
|
||||
val = uint4{r0, r1, r2, r3};
|
||||
#else
|
||||
assert(false && "ld128_cg_or_zero is not supported on ROCm");
|
||||
#endif
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Alignment helpers
|
||||
// ============================================================
|
||||
|
||||
__host__ __device__ __forceinline__ bool is_16byte_aligned(const void* ptr) {
|
||||
return (reinterpret_cast<uintptr_t>(ptr) & 15) == 0;
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ bool is_32byte_aligned(const void* ptr) {
|
||||
return (reinterpret_cast<uintptr_t>(ptr) & 31) == 0;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Packed type conversion and arithmetic
|
||||
// ============================================================
|
||||
|
||||
template <typename packed_t>
|
||||
__device__ __forceinline__ float2 cast_to_float2(const packed_t& val) {
|
||||
if constexpr (std::is_same_v<packed_t, __nv_bfloat162>) {
|
||||
return __bfloat1622float2(val);
|
||||
} else if constexpr (std::is_same_v<packed_t, __half2>) {
|
||||
return __half22float2(val);
|
||||
} else if constexpr (std::is_same_v<packed_t, float2>) {
|
||||
return float2(val);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename packed_t>
|
||||
__device__ __forceinline__ packed_t cast_to_packed(const float2& val) {
|
||||
if constexpr (std::is_same_v<packed_t, __nv_bfloat162>) {
|
||||
return __float22bfloat162_rn(val);
|
||||
} else if constexpr (std::is_same_v<packed_t, __half2>) {
|
||||
return __float22half2_rn(val);
|
||||
} else if constexpr (std::is_same_v<packed_t, float2>) {
|
||||
return float2(val);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename packed_t>
|
||||
__device__ __forceinline__ packed_t packed_mul(const packed_t& x,
|
||||
const packed_t& y) {
|
||||
if constexpr (std::is_same_v<packed_t, __nv_bfloat162> ||
|
||||
std::is_same_v<packed_t, __half2>) {
|
||||
return __hmul2(x, y);
|
||||
} else if constexpr (std::is_same_v<packed_t, float2>) {
|
||||
return make_float2(x.x * y.x, x.y * y.y);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
@@ -15,9 +15,9 @@
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct SSMParamsBase {
|
||||
using index_t = uint32_t;
|
||||
using index_t = size_t;
|
||||
|
||||
int batch, dim, seqlen, dstate, n_groups, n_chunks;
|
||||
int batch, dim, seqlen, dstate, n_groups;
|
||||
int dim_ngroups_ratio;
|
||||
bool is_variable_B;
|
||||
bool is_variable_C;
|
||||
@@ -72,6 +72,8 @@ struct SSMParamsBase {
|
||||
void *__restrict__ block_idx_first_scheduled_token_ptr; // (batch,) - first block to write
|
||||
void *__restrict__ block_idx_last_scheduled_token_ptr; // (batch,) - last block to write
|
||||
void *__restrict__ initial_state_idx_ptr; // (batch,) - index of the initial state to use
|
||||
void *__restrict__ cu_chunk_seqlen_ptr; // (nchunks+1,) - cumulative chunk token offsets
|
||||
void *__restrict__ last_chunk_indices_ptr; // (batch,) - index of last chunk per sequence
|
||||
};
|
||||
|
||||
|
||||
|
||||
@@ -81,7 +81,6 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
||||
constexpr bool kIsVariableC = Ktraits::kIsVariableC;
|
||||
constexpr bool kHasZ = Ktraits::kHasZ;
|
||||
constexpr bool kVarlen = Ktraits::kVarlen;
|
||||
constexpr int kNThreads = Ktraits::kNThreads;
|
||||
constexpr int kNItems = Ktraits::kNItems;
|
||||
constexpr int kNRows = Ktraits::kNRows;
|
||||
constexpr bool kDirectIO = Ktraits::kDirectIO;
|
||||
@@ -161,17 +160,8 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// for (int state_idx = threadIdx.x; state_idx < params.dstate; state_idx += blockDim.x) {
|
||||
// smem_a[state_idx] = A[state_idx * params.A_dstate_stride];
|
||||
// smem_bc[state_idx] = B[state_idx * params.B_dstate_stride] * C[state_idx * params.C_dstate_stride];
|
||||
// }
|
||||
|
||||
constexpr int kChunkSize = kNThreads * kNItems;
|
||||
|
||||
// Use block_size for chunking when APC is enabled, otherwise use 2048 for backwards compatibility
|
||||
const int iteration_chunk_size = params.cache_enabled ? params.block_size : 2048;
|
||||
const int n_chunks = (seqlen + iteration_chunk_size - 1) / iteration_chunk_size;
|
||||
const int block_size = params.cache_enabled ? params.block_size : 2048;
|
||||
|
||||
const int* batch_cache_indices = cache_indices != nullptr ?
|
||||
cache_indices + batch_id * params.cache_indices_stride : nullptr;
|
||||
@@ -181,10 +171,44 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
||||
reinterpret_cast<const int*>(params.block_idx_last_scheduled_token_ptr) : nullptr;
|
||||
const int* initial_state_idx = params.initial_state_idx_ptr != nullptr ?
|
||||
reinterpret_cast<const int*>(params.initial_state_idx_ptr) : nullptr;
|
||||
const int* cu_chunk_seqlen = params.cu_chunk_seqlen_ptr != nullptr ?
|
||||
reinterpret_cast<const int*>(params.cu_chunk_seqlen_ptr) : nullptr;
|
||||
const int* last_chunk_indices = params.last_chunk_indices_ptr != nullptr ?
|
||||
reinterpret_cast<const int*>(params.last_chunk_indices_ptr) : nullptr;
|
||||
|
||||
const size_t load_cache_slot = params.cache_enabled && batch_cache_indices != nullptr ? batch_cache_indices[initial_state_idx[batch_id]] : cache_index;
|
||||
|
||||
const int block_idx_first = (params.cache_enabled && block_idx_first_scheduled != nullptr) ?
|
||||
block_idx_first_scheduled[batch_id] : 0;
|
||||
|
||||
// Determine chunk boundaries from pre-computed metadata (APC mode)
|
||||
// or fall back to simple block_size chunking.
|
||||
int first_chunk_idx, n_chunks;
|
||||
int current_position;
|
||||
|
||||
if (cu_chunk_seqlen != nullptr && last_chunk_indices != nullptr) {
|
||||
const int last_chunk_idx = last_chunk_indices[batch_id];
|
||||
first_chunk_idx = (batch_id == 0) ? 0 : last_chunk_indices[batch_id - 1] + 1;
|
||||
n_chunks = last_chunk_idx - first_chunk_idx + 1;
|
||||
// Derive current_position: if the first chunk is partial (fills remainder
|
||||
// of a started block), offset into the block accordingly.
|
||||
const int first_chunk_tokens = cu_chunk_seqlen[first_chunk_idx + 1] - cu_chunk_seqlen[first_chunk_idx];
|
||||
const int chunk_start_offset = (n_chunks > 1 && first_chunk_tokens < block_size)
|
||||
? (block_size - first_chunk_tokens) : 0;
|
||||
current_position = block_idx_first * block_size + chunk_start_offset;
|
||||
} else {
|
||||
first_chunk_idx = 0;
|
||||
n_chunks = (seqlen + block_size - 1) / block_size;
|
||||
current_position = 0;
|
||||
}
|
||||
|
||||
int tokens_processed = 0;
|
||||
|
||||
for (int chunk = 0; chunk < n_chunks; ++chunk) {
|
||||
const int chunk_tokens = (cu_chunk_seqlen != nullptr)
|
||||
? cu_chunk_seqlen[first_chunk_idx + chunk + 1] - cu_chunk_seqlen[first_chunk_idx + chunk]
|
||||
: min(block_size, seqlen - tokens_processed);
|
||||
if (chunk_tokens <= 0) break;
|
||||
input_t u_vals[kNRows][kNItems], delta_vals_load[kNRows][kNItems];
|
||||
|
||||
__syncthreads();
|
||||
@@ -193,12 +217,12 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
||||
if constexpr (!kDirectIO) {
|
||||
if (r > 0) { __syncthreads(); }
|
||||
}
|
||||
load_input<Ktraits>(u + r * params.u_d_stride, u_vals[r], smem_load, seqlen - chunk * kChunkSize);
|
||||
load_input<Ktraits>(u + r * params.u_d_stride, u_vals[r], smem_load, chunk_tokens);
|
||||
if constexpr (!kDirectIO) { __syncthreads(); }
|
||||
load_input<Ktraits>(delta + r * params.delta_d_stride, delta_vals_load[r], smem_load, seqlen - chunk * kChunkSize);
|
||||
load_input<Ktraits>(delta + r * params.delta_d_stride, delta_vals_load[r], smem_load, chunk_tokens);
|
||||
}
|
||||
u += kChunkSize;
|
||||
delta += kChunkSize;
|
||||
u += chunk_tokens;
|
||||
delta += chunk_tokens;
|
||||
|
||||
float delta_vals[kNRows][kNItems], delta_u_vals[kNRows][kNItems], out_vals[kNRows][kNItems];
|
||||
#pragma unroll
|
||||
@@ -232,7 +256,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
||||
weight_t B_vals[kNItems], C_vals[kNItems];
|
||||
if constexpr (kIsVariableB) {
|
||||
load_weight<Ktraits>(Bvar + state_idx * params.B_dstate_stride, B_vals,
|
||||
smem_load_weight, (seqlen - chunk * kChunkSize) * (1));
|
||||
smem_load_weight, chunk_tokens);
|
||||
if constexpr (!kIsVariableC) {
|
||||
#pragma unroll
|
||||
for (int r = 0; r < kNRows; ++r) {
|
||||
@@ -243,7 +267,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
||||
if constexpr (kIsVariableC) {
|
||||
auto &smem_load_weight_C = !kIsVariableB ? smem_load_weight : smem_load_weight1;
|
||||
load_weight<Ktraits>(Cvar + state_idx * params.C_dstate_stride, C_vals,
|
||||
smem_load_weight_C, (seqlen - chunk * kChunkSize) * (1));
|
||||
smem_load_weight_C, chunk_tokens);
|
||||
if constexpr (!kIsVariableB) {
|
||||
#pragma unroll
|
||||
for (int r = 0; r < kNRows; ++r) {
|
||||
@@ -266,10 +290,8 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
||||
for (int i = 0; i < kNItems; ++i) {
|
||||
thread_data[i] = make_float2(exp2f(delta_vals[r][i] * A_val[r]),
|
||||
!kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i]);
|
||||
if (seqlen % (kNItems * kNThreads) != 0) { // So that the last state is correct
|
||||
if (threadIdx.x * kNItems + i >= seqlen - chunk * kChunkSize) {
|
||||
thread_data[i] = make_float2(1.f, 0.f);
|
||||
}
|
||||
if (threadIdx.x * kNItems + i >= chunk_tokens) {
|
||||
thread_data[i] = make_float2(1.f, 0.f);
|
||||
}
|
||||
}
|
||||
// Initialize running total
|
||||
@@ -301,14 +323,14 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
||||
if (threadIdx.x == 0) {
|
||||
smem_running_prefix[state_idx + r * MAX_DSTATE] = prefix_op.running_prefix;
|
||||
|
||||
// Store state at the end of each chunk when cache is enabled
|
||||
// Store state at the end of each aligned chunk when cache is enabled
|
||||
if (params.cache_enabled && batch_cache_indices != nullptr) {
|
||||
|
||||
size_t cache_slot;
|
||||
if (chunk == n_chunks - 1) {
|
||||
cache_slot = batch_cache_indices[block_idx_last_scheduled[batch_id]];
|
||||
} else {
|
||||
cache_slot = batch_cache_indices[block_idx_first_scheduled[batch_id] + chunk];
|
||||
const int block_idx_completed = (current_position + chunk_tokens - 1) / block_size;
|
||||
cache_slot = batch_cache_indices[block_idx_completed];
|
||||
}
|
||||
|
||||
size_t state_offset = cache_slot * params.ssm_states_batch_stride +
|
||||
@@ -331,38 +353,41 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
||||
}
|
||||
}
|
||||
input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + sequence_start_index * params.out_batch_stride
|
||||
+ dim_id * kNRows * params.out_d_stride + chunk * kChunkSize;
|
||||
+ dim_id * kNRows * params.out_d_stride + tokens_processed;
|
||||
__syncthreads();
|
||||
#pragma unroll
|
||||
for (int r = 0; r < kNRows; ++r) {
|
||||
if constexpr (!kDirectIO) {
|
||||
if (r > 0) { __syncthreads(); }
|
||||
}
|
||||
store_output<Ktraits>(out + r * params.out_d_stride, out_vals[r], smem_store, seqlen - chunk * kChunkSize);
|
||||
store_output<Ktraits>(out + r * params.out_d_stride, out_vals[r], smem_store, chunk_tokens);
|
||||
}
|
||||
|
||||
if constexpr (kHasZ) {
|
||||
input_t *z = reinterpret_cast<input_t *>(params.z_ptr) + sequence_start_index * params.z_batch_stride
|
||||
+ dim_id * kNRows * params.z_d_stride + chunk * kChunkSize;
|
||||
+ dim_id * kNRows * params.z_d_stride + tokens_processed;
|
||||
input_t *out_z = reinterpret_cast<input_t *>(params.out_z_ptr) + sequence_start_index * params.out_z_batch_stride
|
||||
+ dim_id * kNRows * params.out_z_d_stride + chunk * kChunkSize;
|
||||
+ dim_id * kNRows * params.out_z_d_stride + tokens_processed;
|
||||
#pragma unroll
|
||||
for (int r = 0; r < kNRows; ++r) {
|
||||
input_t z_vals[kNItems];
|
||||
__syncthreads();
|
||||
load_input<Ktraits>(z + r * params.z_d_stride, z_vals, smem_load, seqlen - chunk * kChunkSize);
|
||||
load_input<Ktraits>(z + r * params.z_d_stride, z_vals, smem_load, chunk_tokens);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < kNItems; ++i) {
|
||||
float z_val = z_vals[i];
|
||||
out_vals[r][i] *= z_val / (1 + expf(-z_val));
|
||||
}
|
||||
__syncthreads();
|
||||
store_output<Ktraits>(out_z + r * params.out_z_d_stride, out_vals[r], smem_store, seqlen - chunk * kChunkSize);
|
||||
store_output<Ktraits>(out_z + r * params.out_z_d_stride, out_vals[r], smem_store, chunk_tokens);
|
||||
}
|
||||
}
|
||||
|
||||
Bvar += kChunkSize * 1;
|
||||
Cvar += kChunkSize * 1;
|
||||
Bvar += chunk_tokens;
|
||||
Cvar += chunk_tokens;
|
||||
|
||||
tokens_processed += chunk_tokens;
|
||||
current_position += chunk_tokens;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -506,7 +531,9 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms,
|
||||
int64_t block_size,
|
||||
const std::optional<torch::Tensor> &block_idx_first_scheduled_token,
|
||||
const std::optional<torch::Tensor> &block_idx_last_scheduled_token,
|
||||
const std::optional<torch::Tensor> &initial_state_idx) {
|
||||
const std::optional<torch::Tensor> &initial_state_idx,
|
||||
const std::optional<torch::Tensor> &cu_chunk_seqlen,
|
||||
const std::optional<torch::Tensor> &last_chunk_indices) {
|
||||
|
||||
// Reset the parameters
|
||||
memset(¶ms, 0, sizeof(params));
|
||||
@@ -548,6 +575,8 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms,
|
||||
params.block_idx_first_scheduled_token_ptr = block_idx_first_scheduled_token.has_value() ? block_idx_first_scheduled_token.value().data_ptr() : nullptr;
|
||||
params.block_idx_last_scheduled_token_ptr = block_idx_last_scheduled_token.has_value() ? block_idx_last_scheduled_token.value().data_ptr() : nullptr;
|
||||
params.initial_state_idx_ptr = initial_state_idx.has_value() ? initial_state_idx.value().data_ptr() : nullptr;
|
||||
params.cu_chunk_seqlen_ptr = cu_chunk_seqlen.has_value() ? cu_chunk_seqlen.value().data_ptr() : nullptr;
|
||||
params.last_chunk_indices_ptr = last_chunk_indices.has_value() ? last_chunk_indices.value().data_ptr() : nullptr;
|
||||
|
||||
// All stride are in elements, not bytes.
|
||||
params.A_d_stride = A.stride(0);
|
||||
@@ -633,7 +662,9 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
|
||||
int64_t block_size,
|
||||
const std::optional<torch::Tensor> &block_idx_first_scheduled_token,
|
||||
const std::optional<torch::Tensor> &block_idx_last_scheduled_token,
|
||||
const std::optional<torch::Tensor> &initial_state_idx) {
|
||||
const std::optional<torch::Tensor> &initial_state_idx,
|
||||
const std::optional<torch::Tensor> &cu_chunk_seqlen,
|
||||
const std::optional<torch::Tensor> &last_chunk_indices) {
|
||||
auto input_type = u.scalar_type();
|
||||
auto weight_type = A.scalar_type();
|
||||
TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
|
||||
@@ -778,7 +809,9 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
|
||||
block_size,
|
||||
block_idx_first_scheduled_token,
|
||||
block_idx_last_scheduled_token,
|
||||
initial_state_idx
|
||||
initial_state_idx,
|
||||
cu_chunk_seqlen,
|
||||
last_chunk_indices
|
||||
);
|
||||
|
||||
|
||||
|
||||
@@ -35,11 +35,11 @@ __global__ void batched_moe_align_block_size_kernel(
|
||||
int32_t const block_ids_size = sorted_ids_size / block_size;
|
||||
int32_t const SENTINEL =
|
||||
num_batches * max_tokens_per_batch; // To denote invalid entries.
|
||||
// Intialize sorted_ids
|
||||
// Initialize sorted_ids
|
||||
for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
|
||||
sorted_ids[i] = SENTINEL;
|
||||
}
|
||||
// Intialize expert_ids with -1
|
||||
// Initialize expert_ids with -1
|
||||
for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
|
||||
block_ids[i] = -1;
|
||||
}
|
||||
|
||||
@@ -58,6 +58,10 @@ void shuffle_rows(const torch::Tensor& input_tensor,
|
||||
torch::Tensor& output_tensor);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
// cuBLAS bf16 x bf16 -> fp32 router GEMM (fallback for non-SM90 / batch > 16)
|
||||
torch::Tensor router_gemm_bf16_fp32(torch::Tensor const& input,
|
||||
torch::Tensor const& weight);
|
||||
|
||||
// DeepSeek V3 optimized router GEMM kernel for SM90+
|
||||
// Computes output = mat_a @ mat_b.T where:
|
||||
// mat_a: [num_tokens, hidden_dim] in bf16
|
||||
|
||||
60
csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm.cu
Normal file
60
csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm.cu
Normal file
@@ -0,0 +1,60 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
// Adapted from SGLang:
|
||||
// https://github.com/sgl-project/sglang/blob/ded068a76e00878881d52d5bfb791e0f60d7311b/sgl-kernel/csrc/expert_specialization/es_sm100_mxfp8_blockscaled.cu
|
||||
|
||||
#include <torch/all.h>
|
||||
|
||||
#include "cutlass_mxfp8_grouped_mm_launcher.cuh"
|
||||
|
||||
void cutlass_mxfp8_grouped_mm(const torch::Tensor& a, const torch::Tensor& b,
|
||||
const torch::Tensor& sfa,
|
||||
const torch::Tensor& sfb, torch::Tensor& d,
|
||||
const torch::Tensor& problem_sizes,
|
||||
const torch::Tensor& expert_offsets,
|
||||
const torch::Tensor& blockscale_offsets) {
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
|
||||
TORCH_CHECK(problem_sizes.size(1) == 3,
|
||||
"problem_sizes must have shape (num_experts, 3)");
|
||||
TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
|
||||
"Number of experts in problem_sizes must match expert_offsets");
|
||||
TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
|
||||
"problem_sizes must be int32");
|
||||
TORCH_CHECK(expert_offsets.dtype() == torch::kInt32,
|
||||
"expert_offsets must be int32");
|
||||
TORCH_CHECK(blockscale_offsets.dtype() == torch::kInt32,
|
||||
"blockscale_offsets must be int32");
|
||||
TORCH_CHECK(a.dim() == 2, "a must be a 2D tensor of shape (num_tokens, k)");
|
||||
TORCH_CHECK(b.dim() == 3,
|
||||
"b must be a 3D tensor of shape (num_experts, k, n)");
|
||||
TORCH_CHECK(a.size(1) == b.size(1) && a.size(1) % 128 == 0,
|
||||
"k should align 128");
|
||||
TORCH_CHECK(b.size(2) % 128 == 0, "n should align 128");
|
||||
TORCH_CHECK(a.strides()[1] == 1, "a must be row major");
|
||||
TORCH_CHECK(b.strides()[1] == 1, "b must be column major");
|
||||
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
if (d.dtype() == torch::kBFloat16) {
|
||||
expert_specialization::cutlass_mxfp8_grouped_mm_dispatch_out_dtype<
|
||||
cutlass::bfloat16_t>(a, b, sfa, sfb, d, problem_sizes, expert_offsets,
|
||||
blockscale_offsets, stream);
|
||||
} else if (d.dtype() == torch::kFloat16) {
|
||||
expert_specialization::cutlass_mxfp8_grouped_mm_dispatch_out_dtype<
|
||||
cutlass::half_t>(a, b, sfa, sfb, d, problem_sizes, expert_offsets,
|
||||
blockscale_offsets, stream);
|
||||
} else {
|
||||
TORCH_CHECK(false, "dtype must be kFloat16 or kBFloat16");
|
||||
}
|
||||
#else
|
||||
TORCH_CHECK(false,
|
||||
"No implemented cutlass_mxfp8_grouped_mm for "
|
||||
"current device");
|
||||
#endif
|
||||
}
|
||||
|
||||
#include "core/registration.h"
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||
m.impl("cutlass_mxfp8_grouped_mm", cutlass_mxfp8_grouped_mm);
|
||||
}
|
||||
141
csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm_functor.cuh
Normal file
141
csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm_functor.cuh
Normal file
@@ -0,0 +1,141 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
// Adapted from SGLang:
|
||||
// https://github.com/sgl-project/sglang/blob/ded068a76e00878881d52d5bfb791e0f60d7311b/sgl-kernel/csrc/expert_specialization/es_sm100_mxfp8_blockscaled_functor.cuh
|
||||
|
||||
#pragma once
|
||||
#include <cuda.h>
|
||||
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cutlass/util/packed_stride.hpp"
|
||||
#include "cutlass_mxfp8_grouped_mm_traits.cuh"
|
||||
|
||||
namespace expert_specialization {
|
||||
|
||||
using namespace cute;
|
||||
|
||||
template <typename GemmTraits>
|
||||
struct CutlassMxfp8GroupedMmOffsetFunctor {
|
||||
using Gemm = typename GemmTraits::Gemm;
|
||||
using ElementA = typename Gemm::ElementA;
|
||||
using ElementB = typename Gemm::ElementB;
|
||||
using ElementSF = typename GemmTraits::ElementSF;
|
||||
using ElementD = typename GemmTraits::ElementOutput;
|
||||
// Input
|
||||
int* expert_offsets{nullptr};
|
||||
int* blockscale_offsets{nullptr};
|
||||
// Output
|
||||
ElementA* a_base{nullptr};
|
||||
ElementB* b_base{nullptr};
|
||||
ElementSF* sfa_base{nullptr};
|
||||
ElementSF* sfb_base{nullptr};
|
||||
ElementD* d_base{nullptr};
|
||||
ElementA** a_offsets{nullptr};
|
||||
ElementB** b_offsets{nullptr};
|
||||
ElementSF** sfa_offsets{nullptr};
|
||||
ElementSF** sfb_offsets{nullptr};
|
||||
ElementD** d_offsets{nullptr};
|
||||
|
||||
CutlassMxfp8GroupedMmOffsetFunctor() = default;
|
||||
CutlassMxfp8GroupedMmOffsetFunctor(
|
||||
int* _expert_offsets, int* _blockscale_offsets, ElementA* _a_base,
|
||||
ElementB* _b_base, ElementSF* _sfa_base, ElementSF* _sfb_base,
|
||||
ElementD* _d_base, ElementA** _a_offsets, ElementB** _b_offsets,
|
||||
ElementSF** _sfa_offsets, ElementSF** _sfb_offsets, ElementD** _d_offsets)
|
||||
: expert_offsets{_expert_offsets},
|
||||
blockscale_offsets{_blockscale_offsets},
|
||||
a_base(_a_base),
|
||||
b_base(_b_base),
|
||||
sfa_base(_sfa_base),
|
||||
sfb_base(_sfb_base),
|
||||
d_base(_d_base),
|
||||
a_offsets(_a_offsets),
|
||||
b_offsets(_b_offsets),
|
||||
sfa_offsets(_sfa_offsets),
|
||||
sfb_offsets(_sfb_offsets),
|
||||
d_offsets(_d_offsets) {}
|
||||
|
||||
void CUTE_DEVICE operator()(int64_t expert_id, int m, int n, int k) {
|
||||
int64_t expert_offset = static_cast<int64_t>(expert_offsets[expert_id]);
|
||||
int64_t blockscale_offset =
|
||||
static_cast<int64_t>(blockscale_offsets[expert_id]);
|
||||
int64_t a_stride = expert_offset * k;
|
||||
int64_t b_stride = expert_id * k * n;
|
||||
int64_t d_stride = expert_offset * n;
|
||||
int64_t sfa_stride = blockscale_offset * (k / 32);
|
||||
int64_t sfb_stride = expert_id * n * (k / 32);
|
||||
|
||||
a_offsets[expert_id] = a_base + a_stride;
|
||||
b_offsets[expert_id] = b_base + b_stride;
|
||||
sfa_offsets[expert_id] = sfa_base + sfa_stride;
|
||||
sfb_offsets[expert_id] = sfb_base + sfb_stride;
|
||||
d_offsets[expert_id] = d_base + d_stride;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename GemmTraits>
|
||||
struct CutlassMxfp8GroupedMmLayoutFunctor {
|
||||
using Sm1xxBlkScaledConfig = typename GemmTraits::Sm1xxBlkScaledConfig;
|
||||
using LayoutSFA = typename GemmTraits::LayoutSFA;
|
||||
using LayoutSFB = typename GemmTraits::LayoutSFB;
|
||||
LayoutSFA* layout_sfa_base{nullptr};
|
||||
LayoutSFB* layout_sfb_base{nullptr};
|
||||
|
||||
CutlassMxfp8GroupedMmLayoutFunctor() = default;
|
||||
CutlassMxfp8GroupedMmLayoutFunctor(LayoutSFA* _layout_sfa_base,
|
||||
LayoutSFB* _layout_sfb_base)
|
||||
: layout_sfa_base(_layout_sfa_base), layout_sfb_base(_layout_sfb_base) {}
|
||||
|
||||
void CUTE_DEVICE operator()(int64_t expert_id, int m, int n, int k) {
|
||||
LayoutSFA* layout_sfa_ptr = layout_sfa_base + expert_id;
|
||||
LayoutSFB* layout_sfb_ptr = layout_sfb_base + expert_id;
|
||||
*layout_sfa_ptr = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(
|
||||
cute::make_shape(m, n, k, 1));
|
||||
*layout_sfb_ptr = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(
|
||||
cute::make_shape(m, n, k, 1));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename GemmTraits>
|
||||
struct CutlassMxfp8GroupedMmStrideFunctor {
|
||||
using StrideA = typename GemmTraits::StrideA;
|
||||
using StrideB = typename GemmTraits::StrideB;
|
||||
using StrideD = typename GemmTraits::StrideD;
|
||||
StrideA* stride_A_base{nullptr};
|
||||
StrideB* stride_B_base{nullptr};
|
||||
StrideD* stride_D_base{nullptr};
|
||||
|
||||
CutlassMxfp8GroupedMmStrideFunctor() = default;
|
||||
CutlassMxfp8GroupedMmStrideFunctor(StrideA* _stride_A_base,
|
||||
StrideB* _stride_B_base,
|
||||
StrideD* _stride_D_base)
|
||||
: stride_A_base(_stride_A_base),
|
||||
stride_B_base(_stride_B_base),
|
||||
stride_D_base(_stride_D_base) {}
|
||||
|
||||
void CUTE_DEVICE operator()(int64_t expert_id, int m, int n, int k) {
|
||||
StrideA* stride_A = stride_A_base + expert_id;
|
||||
StrideB* stride_B = stride_B_base + expert_id;
|
||||
StrideD* stride_D = stride_D_base + expert_id;
|
||||
*stride_A = cutlass::make_cute_packed_stride(StrideA{}, {m, k, 1});
|
||||
*stride_B = cutlass::make_cute_packed_stride(StrideB{}, {n, k, 1});
|
||||
*stride_D = cutlass::make_cute_packed_stride(StrideD{}, {m, n, 1});
|
||||
}
|
||||
};
|
||||
|
||||
template <typename OffsetFunctor, typename LayoutFunctor,
|
||||
typename StrideFunctor>
|
||||
__global__ void cutlassMxfp8GroupedMmPreComputeKernel(
|
||||
int* problem_sizes, OffsetFunctor offset_functor,
|
||||
LayoutFunctor layout_functor, StrideFunctor stride_functor) {
|
||||
int64_t expert_id = static_cast<int64_t>(threadIdx.x);
|
||||
int m = problem_sizes[expert_id * 3 + 0];
|
||||
int n = problem_sizes[expert_id * 3 + 1];
|
||||
int k = problem_sizes[expert_id * 3 + 2];
|
||||
|
||||
offset_functor(expert_id, m, n, k);
|
||||
layout_functor(expert_id, m, n, k);
|
||||
stride_functor(expert_id, m, n, k);
|
||||
}
|
||||
|
||||
} // namespace expert_specialization
|
||||
179
csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm_launcher.cuh
Normal file
179
csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm_launcher.cuh
Normal file
@@ -0,0 +1,179 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
// Adapted from SGLang:
|
||||
// https://github.com/sgl-project/sglang/blob/ded068a76e00878881d52d5bfb791e0f60d7311b/sgl-kernel/csrc/expert_specialization/es_sm100_mxfp8_blockscaled_launcher.cuh
|
||||
|
||||
#pragma once
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <torch/all.h>
|
||||
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cutlass_mxfp8_grouped_mm_functor.cuh"
|
||||
#include "cutlass_mxfp8_grouped_mm_traits.cuh"
|
||||
|
||||
namespace expert_specialization {
|
||||
|
||||
template <typename GemmTraits>
|
||||
void cutlass_mxfp8_grouped_mm_pre_compute(
|
||||
torch::Tensor& a_ptrs, torch::Tensor& b_ptrs, torch::Tensor& sfa_ptrs,
|
||||
torch::Tensor& sfb_ptrs, torch::Tensor& d_ptrs, torch::Tensor& stride_a,
|
||||
torch::Tensor& stride_b, torch::Tensor& stride_d, torch::Tensor& layout_sfa,
|
||||
torch::Tensor& layout_sfb, const torch::Tensor& a, const torch::Tensor& b,
|
||||
const torch::Tensor& sfa, const torch::Tensor& sfb, const torch::Tensor& d,
|
||||
const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets,
|
||||
const torch::Tensor& blockscale_offsets, cudaStream_t stream) {
|
||||
using OffsetFunctor = CutlassMxfp8GroupedMmOffsetFunctor<GemmTraits>;
|
||||
using ElementA = typename OffsetFunctor::ElementA;
|
||||
using ElementB = typename OffsetFunctor::ElementB;
|
||||
using ElementSF = typename OffsetFunctor::ElementSF;
|
||||
using ElementD = typename OffsetFunctor::ElementD;
|
||||
|
||||
using LayoutFunctor = CutlassMxfp8GroupedMmLayoutFunctor<GemmTraits>;
|
||||
using LayoutSFA = typename LayoutFunctor::LayoutSFA;
|
||||
using LayoutSFB = typename LayoutFunctor::LayoutSFB;
|
||||
|
||||
using StrideFunctor = CutlassMxfp8GroupedMmStrideFunctor<GemmTraits>;
|
||||
using StrideA = typename StrideFunctor::StrideA;
|
||||
using StrideB = typename StrideFunctor::StrideB;
|
||||
using StrideD = typename StrideFunctor::StrideD;
|
||||
|
||||
int num_experts = (int)expert_offsets.size(0);
|
||||
TORCH_CHECK(num_experts <= 1024,
|
||||
"Number of experts cannot exceed 1024, the maximum number of "
|
||||
"threads per block.");
|
||||
|
||||
OffsetFunctor offset_functor(
|
||||
reinterpret_cast<int*>(expert_offsets.data_ptr()),
|
||||
reinterpret_cast<int*>(blockscale_offsets.data_ptr()),
|
||||
reinterpret_cast<ElementA*>(a.data_ptr()),
|
||||
reinterpret_cast<ElementB*>(b.data_ptr()),
|
||||
reinterpret_cast<ElementSF*>(sfa.data_ptr()),
|
||||
reinterpret_cast<ElementSF*>(sfb.data_ptr()),
|
||||
reinterpret_cast<ElementD*>(d.data_ptr()),
|
||||
reinterpret_cast<ElementA**>(a_ptrs.data_ptr()),
|
||||
reinterpret_cast<ElementB**>(b_ptrs.data_ptr()),
|
||||
reinterpret_cast<ElementSF**>(sfa_ptrs.data_ptr()),
|
||||
reinterpret_cast<ElementSF**>(sfb_ptrs.data_ptr()),
|
||||
reinterpret_cast<ElementD**>(d_ptrs.data_ptr()));
|
||||
LayoutFunctor layout_functor(
|
||||
reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),
|
||||
reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr()));
|
||||
StrideFunctor stride_functor(reinterpret_cast<StrideA*>(stride_a.data_ptr()),
|
||||
reinterpret_cast<StrideB*>(stride_b.data_ptr()),
|
||||
reinterpret_cast<StrideD*>(stride_d.data_ptr()));
|
||||
cutlassMxfp8GroupedMmPreComputeKernel<<<1, num_experts, 0, stream>>>(
|
||||
static_cast<int*>(problem_sizes.data_ptr()), offset_functor,
|
||||
layout_functor, stride_functor);
|
||||
}
|
||||
|
||||
template <typename GemmTraits>
|
||||
void cutlass_mxfp8_grouped_mm(
|
||||
const torch::Tensor& a_ptrs, const torch::Tensor& b_ptrs,
|
||||
const torch::Tensor& sfa_ptrs, const torch::Tensor& sfb_ptrs,
|
||||
const torch::Tensor& d_ptrs, const torch::Tensor& stride_a,
|
||||
const torch::Tensor& stride_b, const torch::Tensor& stride_d,
|
||||
const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb,
|
||||
const torch::Tensor& problem_sizes, cudaStream_t stream) {
|
||||
using Gemm = typename GemmTraits::Gemm;
|
||||
using ElementA = typename Gemm::ElementA;
|
||||
using ElementB = typename Gemm::ElementB;
|
||||
using ElementSF = typename GemmTraits::ElementSF;
|
||||
using ElementD = typename GemmTraits::ElementOutput;
|
||||
using StrideA = typename GemmTraits::StrideA;
|
||||
using StrideB = typename GemmTraits::StrideB;
|
||||
using StrideD = typename GemmTraits::StrideD;
|
||||
using LayoutSFA = typename GemmTraits::LayoutSFA;
|
||||
using LayoutSFB = typename GemmTraits::LayoutSFB;
|
||||
using UnderlyingProblemShape =
|
||||
typename GemmTraits::ProblemShape::UnderlyingProblemShape;
|
||||
|
||||
cutlass::KernelHardwareInfo hw_info;
|
||||
hw_info.device_id = c10::cuda::current_device();
|
||||
hw_info.sm_count =
|
||||
at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
|
||||
hw_info.cluster_shape = GemmTraits::MMAConfig::preferred_cluster;
|
||||
hw_info.cluster_shape_fallback = GemmTraits::MMAConfig::fallback_cluster;
|
||||
|
||||
int num_experts = (int)problem_sizes.size(0);
|
||||
|
||||
UnderlyingProblemShape* underlying_problem_shape =
|
||||
reinterpret_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
|
||||
|
||||
typename Gemm::Arguments arguments = {
|
||||
cutlass::gemm::GemmUniversalMode::kGrouped,
|
||||
{num_experts, underlying_problem_shape, nullptr},
|
||||
{reinterpret_cast<const ElementA**>(a_ptrs.data_ptr()),
|
||||
reinterpret_cast<StrideA*>(stride_a.data_ptr()),
|
||||
reinterpret_cast<const ElementB**>(b_ptrs.data_ptr()),
|
||||
reinterpret_cast<StrideB*>(stride_b.data_ptr()),
|
||||
reinterpret_cast<const ElementSF**>(sfa_ptrs.data_ptr()),
|
||||
reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),
|
||||
reinterpret_cast<const ElementSF**>(sfb_ptrs.data_ptr()),
|
||||
reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr())},
|
||||
{{},
|
||||
nullptr,
|
||||
nullptr,
|
||||
reinterpret_cast<ElementD**>(d_ptrs.data_ptr()),
|
||||
reinterpret_cast<StrideD*>(stride_d.data_ptr())},
|
||||
hw_info,
|
||||
{} // Scheduler
|
||||
};
|
||||
|
||||
Gemm gemm;
|
||||
|
||||
auto can_implement_status = gemm.can_implement(arguments);
|
||||
TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
|
||||
"Failed to implement GEMM");
|
||||
|
||||
torch::TensorOptions options_uint8 =
|
||||
torch::TensorOptions().dtype(torch::kUInt8).device(d_ptrs.device());
|
||||
size_t workspace_size = gemm.get_workspace_size(arguments);
|
||||
torch::Tensor workspace = torch::empty(workspace_size, options_uint8);
|
||||
|
||||
auto status = gemm.initialize(arguments, workspace.data_ptr(), stream);
|
||||
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
|
||||
|
||||
status = gemm.run(stream, nullptr, true); // Enable PDL
|
||||
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
|
||||
}
|
||||
|
||||
template <typename OutType>
|
||||
void cutlass_mxfp8_grouped_mm_dispatch_out_dtype(
|
||||
const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& sfa,
|
||||
const torch::Tensor& sfb, torch::Tensor& d,
|
||||
const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets,
|
||||
const torch::Tensor& blockscale_offsets, cudaStream_t stream) {
|
||||
int num_experts = (int)problem_sizes.size(0);
|
||||
torch::TensorOptions options_int64 =
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(a.device());
|
||||
torch::TensorOptions options_int32 =
|
||||
torch::TensorOptions().dtype(torch::kInt32).device(a.device());
|
||||
|
||||
torch::Tensor a_ptrs = torch::empty(num_experts, options_int64);
|
||||
torch::Tensor b_ptrs = torch::empty(num_experts, options_int64);
|
||||
torch::Tensor sfa_ptrs = torch::empty(num_experts, options_int64);
|
||||
torch::Tensor sfb_ptrs = torch::empty(num_experts, options_int64);
|
||||
torch::Tensor d_ptrs = torch::empty(num_experts, options_int64);
|
||||
|
||||
torch::Tensor stride_a = torch::empty(num_experts, options_int64);
|
||||
torch::Tensor stride_b = torch::empty(num_experts, options_int64);
|
||||
torch::Tensor stride_d = torch::empty(num_experts, options_int64);
|
||||
torch::Tensor layout_sfa = torch::empty({num_experts, 5}, options_int32);
|
||||
torch::Tensor layout_sfb = torch::empty({num_experts, 5}, options_int32);
|
||||
|
||||
using GemmTraits = CutlassMxfp8GroupedMmGemmTraits<MMA1SMConfig, OutType>;
|
||||
cutlass_mxfp8_grouped_mm_pre_compute<GemmTraits>(
|
||||
a_ptrs, b_ptrs, sfa_ptrs, sfb_ptrs, d_ptrs, stride_a, stride_b, stride_d,
|
||||
layout_sfa, layout_sfb, a, b, sfa, sfb, d, problem_sizes, expert_offsets,
|
||||
blockscale_offsets, stream);
|
||||
cutlass_mxfp8_grouped_mm<GemmTraits>(
|
||||
a_ptrs, b_ptrs, sfa_ptrs, sfb_ptrs, d_ptrs, stride_a, stride_b, stride_d,
|
||||
layout_sfa, layout_sfb, problem_sizes, stream);
|
||||
}
|
||||
|
||||
} // namespace expert_specialization
|
||||
127
csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm_traits.cuh
Normal file
127
csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm_traits.cuh
Normal file
@@ -0,0 +1,127 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
// Adapted from SGLang:
|
||||
// https://github.com/sgl-project/sglang/blob/ded068a76e00878881d52d5bfb791e0f60d7311b/sgl-kernel/csrc/expert_specialization/es_sm100_mxfp8_blockscaled_traits.cuh
|
||||
|
||||
#pragma once
|
||||
|
||||
// Misc
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cutlass/arch/arch.h"
|
||||
#include "cutlass/arch/mma.h"
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/detail/sm100_blockscaled_layout.hpp"
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/gemm/dispatch_policy.hpp"
|
||||
#include "cutlass/gemm/group_array_problem_shape.hpp"
|
||||
#include "cutlass/layout/layout.h"
|
||||
#include "cutlass/numeric_conversion.h"
|
||||
#include "cutlass/numeric_size.h"
|
||||
|
||||
// Collective Builder
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp"
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
|
||||
// Integration
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
|
||||
namespace expert_specialization {
|
||||
|
||||
using namespace cute;
|
||||
|
||||
// Different configs for 1SM and 2SM MMA kernel
|
||||
struct MMA1SMConfig {
|
||||
using MmaTileShape = Shape<_128, _128, _128>;
|
||||
using KernelSchedule =
|
||||
cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100;
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
const static dim3 preferred_cluster;
|
||||
const static dim3 fallback_cluster;
|
||||
};
|
||||
const dim3 MMA1SMConfig::preferred_cluster(1, 4, 1);
|
||||
const dim3 MMA1SMConfig::fallback_cluster(1, 2, 1);
|
||||
|
||||
template <typename _MMAConfig, typename OutputDtype>
|
||||
struct CutlassMxfp8GroupedMmGemmTraits {
|
||||
using MMAConfig = _MMAConfig;
|
||||
using ElementInput = cutlass::float_e4m3_t;
|
||||
using ElementOutput = OutputDtype;
|
||||
using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;
|
||||
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::mx_float8_t<ElementInput>;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
constexpr static int AlignmentA = 32;
|
||||
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::mx_float8_t<ElementInput>;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
constexpr static int AlignmentB = 32;
|
||||
|
||||
// C/D matrix configuration
|
||||
using ElementC = void;
|
||||
using ElementD = ElementOutput;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
using LayoutD = cutlass::layout::RowMajor;
|
||||
constexpr static int AlignmentC = 128 / cutlass::sizeof_bits<ElementD>::value;
|
||||
constexpr static int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
|
||||
using ElementAccumulator = float;
|
||||
|
||||
static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
|
||||
using CustomEVTIdentity = // acc
|
||||
cutlass::epilogue::fusion::Sm90EVT<
|
||||
cutlass::epilogue::fusion::Sm90Compute<
|
||||
cutlass::epilogue::thread::Identity, ElementD, ElementAccumulator,
|
||||
RoundStyle>,
|
||||
cutlass::epilogue::fusion::Sm90AccFetch>;
|
||||
|
||||
// Core kernel configurations
|
||||
using ArchTag = cutlass::arch::Sm100;
|
||||
using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
|
||||
using StageCountType = cutlass::gemm::collective::StageCountAuto;
|
||||
|
||||
// Runtime Cluster Shape
|
||||
using ClusterShape = Shape<int32_t, int32_t, _1>;
|
||||
|
||||
// Define Epilogue
|
||||
using CollectiveEpilogue =
|
||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass, typename MMAConfig::MmaTileShape,
|
||||
ClusterShape, Shape<_64, _64>, ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC*, AlignmentC, ElementD, LayoutD*, AlignmentD,
|
||||
typename MMAConfig::EpilogueSchedule,
|
||||
CustomEVTIdentity>::CollectiveOp;
|
||||
|
||||
// Define Mainloop
|
||||
using CollectiveMainloop =
|
||||
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass, ElementA, LayoutA*, AlignmentA, ElementB,
|
||||
LayoutB*, AlignmentB, ElementAccumulator,
|
||||
typename MMAConfig::MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
|
||||
sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
typename MMAConfig::KernelSchedule>::CollectiveOp;
|
||||
|
||||
// Define GemmKernel
|
||||
using GemmKernel =
|
||||
cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
|
||||
CollectiveEpilogue>;
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
|
||||
using ElementSF = typename Gemm::GemmKernel::ElementSF;
|
||||
using StrideA = typename Gemm::GemmKernel::InternalStrideA;
|
||||
using StrideB = typename Gemm::GemmKernel::InternalStrideB;
|
||||
using StrideC = typename Gemm::GemmKernel::InternalStrideC;
|
||||
using StrideD = typename Gemm::GemmKernel::InternalStrideD;
|
||||
using LayoutSFA =
|
||||
typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
|
||||
using LayoutSFB =
|
||||
typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
|
||||
using Sm1xxBlkScaledConfig =
|
||||
typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
|
||||
};
|
||||
|
||||
} // namespace expert_specialization
|
||||
60
csrc/moe/mxfp8_moe/mxfp8_experts_quant.cu
Normal file
60
csrc/moe/mxfp8_moe/mxfp8_experts_quant.cu
Normal file
@@ -0,0 +1,60 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
// Adapted from SGLang:
|
||||
// https://github.com/sgl-project/sglang/blob/ded068a76e00878881d52d5bfb791e0f60d7311b/sgl-kernel/csrc/expert_specialization/es_sm100_mxfp8_blockscaled_group_quant.cu
|
||||
|
||||
#include <torch/all.h>
|
||||
|
||||
#include "mxfp8_experts_quant.cuh"
|
||||
|
||||
void mxfp8_experts_quant(const torch::Tensor& input,
|
||||
const torch::Tensor& problem_sizes,
|
||||
const torch::Tensor& expert_offsets,
|
||||
const torch::Tensor& blockscale_offsets,
|
||||
torch::Tensor& quant_output,
|
||||
torch::Tensor& scale_factor) {
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
TORCH_CHECK(input.dim() == 2, "input must be 2D tensor");
|
||||
TORCH_CHECK(input.size(1) % 128 == 0, "k must align to 128");
|
||||
TORCH_CHECK(input.strides()[1] == 1, "input must be row major");
|
||||
TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
|
||||
TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
|
||||
"problem_sizes must be int32");
|
||||
TORCH_CHECK(expert_offsets.dtype() == torch::kInt32,
|
||||
"expert_offsets must be int32");
|
||||
TORCH_CHECK(blockscale_offsets.dtype() == torch::kInt32,
|
||||
"blockscale_offsets must be int32");
|
||||
|
||||
auto groups = problem_sizes.size(0);
|
||||
TORCH_CHECK(
|
||||
expert_offsets.dim() == 1 && expert_offsets.size(0) == groups,
|
||||
"expert_offsets must be 1D and have size equal to the number of groups");
|
||||
TORCH_CHECK(
|
||||
blockscale_offsets.dim() == 1 && blockscale_offsets.size(0) == groups,
|
||||
"blockscale_offsets must be 1D and have size equal to the number of "
|
||||
"groups");
|
||||
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
if (input.dtype() == torch::kBFloat16) {
|
||||
expert_specialization::launch_mxfp8_experts_quant<__nv_bfloat16>(
|
||||
input, problem_sizes, expert_offsets, blockscale_offsets, quant_output,
|
||||
scale_factor);
|
||||
} else if (input.dtype() == torch::kFloat16) {
|
||||
expert_specialization::launch_mxfp8_experts_quant<__half>(
|
||||
input, problem_sizes, expert_offsets, blockscale_offsets, quant_output,
|
||||
scale_factor);
|
||||
} else {
|
||||
TORCH_CHECK(false, "dtype must be kFloat16 or kBFloat16");
|
||||
}
|
||||
#else
|
||||
TORCH_CHECK(false,
|
||||
"No implemented mxfp8_experts_quant for "
|
||||
"current device");
|
||||
#endif
|
||||
}
|
||||
|
||||
#include "core/registration.h"
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||
m.impl("mxfp8_experts_quant", mxfp8_experts_quant);
|
||||
}
|
||||
414
csrc/moe/mxfp8_moe/mxfp8_experts_quant.cuh
Normal file
414
csrc/moe/mxfp8_moe/mxfp8_experts_quant.cuh
Normal file
@@ -0,0 +1,414 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
// Adapted from SGLang:
|
||||
// https://github.com/sgl-project/sglang/blob/ded068a76e00878881d52d5bfb791e0f60d7311b/sgl-kernel/csrc/expert_specialization/es_sm100_mxfp8_blockscaled_group_quant.cuh
|
||||
|
||||
#pragma once
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
#include <torch/all.h>
|
||||
|
||||
#include <cuda/ptx>
|
||||
|
||||
#include "cute/tensor.hpp"
|
||||
|
||||
namespace expert_specialization {
|
||||
|
||||
using namespace cute;
|
||||
|
||||
constexpr uint32_t THREAD_BLOCK_SIZE = 128;
|
||||
constexpr uint32_t WARP_SIZE = 32;
|
||||
constexpr int BLOCK_M = 128;
|
||||
constexpr int BLOCK_K = 128;
|
||||
using ThrLayout = Layout<Shape<_16, _8>, Stride<_8, _1>>;
|
||||
using ValLayout = Layout<Shape<_1, _16>>;
|
||||
using SfR2SThrLayout = Layout<Shape<_16, _4>, Stride<_4, _1>>;
|
||||
using SfR2SValLayout = Layout<Shape<_1, _1>>;
|
||||
using ScaleFactorTileLayout =
|
||||
Layout<Shape<Shape<_32, _4>, _4>, Stride<Stride<_16, _4>, _1>>;
|
||||
|
||||
// Fast reciprocal.
|
||||
inline __device__ float reciprocal_approximate_ftz(float a) {
|
||||
float b;
|
||||
asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
|
||||
return b;
|
||||
}
|
||||
|
||||
// Some code references TRT-LLM:
|
||||
// https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/kernels/quantization.cuh
|
||||
template <typename FragmentS, typename FragmentD>
|
||||
__inline__ __device__ uint8_t cvt_warp_fp16_to_mxfp8(FragmentS& fragment_s,
|
||||
FragmentD& fragment_d) {
|
||||
using FragmentSLayout = typename FragmentS::layout_type;
|
||||
using FragmentDLayout = typename FragmentD::layout_type;
|
||||
FragmentSLayout fragment_s_layout;
|
||||
FragmentDLayout fragment_d_layout;
|
||||
static_assert(is_static<FragmentSLayout>::value &&
|
||||
size(fragment_s_layout) == 16);
|
||||
static_assert(is_static<FragmentDLayout>::value &&
|
||||
size(fragment_d_layout) == 16);
|
||||
|
||||
constexpr int eles_per_thr = 16;
|
||||
using ValType = typename FragmentS::element_type;
|
||||
using VecType = std::conditional_t<std::is_same_v<ValType, __nv_bfloat16>,
|
||||
__nv_bfloat162, __half2>;
|
||||
VecType vec[8];
|
||||
// Assign vals
|
||||
vec[0].x = fragment_s(Int<0>{});
|
||||
vec[0].y = fragment_s(Int<1>{});
|
||||
vec[1].x = fragment_s(Int<2>{});
|
||||
vec[1].y = fragment_s(Int<3>{});
|
||||
vec[2].x = fragment_s(Int<4>{});
|
||||
vec[2].y = fragment_s(Int<5>{});
|
||||
vec[3].x = fragment_s(Int<6>{});
|
||||
vec[3].y = fragment_s(Int<7>{});
|
||||
vec[4].x = fragment_s(Int<8>{});
|
||||
vec[4].y = fragment_s(Int<9>{});
|
||||
vec[5].x = fragment_s(Int<10>{});
|
||||
vec[5].y = fragment_s(Int<11>{});
|
||||
vec[6].x = fragment_s(Int<12>{});
|
||||
vec[6].y = fragment_s(Int<13>{});
|
||||
vec[7].x = fragment_s(Int<14>{});
|
||||
vec[7].y = fragment_s(Int<15>{});
|
||||
|
||||
auto local_max = __habs2(vec[0]);
|
||||
for (int i = 1; i < eles_per_thr / 2; i++) {
|
||||
local_max = __hmax2(__habs2(vec[i]), local_max);
|
||||
}
|
||||
local_max = __hmax2(__shfl_xor_sync(uint32_t(-1), local_max, 1), local_max);
|
||||
|
||||
// Get the final absolute maximum values.
|
||||
float block_max(0.0f);
|
||||
if constexpr (std::is_same_v<ValType, __nv_bfloat16>) {
|
||||
block_max = __bfloat162float(__hmax(local_max.x, local_max.y));
|
||||
} else {
|
||||
block_max = __half2float(__hmax(local_max.x, local_max.y));
|
||||
}
|
||||
// Get the SF (max value of the vector / max value of mxfp8).
|
||||
float sf_val = block_max * reciprocal_approximate_ftz(448.0f);
|
||||
// 8 bits representation of the SF.
|
||||
uint8_t fp8_sf_val;
|
||||
|
||||
__nv_fp8_e8m0 tmp_sf_val;
|
||||
tmp_sf_val.__x =
|
||||
__nv_cvt_float_to_e8m0(sf_val, __NV_SATFINITE, cudaRoundPosInf);
|
||||
sf_val = static_cast<float>(tmp_sf_val);
|
||||
fp8_sf_val = tmp_sf_val.__x;
|
||||
// Get the output scale (reciprocal of the SFValue).
|
||||
float output_scale =
|
||||
block_max != 0.f ? reciprocal_approximate_ftz(sf_val) : 0.0f;
|
||||
|
||||
// Convert the input to float.
|
||||
float2 fp2_vals[eles_per_thr / 2];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < eles_per_thr / 2; i++) {
|
||||
if constexpr (std::is_same_v<ValType, __half>) {
|
||||
fp2_vals[i] = __half22float2(vec[i]);
|
||||
} else {
|
||||
fp2_vals[i] = __bfloat1622float2(vec[i]);
|
||||
}
|
||||
fp2_vals[i].x *= output_scale;
|
||||
fp2_vals[i].y *= output_scale;
|
||||
}
|
||||
union {
|
||||
uint8_t bytes[16];
|
||||
__nv_fp8x2_e4m3 elts[8];
|
||||
} u;
|
||||
u.elts[0] = __nv_fp8x2_e4m3(fp2_vals[0]);
|
||||
u.elts[1] = __nv_fp8x2_e4m3(fp2_vals[1]);
|
||||
u.elts[2] = __nv_fp8x2_e4m3(fp2_vals[2]);
|
||||
u.elts[3] = __nv_fp8x2_e4m3(fp2_vals[3]);
|
||||
u.elts[4] = __nv_fp8x2_e4m3(fp2_vals[4]);
|
||||
u.elts[5] = __nv_fp8x2_e4m3(fp2_vals[5]);
|
||||
u.elts[6] = __nv_fp8x2_e4m3(fp2_vals[6]);
|
||||
u.elts[7] = __nv_fp8x2_e4m3(fp2_vals[7]);
|
||||
fragment_d(Int<0>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[0]);
|
||||
fragment_d(Int<1>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[1]);
|
||||
fragment_d(Int<2>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[2]);
|
||||
fragment_d(Int<3>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[3]);
|
||||
fragment_d(Int<4>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[4]);
|
||||
fragment_d(Int<5>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[5]);
|
||||
fragment_d(Int<6>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[6]);
|
||||
fragment_d(Int<7>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[7]);
|
||||
fragment_d(Int<8>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[8]);
|
||||
fragment_d(Int<9>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[9]);
|
||||
fragment_d(Int<10>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[10]);
|
||||
fragment_d(Int<11>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[11]);
|
||||
fragment_d(Int<12>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[12]);
|
||||
fragment_d(Int<13>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[13]);
|
||||
fragment_d(Int<14>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[14]);
|
||||
fragment_d(Int<15>{}) = cutlass::float_e4m3_t::bitcast(u.bytes[15]);
|
||||
return fp8_sf_val;
|
||||
}
|
||||
|
||||
template <typename TensorS, typename TensorP, typename TensorD,
|
||||
typename TensorSharedSF, typename TensorSF, typename TiledCopyG2R,
|
||||
typename TiledCopyR2G, typename TiledCopyR2S>
|
||||
__inline__ __device__ void mxfp8_experts_quant_tile(
|
||||
TensorS& tensor_s, TensorP& tensor_p, TensorD& tensor_d,
|
||||
TensorSharedSF& tensor_shared_sf, TensorSF& tensor_sf, int m,
|
||||
TiledCopyG2R& tiled_copy_g2r, TiledCopyR2G& tiled_copy_r2g,
|
||||
TiledCopyR2S& tiled_copy_r2s) {
|
||||
static_assert(size(get<0>(typename TensorS::layout_type{})) == 128 &&
|
||||
size(get<1>(typename TensorS::layout_type{})) == 128 &&
|
||||
stride(get<1>(typename TensorS::layout_type{})) == 1);
|
||||
static_assert(size(get<0>(typename TensorD::layout_type{})) == 128 &&
|
||||
size(get<1>(typename TensorD::layout_type{})) == 128 &&
|
||||
stride(get<1>(typename TensorD::layout_type{})) == 1);
|
||||
static_assert(size(get<0>(typename TensorP::layout_type{})) == 128 &&
|
||||
size(get<1>(typename TensorP::layout_type{})) == 128);
|
||||
static_assert(size(get<0>(typename TensorSharedSF::layout_type{})) == 128 &&
|
||||
size(get<1>(typename TensorSharedSF::layout_type{})) == 4);
|
||||
static_assert(size(get<0>(typename TensorSF::layout_type{})) == 128 &&
|
||||
size(get<1>(typename TensorSF::layout_type{})) == 4);
|
||||
|
||||
using Tiler_MN = typename TiledCopyG2R::Tiler_MN;
|
||||
auto tiler_mn = Tiler_MN{};
|
||||
static_assert(size<0>(tiler_mn) == 16 && size<1>(tiler_mn) == 128);
|
||||
|
||||
auto tiled_tensor_s = tiled_divide(tensor_s, tiler_mn);
|
||||
auto tiled_tensor_p = tiled_divide(tensor_p, tiler_mn);
|
||||
auto tiled_tensor_d = tiled_divide(tensor_d, tiler_mn);
|
||||
static_assert(size<2>(tiled_tensor_s) == 1);
|
||||
static_assert(size<2>(tiled_tensor_p) == 1);
|
||||
static_assert(size<2>(tiled_tensor_d) == 1);
|
||||
auto squeeze_tiled_tensor_s = take<0, 2>(tiled_tensor_s);
|
||||
auto squeeze_tiled_tensor_p = take<0, 2>(tiled_tensor_p);
|
||||
auto squeeze_tiled_tensor_d = take<0, 2>(tiled_tensor_d);
|
||||
|
||||
using SF_Tiler_MN = typename TiledCopyR2S::Tiler_MN;
|
||||
auto sf_tiler_mn = SF_Tiler_MN{};
|
||||
static_assert(size<0>(sf_tiler_mn) == 16 && size<1>(sf_tiler_mn) == 4);
|
||||
|
||||
auto tiled_tensor_sf = tiled_divide(tensor_sf, sf_tiler_mn);
|
||||
auto tiled_tensor_shared_sf = tiled_divide(tensor_shared_sf, sf_tiler_mn);
|
||||
auto squeeze_tiled_tensor_sf = take<0, 2>(tiled_tensor_sf);
|
||||
auto squeeze_tiled_tensor_shared_sf = take<0, 2>(tiled_tensor_shared_sf);
|
||||
|
||||
constexpr int tile_loop_count = size<1>(tiled_tensor_s);
|
||||
constexpr int rows_in_tile = 16;
|
||||
// We don't need to clear shared memory
|
||||
// clear(squeeze_tiled_tensor_shared_sf);
|
||||
#pragma unroll 4
|
||||
for (int t = 0; t < tile_loop_count; t++) {
|
||||
if (t * rows_in_tile >= m) {
|
||||
break;
|
||||
}
|
||||
auto current_copy_tile_s = tensor<0>(squeeze_tiled_tensor_s(_, t));
|
||||
auto current_copy_tile_p = tensor<0>(squeeze_tiled_tensor_p(_, t));
|
||||
auto current_copy_tile_d = tensor<0>(squeeze_tiled_tensor_d(_, t));
|
||||
auto current_copy_tile_sf = tensor<0>(squeeze_tiled_tensor_sf(_, t));
|
||||
auto current_copy_tile_shared_sf =
|
||||
tensor<0>(squeeze_tiled_tensor_shared_sf(_, t));
|
||||
|
||||
// Global to Register copy
|
||||
auto thr_copy_g2r = tiled_copy_g2r.get_thread_slice(threadIdx.x);
|
||||
auto thr_tile_g2r_s = thr_copy_g2r.partition_S(current_copy_tile_s);
|
||||
auto thr_tile_g2r_p = thr_copy_g2r.partition_S(current_copy_tile_p);
|
||||
auto input_fragment = make_fragment_like(thr_tile_g2r_s);
|
||||
|
||||
// Register to Global copy
|
||||
auto thr_copy_r2g = tiled_copy_r2g.get_thread_slice(threadIdx.x);
|
||||
auto thr_tile_r2g_d = thr_copy_r2g.partition_D(current_copy_tile_d);
|
||||
auto thr_tile_r2g_p = thr_copy_r2g.partition_D(current_copy_tile_p);
|
||||
auto output_fragment = make_fragment_like(thr_tile_r2g_d);
|
||||
|
||||
// Register to Shared copy
|
||||
auto thr_copy_r2s = tiled_copy_r2s.get_thread_slice(threadIdx.x / 2);
|
||||
auto thr_tile_r2s_shared_sf =
|
||||
thr_copy_r2s.partition_D(current_copy_tile_shared_sf);
|
||||
auto shared_sf_fragment = make_fragment_like(thr_tile_r2s_shared_sf);
|
||||
|
||||
// CopyG2R & convert & CopyR2G
|
||||
copy_if(tiled_copy_g2r, thr_tile_g2r_p, thr_tile_g2r_s, input_fragment);
|
||||
uint8_t fp8_sf_val =
|
||||
cvt_warp_fp16_to_mxfp8(input_fragment, output_fragment);
|
||||
copy_if(tiled_copy_r2g, thr_tile_r2g_p, output_fragment, thr_tile_r2g_d);
|
||||
shared_sf_fragment[0] = fp8_sf_val;
|
||||
|
||||
// Before first copy r2s, clear shared memory and wait previous group
|
||||
if (t == 0 && threadIdx.x == 0) {
|
||||
// Wait for the group to have completed reading from shared memory.
|
||||
cuda::ptx::cp_async_bulk_wait_group_read(cuda::ptx::n32_t<0>());
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x % 2 == 0) {
|
||||
copy(tiled_copy_r2s, shared_sf_fragment, thr_tile_r2s_shared_sf);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// Wait for shared memory writes to be visible to TMA engine.
|
||||
cuda::ptx::fence_proxy_async(cuda::ptx::space_shared); // b)
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
cuda::ptx::cp_async_bulk(cuda::ptx::space_global, cuda::ptx::space_shared,
|
||||
squeeze_tiled_tensor_sf.data().get(),
|
||||
squeeze_tiled_tensor_shared_sf.data().get(), 512);
|
||||
// Wait for TMA transfer to have finished reading shared memory.
|
||||
// Create a "bulk async-group" out of the previous bulk copy operation.
|
||||
cuda::ptx::cp_async_bulk_commit_group();
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T_IN, typename TiledCopyG2R, typename TiledCopyR2G,
|
||||
typename TiledCopyR2S>
|
||||
__global__ void mxfp8_experts_quant_kernel(
|
||||
const T_IN* input, const int* problem_sizes, const int* expert_offsets,
|
||||
const int* blockscale_offsets, cutlass::float_e4m3_t* quant_output,
|
||||
uint8_t* scale_factor, int groups, TiledCopyG2R tiled_copy_g2r,
|
||||
TiledCopyR2G tiled_copy_r2g, TiledCopyR2S tiled_copy_r2s) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000
|
||||
__shared__ __align__(512) uint8_t shared_memory[512];
|
||||
ScaleFactorTileLayout scale_factor_tile_layout{};
|
||||
auto scale_factor_shared =
|
||||
make_tensor(make_smem_ptr(shared_memory),
|
||||
scale_factor_tile_layout); // ((_32,_4), _4):((_16,_4), _1)
|
||||
// TODO: Transform Groupwise Schedule into a more efficient Schedule
|
||||
for (int g = 0; g < groups; g++) {
|
||||
int m = problem_sizes[g * 3 + 0];
|
||||
int k = problem_sizes[g * 3 + 2];
|
||||
int64_t expert_offset = static_cast<int64_t>(expert_offsets[g]);
|
||||
int64_t blockscale_offset = static_cast<int64_t>(blockscale_offsets[g]);
|
||||
|
||||
auto input_tensor = make_tensor(
|
||||
make_gmem_ptr(input + expert_offset * k),
|
||||
make_layout(make_shape(m, k),
|
||||
LayoutRight{})); // (M, K):(K, 1) half_t/bfloat16_t
|
||||
|
||||
auto quant_output_tensor = make_tensor(
|
||||
make_gmem_ptr(quant_output + expert_offset * k),
|
||||
make_layout(make_shape(m, k),
|
||||
LayoutRight{})); // (M, K):(K, 1) cutlass::float_e4m3_t
|
||||
|
||||
auto scale_factor_shape = make_shape(ceil_div(m, 128) * 128, k / 32);
|
||||
auto scale_factor_layout = tile_to_shape(scale_factor_tile_layout,
|
||||
scale_factor_shape, LayoutRight{});
|
||||
// layout<0>(layout<0>(scale_factor_layout)) (_32,_4):(_16,_4) -- static
|
||||
// layout<1>(layout<0>(scale_factor_layout)) M_align_128 / 128 -- dynamic
|
||||
// shape dynamic stride layout<0>(layout<1>(scale_factor_layout)) _4:_1 --
|
||||
// static layout<1>(layout<1>(scale_factor_layout)) (K / 32) / 4 : _512 --
|
||||
// dynamic shape static stride
|
||||
|
||||
// Reshape to zipped layout for 1D indexing
|
||||
auto zipped_scale_factor_layout = make_layout(
|
||||
make_layout(layout<0>(layout<0>(scale_factor_layout)),
|
||||
layout<0>(layout<1>(scale_factor_layout))),
|
||||
make_layout(
|
||||
layout<1>(layout<0>(scale_factor_layout)),
|
||||
layout<1>(layout<1>(
|
||||
scale_factor_layout)))); // (((_32,_4),_4),(M_align_128 /
|
||||
// 128,(K / 32) /
|
||||
// 4)):(((_16,_4),_1),(?,_512))
|
||||
|
||||
auto scale_factor_tensor =
|
||||
make_tensor(make_gmem_ptr(scale_factor + blockscale_offset * (k / 32)),
|
||||
zipped_scale_factor_layout);
|
||||
|
||||
// Used for cases where M is not divisible by 128 (most scenarios).
|
||||
auto input_shape = shape(input_tensor); // (M, K):(K, 1)
|
||||
auto identity_tensor = make_identity_tensor(input_shape);
|
||||
auto predict_tensor = cute::lazy::transform(
|
||||
identity_tensor, [&](auto c) { return elem_less(c, input_shape); });
|
||||
|
||||
// (_128, _128)
|
||||
auto tiler = make_shape(Int<BLOCK_M>{}, Int<BLOCK_K>{});
|
||||
|
||||
auto tiled_input_tensor = zipped_divide(
|
||||
input_tensor, tiler); // ((128, 128), (cdiv(M, 128), cdiv(K, 128)))
|
||||
auto tiled_quant_output_tensor =
|
||||
zipped_divide(quant_output_tensor,
|
||||
tiler); // ((128, 128), (cdiv(M, 128), cdiv(K, 128)))
|
||||
auto tiled_predict_tensor = zipped_divide(
|
||||
predict_tensor, tiler); // ((128, 128), (cdiv(M, 128), cdiv(K, 128)))
|
||||
|
||||
auto total_tiles =
|
||||
size<1>(tiled_input_tensor); // cdiv(M, 128) * cdiv(K, 128)
|
||||
decltype(total_tiles) blk_offset = blockIdx.x;
|
||||
while (blk_offset < total_tiles) {
|
||||
auto current_input_tile = tensor<0>(tiled_input_tensor(_, blk_offset));
|
||||
auto current_quant_output_tile =
|
||||
tensor<0>(tiled_quant_output_tensor(_, blk_offset));
|
||||
auto current_predict_tile =
|
||||
tensor<0>(tiled_predict_tensor(_, blk_offset));
|
||||
auto current_scale_factor_tile =
|
||||
tensor<0>(scale_factor_tensor(_, blk_offset));
|
||||
|
||||
mxfp8_experts_quant_tile<
|
||||
decltype(current_input_tile), decltype(current_predict_tile),
|
||||
decltype(current_quant_output_tile), decltype(scale_factor_shared),
|
||||
decltype(current_scale_factor_tile), TiledCopyG2R, TiledCopyR2G,
|
||||
TiledCopyR2S>(current_input_tile, current_predict_tile,
|
||||
current_quant_output_tile, scale_factor_shared,
|
||||
current_scale_factor_tile, m, tiled_copy_g2r,
|
||||
tiled_copy_r2g, tiled_copy_r2s);
|
||||
blk_offset += gridDim.x;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T_IN>
|
||||
void launch_mxfp8_experts_quant(const torch::Tensor& input,
|
||||
const torch::Tensor& problem_sizes,
|
||||
const torch::Tensor& expert_offsets,
|
||||
const torch::Tensor& blockscale_offsets,
|
||||
torch::Tensor& quant_output,
|
||||
torch::Tensor& scale_factor) {
|
||||
ThrLayout thr_layout{};
|
||||
ValLayout val_layout{};
|
||||
SfR2SThrLayout r2s_thr_layout{};
|
||||
SfR2SValLayout r2s_val_layout{};
|
||||
|
||||
using CopyOpG2R =
|
||||
UniversalCopy<cutlass::AlignedArray<T_IN, size(val_layout)>>;
|
||||
using CopyAtomG2R = cute::Copy_Atom<CopyOpG2R, T_IN>;
|
||||
auto tiled_copy_g2r = cute::make_tiled_copy(
|
||||
CopyAtomG2R{}, thr_layout, val_layout); // Tiler_MN: (16, 128)
|
||||
|
||||
using CopyOpR2G = UniversalCopy<
|
||||
cutlass::AlignedArray<cutlass::float_e4m3_t, size(val_layout)>>;
|
||||
using CopyAtomR2G = cute::Copy_Atom<CopyOpR2G, cutlass::float_e4m3_t>;
|
||||
auto tiled_copy_r2g = cute::make_tiled_copy(
|
||||
CopyAtomR2G{}, thr_layout, val_layout); // Tiler_MN: (16, 128)
|
||||
|
||||
using CopyOpR2S =
|
||||
UniversalCopy<cutlass::AlignedArray<uint8_t, size(r2s_val_layout)>>;
|
||||
using CopyAtomR2S = cute::Copy_Atom<CopyOpR2S, uint8_t>;
|
||||
auto tiled_copy_r2s = cute::make_tiled_copy(
|
||||
CopyAtomR2S{}, r2s_thr_layout, r2s_val_layout); // Tiler_MN: (16, 4)
|
||||
|
||||
int max_active_blocks_per_sm = -1;
|
||||
AT_CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
mxfp8_experts_quant_kernel<T_IN, decltype(tiled_copy_g2r),
|
||||
decltype(tiled_copy_r2g),
|
||||
decltype(tiled_copy_r2s)>,
|
||||
THREAD_BLOCK_SIZE, 0));
|
||||
|
||||
dim3 grid(at::cuda::getCurrentDeviceProperties()->multiProcessorCount *
|
||||
max_active_blocks_per_sm,
|
||||
1, 1);
|
||||
dim3 block(THREAD_BLOCK_SIZE, 1, 1);
|
||||
int num_experts = (int)problem_sizes.size(0);
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
mxfp8_experts_quant_kernel<T_IN, decltype(tiled_copy_g2r),
|
||||
decltype(tiled_copy_r2g), decltype(tiled_copy_r2s)>
|
||||
<<<grid, block, 0, stream>>>(
|
||||
reinterpret_cast<const T_IN*>(input.data_ptr()),
|
||||
reinterpret_cast<const int*>(problem_sizes.data_ptr()),
|
||||
reinterpret_cast<const int*>(expert_offsets.data_ptr()),
|
||||
reinterpret_cast<const int*>(blockscale_offsets.data_ptr()),
|
||||
reinterpret_cast<cutlass::float_e4m3_t*>(quant_output.data_ptr()),
|
||||
reinterpret_cast<uint8_t*>(scale_factor.data_ptr()), num_experts,
|
||||
tiled_copy_g2r, tiled_copy_r2g, tiled_copy_r2s);
|
||||
}
|
||||
|
||||
} // namespace expert_specialization
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user