Compare commits
445 Commits
v0.17.1rc0
...
v0.18.1rc0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
298e510848 | ||
|
|
3982bc2cd0 | ||
|
|
02eec7ecbe | ||
|
|
17ee641c45 | ||
|
|
0d50fa1db6 | ||
|
|
1fa1e53a73 | ||
|
|
3ffa52009f | ||
|
|
87bd91892f | ||
|
|
c7f98b4d0a | ||
|
|
1c472f8fe1 | ||
|
|
c57d38d603 | ||
|
|
e5ed6c6c13 | ||
|
|
b3d0b37908 | ||
|
|
85f671b8e1 | ||
|
|
8bc6b5cdb0 | ||
|
|
4f16ebbbd3 | ||
|
|
12fd17eb51 | ||
|
|
37aadf6237 | ||
|
|
d7d2b5e405 | ||
|
|
6ec5e9fd37 | ||
|
|
e1d85e5c24 | ||
|
|
79eb9369c5 | ||
|
|
e80cfe575d | ||
|
|
d0532bf38d | ||
|
|
fb4e8bf442 | ||
|
|
6ade4bc5a5 | ||
|
|
2e089b96a8 | ||
|
|
880be2b1b8 | ||
|
|
c0f5fae601 | ||
|
|
aa84e43ccb | ||
|
|
5e806bcf54 | ||
|
|
56a62c310c | ||
|
|
1779c09898 | ||
|
|
44eea10f68 | ||
|
|
8b6c6b9505 | ||
|
|
9f6d9dd371 | ||
|
|
dd20ee4e3e | ||
|
|
0523449c9c | ||
|
|
b4c1aef21c | ||
|
|
6050b93bed | ||
|
|
5a4a179591 | ||
|
|
37cd9fc107 | ||
|
|
9cfd4ebb5e | ||
|
|
ed359c497a | ||
|
|
dcee9be95a | ||
|
|
bd8c4c0752 | ||
|
|
0140eafb15 | ||
|
|
bdf6a0a57b | ||
|
|
0674d1fee7 | ||
|
|
30108fc8b0 | ||
|
|
e2d1c8b5e8 | ||
|
|
6951fcd44f | ||
|
|
39474513f6 | ||
|
|
638a872d77 | ||
|
|
9040151fe1 | ||
|
|
8fbe3f303f | ||
|
|
ea2c148fa7 | ||
|
|
47b7af0d87 | ||
|
|
269bf46d99 | ||
|
|
e5a77a5015 | ||
|
|
ca1ac1a4b4 | ||
|
|
4ca3fa6bb4 | ||
|
|
be12afd284 | ||
|
|
df3c0291a3 | ||
|
|
2be1a0f74b | ||
|
|
4120a05ff1 | ||
|
|
98ff042917 | ||
|
|
b55156eae9 | ||
|
|
112944fab9 | ||
|
|
91be5f9be3 | ||
|
|
4ee847e400 | ||
|
|
040a505ff5 | ||
|
|
9279c59a0e | ||
|
|
7454096199 | ||
|
|
fb8b5e05fc | ||
|
|
e5d96dc8fc | ||
|
|
daa05bf340 | ||
|
|
7769b58307 | ||
|
|
2f9f946b22 | ||
|
|
2890aecce5 | ||
|
|
34f093b417 | ||
|
|
4dce8321a9 | ||
|
|
657855ab41 | ||
|
|
e27b8ba3d1 | ||
|
|
40b8363b45 | ||
|
|
8b10e4fb31 | ||
|
|
104605cbf2 | ||
|
|
96266f119b | ||
|
|
7c0cf3bcd0 | ||
|
|
572b432913 | ||
|
|
9515c20868 | ||
|
|
c63ca2b2e6 | ||
|
|
a32eaf5bb2 | ||
|
|
e390742c59 | ||
|
|
7a6ebcbfcf | ||
|
|
c7bc12c20f | ||
|
|
f9e2a38386 | ||
|
|
4426447bba | ||
|
|
3322e26420 | ||
|
|
765e461065 | ||
|
|
6a9cceb219 | ||
|
|
199f914183 | ||
|
|
ca21483bf9 | ||
|
|
da70c87e81 | ||
|
|
0b6d52629f | ||
|
|
d3cc379567 | ||
|
|
354cd580d5 | ||
|
|
d49f273144 | ||
|
|
b21d384304 | ||
|
|
e3126cd107 | ||
|
|
e37ff5b5c8 | ||
|
|
6accb21f2a | ||
|
|
053f3b6309 | ||
|
|
5f82706a21 | ||
|
|
c32a58cc2a | ||
|
|
ef2c4f778d | ||
|
|
9dade5da3a | ||
|
|
828f862acb | ||
|
|
577df69b26 | ||
|
|
04244fd0e1 | ||
|
|
9482b0b085 | ||
|
|
5bc1da147f | ||
|
|
0091017188 | ||
|
|
0d81a1fe61 | ||
|
|
6ae4c8d6fc | ||
|
|
a913b612d8 | ||
|
|
5ce2d10e4a | ||
|
|
738d0a281f | ||
|
|
70b81c4f3d | ||
|
|
7476d148db | ||
|
|
f3732bd931 | ||
|
|
0ef7f79054 | ||
|
|
5dd8df0701 | ||
|
|
39bfb57b7c | ||
|
|
c9d838fc33 | ||
|
|
b1169d7be8 | ||
|
|
17808394bc | ||
|
|
296839a1b0 | ||
|
|
c373b5c00d | ||
|
|
de1a86b7de | ||
|
|
99267c23ca | ||
|
|
525f2eeb0b | ||
|
|
918b7890a1 | ||
|
|
98b09ddc27 | ||
|
|
cef1f302d2 | ||
|
|
17c47fb869 | ||
|
|
b322b197f1 | ||
|
|
eaf7c9b976 | ||
|
|
47a1f11bff | ||
|
|
fad09e8a1f | ||
|
|
8c31f47c63 | ||
|
|
261801242f | ||
|
|
fcf0687b27 | ||
|
|
86b7e3c95a | ||
|
|
0e95916155 | ||
|
|
ce2ef42fd3 | ||
|
|
8b6325758c | ||
|
|
a0dd1995c7 | ||
|
|
f1740006e4 | ||
|
|
58cde5c026 | ||
|
|
761e0aa7a0 | ||
|
|
ff9fbc9aff | ||
|
|
e6c4797704 | ||
|
|
09e4576f65 | ||
|
|
3ed7b1e6e0 | ||
|
|
e8f9dbc369 | ||
|
|
de35c06c66 | ||
|
|
c0745a851a | ||
|
|
b5ca9c3557 | ||
|
|
245758992e | ||
|
|
1204cf0a9d | ||
|
|
b36adfa349 | ||
|
|
e78821b438 | ||
|
|
51f0acda79 | ||
|
|
fa75204b16 | ||
|
|
bdb903bb5f | ||
|
|
68f783a727 | ||
|
|
c5030c439d | ||
|
|
51b2333be1 | ||
|
|
4ed51308c8 | ||
|
|
c781fbbab3 | ||
|
|
979ff44cea | ||
|
|
f63ed7b5ac | ||
|
|
c9e5096256 | ||
|
|
2ff0ad9694 | ||
|
|
a836524d20 | ||
|
|
3717a4dd47 | ||
|
|
ecfcdd2ce4 | ||
|
|
c25dbc2d27 | ||
|
|
77d2a5f17b | ||
|
|
59192dfd39 | ||
|
|
56cb1baa66 | ||
|
|
f340324335 | ||
|
|
2660b9289c | ||
|
|
293f036e6d | ||
|
|
0fb142a454 | ||
|
|
00f8e0d211 | ||
|
|
4af9ed21cb | ||
|
|
9c7cab5ebb | ||
|
|
132bfd45b6 | ||
|
|
24b4272a8c | ||
|
|
8a680463fa | ||
|
|
20b14095a4 | ||
|
|
17c1bdf371 | ||
|
|
3e3d320c1b | ||
|
|
54a62a79f7 | ||
|
|
384dc7f77b | ||
|
|
f04d5226f8 | ||
|
|
0a0a1a198b | ||
|
|
6c1cfbad32 | ||
|
|
45f526d652 | ||
|
|
5db91f0aaf | ||
|
|
061980c36a | ||
|
|
7a49742b88 | ||
|
|
3e6a1e1686 | ||
|
|
7961486a9b | ||
|
|
4f9b14c21c | ||
|
|
31a458c091 | ||
|
|
a3a51d20e7 | ||
|
|
e5b807607c | ||
|
|
fd4d96302a | ||
|
|
c0f011918d | ||
|
|
e6ae4b1be1 | ||
|
|
2dccb38f73 | ||
|
|
d157216093 | ||
|
|
93f3c8e531 | ||
|
|
2cc26c3a99 | ||
|
|
dfa8852db2 | ||
|
|
714c6e0eab | ||
|
|
0fefd00e6c | ||
|
|
f5c081d432 | ||
|
|
c88ea8338b | ||
|
|
9f9ecff4cd | ||
|
|
ca1954d58c | ||
|
|
55e6d3d5c0 | ||
|
|
6682c231fa | ||
|
|
5ae685c1c8 | ||
|
|
ce8cf9161d | ||
|
|
18be11fd59 | ||
|
|
8d8855fdae | ||
|
|
e855d380fa | ||
|
|
0e5a9382af | ||
|
|
04bf5a35fa | ||
|
|
43a73f853b | ||
|
|
ffbc2e5bdb | ||
|
|
f9e6db3034 | ||
|
|
d61d2b08e9 | ||
|
|
f5e59ee7a6 | ||
|
|
9b005edc48 | ||
|
|
bf9a185395 | ||
|
|
ad041c79db | ||
|
|
747b068136 | ||
|
|
122f75d939 | ||
|
|
d8f8a7aad2 | ||
|
|
0115e957d4 | ||
|
|
116ed130f4 | ||
|
|
8374387bd8 | ||
|
|
912fbe9555 | ||
|
|
52131f88d9 | ||
|
|
821eb80c0d | ||
|
|
a2956a0f8e | ||
|
|
911355e216 | ||
|
|
8d3f8f485e | ||
|
|
96efb91480 | ||
|
|
2754231ba3 | ||
|
|
2390d44209 | ||
|
|
7362b4450a | ||
|
|
57a314d155 | ||
|
|
d4c57863f7 | ||
|
|
68e1b711f1 | ||
|
|
0024f39a32 | ||
|
|
e9163b536e | ||
|
|
7acaea634c | ||
|
|
697e4ff352 | ||
|
|
a3e2e250f0 | ||
|
|
143e4dccdf | ||
|
|
6590a3ecda | ||
|
|
b3debb7e77 | ||
|
|
458c1a4b2d | ||
|
|
821fde2df4 | ||
|
|
8c29042bb9 | ||
|
|
5467d137b3 | ||
|
|
3ed46f374b | ||
|
|
84868e4793 | ||
|
|
a8e8d62dd8 | ||
|
|
e42b49bd69 | ||
|
|
4a718e770d | ||
|
|
600a039f57 | ||
|
|
ffa5d74f15 | ||
|
|
74fe80ee95 | ||
|
|
bcfdadb1bc | ||
|
|
236de72e49 | ||
|
|
a116f96930 | ||
|
|
092ace9e3a | ||
|
|
f680dc1b39 | ||
|
|
b41aa264f9 | ||
|
|
367cf5cd3e | ||
|
|
6d53efd2a5 | ||
|
|
8b346309a5 | ||
|
|
54a6db827f | ||
|
|
9efc4db965 | ||
|
|
f1816fb192 | ||
|
|
0005d2a3c9 | ||
|
|
d0b402974f | ||
|
|
6341d43043 | ||
|
|
7afe0faab1 | ||
|
|
5a3f1eb62f | ||
|
|
b3ce711b93 | ||
|
|
abf61aaa8e | ||
|
|
4508532fbd | ||
|
|
d5af196c18 | ||
|
|
82f836d976 | ||
|
|
4fccd30f19 | ||
|
|
cfaf4668f7 | ||
|
|
99a57bdf74 | ||
|
|
a2268617cf | ||
|
|
a4ad9db541 | ||
|
|
b373b5102a | ||
|
|
f296a1966d | ||
|
|
bc2c0c86ef | ||
|
|
891c60dcd5 | ||
|
|
1ce13cf992 | ||
|
|
10f08dedfa | ||
|
|
5e1a373d2e | ||
|
|
572c776bfb | ||
|
|
55d8073d06 | ||
|
|
cd32d6f586 | ||
|
|
aaa3092f51 | ||
|
|
87985077a4 | ||
|
|
a79c1c2c80 | ||
|
|
cc8f1f4764 | ||
|
|
05b9e8ab5b | ||
|
|
2cdf92228c | ||
|
|
c973ecdead | ||
|
|
e39257a552 | ||
|
|
cc16b24b17 | ||
|
|
bdc2343454 | ||
|
|
f444c05c32 | ||
|
|
85199f9681 | ||
|
|
a1257fd1ea | ||
|
|
abcffbba8c | ||
|
|
53ec16a705 | ||
|
|
2e693f48e7 | ||
|
|
7f1f36bf91 | ||
|
|
5282c7d4d0 | ||
|
|
9e19f8338b | ||
|
|
06e0bc21d2 | ||
|
|
5a71cdd76e | ||
|
|
f0d3658c0f | ||
|
|
57431d8231 | ||
|
|
3e64fe4a18 | ||
|
|
8cb24d3aed | ||
|
|
00726c74c9 | ||
|
|
9fe404ed04 | ||
|
|
802f306cd1 | ||
|
|
894843eb25 | ||
|
|
584a3f56de | ||
|
|
36735fd772 | ||
|
|
6ecabe4936 | ||
|
|
2f8b4ce0c0 | ||
|
|
2ef69456f5 | ||
|
|
17852aa503 | ||
|
|
8647c6cf51 | ||
|
|
513949f95f | ||
|
|
262b76a09f | ||
|
|
c34ba6b961 | ||
|
|
24062b704f | ||
|
|
d6b61e5166 | ||
|
|
cf632499ee | ||
|
|
a3774a8198 | ||
|
|
0ce21c46a0 | ||
|
|
55eed6b7a5 | ||
|
|
c77181e534 | ||
|
|
12001f2ebc | ||
|
|
7ee5d5093b | ||
|
|
428bc718bd | ||
|
|
ff1e3d9c63 | ||
|
|
35bdca5431 | ||
|
|
8a24842765 | ||
|
|
65986db6ba | ||
|
|
9556af87d5 | ||
|
|
a1a3523a56 | ||
|
|
741f4e046b | ||
|
|
a5d06dc557 | ||
|
|
5efa206a8c | ||
|
|
196802dfa6 | ||
|
|
c84b519cf3 | ||
|
|
741ecf0630 | ||
|
|
b7e5a588d8 | ||
|
|
822e250ab7 | ||
|
|
bea02cdf93 | ||
|
|
a3ea760ea5 | ||
|
|
35db669f1d | ||
|
|
afebeffbfb | ||
|
|
5573894737 | ||
|
|
d5816c8c2f | ||
|
|
8ccbcda5c0 | ||
|
|
a9e532afe2 | ||
|
|
f3163bba67 | ||
|
|
700a1ddc65 | ||
|
|
f33251ffc8 | ||
|
|
e584dce52b | ||
|
|
40c0461f24 | ||
|
|
724759684c | ||
|
|
9c34e9d24f | ||
|
|
09b6f99852 | ||
|
|
c87fb515ed | ||
|
|
5353c9b016 | ||
|
|
13e79fc811 | ||
|
|
9d07a3d6e4 | ||
|
|
646b85544b | ||
|
|
4286cc5ec2 | ||
|
|
545d18d81b | ||
|
|
e661b9ee83 | ||
|
|
c910eeb125 | ||
|
|
f4ae58b38b | ||
|
|
e568cf88bc | ||
|
|
098d844731 | ||
|
|
a40ee486f2 | ||
|
|
eac2dc2b41 | ||
|
|
d5080aeaa4 | ||
|
|
f22d6e0267 | ||
|
|
76c6e6da08 | ||
|
|
4184653775 | ||
|
|
4aaaf8c8ce | ||
|
|
4bf533623b | ||
|
|
5f77ef15ae | ||
|
|
7d6abdd022 | ||
|
|
a8ff2cca92 | ||
|
|
42fadebecb | ||
|
|
a197eda9c3 | ||
|
|
82b110d50e | ||
|
|
9040cd40af | ||
|
|
fa0d353acf | ||
|
|
b386bb3d7c | ||
|
|
fe714dd507 | ||
|
|
8ab3d7427c | ||
|
|
84e436ed1c | ||
|
|
81939e7733 | ||
|
|
195d1ca3e8 | ||
|
|
8d983d7cd6 | ||
|
|
65b2f405dc | ||
|
|
2a68464c5b | ||
|
|
bdd8981dab | ||
|
|
f088a831dd |
@@ -10,7 +10,7 @@ steps:
|
||||
docker build
|
||||
--build-arg max_jobs=16
|
||||
--build-arg REMOTE_VLLM=1
|
||||
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx942;gfx950'
|
||||
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
|
||||
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
|
||||
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
-f docker/Dockerfile.rocm
|
||||
|
||||
@@ -21,6 +21,20 @@ steps:
|
||||
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
|
||||
pytest -x -v -s tests/kernels/test_onednn.py"
|
||||
|
||||
- label: CPU-Compatibility Tests
|
||||
depends_on: []
|
||||
soft_fail: true
|
||||
device: intel_cpu
|
||||
no_plugin: true
|
||||
source_file_dependencies:
|
||||
- cmake/cpu_extension.cmake
|
||||
- setup.py
|
||||
- vllm/platforms/cpu.py
|
||||
commands:
|
||||
- |
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
|
||||
|
||||
- label: CPU-Language Generation and Pooling Model Tests
|
||||
depends_on: []
|
||||
soft_fail: true
|
||||
|
||||
@@ -25,9 +25,7 @@ fi
|
||||
docker build --file docker/Dockerfile.cpu \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
|
||||
--build-arg VLLM_CPU_AVX512BF16=true \
|
||||
--build-arg VLLM_CPU_AVX512VNNI=true \
|
||||
--build-arg VLLM_CPU_AMXBF16=true \
|
||||
--build-arg VLLM_CPU_X86=true \
|
||||
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
|
||||
--target vllm-test \
|
||||
--progress plain .
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
Qwen3-235B-A22B-Instruct-2507-FP8.yaml
|
||||
@@ -7,12 +7,12 @@ import argparse
|
||||
import html as _html
|
||||
import json
|
||||
import os
|
||||
from contextlib import nullcontext
|
||||
from dataclasses import dataclass
|
||||
from importlib import util
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import regex as re
|
||||
|
||||
pd.options.display.float_format = "{:.2f}".format
|
||||
plotly_found = util.find_spec("plotly.express") is not None
|
||||
@@ -33,6 +33,45 @@ pd.set_option("display.precision", 2)
|
||||
pd.set_option("display.float_format", lambda x: f"{x:.2f}")
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Concurrency normalization (NEW, small)
|
||||
# -----------------------------
|
||||
def _find_concurrency_col(df: pd.DataFrame) -> str:
|
||||
for c in [
|
||||
"# of max concurrency.",
|
||||
"# of max concurrency",
|
||||
"Max Concurrency",
|
||||
"max_concurrency",
|
||||
"Concurrency",
|
||||
]:
|
||||
if c in df.columns:
|
||||
return c
|
||||
|
||||
for c in df.columns:
|
||||
if "concurr" in str(c).lower():
|
||||
s = df[c]
|
||||
if s.dtype.kind in "iu" and s.nunique() > 1 and s.min() >= 1:
|
||||
return c
|
||||
|
||||
raise ValueError(
|
||||
"Cannot infer concurrency column. "
|
||||
"Please rename the column to one of the known names "
|
||||
"or add an explicit override (e.g., --concurrency-col)."
|
||||
)
|
||||
|
||||
|
||||
def _normalize_concurrency_in_df(
|
||||
df: pd.DataFrame, canonical: str = "# of max concurrency."
|
||||
) -> pd.DataFrame:
|
||||
if canonical in df.columns:
|
||||
return df
|
||||
detected = _find_concurrency_col(df)
|
||||
if detected in df.columns and detected != canonical:
|
||||
return df.rename(columns={detected: canonical})
|
||||
df[canonical] = pd.NA
|
||||
return df
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Core data compare
|
||||
# -----------------------------
|
||||
@@ -52,19 +91,25 @@ def compare_data_columns(
|
||||
- Concat along axis=1 (indexes align), then reset_index so callers can
|
||||
group by columns.
|
||||
- If --debug, add a <file_label>_name column per file.
|
||||
|
||||
Minimal fix to support different max_concurrency lists across files:
|
||||
- normalize concurrency column naming to "# of max concurrency."
|
||||
- align on UNION of keys (missing points become NaN)
|
||||
- BUGFIX: don't drop throughput rows based on P99/Median presence
|
||||
"""
|
||||
print("\ncompare_data_column:", data_column)
|
||||
|
||||
frames = []
|
||||
raw_data_cols: list[str] = []
|
||||
compare_frames = []
|
||||
|
||||
# Determine key cols after normalizing concurrency
|
||||
cols_per_file: list[set] = []
|
||||
for f in files:
|
||||
try:
|
||||
df_tmp = pd.read_json(f, orient="records")
|
||||
except Exception as err:
|
||||
raise ValueError(f"Failed to read {f}") from err
|
||||
df_tmp = _normalize_concurrency_in_df(df_tmp, canonical="# of max concurrency.")
|
||||
cols_per_file.append(set(df_tmp.columns))
|
||||
|
||||
key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
|
||||
@@ -75,12 +120,25 @@ def compare_data_columns(
|
||||
"No common key columns found from info_cols across the input files."
|
||||
)
|
||||
|
||||
meta_added = False
|
||||
union_index = None
|
||||
metas: list[pd.DataFrame] = []
|
||||
staged: list[tuple[str, pd.Series, pd.Series | None]] = []
|
||||
|
||||
for file in files:
|
||||
df = pd.read_json(file, orient="records")
|
||||
df = _normalize_concurrency_in_df(df, canonical="# of max concurrency.")
|
||||
|
||||
if drop_column in df.columns:
|
||||
# BUGFIX: only drop rows for latency-like metrics; throughput rows may have
|
||||
# NaN in P99/Median columns even if the column exists in the JSON.
|
||||
metric_lc = str(data_column).lower()
|
||||
is_latency_metric = (
|
||||
"ttft" in metric_lc
|
||||
or "tpot" in metric_lc
|
||||
or "p99" in metric_lc
|
||||
or "median" in metric_lc
|
||||
or metric_lc.strip() in {"p99", "median"}
|
||||
)
|
||||
if is_latency_metric and drop_column in df.columns:
|
||||
df = df.dropna(subset=[drop_column], ignore_index=True)
|
||||
|
||||
for c in (
|
||||
@@ -105,35 +163,61 @@ def compare_data_columns(
|
||||
meta = meta.groupby(level=key_cols, dropna=False).first()
|
||||
|
||||
file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
|
||||
s = df_idx[data_column]
|
||||
if not s.index.is_unique:
|
||||
s = s.groupby(level=key_cols, dropna=False).mean()
|
||||
|
||||
if data_column in df_idx.columns:
|
||||
s = df_idx[data_column]
|
||||
if not s.index.is_unique:
|
||||
s = s.groupby(level=key_cols, dropna=False).mean()
|
||||
else:
|
||||
# keep NA series to preserve meta keys for union_index
|
||||
s = pd.Series(pd.NA, index=meta.index)
|
||||
s.name = file_label
|
||||
|
||||
if not meta_added:
|
||||
frames.append(meta)
|
||||
meta_added = True
|
||||
|
||||
name_s = None
|
||||
if debug and name_column in df_idx.columns:
|
||||
name_s = df_idx[name_column]
|
||||
if not name_s.index.is_unique:
|
||||
name_s = name_s.groupby(level=key_cols, dropna=False).first()
|
||||
name_s.name = f"{file_label}_name"
|
||||
frames.append(name_s)
|
||||
|
||||
frames.append(s)
|
||||
if union_index is None:
|
||||
union_index = meta.index
|
||||
else:
|
||||
union_index = union_index.union(meta.index)
|
||||
metas.append(meta)
|
||||
|
||||
staged.append((file_label, s, name_s))
|
||||
|
||||
if union_index is None:
|
||||
raise ValueError("No data found after loading inputs.")
|
||||
|
||||
# meta first (union-aligned): build UNION meta across all files
|
||||
if metas:
|
||||
meta_union = pd.concat(metas, axis=0)
|
||||
# Collapse duplicates on the MultiIndex; keep first non-null per column
|
||||
meta_union = meta_union.groupby(level=key_cols, dropna=False).first()
|
||||
frames.append(meta_union.reindex(union_index))
|
||||
|
||||
# values + ratios (union-aligned)
|
||||
metric_series_aligned: list[pd.Series] = []
|
||||
for file_label, s, name_s in staged:
|
||||
s_aligned = s.reindex(union_index)
|
||||
frames.append(s_aligned)
|
||||
raw_data_cols.append(file_label)
|
||||
compare_frames.append(s)
|
||||
metric_series_aligned.append(s_aligned)
|
||||
|
||||
if len(compare_frames) >= 2:
|
||||
base = compare_frames[0]
|
||||
current = compare_frames[-1]
|
||||
if "P99" in data_column or "Median" in data_column:
|
||||
if debug and name_s is not None:
|
||||
frames.append(name_s.reindex(union_index))
|
||||
|
||||
if len(metric_series_aligned) >= 2:
|
||||
base = metric_series_aligned[0]
|
||||
current = metric_series_aligned[-1]
|
||||
if "P99" in str(data_column) or "Median" in str(data_column):
|
||||
ratio = base / current
|
||||
else:
|
||||
ratio = current / base
|
||||
ratio = ratio.mask(base == 0)
|
||||
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
|
||||
ratio.name = f"Ratio 1 vs {len(metric_series_aligned)}"
|
||||
frames.append(ratio)
|
||||
|
||||
concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
|
||||
@@ -204,24 +288,10 @@ def split_json_by_tp_pp(
|
||||
# -----------------------------
|
||||
# Styling helpers
|
||||
# -----------------------------
|
||||
def _find_concurrency_col(df: pd.DataFrame) -> str:
|
||||
for c in [
|
||||
"# of max concurrency.",
|
||||
"# of max concurrency",
|
||||
"Max Concurrency",
|
||||
"max_concurrency",
|
||||
"Concurrency",
|
||||
]:
|
||||
if c in df.columns:
|
||||
return c
|
||||
for c in df.columns:
|
||||
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
|
||||
return c
|
||||
return "# of max concurrency."
|
||||
|
||||
|
||||
def _highlight_threshold(
|
||||
df: pd.DataFrame, threshold: float
|
||||
df: pd.DataFrame,
|
||||
threshold: float,
|
||||
slack_pct: float = 0.0,
|
||||
) -> pd.io.formats.style.Styler:
|
||||
conc_col = _find_concurrency_col(df)
|
||||
key_cols = [
|
||||
@@ -234,12 +304,24 @@ def _highlight_threshold(
|
||||
]
|
||||
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
|
||||
|
||||
return df.style.map(
|
||||
lambda v: "background-color:#e6ffe6;font-weight:bold;"
|
||||
if pd.notna(v) and v <= threshold
|
||||
else "",
|
||||
subset=conf_cols,
|
||||
)
|
||||
try:
|
||||
slack_pct = float(slack_pct or 0.0)
|
||||
except Exception:
|
||||
slack_pct = 0.0
|
||||
slack_limit = threshold * (1.0 + slack_pct / 100.0)
|
||||
|
||||
def _cell(v):
|
||||
if pd.isna(v):
|
||||
return ""
|
||||
if v <= threshold:
|
||||
# Strict SLA
|
||||
return "background-color:#e6ffe6;font-weight:bold;"
|
||||
if v <= slack_limit:
|
||||
# Within slack range
|
||||
return "background-color:#ffe5cc;font-weight:bold;"
|
||||
return ""
|
||||
|
||||
return df.style.map(_cell, subset=conf_cols)
|
||||
|
||||
|
||||
def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
|
||||
@@ -286,11 +368,30 @@ def _sanitize_sheet_name(name: str) -> str:
|
||||
- max 31 chars
|
||||
- cannot contain: : \ / ? * [ ]
|
||||
- cannot be empty
|
||||
|
||||
NOTE: Use fast, non-regex operations here to avoid the third-party `regex`
|
||||
module's compile overhead/edge-cases on some systems.
|
||||
"""
|
||||
name = "sheet" if name is None else str(name)
|
||||
name = re.sub(r"[:\\/?*\[\]]", "_", name)
|
||||
|
||||
# Replace illegal characters with underscore.
|
||||
trans = str.maketrans(
|
||||
{
|
||||
":": "_",
|
||||
"\\": "_",
|
||||
"/": "_",
|
||||
"?": "_",
|
||||
"*": "_",
|
||||
"[": "_",
|
||||
"]": "_",
|
||||
}
|
||||
)
|
||||
name = name.translate(trans)
|
||||
|
||||
# Strip quotes/spaces and collapse whitespace.
|
||||
name = name.strip().strip("'")
|
||||
name = re.sub(r"\s+", " ", name)
|
||||
name = " ".join(name.split())
|
||||
|
||||
if not name:
|
||||
name = "sheet"
|
||||
return name[:31]
|
||||
@@ -298,30 +399,57 @@ def _sanitize_sheet_name(name: str) -> str:
|
||||
|
||||
def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
|
||||
d = dict(zip(group_cols, gkey_tuple))
|
||||
model = d.get("Model", "model")
|
||||
model_short = str(model).split("/")[-1]
|
||||
|
||||
# Always keep input/output lengths (these are important).
|
||||
ilen = d.get("Input Len", "")
|
||||
olen = d.get("Output Len", "")
|
||||
lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
|
||||
|
||||
# Shorten model name aggressively to make room for lens.
|
||||
model = d.get("Model", "model")
|
||||
leaf = str(model).split("/")[-1]
|
||||
|
||||
max_model_len = max(1, 31 - len(lens))
|
||||
model_short = leaf[:max_model_len]
|
||||
|
||||
return _sanitize_sheet_name(f"{model_short}{lens}")
|
||||
|
||||
|
||||
def _write_tables_to_excel_sheet(
|
||||
writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
|
||||
):
|
||||
startrow = 0
|
||||
"""Write all blocks to a sheet with a single to_excel() call.
|
||||
|
||||
Pandas+openpyxl can be extremely slow when called many times per sheet.
|
||||
We flatten blocks into one table with a 'Section' column to keep structure
|
||||
while making Excel generation fast and deterministic.
|
||||
"""
|
||||
if not blocks:
|
||||
pd.DataFrame().to_excel(writer, sheet_name=sheet, index=False)
|
||||
return
|
||||
|
||||
combined_parts: list[pd.DataFrame] = []
|
||||
for title, df in blocks:
|
||||
pd.DataFrame([[title]]).to_excel(
|
||||
writer, sheet_name=sheet, index=False, header=False, startrow=startrow
|
||||
)
|
||||
startrow += 1
|
||||
df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow)
|
||||
startrow += len(df) + 3
|
||||
df2 = df.copy()
|
||||
# Put the section label as the first column for readability.
|
||||
df2.insert(0, "Section", title)
|
||||
combined_parts.append(df2)
|
||||
|
||||
combined = pd.concat(combined_parts, axis=0, ignore_index=True, sort=False)
|
||||
combined.to_excel(writer, sheet_name=sheet, index=False)
|
||||
|
||||
|
||||
def _safe_filename(s: str) -> str:
|
||||
s = re.sub(r"[^\w\-.]+", "_", str(s).strip())
|
||||
return s[:180] if len(s) > 180 else s
|
||||
# Fast path without the third-party `regex` module.
|
||||
s = " ".join(str(s).strip().split())
|
||||
allowed = []
|
||||
for ch in s:
|
||||
if ch.isalnum() or ch in "._-":
|
||||
allowed.append(ch)
|
||||
else:
|
||||
allowed.append("_")
|
||||
out = "".join(allowed)
|
||||
return out[:180] if len(out) > 180 else out
|
||||
|
||||
|
||||
# -----------------------------
|
||||
@@ -428,7 +556,11 @@ def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
|
||||
|
||||
|
||||
def _max_concurrency_ok(
|
||||
df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
|
||||
df: pd.DataFrame,
|
||||
conc_col: str,
|
||||
cfg_col: str,
|
||||
threshold: float,
|
||||
slack_pct: float = 0.0,
|
||||
):
|
||||
if df is None or conc_col not in df.columns or cfg_col not in df.columns:
|
||||
return pd.NA
|
||||
@@ -441,7 +573,14 @@ def _max_concurrency_ok(
|
||||
if d.empty:
|
||||
return pd.NA
|
||||
|
||||
ok = d[d[cfg_col] <= threshold]
|
||||
# Accept values up to (1 + slack_pct%) above the SLA.
|
||||
try:
|
||||
slack_pct = float(slack_pct or 0.0)
|
||||
except Exception:
|
||||
slack_pct = 0.0
|
||||
effective_limit = float(threshold) * (1.0 + slack_pct / 100.0)
|
||||
|
||||
ok = d[d[cfg_col] <= effective_limit]
|
||||
if ok.empty:
|
||||
return pd.NA
|
||||
|
||||
@@ -507,15 +646,25 @@ def build_valid_max_concurrency_summary_html(
|
||||
if not cfg_cols:
|
||||
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
|
||||
|
||||
# Display SLA ranges in the table header (SLA .. SLA*(1+slack))
|
||||
ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
|
||||
tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
|
||||
ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
|
||||
tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
|
||||
|
||||
rows = []
|
||||
for cfg in cfg_cols:
|
||||
ttft_max = (
|
||||
_max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
|
||||
_max_concurrency_ok(
|
||||
ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
|
||||
)
|
||||
if ttft_group_df is not None
|
||||
else pd.NA
|
||||
)
|
||||
tpot_max = (
|
||||
_max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
|
||||
_max_concurrency_ok(
|
||||
tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
|
||||
)
|
||||
if tpot_group_df is not None
|
||||
else pd.NA
|
||||
)
|
||||
@@ -544,8 +693,8 @@ def build_valid_max_concurrency_summary_html(
|
||||
rows.append(
|
||||
{
|
||||
"Configuration": cfg,
|
||||
f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
|
||||
f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
|
||||
f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
|
||||
f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
|
||||
f"Max {conc_col} (Both)": both,
|
||||
"Output Tput @ Both (tok/s)": tput_at_both,
|
||||
"TTFT @ Both (ms)": ttft_at_both,
|
||||
@@ -620,15 +769,24 @@ def build_valid_max_concurrency_summary_df(
|
||||
if not cfg_cols:
|
||||
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
|
||||
|
||||
ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
|
||||
tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
|
||||
ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
|
||||
tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
|
||||
|
||||
rows = []
|
||||
for cfg in cfg_cols:
|
||||
ttft_max = (
|
||||
_max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
|
||||
_max_concurrency_ok(
|
||||
ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
|
||||
)
|
||||
if ttft_group_df is not None
|
||||
else pd.NA
|
||||
)
|
||||
tpot_max = (
|
||||
_max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
|
||||
_max_concurrency_ok(
|
||||
tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
|
||||
)
|
||||
if tpot_group_df is not None
|
||||
else pd.NA
|
||||
)
|
||||
@@ -657,8 +815,8 @@ def build_valid_max_concurrency_summary_df(
|
||||
rows.append(
|
||||
{
|
||||
"Configuration": cfg,
|
||||
f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
|
||||
f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
|
||||
f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
|
||||
f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
|
||||
f"Max {conc_col} (Both)": both,
|
||||
"Output Tput @ Both (tok/s)": tput_at_both,
|
||||
"TTFT @ Both (ms)": ttft_at_both,
|
||||
@@ -751,7 +909,21 @@ def build_parser() -> argparse.ArgumentParser:
|
||||
help="Reference limit for TPOT plots (ms)",
|
||||
)
|
||||
|
||||
# ---- NEW: export options ----
|
||||
# ---- SLA tolerance (slack) options ----
|
||||
parser.add_argument(
|
||||
"--ttft-slack-pct",
|
||||
type=float,
|
||||
default=5.0,
|
||||
help="Allowed percentage above TTFT SLA (default: 5).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tpot-slack-pct",
|
||||
type=float,
|
||||
default=5.0,
|
||||
help="Allowed percentage above TPOT SLA (default: 5).",
|
||||
)
|
||||
|
||||
# ---- export options ----
|
||||
parser.add_argument(
|
||||
"--excel-out",
|
||||
type=str,
|
||||
@@ -843,9 +1015,13 @@ def render_metric_table_html(
|
||||
|
||||
metric_name = metric_label.lower()
|
||||
if "ttft" in metric_name:
|
||||
styler = _highlight_threshold(display_group, args.ttft_max_ms)
|
||||
styler = _highlight_threshold(
|
||||
display_group, args.ttft_max_ms, args.ttft_slack_pct
|
||||
)
|
||||
elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
|
||||
styler = _highlight_threshold(display_group, args.tpot_max_ms)
|
||||
styler = _highlight_threshold(
|
||||
display_group, args.tpot_max_ms, args.tpot_slack_pct
|
||||
)
|
||||
else:
|
||||
styler = display_group.style
|
||||
|
||||
@@ -962,22 +1138,46 @@ def write_report_group_first(
|
||||
csv_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
excel_path = args.excel_out or "perf_comparison.xlsx"
|
||||
with pd.ExcelWriter(excel_path, engine="openpyxl") as xw:
|
||||
disable_excel = os.getenv("VLLM_COMPARE_DISABLE_EXCEL", "0") == "1"
|
||||
|
||||
# Prefer xlsxwriter for speed; fallback to openpyxl if unavailable.
|
||||
excel_engine = (
|
||||
os.getenv("VLLM_COMPARE_EXCEL_ENGINE", "xlsxwriter").strip() or "xlsxwriter"
|
||||
)
|
||||
if excel_engine == "xlsxwriter" and util.find_spec("xlsxwriter") is None:
|
||||
excel_engine = "openpyxl"
|
||||
|
||||
excel_engine_kwargs = {}
|
||||
if excel_engine == "xlsxwriter":
|
||||
# Reduce memory pressure & usually faster writes.
|
||||
excel_engine_kwargs = {"options": {"constant_memory": True}}
|
||||
|
||||
xw_ctx = (
|
||||
nullcontext(None)
|
||||
if disable_excel
|
||||
else pd.ExcelWriter(
|
||||
excel_path, engine=excel_engine, engine_kwargs=excel_engine_kwargs
|
||||
)
|
||||
)
|
||||
with xw_ctx as xw:
|
||||
used_sheets: set[str] = set()
|
||||
# ---- Environment sheet (first) ----
|
||||
env_sheet = _sanitize_sheet_name("Environment")
|
||||
env_df = _load_env_df_for_inputs(args, files)
|
||||
if env_df is None or env_df.empty:
|
||||
pd.DataFrame(
|
||||
[
|
||||
{
|
||||
"Section": "Environment",
|
||||
"Key": "vllm_env.txt",
|
||||
"Value": "NOT FOUND (or empty)",
|
||||
}
|
||||
]
|
||||
).to_excel(xw, sheet_name=env_sheet, index=False)
|
||||
else:
|
||||
env_df.to_excel(xw, sheet_name=env_sheet, index=False)
|
||||
if xw is not None:
|
||||
if env_df is None or env_df.empty:
|
||||
pd.DataFrame(
|
||||
[
|
||||
{
|
||||
"Section": "Environment",
|
||||
"Key": "vllm_env.txt",
|
||||
"Value": "NOT FOUND (or empty)",
|
||||
}
|
||||
]
|
||||
).to_excel(xw, sheet_name=env_sheet, index=False)
|
||||
else:
|
||||
env_df.to_excel(xw, sheet_name=env_sheet, index=False)
|
||||
used_sheets.add(env_sheet)
|
||||
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
|
||||
main_fh.write('<meta charset="utf-8">\n')
|
||||
for gkey in group_keys:
|
||||
@@ -993,12 +1193,19 @@ def write_report_group_first(
|
||||
|
||||
main_fh.write(group_header)
|
||||
|
||||
do_excel = xw is not None
|
||||
sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
|
||||
sheet_base = sheet
|
||||
dedup_i = 1
|
||||
while sheet in xw.sheets:
|
||||
dedup_i += 1
|
||||
sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}")
|
||||
if do_excel:
|
||||
dedup_i = 1
|
||||
while sheet in used_sheets:
|
||||
dedup_i += 1
|
||||
suffix = f"_{dedup_i}"
|
||||
# Ensure uniqueness even when sheet names are truncated.
|
||||
base = str(sheet_base)
|
||||
keep = max(1, 31 - len(suffix))
|
||||
sheet = _sanitize_sheet_name(base[:keep] + suffix)
|
||||
used_sheets.add(sheet)
|
||||
|
||||
excel_blocks: list[tuple[str, pd.DataFrame]] = []
|
||||
|
||||
@@ -1059,7 +1266,7 @@ def write_report_group_first(
|
||||
)
|
||||
|
||||
excel_blocks.append(
|
||||
(metric_label, display_group.reset_index(drop=True))
|
||||
(metric_label, group_df.reset_index(drop=True))
|
||||
)
|
||||
if csv_dir:
|
||||
fn = _safe_filename(
|
||||
@@ -1067,7 +1274,7 @@ def write_report_group_first(
|
||||
"/", "_"
|
||||
)
|
||||
)
|
||||
display_group.to_csv(csv_dir / f"{fn}.csv", index=False)
|
||||
group_df.to_csv(csv_dir / f"{fn}.csv", index=False)
|
||||
|
||||
summary_html = build_valid_max_concurrency_summary_html(
|
||||
tput_group_df=tput_group_df,
|
||||
@@ -1097,9 +1304,13 @@ def write_report_group_first(
|
||||
)
|
||||
summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
|
||||
|
||||
_write_tables_to_excel_sheet(xw, sheet, excel_blocks)
|
||||
if do_excel:
|
||||
_write_tables_to_excel_sheet(xw, sheet, excel_blocks)
|
||||
|
||||
print(f"Wrote Excel: {excel_path}")
|
||||
if disable_excel:
|
||||
print("Skipped Excel generation (VLLM_COMPARE_DISABLE_EXCEL=1).")
|
||||
else:
|
||||
print(f"Wrote Excel: {excel_path}")
|
||||
if csv_dir:
|
||||
print(f"Wrote CSVs under: {csv_dir}")
|
||||
|
||||
|
||||
365
.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
Executable file → Normal file
365
.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
Executable file → Normal file
@@ -12,6 +12,13 @@ DRY_RUN="${DRY_RUN:-0}"
|
||||
MODEL_FILTER="${MODEL_FILTER:-}"
|
||||
DTYPE_FILTER="${DTYPE_FILTER:-}"
|
||||
|
||||
# Adaptive search controls
|
||||
ENABLE_ADAPTIVE_CONCURRENCY="${ENABLE_ADAPTIVE_CONCURRENCY:-0}"
|
||||
SLA_TTFT_MS="${SLA_TTFT_MS:-3000}"
|
||||
SLA_TPOT_MS="${SLA_TPOT_MS:-100}"
|
||||
ADAPTIVE_MAX_PROBES="${ADAPTIVE_MAX_PROBES:-8}"
|
||||
ADAPTIVE_MAX_CONCURRENCY="${ADAPTIVE_MAX_CONCURRENCY:-1024}"
|
||||
|
||||
check_gpus() {
|
||||
if command -v nvidia-smi; then
|
||||
# check the number of GPUs and GPU type.
|
||||
@@ -183,6 +190,304 @@ upload_to_buildkite() {
|
||||
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
|
||||
}
|
||||
|
||||
# -------------------------------
|
||||
# Adaptive concurrency helpers
|
||||
# -------------------------------
|
||||
result_json_path_for_serving() {
|
||||
local test_name=$1
|
||||
local qps=$2
|
||||
local max_concurrency=$3
|
||||
echo "$RESULTS_FOLDER/${test_name}_qps_${qps}_concurrency_${max_concurrency}.json"
|
||||
}
|
||||
|
||||
extract_metric_ms() {
|
||||
local metric_name=$1
|
||||
local json_file=$2
|
||||
|
||||
[[ -f "$json_file" ]] || return 0
|
||||
|
||||
if [[ "$metric_name" == "ttft" ]]; then
|
||||
jq -r '
|
||||
[
|
||||
.ttft_ms.p99?,
|
||||
.metrics.ttft_ms.p99?,
|
||||
.ttft.p99?,
|
||||
.metrics.ttft.p99?,
|
||||
.p99_ttft_ms?,
|
||||
.ttft_ms.mean?,
|
||||
.metrics.ttft_ms.mean?,
|
||||
.ttft.mean?,
|
||||
.metrics.ttft.mean?,
|
||||
.mean_ttft_ms?
|
||||
] | map(select(. != null)) | .[0] // empty
|
||||
' "$json_file"
|
||||
else
|
||||
jq -r '
|
||||
[
|
||||
.tpot_ms.p99?,
|
||||
.metrics.tpot_ms.p99?,
|
||||
.tpot.p99?,
|
||||
.metrics.tpot.p99?,
|
||||
.p99_tpot_ms?,
|
||||
.itl_ms.p99?,
|
||||
.metrics.itl_ms.p99?,
|
||||
.inter_token_latency_ms.p99?,
|
||||
.tpot_ms.mean?,
|
||||
.metrics.tpot_ms.mean?,
|
||||
.tpot.mean?,
|
||||
.metrics.tpot.mean?,
|
||||
.itl_ms.mean?,
|
||||
.metrics.itl_ms.mean?,
|
||||
.mean_tpot_ms?,
|
||||
.mean_itl_ms?
|
||||
] | map(select(. != null)) | .[0] // empty
|
||||
' "$json_file"
|
||||
fi
|
||||
}
|
||||
|
||||
evaluate_sla_from_json() {
|
||||
local json_file=$1
|
||||
local ttft
|
||||
local tpot
|
||||
local pass
|
||||
|
||||
[[ -f "$json_file" ]] || return 2
|
||||
|
||||
ttft=$(extract_metric_ms ttft "$json_file")
|
||||
tpot=$(extract_metric_ms tpot "$json_file")
|
||||
|
||||
[[ -n "$ttft" && -n "$tpot" ]] || return 2
|
||||
|
||||
pass=$(jq -n \
|
||||
--argjson ttft "$ttft" \
|
||||
--argjson tpot "$tpot" \
|
||||
--argjson sla_ttft "$SLA_TTFT_MS" \
|
||||
--argjson sla_tpot "$SLA_TPOT_MS" \
|
||||
'($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)')
|
||||
|
||||
[[ "$pass" == "true" ]]
|
||||
}
|
||||
|
||||
write_adaptive_summary_json() {
|
||||
local summary_file=$1
|
||||
local test_name=$2
|
||||
local qps=$3
|
||||
local static_last_pass=$4
|
||||
local static_first_fail=$5
|
||||
local final_last_pass=$6
|
||||
local final_first_fail=$7
|
||||
|
||||
jq -n \
|
||||
--arg test_name "$test_name" \
|
||||
--arg qps "$qps" \
|
||||
--argjson sla_ttft "$SLA_TTFT_MS" \
|
||||
--argjson sla_tpot "$SLA_TPOT_MS" \
|
||||
--arg static_last_pass "${static_last_pass:-}" \
|
||||
--arg static_first_fail "${static_first_fail:-}" \
|
||||
--arg final_last_pass "${final_last_pass:-}" \
|
||||
--arg final_first_fail "${final_first_fail:-}" \
|
||||
'{
|
||||
test_name: $test_name,
|
||||
qps: $qps,
|
||||
sla_ttft_ms: $sla_ttft,
|
||||
sla_tpot_ms: $sla_tpot,
|
||||
static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end),
|
||||
static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end),
|
||||
final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end),
|
||||
final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end)
|
||||
}' > "$summary_file"
|
||||
}
|
||||
|
||||
run_single_serving_probe() {
|
||||
local test_name=$1
|
||||
local qps=$2
|
||||
local max_concurrency=$3
|
||||
local tp=$4
|
||||
local compilation_config_mode=$5
|
||||
local optimization_level=$6
|
||||
local client_args_effective=$7
|
||||
local client_remote_args=$8
|
||||
local server_command=$9
|
||||
|
||||
local new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
|
||||
local result_json
|
||||
local num_prompts_arg=""
|
||||
local client_command
|
||||
|
||||
result_json=$(result_json_path_for_serving "$test_name" "$qps" "$max_concurrency")
|
||||
|
||||
if [[ -f "$result_json" ]]; then
|
||||
evaluate_sla_from_json "$result_json"
|
||||
return $?
|
||||
fi
|
||||
|
||||
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
|
||||
num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
|
||||
if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
|
||||
if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
|
||||
num_prompts_arg="--num-prompts $num_prompts"
|
||||
fi
|
||||
|
||||
client_command="vllm bench serve \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--max-concurrency $max_concurrency \
|
||||
$num_prompts_arg \
|
||||
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level adaptive_search=1 \
|
||||
$client_args_effective $client_remote_args "
|
||||
|
||||
echo "Adaptive probe: $client_command"
|
||||
|
||||
if [[ "${DRY_RUN:-0}" != "1" ]]; then
|
||||
bash -c "$client_command"
|
||||
fi
|
||||
|
||||
jq_output=$(jq -n \
|
||||
--arg server "$server_command" \
|
||||
--arg client "$client_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
'{
|
||||
server_command: $server,
|
||||
client_command: $client,
|
||||
gpu_type: $gpu,
|
||||
adaptive_search: true
|
||||
}')
|
||||
echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
|
||||
|
||||
evaluate_sla_from_json "$result_json"
|
||||
}
|
||||
|
||||
adaptive_refine_from_static_results() {
|
||||
local test_name=$1
|
||||
local qps=$2
|
||||
local max_concurrency_list_raw=$3
|
||||
local tp=$4
|
||||
local compilation_config_mode=$5
|
||||
local optimization_level=$6
|
||||
local client_args_effective=$7
|
||||
local client_remote_args=$8
|
||||
local server_command=$9
|
||||
|
||||
local sorted_points
|
||||
local point
|
||||
local rc
|
||||
local static_last_pass=""
|
||||
local static_first_fail=""
|
||||
local largest_static=""
|
||||
local step_hint=1
|
||||
local previous_point=""
|
||||
local low
|
||||
local high
|
||||
local mid
|
||||
local probes=0
|
||||
local summary_file="$RESULTS_FOLDER/${test_name}_qps_${qps}_sla_summary.json"
|
||||
|
||||
[[ "${ENABLE_ADAPTIVE_CONCURRENCY}" == "1" ]] || return 0
|
||||
[[ "${DRY_RUN:-0}" != "1" ]] || return 0
|
||||
|
||||
sorted_points=$(for point in $max_concurrency_list_raw; do printf '%s\n' "$point"; done | tr -d "'" | awk '/^[0-9]+$/' | sort -n | uniq)
|
||||
[[ -n "$sorted_points" ]] || return 0
|
||||
|
||||
while read -r point; do
|
||||
[[ -z "$point" ]] && continue
|
||||
largest_static="$point"
|
||||
evaluate_sla_from_json "$(result_json_path_for_serving "$test_name" "$qps" "$point")"
|
||||
rc=$?
|
||||
if (( rc == 0 )); then
|
||||
static_last_pass="$point"
|
||||
elif (( rc == 1 )); then
|
||||
if [[ -n "$static_last_pass" ]]; then
|
||||
static_first_fail="$point"
|
||||
break
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ -n "$previous_point" ]]; then
|
||||
step_hint=$(( point - previous_point ))
|
||||
if (( step_hint < 1 )); then step_hint=1; fi
|
||||
fi
|
||||
previous_point="$point"
|
||||
done <<< "$sorted_points"
|
||||
|
||||
if [[ -z "$static_last_pass" ]]; then
|
||||
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "" "$static_first_fail" "" "$static_first_fail"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [[ -n "$static_first_fail" ]]; then
|
||||
low=$static_last_pass
|
||||
high=$static_first_fail
|
||||
while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
|
||||
mid=$(( (low + high) / 2 ))
|
||||
probes=$(( probes + 1 ))
|
||||
run_single_serving_probe \
|
||||
"$test_name" "$qps" "$mid" "$tp" \
|
||||
"$compilation_config_mode" "$optimization_level" \
|
||||
"$client_args_effective" "$client_remote_args" "$server_command"
|
||||
rc=$?
|
||||
if (( rc == 0 )); then
|
||||
low=$mid
|
||||
elif (( rc == 1 )); then
|
||||
high=$mid
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "$static_first_fail" "$low" "$high"
|
||||
return 0
|
||||
fi
|
||||
|
||||
low=$largest_static
|
||||
high=""
|
||||
while (( probes < ADAPTIVE_MAX_PROBES )); do
|
||||
point=$(( low + step_hint ))
|
||||
if (( point > ADAPTIVE_MAX_CONCURRENCY )); then
|
||||
point=$ADAPTIVE_MAX_CONCURRENCY
|
||||
fi
|
||||
(( point > low )) || break
|
||||
probes=$(( probes + 1 ))
|
||||
run_single_serving_probe \
|
||||
"$test_name" "$qps" "$point" "$tp" \
|
||||
"$compilation_config_mode" "$optimization_level" \
|
||||
"$client_args_effective" "$client_remote_args" "$server_command"
|
||||
rc=$?
|
||||
if (( rc == 0 )); then
|
||||
low=$point
|
||||
(( point == ADAPTIVE_MAX_CONCURRENCY )) && break
|
||||
step_hint=$(( step_hint * 2 ))
|
||||
if (( step_hint < 1 )); then step_hint=1; fi
|
||||
elif (( rc == 1 )); then
|
||||
high=$point
|
||||
break
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ -n "$high" ]]; then
|
||||
while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
|
||||
mid=$(( (low + high) / 2 ))
|
||||
probes=$(( probes + 1 ))
|
||||
run_single_serving_probe \
|
||||
"$test_name" "$qps" "$mid" "$tp" \
|
||||
"$compilation_config_mode" "$optimization_level" \
|
||||
"$client_args_effective" "$client_remote_args" "$server_command"
|
||||
rc=$?
|
||||
if (( rc == 0 )); then
|
||||
low=$mid
|
||||
elif (( rc == 1 )); then
|
||||
high=$mid
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "" "$low" "$high"
|
||||
}
|
||||
|
||||
run_benchmark_tests() {
|
||||
# run benchmark tests using `vllm bench <test_type>` command
|
||||
# $1: test type (latency or throughput)
|
||||
@@ -347,10 +652,48 @@ run_serving_tests() {
|
||||
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
|
||||
client_params=$(echo "$params" | jq -r '.client_parameters')
|
||||
|
||||
server_args=$(json2args "$server_params")
|
||||
# vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly.
|
||||
server_model=$(echo "$server_params" | jq -r '.model // empty')
|
||||
if [[ -z "$server_model" || "$server_model" == "null" ]]; then
|
||||
echo "Error: serving test '$test_name' is missing server_parameters.model" >&2
|
||||
exit 1
|
||||
fi
|
||||
server_params_no_model=$(echo "$server_params" | jq -c 'del(.model)')
|
||||
server_args=$(json2args "$server_params_no_model")
|
||||
|
||||
server_envs=$(json2envs "$server_envs")
|
||||
client_args=$(json2args "$client_params")
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Option 1: Dynamic num-prompts scaling based on max_concurrency
|
||||
#
|
||||
# If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with:
|
||||
# num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY
|
||||
#
|
||||
# If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior
|
||||
# unchanged (i.e., whatever is in serving-tests-*.json).
|
||||
# ------------------------------------------------------------
|
||||
PROMPTS_PER_CONCURRENCY="${PROMPTS_PER_CONCURRENCY-}" # no default on purpose
|
||||
MIN_NUM_PROMPTS="${MIN_NUM_PROMPTS:-1}"
|
||||
MAX_NUM_PROMPTS="${MAX_NUM_PROMPTS:-1000000}"
|
||||
|
||||
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
|
||||
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
|
||||
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
|
||||
# Handles: --num-prompts 123 and --num-prompts=123
|
||||
client_args_no_np="$(
|
||||
printf ' %s ' "$client_args" \
|
||||
| sed -E \
|
||||
-e 's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g' \
|
||||
-e 's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g'
|
||||
)"
|
||||
# normalize whitespace
|
||||
client_args_no_np="$(echo "$client_args_no_np" | tr -s ' ' | sed -E 's/^ //; s/ $//')"
|
||||
client_args_no_np="$(echo "$client_args_no_np" | xargs)"
|
||||
client_args_effective="$client_args_no_np"
|
||||
else
|
||||
client_args_effective="$client_args"
|
||||
fi
|
||||
# qps_list
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
@@ -382,14 +725,13 @@ run_serving_tests() {
|
||||
fi
|
||||
|
||||
# check if server model and client model is aligned
|
||||
server_model=$(echo "$server_params" | jq -r '.model')
|
||||
client_model=$(echo "$client_params" | jq -r '.model')
|
||||
if [[ $server_model != "$client_model" ]]; then
|
||||
echo "Server model and client model must be the same. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
server_command="$server_envs vllm serve \
|
||||
server_command="$server_envs vllm serve $server_model \
|
||||
$server_args"
|
||||
|
||||
# run the server
|
||||
@@ -436,6 +778,14 @@ run_serving_tests() {
|
||||
for max_concurrency in $max_concurrency_list; do
|
||||
new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
|
||||
echo " new test name $new_test_name"
|
||||
# If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts.
|
||||
num_prompts_arg=""
|
||||
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
|
||||
num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
|
||||
if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
|
||||
if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
|
||||
num_prompts_arg="--num-prompts $num_prompts"
|
||||
fi
|
||||
# pass the tensor parallel size, the compilation mode, and the optimization
|
||||
# level to the client so that they can be used on the benchmark dashboard
|
||||
client_command="vllm bench serve \
|
||||
@@ -444,8 +794,9 @@ run_serving_tests() {
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--max-concurrency $max_concurrency \
|
||||
$num_prompts_arg \
|
||||
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
|
||||
$client_args $client_remote_args "
|
||||
$client_args_effective $client_remote_args "
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
@@ -467,6 +818,11 @@ run_serving_tests() {
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||
|
||||
done
|
||||
|
||||
adaptive_refine_from_static_results \
|
||||
"$test_name" "$qps" "$max_concurrency_list" "$tp" \
|
||||
"$compilation_config_mode" "$optimization_level" \
|
||||
"$client_args_effective" "$client_remote_args" "$server_command"
|
||||
done
|
||||
|
||||
# clean up
|
||||
@@ -532,6 +888,7 @@ main() {
|
||||
# postprocess benchmarking results
|
||||
pip install tabulate pandas
|
||||
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
|
||||
python3 $QUICK_BENCHMARK_ROOT/scripts/compare-json-results.py -f $RESULTS_FOLDER/benchmark_results.json
|
||||
|
||||
upload_to_buildkite
|
||||
}
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
{
|
||||
"defaults": {
|
||||
"qps_list": [
|
||||
"inf"
|
||||
],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
|
||||
},
|
||||
"server_parameters": {
|
||||
"dtype": "bfloat16",
|
||||
"model": "openai/whisper-large-v3-turbo"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "openai/whisper-large-v3-turbo",
|
||||
"backend": "openai-audio",
|
||||
"endpoint": "/v1/audio/transcriptions",
|
||||
"dataset_name": "hf",
|
||||
"dataset_path": "openslr/librispeech_asr",
|
||||
"hf_subset": "clean",
|
||||
"hf_split": "test",
|
||||
"no_stream": "",
|
||||
"no_oversample": "",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
"tests": [
|
||||
{
|
||||
"test_name": "serving_whisper_large_v3_turbo_librispeech_clean_tp1",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {}
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -149,6 +149,39 @@
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_2048_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_2048_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_2048_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
@@ -188,6 +221,45 @@
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp2_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp4_random_128_128",
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama3B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
|
||||
@@ -72,17 +72,6 @@
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||
"server_parameters": {
|
||||
@@ -105,17 +94,6 @@
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_128_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||
"server_parameters": {
|
||||
@@ -139,14 +117,25 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_2048_128",
|
||||
"test_name": "serving_llama8B_tp1_random_2048_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 4
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_2048_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
@@ -83,7 +83,7 @@ steps:
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
|
||||
@@ -152,7 +152,7 @@ steps:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
|
||||
@@ -16,6 +16,23 @@ RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python"
|
||||
WORK_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "$WORK_DIR"' EXIT
|
||||
|
||||
# ── Detect PyTorch index URL ─────────────────────────────────────────────
|
||||
|
||||
if python3 -c "import torch; assert torch.version.hip" 2>/dev/null; then
|
||||
ROCM_VER=$(python3 -c "import torch; print(torch.version.hip.rsplit('.', 1)[0])")
|
||||
CANDIDATE_URL="https://download.pytorch.org/whl/rocm${ROCM_VER}"
|
||||
if curl -fsSL --head "${CANDIDATE_URL}/" >/dev/null 2>&1; then
|
||||
TORCH_INDEX_URL="${CANDIDATE_URL}"
|
||||
else
|
||||
echo ">>> WARNING: ROCm ${ROCM_VER} wheel index not found at ${CANDIDATE_URL}"
|
||||
echo ">>> Falling back to default PyPI (resolution may be incomplete)"
|
||||
TORCH_INDEX_URL=""
|
||||
fi
|
||||
else
|
||||
TORCH_INDEX_URL="https://download.pytorch.org/whl/cu129"
|
||||
fi
|
||||
echo ">>> Using PyTorch index: ${TORCH_INDEX_URL:-PyPI default}"
|
||||
|
||||
# Fetch all Ray requirement files used in the LLM depset pipeline
|
||||
echo ">>> Fetching Ray requirement files"
|
||||
RAY_FILES=(
|
||||
@@ -116,6 +133,11 @@ echo "============================================================"
|
||||
echo ">>> Resolving: Can Ray generate compatible lock files?"
|
||||
echo "============================================================"
|
||||
|
||||
EXTRA_INDEX_ARGS=()
|
||||
if [[ -n "${TORCH_INDEX_URL}" ]]; then
|
||||
EXTRA_INDEX_ARGS+=(--extra-index-url "${TORCH_INDEX_URL}")
|
||||
fi
|
||||
|
||||
set +e
|
||||
uv pip compile \
|
||||
"${WORK_DIR}/requirements.txt" \
|
||||
@@ -126,7 +148,7 @@ uv pip compile \
|
||||
-c "${WORK_DIR}/vllm-constraints.txt" \
|
||||
--python-version 3.12 \
|
||||
--python-platform x86_64-manylinux_2_31 \
|
||||
--extra-index-url https://download.pytorch.org/whl/cu129 \
|
||||
"${EXTRA_INDEX_ARGS[@]}" \
|
||||
--index-strategy unsafe-best-match \
|
||||
--unsafe-package setuptools \
|
||||
--unsafe-package ray \
|
||||
|
||||
@@ -205,6 +205,13 @@ re_quote_pytest_markers() {
|
||||
esac
|
||||
|
||||
if $is_boundary; then
|
||||
# Strip surrounding double quotes if present (from upstream
|
||||
# single-to-double conversion); without this, wrapping below
|
||||
# would produce '"expr"' with literal double-quote characters.
|
||||
if [[ "$marker_buf" == '"'*'"' ]]; then
|
||||
marker_buf="${marker_buf#\"}"
|
||||
marker_buf="${marker_buf%\"}"
|
||||
fi
|
||||
# Flush the collected marker expression
|
||||
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
|
||||
output+="'${marker_buf}' "
|
||||
@@ -242,6 +249,11 @@ re_quote_pytest_markers() {
|
||||
|
||||
# Flush any trailing marker expression (marker at end of command)
|
||||
if $collecting && [[ -n "$marker_buf" ]]; then
|
||||
# Strip surrounding double quotes (see mid-stream flush comment)
|
||||
if [[ "$marker_buf" == '"'*'"' ]]; then
|
||||
marker_buf="${marker_buf#\"}"
|
||||
marker_buf="${marker_buf%\"}"
|
||||
fi
|
||||
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
|
||||
output+="'${marker_buf}'"
|
||||
else
|
||||
@@ -321,15 +333,18 @@ apply_rocm_test_overrides() {
|
||||
# --- Entrypoint ignores ---
|
||||
if [[ $cmds == *" entrypoints/openai "* ]]; then
|
||||
cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
|
||||
--ignore=entrypoints/openai/test_audio.py \
|
||||
--ignore=entrypoints/openai/test_shutdown.py \
|
||||
--ignore=entrypoints/openai/chat_completion/test_audio.py \
|
||||
--ignore=entrypoints/openai/completion/test_shutdown.py \
|
||||
--ignore=entrypoints/openai/test_completion.py \
|
||||
--ignore=entrypoints/openai/test_models.py \
|
||||
--ignore=entrypoints/openai/test_lora_adapters.py \
|
||||
--ignore=entrypoints/openai/models/test_models.py \
|
||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
||||
--ignore=entrypoints/openai/test_root_path.py \
|
||||
--ignore=entrypoints/openai/test_tokenization.py \
|
||||
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
||||
--ignore=entrypoints/openai/chat_completion/test_root_path.py \
|
||||
--ignore=entrypoints/openai/completion/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
if [[ $cmds == *" entrypoints/serve"* ]]; then
|
||||
cmds="${cmds} \
|
||||
--ignore=entrypoints/serve/lora/test_lora_adapters.py"
|
||||
fi
|
||||
|
||||
if [[ $cmds == *" entrypoints/llm "* ]]; then
|
||||
@@ -492,6 +507,8 @@ else
|
||||
-e HF_TOKEN \
|
||||
-e AWS_ACCESS_KEY_ID \
|
||||
-e AWS_SECRET_ACCESS_KEY \
|
||||
-e BUILDKITE_PARALLEL_JOB \
|
||||
-e BUILDKITE_PARALLEL_JOB_COUNT \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
||||
|
||||
65
.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
Executable file
65
.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
Executable file
@@ -0,0 +1,65 @@
|
||||
#!/bin/bash
|
||||
set -euox pipefail
|
||||
|
||||
export VLLM_CPU_KVCACHE_SPACE=1
|
||||
export VLLM_CPU_CI_ENV=1
|
||||
# Reduce sub-processes for acceleration
|
||||
export TORCH_COMPILE_DISABLE=1
|
||||
export VLLM_ENABLE_V1_MULTIPROCESSING=0
|
||||
|
||||
SDE_ARCHIVE="sde-external-10.7.0-2026-02-18-lin.tar.xz"
|
||||
SDE_CHECKSUM="CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217"
|
||||
wget "https://downloadmirror.intel.com/913594/${SDE_ARCHIVE}"
|
||||
echo "${SDE_CHECKSUM} ${SDE_ARCHIVE}" | sha256sum --check
|
||||
mkdir -p sde
|
||||
tar -xvf "./${SDE_ARCHIVE}" --strip-components=1 -C ./sde/
|
||||
|
||||
wait_for_pid_and_check_log() {
|
||||
local pid="$1"
|
||||
local log_file="$2"
|
||||
local exit_status
|
||||
|
||||
if [ -z "$pid" ] || [ -z "$log_file" ]; then
|
||||
echo "Usage: wait_for_pid_and_check_log <PID> <LOG_FILE>"
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo "Waiting for process $pid to finish..."
|
||||
|
||||
# Use the 'wait' command to pause the script until the specific PID exits.
|
||||
# The 'wait' command's own exit status will be that of the waited-for process.
|
||||
if wait "$pid"; then
|
||||
exit_status=$?
|
||||
echo "Process $pid finished with exit status $exit_status (Success)."
|
||||
else
|
||||
exit_status=$?
|
||||
echo "Process $pid finished with exit status $exit_status (Failure)."
|
||||
fi
|
||||
|
||||
if [ "$exit_status" -ne 0 ]; then
|
||||
echo "Process exited with a non-zero status."
|
||||
echo "--- Last few lines of log file: $log_file ---"
|
||||
tail -n 50 "$log_file"
|
||||
echo "---------------------------------------------"
|
||||
return 1 # Indicate failure based on exit status
|
||||
fi
|
||||
|
||||
echo "No errors detected in log file and process exited successfully."
|
||||
return 0
|
||||
}
|
||||
|
||||
# Test Sky Lake (AVX512F)
|
||||
./sde/sde64 -skl -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_0.log 2>&1 &
|
||||
PID_TEST_0=$!
|
||||
|
||||
# Test Cascade Lake (AVX512F + VNNI)
|
||||
./sde/sde64 -clx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_1.log 2>&1 &
|
||||
PID_TEST_1=$!
|
||||
|
||||
# Test Cooper Lake (AVX512F + VNNI + BF16)
|
||||
./sde/sde64 -cpx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_2.log 2>&1 &
|
||||
PID_TEST_2=$!
|
||||
|
||||
wait_for_pid_and_check_log $PID_TEST_0 test_0.log
|
||||
wait_for_pid_and_check_log $PID_TEST_1 test_1.log
|
||||
wait_for_pid_and_check_log $PID_TEST_2 test_2.log
|
||||
@@ -127,7 +127,7 @@ run_and_track_test() {
|
||||
|
||||
# --- Actual Test Execution ---
|
||||
run_and_track_test 1 "test_struct_output_generate.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
||||
run_and_track_test 2 "test_moe_pallas.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
||||
run_and_track_test 3 "test_lora.py" \
|
||||
|
||||
@@ -33,23 +33,22 @@ docker run \
|
||||
bash -c '
|
||||
set -e
|
||||
echo $ZE_AFFINITY_MASK
|
||||
pip install tblib==3.1.0
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
|
||||
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
|
||||
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
|
||||
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager --max-model-len 8192
|
||||
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
|
||||
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
|
||||
cd tests
|
||||
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
|
||||
pytest -v -s v1/engine
|
||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py --ignore=v1/worker/test_worker_memory_snapshot.py
|
||||
pytest -v -s v1/structured_output
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py -k "not (test_register_kv_caches and FLASH_ATTN and True)"
|
||||
pytest -v -s v1/test_serial_utils.py
|
||||
'
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euxo pipefail
|
||||
|
||||
# Nightly e2e test for prefetch offloading with a MoE model.
|
||||
# Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights
|
||||
# and validates GSM8K accuracy matches baseline (no offloading).
|
||||
#
|
||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||
#
|
||||
# Environment variables:
|
||||
# ATTENTION_BACKEND - attention backend to use (e.g., FLASH_ATTN,
|
||||
# ROCM_ATTN, FLASHINFER). If unset, uses vllm default.
|
||||
THRESHOLD=${1:-0.25}
|
||||
NUM_Q=${2:-1319}
|
||||
PORT=${3:-8030}
|
||||
@@ -22,6 +25,14 @@ wait_for_server() {
|
||||
|
||||
MODEL="deepseek-ai/DeepSeek-V2-Lite"
|
||||
|
||||
# ── Build optional vllm serve flags ─────────────────────────────────────
|
||||
|
||||
EXTRA_ARGS=()
|
||||
if [[ -n "${ATTENTION_BACKEND:-}" ]]; then
|
||||
echo "Using attention backend: ${ATTENTION_BACKEND}"
|
||||
EXTRA_ARGS+=(--attention-backend "${ATTENTION_BACKEND}")
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||
kill "${SERVER_PID}" 2>/dev/null || true
|
||||
@@ -40,7 +51,8 @@ vllm serve "$MODEL" \
|
||||
--offload-num-in-group 2 \
|
||||
--offload-prefetch-step 1 \
|
||||
--offload-params w13_weight w2_weight \
|
||||
--port "$PORT" &
|
||||
--port "$PORT" \
|
||||
${EXTRA_ARGS+"${EXTRA_ARGS[@]}"} &
|
||||
SERVER_PID=$!
|
||||
wait_for_server "$PORT"
|
||||
|
||||
|
||||
248
.buildkite/scripts/tool_call/run-bfcl-eval.sh
Executable file
248
.buildkite/scripts/tool_call/run-bfcl-eval.sh
Executable file
@@ -0,0 +1,248 @@
|
||||
#!/bin/bash
|
||||
# Run BFCL (Berkeley Function Call Leaderboard) tool-calling correctness
|
||||
# evaluation against a local vLLM server.
|
||||
#
|
||||
# Usage:
|
||||
# # Run with defaults (gpt-oss-20b, multi_turn)
|
||||
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
|
||||
#
|
||||
# # Run with gpt-oss-120b and multiple test categories
|
||||
# BFCL_MODEL="openai/gpt-oss-120b" BFCL_TP_SIZE=4 \
|
||||
# BFCL_TEST_CATEGORY="live_simple, multiple, parallel_multiple" \
|
||||
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
|
||||
#
|
||||
# # Chain both API types (use BFCL_OUTPUT_DIR to avoid overwriting results)
|
||||
# BFCL_OUTPUT_DIR=./bfcl-chat-completions BFCL_API_TYPE=chat_completions \
|
||||
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh && \
|
||||
# BFCL_OUTPUT_DIR=./bfcl-responses BFCL_API_TYPE=responses \
|
||||
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
|
||||
#
|
||||
# Environment variables (all optional, with defaults):
|
||||
# BFCL_MODEL - HF model name (default: openai/gpt-oss-20b)
|
||||
# BFCL_API_TYPE - API type: "chat_completions" or "responses" (default: chat_completions)
|
||||
# BFCL_OUTPUT_DIR - Directory for BFCL results (default: current working directory)
|
||||
# BFCL_TEST_CATEGORY - BFCL test categories (default: multi_turn)
|
||||
# BFCL_TOOL_CALL_PARSER - Tool call parser name (default: openai)
|
||||
# BFCL_NUM_THREADS - Threads for BFCL generate (default: 8)
|
||||
# BFCL_TP_SIZE - Tensor parallel size (default: 1)
|
||||
# BFCL_MAX_MODEL_LEN - Max model length (default: 4096)
|
||||
# BFCL_PORT - Server port (default: 8000)
|
||||
# BFCL_REASONING_PARSER - Reasoning parser name (default: disabled)
|
||||
# BFCL_EXTRA_ARGS - Additional vLLM server args
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ---- Configuration ----
|
||||
MODEL="${BFCL_MODEL:-openai/gpt-oss-20b}"
|
||||
API_TYPE="${BFCL_API_TYPE:-chat_completions}"
|
||||
OUTPUT_DIR="${BFCL_OUTPUT_DIR:-}"
|
||||
TEST_CATEGORY="${BFCL_TEST_CATEGORY:-multi_turn}"
|
||||
TOOL_CALL_PARSER="${BFCL_TOOL_CALL_PARSER:-openai}"
|
||||
NUM_THREADS="${BFCL_NUM_THREADS:-8}"
|
||||
TP_SIZE="${BFCL_TP_SIZE:-1}"
|
||||
MAX_MODEL_LEN="${BFCL_MAX_MODEL_LEN:-4096}"
|
||||
PORT="${BFCL_PORT:-8000}"
|
||||
REASONING_PARSER="${BFCL_REASONING_PARSER:-}"
|
||||
EXTRA_ARGS="${BFCL_EXTRA_ARGS:-}"
|
||||
|
||||
# Set up output directory
|
||||
if [ -n "$OUTPUT_DIR" ]; then
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
OUTPUT_DIR="$(cd "$OUTPUT_DIR" && pwd)"
|
||||
fi
|
||||
|
||||
echo "============================================"
|
||||
echo "BFCL Tool Call Correctness Evaluation"
|
||||
echo "============================================"
|
||||
echo "Model: $MODEL"
|
||||
echo "Tool parser: $TOOL_CALL_PARSER"
|
||||
echo "API type: $API_TYPE"
|
||||
echo "Output dir: ${OUTPUT_DIR:-<cwd>}"
|
||||
echo "Test category: $TEST_CATEGORY"
|
||||
echo "TP size: $TP_SIZE"
|
||||
echo "Max model len: $MAX_MODEL_LEN"
|
||||
echo "Port: $PORT"
|
||||
echo "Num threads: $NUM_THREADS"
|
||||
echo "============================================"
|
||||
|
||||
# ---- Install bfcl-eval if missing ----
|
||||
if ! python3 -c "import bfcl_eval" 2>/dev/null; then
|
||||
echo "Installing bfcl-eval..."
|
||||
pip install "bfcl-eval>=2025.10.20.1,<2026"
|
||||
fi
|
||||
|
||||
# ---- Cleanup handler ----
|
||||
SERVER_PID=""
|
||||
cleanup() {
|
||||
if [ -n "$SERVER_PID" ]; then
|
||||
echo "Stopping vLLM server (pid=$SERVER_PID)..."
|
||||
kill "$SERVER_PID" 2>/dev/null || true
|
||||
wait "$SERVER_PID" 2>/dev/null || true
|
||||
fi
|
||||
# Remove BFCL lock files (created by filelock for thread-safe writes)
|
||||
rm -rf .file_locks/
|
||||
if [ -n "${OUTPUT_DIR:-}" ]; then
|
||||
rm -rf "$OUTPUT_DIR/.file_locks/"
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
# ---- Start vLLM server ----
|
||||
echo "Starting vLLM server..."
|
||||
|
||||
SERVE_ARGS=(
|
||||
"$MODEL"
|
||||
--port "$PORT"
|
||||
--enable-auto-tool-choice
|
||||
--tool-call-parser "$TOOL_CALL_PARSER"
|
||||
--tensor-parallel-size "$TP_SIZE"
|
||||
--max-model-len "$MAX_MODEL_LEN"
|
||||
--enforce-eager
|
||||
--no-enable-prefix-caching
|
||||
)
|
||||
|
||||
# Append reasoning parser if specified
|
||||
if [ -n "$REASONING_PARSER" ]; then
|
||||
SERVE_ARGS+=(--reasoning-parser "$REASONING_PARSER")
|
||||
fi
|
||||
|
||||
# Append any extra args
|
||||
if [ -n "$EXTRA_ARGS" ]; then
|
||||
read -ra EXTRA_ARGS_ARRAY <<< "$EXTRA_ARGS"
|
||||
SERVE_ARGS+=("${EXTRA_ARGS_ARRAY[@]}")
|
||||
fi
|
||||
|
||||
echo "Command: vllm serve ${SERVE_ARGS[*]}"
|
||||
vllm serve "${SERVE_ARGS[@]}" &
|
||||
SERVER_PID=$!
|
||||
|
||||
# ---- Wait for server to be ready ----
|
||||
echo "Waiting for vLLM server to start (timeout: 600s)..."
|
||||
SECONDS_WAITED=0
|
||||
until curl -sf "http://localhost:${PORT}/health" > /dev/null 2>&1; do
|
||||
if [ $SECONDS_WAITED -ge 600 ]; then
|
||||
echo ""
|
||||
echo "ERROR: vLLM server failed to start within 600s"
|
||||
exit 1
|
||||
fi
|
||||
if (( SECONDS_WAITED % 30 == 0 && SECONDS_WAITED > 0 )); then
|
||||
echo " Still waiting... (${SECONDS_WAITED}s elapsed)"
|
||||
fi
|
||||
sleep 2
|
||||
SECONDS_WAITED=$((SECONDS_WAITED + 2))
|
||||
done
|
||||
echo "vLLM server is ready. (started in ${SECONDS_WAITED}s)"
|
||||
|
||||
# ---- Run BFCL evaluation ----
|
||||
# bfcl-eval has no CLI entry point; generate() and evaluate() are Typer
|
||||
# functions that must be called from Python. The MODEL_CONFIG_MAPPING must
|
||||
# be patched in-process so BFCL knows to use the OpenAI-compatible handler
|
||||
# against our local vLLM server.
|
||||
bfcl_exit_code=0
|
||||
python3 - "$MODEL" "$TEST_CATEGORY" "$NUM_THREADS" "$PORT" "$API_TYPE" "$OUTPUT_DIR" << 'PYEOF' || bfcl_exit_code=$?
|
||||
import os
|
||||
import sys
|
||||
|
||||
model = sys.argv[1]
|
||||
test_category = sys.argv[2]
|
||||
num_threads = int(sys.argv[3])
|
||||
port = sys.argv[4]
|
||||
api_type = sys.argv[5]
|
||||
output_dir = sys.argv[6] if len(sys.argv) > 6 and sys.argv[6] else os.getcwd()
|
||||
|
||||
os.environ["OPENAI_BASE_URL"] = f"http://localhost:{port}/v1"
|
||||
os.environ["OPENAI_API_KEY"] = "dummy"
|
||||
os.environ["BFCL_PROJECT_ROOT"] = output_dir
|
||||
|
||||
import bfcl_eval.constants.model_config as bfcl_model_config
|
||||
from bfcl_eval.constants.model_config import ModelConfig
|
||||
from bfcl_eval.model_handler.api_inference.openai_completion import (
|
||||
OpenAICompletionsHandler,
|
||||
)
|
||||
from bfcl_eval.model_handler.api_inference.openai_response import (
|
||||
OpenAIResponsesHandler,
|
||||
)
|
||||
|
||||
if api_type == "responses":
|
||||
handler = OpenAIResponsesHandler
|
||||
else:
|
||||
handler = OpenAICompletionsHandler
|
||||
|
||||
bfcl_model_config.MODEL_CONFIG_MAPPING[model] = ModelConfig(
|
||||
model_name=model,
|
||||
display_name=f"{model} (FC) (vLLM)",
|
||||
url=f"https://huggingface.co/{model}",
|
||||
org="",
|
||||
license="apache-2.0",
|
||||
model_handler=handler,
|
||||
input_price=None,
|
||||
output_price=None,
|
||||
is_fc_model=True,
|
||||
underscore_to_dot=True,
|
||||
)
|
||||
|
||||
from bfcl_eval.__main__ import evaluate, generate
|
||||
import inspect
|
||||
import typer
|
||||
|
||||
|
||||
def _get_default_kwargs(function):
|
||||
kwargs = {}
|
||||
for k, v in inspect.signature(function).parameters.items():
|
||||
if v.default is not inspect.Parameter.empty:
|
||||
default = v.default
|
||||
if isinstance(default, typer.models.OptionInfo):
|
||||
default = default.default
|
||||
kwargs[k] = default
|
||||
return kwargs
|
||||
|
||||
|
||||
# ---- generate ----
|
||||
print(f"=== BFCL generate: model={model} test_category={test_category} ===")
|
||||
gen_kwargs = _get_default_kwargs(generate)
|
||||
gen_kwargs["model"] = [model]
|
||||
gen_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
|
||||
gen_kwargs["skip_server_setup"] = True
|
||||
gen_kwargs["num_threads"] = num_threads
|
||||
generate(**gen_kwargs)
|
||||
|
||||
# ---- evaluate ----
|
||||
print(f"=== BFCL evaluate: model={model} test_category={test_category} ===")
|
||||
eval_kwargs = _get_default_kwargs(evaluate)
|
||||
eval_kwargs["model"] = [model]
|
||||
eval_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
|
||||
evaluate(**eval_kwargs)
|
||||
|
||||
print("=== BFCL evaluation completed successfully ===")
|
||||
PYEOF
|
||||
|
||||
# ---- Upload results to buildkite ----
|
||||
if command -v buildkite-agent &>/dev/null; then
|
||||
if [ $bfcl_exit_code -eq 0 ]; then
|
||||
STYLE="success"
|
||||
STATUS="PASSED"
|
||||
else
|
||||
STYLE="error"
|
||||
STATUS="FAILED"
|
||||
fi
|
||||
|
||||
buildkite-agent annotate --style "$STYLE" --context "bfcl-results" <<EOF
|
||||
### BFCL Tool Call Correctness - ${STATUS}
|
||||
- **Model:** \`${MODEL}\`
|
||||
- **Parser:** \`${TOOL_CALL_PARSER}\`
|
||||
- **API type:** \`${API_TYPE}\`
|
||||
- **Test category:** \`${TEST_CATEGORY}\`
|
||||
EOF
|
||||
|
||||
# BFCL writes results to $BFCL_PROJECT_ROOT/result/ and scores to
|
||||
# $BFCL_PROJECT_ROOT/score/
|
||||
RESULTS_ROOT="${OUTPUT_DIR:-.}"
|
||||
if [ -d "$RESULTS_ROOT/result" ]; then
|
||||
buildkite-agent artifact upload "$RESULTS_ROOT/result/**/*"
|
||||
fi
|
||||
if [ -d "$RESULTS_ROOT/score" ]; then
|
||||
buildkite-agent artifact upload "$RESULTS_ROOT/score/**/*"
|
||||
fi
|
||||
fi
|
||||
|
||||
exit $bfcl_exit_code
|
||||
File diff suppressed because it is too large
Load Diff
@@ -14,8 +14,3 @@ steps:
|
||||
- pytest -v -s basic_correctness/test_cumem.py
|
||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
@@ -59,7 +59,7 @@ steps:
|
||||
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
||||
- pytest -s -v tests/compile/passes/distributed
|
||||
|
||||
- label: Fusion and Compile Unit Tests (B200)
|
||||
- label: Fusion and Compile Unit Tests (2xB200)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/"
|
||||
device: b200
|
||||
@@ -101,8 +101,8 @@ steps:
|
||||
- nvidia-smi
|
||||
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
|
||||
# Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and (qwen3 or deepseek)"
|
||||
|
||||
- label: Fusion E2E Config Sweep (H100)
|
||||
timeout_in_minutes: 30
|
||||
@@ -132,9 +132,9 @@ steps:
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all models but only FLASHINFER, Inductor partition and native custom ops
|
||||
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||
# Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||
# Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek)) or llama-3)"
|
||||
|
||||
- label: Fusion E2E TP2 Quick (H100)
|
||||
timeout_in_minutes: 20
|
||||
@@ -150,8 +150,8 @@ steps:
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all models and attn backends but only Inductor partition and native custom ops
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
|
||||
|
||||
- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
|
||||
timeout_in_minutes: 40
|
||||
@@ -205,7 +205,7 @@ steps:
|
||||
commands:
|
||||
- nvidia-smi
|
||||
# Run all models but only FLASHINFER, Inductor partition and native custom ops
|
||||
# include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||
# include qwen/deepseek with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
||||
# for ar-rms-quant-fp4, also sweep llama3
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))) or Llama-3.1-8B-Instruct-FP4"
|
||||
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
|
||||
|
||||
@@ -15,8 +15,29 @@ steps:
|
||||
- pytest -v -s distributed/test_shm_buffer.py
|
||||
- pytest -v -s distributed/test_shm_storage.py
|
||||
|
||||
- label: Distributed (2 GPUs)
|
||||
timeout_in_minutes: 60
|
||||
- label: Distributed DP Tests (2 GPUs)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/worker/worker_base.py
|
||||
- vllm/v1/engine/
|
||||
- vllm/v1/worker/
|
||||
- tests/v1/distributed
|
||||
- tests/entrypoints/openai/test_multi_api_servers.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py
|
||||
|
||||
- label: Distributed Compile + RPC Tests (2 GPUs)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
@@ -29,62 +50,80 @@ steps:
|
||||
- vllm/v1/worker/
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- tests/compile/test_wrapper.py
|
||||
- tests/distributed/
|
||||
- tests/entrypoints/llm/test_collective_rpc.py
|
||||
- tests/v1/distributed
|
||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
|
||||
- label: Distributed Torchrun + Shutdown Tests (2 GPUs)
|
||||
timeout_in_minutes: 20
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/worker/worker_base.py
|
||||
- vllm/v1/engine/
|
||||
- vllm/v1/worker/
|
||||
- tests/distributed/
|
||||
- tests/v1/shutdown
|
||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||
|
||||
- label: Distributed Tests (4 GPUs)
|
||||
timeout_in_minutes: 50
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
- label: Distributed Torchrun + Examples (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace"
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- tests/distributed/test_utils
|
||||
- tests/distributed/test_pynccl
|
||||
- tests/distributed/test_events
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- examples/offline_inference/rlhf.py
|
||||
- tests/distributed/test_torchrun_example.py
|
||||
- tests/distributed/test_torchrun_example_moe.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- examples/offline_inference/new_weight_syncing/
|
||||
- examples/rl/
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
- tests/v1/distributed
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
- tests/distributed/test_symm_mem_allreduce.py
|
||||
- tests/distributed/test_multiproc_executor.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
# test with torchrun tp=2 and external_dp=2
|
||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
- torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example.py
|
||||
# test with torchrun tp=2 and pp=2
|
||||
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
- PP_SIZE=2 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example.py
|
||||
# test with torchrun tp=4 and dp=1
|
||||
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
- TP_SIZE=4 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
|
||||
# test with torchrun tp=2, pp=2 and dp=1
|
||||
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
|
||||
# test with torchrun tp=1 and dp=4 with ep
|
||||
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
|
||||
# test with torchrun tp=2 and dp=2 with ep
|
||||
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py
|
||||
# test with internal dp
|
||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||
- python3 examples/offline_inference/data_parallel.py --enforce-eager
|
||||
# rlhf examples
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_nccl.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_ipc.py
|
||||
|
||||
- label: Distributed DP Tests (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- tests/v1/distributed
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
- tests/distributed/test_utils
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
@@ -92,22 +131,27 @@ steps:
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||
- pytest -v -s distributed/test_utils.py
|
||||
|
||||
- label: Distributed Compile + Comm (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- tests/distributed/test_pynccl
|
||||
- tests/distributed/test_events
|
||||
- tests/compile/fullgraph/test_basic_correctness.py
|
||||
- tests/distributed/test_symm_mem_allreduce.py
|
||||
- tests/distributed/test_multiproc_executor.py
|
||||
commands:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
- pytest -v -s distributed/test_events.py
|
||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
||||
# test multi-node TP with multiproc executor (simulated on single node)
|
||||
- pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
|
||||
# TODO: create a dedicated test section for multi-GPU example tests
|
||||
# when we have multiple distributed example tests
|
||||
# OLD rlhf examples
|
||||
- cd ../examples/offline_inference
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
# NEW rlhf examples
|
||||
- cd new_weight_syncing
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
|
||||
|
||||
- label: Distributed Tests (8 GPUs)(H100)
|
||||
timeout_in_minutes: 10
|
||||
@@ -149,7 +193,7 @@ steps:
|
||||
num_devices: 2
|
||||
commands:
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
# - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py --- failing, need to re-enable
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py
|
||||
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
group: Engine
|
||||
depends_on:
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Engine
|
||||
@@ -14,28 +14,30 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||
|
||||
- label: V1 e2e + engine (1 GPU)
|
||||
timeout_in_minutes: 45
|
||||
- label: Engine (1 GPU)
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
- vllm/v1/engine/
|
||||
- tests/v1/engine/
|
||||
commands:
|
||||
# TODO: accuracy does not match, whether setting
|
||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||
- pytest -v -s v1/e2e
|
||||
# Run this test standalone for now;
|
||||
# need to untangle use (implicit) use of spawn/fork across the tests.
|
||||
- pytest -v -s v1/engine/test_preprocess_error_handling.py
|
||||
# Run the rest of v1/engine tests
|
||||
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
commands:
|
||||
- pytest -v -s v1/e2e
|
||||
- pytest -v -s v1/engine
|
||||
|
||||
- label: e2e Scheduling (1 GPU)
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/v1/
|
||||
- tests/v1/e2e/general/
|
||||
commands:
|
||||
- pytest -v -s v1/e2e/general/test_async_scheduling.py
|
||||
|
||||
- label: e2e Core (1 GPU)
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/v1/
|
||||
- tests/v1/e2e/general/
|
||||
commands:
|
||||
- pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
|
||||
|
||||
- label: V1 e2e (2 GPUs)
|
||||
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
|
||||
@@ -46,7 +48,7 @@ steps:
|
||||
- tests/v1/e2e
|
||||
commands:
|
||||
# Only run tests that need exactly 2 GPUs
|
||||
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
|
||||
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_2
|
||||
@@ -62,9 +64,21 @@ steps:
|
||||
- tests/v1/e2e
|
||||
commands:
|
||||
# Only run tests that need 4 GPUs
|
||||
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
|
||||
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_4
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: V1 e2e (4xH100)
|
||||
timeout_in_minutes: 60
|
||||
device: h100
|
||||
num_devices: 4
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/v1/attention/backends/utils.py
|
||||
- vllm/v1/worker/gpu_model_runner.py
|
||||
- tests/v1/e2e/test_hybrid_chunked_prefill.py
|
||||
commands:
|
||||
- pytest -v -s v1/e2e/test_hybrid_chunked_prefill.py
|
||||
|
||||
@@ -10,7 +10,7 @@ steps:
|
||||
- tests/entrypoints/
|
||||
commands:
|
||||
- pytest -v -s entrypoints/openai/tool_parsers
|
||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
||||
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
||||
|
||||
- label: Entrypoints Integration (LLM)
|
||||
timeout_in_minutes: 40
|
||||
@@ -24,11 +24,6 @@ steps:
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: Entrypoints Integration (API Server 1)
|
||||
timeout_in_minutes: 130
|
||||
@@ -39,7 +34,7 @@ steps:
|
||||
- tests/entrypoints/test_chat_utils
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
mirror:
|
||||
amd:
|
||||
@@ -53,18 +48,13 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/rpc
|
||||
- tests/entrypoints/instrumentator
|
||||
- tests/entrypoints/serve/instrumentator
|
||||
- tests/tool_use
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/instrumentator
|
||||
- pytest -v -s entrypoints/serve/instrumentator
|
||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
|
||||
- pytest -v -s tool_use
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: Entrypoints Integration (Pooling)
|
||||
timeout_in_minutes: 50
|
||||
@@ -75,11 +65,6 @@ steps:
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/pooling
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: Entrypoints Integration (Responses API)
|
||||
timeout_in_minutes: 50
|
||||
@@ -90,19 +75,6 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s entrypoints/openai/responses
|
||||
|
||||
- label: Entrypoints V1
|
||||
timeout_in_minutes: 50
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
- pytest -v -s v1/entrypoints
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: OpenAI API Correctness
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
|
||||
@@ -24,8 +24,7 @@ steps:
|
||||
|
||||
- label: Elastic EP Scaling Test
|
||||
timeout_in_minutes: 20
|
||||
device: b200
|
||||
optional: true
|
||||
device: h100
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
|
||||
@@ -35,7 +35,7 @@ steps:
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels MoE Test %N
|
||||
timeout_in_minutes: 60
|
||||
timeout_in_minutes: 25
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
- csrc/moe/
|
||||
@@ -47,7 +47,7 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
parallelism: 5
|
||||
|
||||
- label: Kernels Mamba Test
|
||||
timeout_in_minutes: 45
|
||||
|
||||
@@ -45,6 +45,22 @@ steps:
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
|
||||
|
||||
- label: LM Eval Qwen3.5 Models (B200)
|
||||
timeout_in_minutes: 120
|
||||
device: b200
|
||||
optional: true
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/models/qwen3_5.py
|
||||
- vllm/model_executor/models/qwen3_5_mtp.py
|
||||
- vllm/transformers_utils/configs/qwen3_5.py
|
||||
- vllm/transformers_utils/configs/qwen3_5_moe.py
|
||||
- vllm/model_executor/models/qwen3_next.py
|
||||
- vllm/model_executor/models/qwen3_next_mtp.py
|
||||
- vllm/model_executor/layers/fla/ops/
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-qwen35-blackwell.txt
|
||||
|
||||
- label: LM Eval Large Models (H200)
|
||||
timeout_in_minutes: 60
|
||||
device: h200
|
||||
|
||||
@@ -8,7 +8,7 @@ steps:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
commands:
|
||||
- pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
|
||||
- pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py --ignore=lora/test_qwen35_densemoel_lora.py
|
||||
parallelism: 4
|
||||
|
||||
|
||||
@@ -30,4 +30,5 @@ steps:
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||
- pytest -v -s -x lora/test_gptoss_tp.py
|
||||
- pytest -v -s -x lora/test_gptoss_tp.py
|
||||
- pytest -v -s -x lora/test_qwen35_densemoel_lora.py
|
||||
@@ -88,11 +88,6 @@ steps:
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: Metrics, Tracing (2 GPUs)
|
||||
timeout_in_minutes: 20
|
||||
|
||||
@@ -9,9 +9,9 @@ steps:
|
||||
- vllm/config/model.py
|
||||
- vllm/model_executor
|
||||
- tests/model_executor
|
||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
- tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
|
||||
commands:
|
||||
- apt-get update && apt-get install -y curl libsodium23
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s model_executor
|
||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
- pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py
|
||||
|
||||
110
.buildkite/test_areas/model_runner_v2.yaml
Normal file
110
.buildkite/test_areas/model_runner_v2.yaml
Normal file
@@ -0,0 +1,110 @@
|
||||
group: Model Runner V2
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Model Runner V2 Core Tests
|
||||
timeout_in_minutes: 45
|
||||
source_file_dependencies:
|
||||
- vllm/v1/worker/gpu/
|
||||
- vllm/v1/worker/gpu_worker.py
|
||||
- vllm/v1/core/sched/
|
||||
- vllm/v1/attention/
|
||||
- tests/v1/engine/test_llm_engine.py
|
||||
- tests/v1/e2e/
|
||||
- tests/entrypoints/llm/test_struct_output_generate.py
|
||||
commands:
|
||||
- set -x
|
||||
- export VLLM_USE_V2_MODEL_RUNNER=1
|
||||
- pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics"
|
||||
# This requires eager until we sort out CG correctness issues.
|
||||
# TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged.
|
||||
- ENFORCE_EAGER=1 pytest -v -s v1/e2e/general/test_async_scheduling.py -k "not ngram"
|
||||
- pytest -v -s v1/e2e/general/test_context_length.py
|
||||
- pytest -v -s v1/e2e/general/test_min_tokens.py
|
||||
# Temporary hack filter to exclude ngram spec decoding based tests.
|
||||
- pytest -v -s entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
|
||||
|
||||
- label: Model Runner V2 Examples
|
||||
timeout_in_minutes: 45
|
||||
working_dir: "/vllm-workspace/examples"
|
||||
source_file_dependencies:
|
||||
- vllm/v1/worker/gpu/
|
||||
- vllm/v1/core/sched/
|
||||
- vllm/v1/worker/gpu_worker.py
|
||||
- examples/offline_inference/
|
||||
- examples/basic/offline_inference/
|
||||
- examples/pooling/embed/vision_embedding_offline.py
|
||||
- examples/others/tensorize_vllm_model.py
|
||||
commands:
|
||||
- set -x
|
||||
- export VLLM_USE_V2_MODEL_RUNNER=1
|
||||
- pip install tensorizer # for tensorizer test
|
||||
- python3 basic/offline_inference/chat.py # for basic
|
||||
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
|
||||
#- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 # TODO
|
||||
#- python3 basic/offline_inference/embed.py # TODO
|
||||
# for multi-modal models
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
# for pooling models
|
||||
- python3 pooling/embed/vision_embedding_offline.py --seed 0
|
||||
# for features demo
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||
|
||||
- label: Model Runner V2 Distributed (2 GPUs)
|
||||
timeout_in_minutes: 45
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 2
|
||||
source_file_dependencies:
|
||||
- vllm/v1/worker/gpu/
|
||||
- vllm/v1/worker/gpu_worker.py
|
||||
- tests/basic_correctness/test_basic_correctness.py
|
||||
- tests/v1/distributed/test_async_llm_dp.py
|
||||
- tests/v1/distributed/test_eagle_dp.py
|
||||
commands:
|
||||
- set -x
|
||||
- export VLLM_USE_V2_MODEL_RUNNER=1
|
||||
# The "and not True" here is a hacky way to exclude the prompt_embeds cases which aren't yet supported.
|
||||
- TARGET_TEST_SUITE=L4 pytest -v -s basic_correctness/test_basic_correctness.py -m 'distributed(num_gpus=2)' -k "not ray and not True"
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py -k "not ray"
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
|
||||
# These require fix https://github.com/vllm-project/vllm/pull/36280
|
||||
- label: Model Runner V2 Pipeline Parallelism (4 GPUs)
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_devices: 4
|
||||
source_file_dependencies:
|
||||
- vllm/v1/worker/gpu/
|
||||
- vllm/v1/worker/gpu_worker.py
|
||||
- tests/distributed/test_pipeline_parallel.py
|
||||
#- tests/distributed/test_pp_cudagraph.py
|
||||
commands:
|
||||
- set -x
|
||||
- export VLLM_USE_V2_MODEL_RUNNER=1
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py -k "not ray and not Jamba"
|
||||
# TODO: Uncomment once https://github.com/vllm-project/vllm/pull/35162 is merged.
|
||||
#- pytest -v -s distributed/test_pp_cudagraph.py -k "not ray"
|
||||
|
||||
- label: Model Runner V2 Spec Decode
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/v1/worker/gpu/
|
||||
- vllm/v1/worker/gpu_worker.py
|
||||
- tests/v1/spec_decode/test_max_len.py
|
||||
- tests/v1/e2e/spec_decode/test_spec_decode.py
|
||||
commands:
|
||||
- set -x
|
||||
- export VLLM_USE_V2_MODEL_RUNNER=1
|
||||
- pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp"
|
||||
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle or mtp"
|
||||
@@ -2,15 +2,59 @@ group: Models - Multimodal
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Multi-Modal Models (Standard) # 60min
|
||||
timeout_in_minutes: 80
|
||||
- label: "Multi-Modal Models (Standard) 1: qwen2"
|
||||
timeout_in_minutes: 45
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2"
|
||||
- pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: "Multi-Modal Models (Standard) 2: qwen3 + gemma"
|
||||
timeout_in_minutes: 45
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma"
|
||||
- pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl"
|
||||
timeout_in_minutes: 45
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma"
|
||||
- pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: "Multi-Modal Models (Standard) 4: other + whisper"
|
||||
timeout_in_minutes: 45
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
mirror:
|
||||
amd:
|
||||
@@ -18,7 +62,7 @@ steps:
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: Multi-Modal Processor Test (CPU)
|
||||
- label: Multi-Modal Processor (CPU)
|
||||
depends_on:
|
||||
- image-build-cpu
|
||||
timeout_in_minutes: 60
|
||||
@@ -51,34 +95,44 @@ steps:
|
||||
commands:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||
|
||||
- label: Multi-Modal Models (Extended) 1
|
||||
- label: Multi-Modal Models (Extended Generation 1)
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
- tests/models/multimodal/generation
|
||||
- tests/models/multimodal/test_mapping.py
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
|
||||
- pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py
|
||||
- pytest -v -s models/multimodal/test_mapping.py
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
- label: Multi-Modal Models (Extended) 2
|
||||
- label: Multi-Modal Models (Extended Generation 2)
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
- tests/models/multimodal/generation
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||
|
||||
- label: Multi-Modal Models (Extended) 3
|
||||
- label: Multi-Modal Models (Extended Generation 3)
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
- tests/models/multimodal/generation
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
||||
|
||||
- label: Multi-Modal Models (Extended Pooling)
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal/pooling
|
||||
commands:
|
||||
- pytest -v -s models/multimodal/pooling -m 'not core_model'
|
||||
|
||||
@@ -36,11 +36,6 @@ steps:
|
||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s distributed/test_distributed_oot.py
|
||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/openai/chat_completion/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_2
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
|
||||
@@ -35,7 +35,7 @@ steps:
|
||||
# as it is a heavy test that is covered in other steps.
|
||||
# Use `find` to launch multiple instances of pytest so that
|
||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
|
||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||
|
||||
- label: PyTorch Fullgraph
|
||||
timeout_in_minutes: 30
|
||||
|
||||
40
.buildkite/test_areas/spec_decode.yaml
Normal file
40
.buildkite/test_areas/spec_decode.yaml
Normal file
@@ -0,0 +1,40 @@
|
||||
group: Spec Decode
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Spec Decode Eagle
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/v1/spec_decode/
|
||||
- vllm/v1/worker/gpu/spec_decode/
|
||||
- tests/v1/e2e/spec_decode/
|
||||
commands:
|
||||
- pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"
|
||||
|
||||
- label: Spec Decode Speculators + MTP
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/v1/spec_decode/
|
||||
- vllm/v1/worker/gpu/spec_decode/
|
||||
- vllm/transformers_utils/configs/speculators/
|
||||
- tests/v1/e2e/spec_decode/
|
||||
commands:
|
||||
- pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
|
||||
|
||||
- label: Spec Decode Ngram + Suffix
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/v1/spec_decode/
|
||||
- vllm/v1/worker/gpu/spec_decode/
|
||||
- tests/v1/e2e/spec_decode/
|
||||
commands:
|
||||
- pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"
|
||||
|
||||
- label: Spec Decode Draft Model
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/v1/spec_decode/
|
||||
- vllm/v1/worker/gpu/spec_decode/
|
||||
- tests/v1/e2e/spec_decode/
|
||||
commands:
|
||||
- pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"
|
||||
3
.github/CODEOWNERS
vendored
3
.github/CODEOWNERS
vendored
@@ -75,7 +75,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
|
||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||
/tests/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
|
||||
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
||||
@@ -171,6 +171,7 @@ mkdocs.yaml @hmellor
|
||||
|
||||
# Pooling models
|
||||
/examples/pooling @noooop
|
||||
/docs/models/pooling_models @noooop
|
||||
/tests/models/*/pooling* @noooop
|
||||
/tests/entrypoints/pooling @noooop
|
||||
/vllm/config/pooler.py @noooop
|
||||
|
||||
13
.github/mergify.yml
vendored
13
.github/mergify.yml
vendored
@@ -27,7 +27,7 @@ pull_request_rules:
|
||||
Hi @{{author}}, the pre-commit checks have failed. Please run:
|
||||
|
||||
```bash
|
||||
uv pip install pre-commit
|
||||
uv pip install pre-commit>=4.5.1
|
||||
pre-commit install
|
||||
pre-commit run --all-files
|
||||
```
|
||||
@@ -260,7 +260,7 @@ pull_request_rules:
|
||||
- files=examples/offline_inference/structured_outputs.py
|
||||
- files=examples/online_serving/structured_outputs/structured_outputs.py
|
||||
- files~=^tests/v1/structured_output/
|
||||
- files=tests/v1/entrypoints/llm/test_struct_output_generate.py
|
||||
- files=tests/entrypoints/llm/test_struct_output_generate.py
|
||||
- files~=^vllm/v1/structured_output/
|
||||
actions:
|
||||
label:
|
||||
@@ -333,9 +333,10 @@ pull_request_rules:
|
||||
- label != stale
|
||||
- or:
|
||||
- files~=^tests/tool_use/
|
||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||
- files~=^tests/tool_parsers/
|
||||
- files~=^tests/entrypoints/openai/.*tool.*
|
||||
- files~=^tests/entrypoints/anthropic/.*tool.*
|
||||
- files~=^vllm/tool_parsers/
|
||||
- files=docs/features/tool_calling.md
|
||||
- files~=^examples/tool_chat_*
|
||||
- files=examples/offline_inference/chat_with_tools.py
|
||||
@@ -381,7 +382,7 @@ pull_request_rules:
|
||||
- or:
|
||||
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
||||
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
||||
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
- files~=^tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
|
||||
- files~=^tests/model_executor/model_loader/tensorizer_loader/
|
||||
actions:
|
||||
assign:
|
||||
|
||||
50
.github/scripts/cleanup_pr_body.sh
vendored
50
.github/scripts/cleanup_pr_body.sh
vendored
@@ -1,50 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -eu
|
||||
|
||||
# ensure 1 argument is passed
|
||||
if [ "$#" -ne 1 ]; then
|
||||
echo "Usage: $0 <pr_number>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
PR_NUMBER=$1
|
||||
OLD=/tmp/orig_pr_body.txt
|
||||
NEW=/tmp/new_pr_body.txt
|
||||
|
||||
gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
|
||||
cp "${OLD}" "${NEW}"
|
||||
|
||||
# Remove markdown comments (like the <!-- markdownlint-disable --> at the start)
|
||||
sed -i '/<!--.*-->$/d' "${NEW}"
|
||||
|
||||
# Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED."
|
||||
sed -i '/PLEASE FILL IN THE PR DESCRIPTION HERE.*$/d' "${NEW}"
|
||||
|
||||
# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
|
||||
sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
|
||||
|
||||
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
|
||||
python3 - <<EOF
|
||||
import regex as re
|
||||
|
||||
with open("${NEW}", "r") as file:
|
||||
content = file.read()
|
||||
|
||||
pattern = re.compile(r'(---\n\n)?<details>.*?<summary>.*?PR Checklist \(Click to Expand\).*?</summary>.*?</details>', re.DOTALL)
|
||||
content = re.sub(pattern, '', content)
|
||||
|
||||
with open("${NEW}", "w") as file:
|
||||
file.write(content)
|
||||
EOF
|
||||
|
||||
# Run this only if ${NEW} is different than ${OLD}
|
||||
if ! cmp -s "${OLD}" "${NEW}"; then
|
||||
gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
|
||||
echo
|
||||
echo "Updated PR body:"
|
||||
echo
|
||||
cat "${NEW}"
|
||||
else
|
||||
echo "No changes needed"
|
||||
fi
|
||||
32
.github/workflows/cleanup_pr_body.yml
vendored
32
.github/workflows/cleanup_pr_body.yml
vendored
@@ -1,32 +0,0 @@
|
||||
name: Cleanup PR Body
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened, reopened, edited]
|
||||
|
||||
permissions:
|
||||
pull-requests: write
|
||||
|
||||
jobs:
|
||||
update-description:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
|
||||
with:
|
||||
python-version: '3.12'
|
||||
cache: 'pip'
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
python3 -m pip install --upgrade pip
|
||||
python3 -m pip install regex
|
||||
|
||||
- name: Update PR description
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
|
||||
105
.github/workflows/issue_autolabel.yml
vendored
105
.github/workflows/issue_autolabel.yml
vendored
@@ -383,4 +383,107 @@ jobs:
|
||||
core.notice(`All users for label "${label}" already mentioned, skipping comment`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
- name: Request missing ROCm info from issue author
|
||||
if: contains(steps.label-step.outputs.labels_added, 'rocm') && contains(toJSON(github.event.issue.labels.*.name), 'bug')
|
||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
with:
|
||||
script: |
|
||||
const body = (context.payload.issue.body || '').toLowerCase();
|
||||
|
||||
// Check for existing bot comments to avoid duplicate requests
|
||||
const comments = await github.rest.issues.listComments({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
});
|
||||
const botAlreadyAsked = comments.data.some(
|
||||
c => c.user.type === 'Bot' && c.body.includes('<!-- rocm-info-request -->')
|
||||
);
|
||||
if (botAlreadyAsked) {
|
||||
core.notice('ROCm info request already posted, skipping');
|
||||
return;
|
||||
}
|
||||
|
||||
// Define required information and detection patterns
|
||||
const requiredInfo = [
|
||||
{
|
||||
name: 'Reproducer',
|
||||
patterns: [
|
||||
/reproduc/i, /minimal.?example/i, /repro\b/i, /steps to reproduce/i,
|
||||
/code.?snippet/i, /sample.?code/i,
|
||||
/```python[\s\S]*?```/, /```bash[\s\S]*?```/, /```sh[\s\S]*?```/,
|
||||
],
|
||||
ask: 'A minimal reproducer (code snippet or script that triggers the issue)',
|
||||
},
|
||||
{
|
||||
name: 'Error message',
|
||||
patterns: [
|
||||
/error/i, /traceback/i, /exception/i, /fault/i, /crash/i,
|
||||
/failed/i, /abort/i, /panic/i,
|
||||
],
|
||||
ask: 'The full error message or traceback',
|
||||
},
|
||||
{
|
||||
name: 'Installation method',
|
||||
patterns: [
|
||||
/docker/i, /rocm\/pytorch/i, /dockerfile/i, /from source/i,
|
||||
/pip install/i, /build.?from/i, /container/i, /image/i,
|
||||
/wheel/i, /\.whl/i, /nightly/i,
|
||||
],
|
||||
ask: 'How you installed vLLM (Docker image name, pip install, or build from source steps)',
|
||||
},
|
||||
{
|
||||
name: 'Command',
|
||||
patterns: [
|
||||
/vllm serve/i, /python\s+\S+\.py/i, /```bash[\s\S]*?```/,
|
||||
/```sh[\s\S]*?```/, /command/i, /launch/i, /run\s/i,
|
||||
/--model/i, /--tensor-parallel/i, /--gpu-memory/i,
|
||||
],
|
||||
ask: 'The command you used to launch vLLM (e.g., `vllm serve ...` or the Python script)',
|
||||
},
|
||||
{
|
||||
name: 'GFX architecture',
|
||||
patterns: [
|
||||
/gfx\d{3,4}/i, /mi\d{3}/i, /mi\d{2}\b/i, /radeon/i,
|
||||
/gpu.?arch/i, /rocm-smi/i, /rocminfo/i, /navi/i,
|
||||
/instinct/i,
|
||||
],
|
||||
ask: 'Your GPU model and GFX architecture (e.g., MI300X / gfx942) — run `rocminfo | grep gfx`',
|
||||
},
|
||||
];
|
||||
|
||||
const issueBody = context.payload.issue.body || '';
|
||||
const missing = requiredInfo.filter(info =>
|
||||
!info.patterns.some(p => p.test(issueBody))
|
||||
);
|
||||
|
||||
if (missing.length === 0) {
|
||||
core.notice('All required ROCm info appears to be present');
|
||||
return;
|
||||
}
|
||||
|
||||
const author = context.payload.issue.user.login;
|
||||
const checklist = requiredInfo.map(info => {
|
||||
const found = !missing.includes(info);
|
||||
return `- [${found ? 'x' : ' '}] ${info.ask}`;
|
||||
}).join('\n');
|
||||
const message = [
|
||||
'<!-- rocm-info-request -->',
|
||||
`Hi @${author}, thanks for reporting this ROCm issue!`,
|
||||
'',
|
||||
'To help us investigate, please make sure the following information is included:',
|
||||
'',
|
||||
checklist,
|
||||
'',
|
||||
'Please provide any unchecked items above. This will help us reproduce and resolve the issue faster. Thank you!',
|
||||
].join('\n');
|
||||
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
body: message,
|
||||
});
|
||||
core.notice(`Requested missing ROCm info from @${author}: ${missing.map(m => m.name).join(', ')}`);
|
||||
6
.github/workflows/macos-smoke-test.yml
vendored
6
.github/workflows/macos-smoke-test.yml
vendored
@@ -1,9 +1,9 @@
|
||||
name: macOS Apple Silicon Smoke Test
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
schedule:
|
||||
# Daily at 2:30 AM UTC
|
||||
- cron: '30 2 * * *'
|
||||
workflow_dispatch: # Manual trigger
|
||||
|
||||
permissions:
|
||||
|
||||
96
.github/workflows/new_pr_bot.yml
vendored
Normal file
96
.github/workflows/new_pr_bot.yml
vendored
Normal file
@@ -0,0 +1,96 @@
|
||||
name: New PR Bot
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened]
|
||||
|
||||
permissions:
|
||||
pull-requests: write
|
||||
|
||||
jobs:
|
||||
update-description:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Update PR description
|
||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
with:
|
||||
script: |
|
||||
const { owner, repo } = context.repo;
|
||||
const pr_number = context.issue.number;
|
||||
|
||||
const { data: pr } = await github.rest.pulls.get({
|
||||
owner,
|
||||
repo,
|
||||
pull_number: pr_number,
|
||||
});
|
||||
|
||||
let body = pr.body || '';
|
||||
const original = body;
|
||||
|
||||
// Remove markdown comments (<!-- ... -->)
|
||||
body = body.replace(/^<!--.*-->$/gm, '');
|
||||
|
||||
// Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ..."
|
||||
body = body.replace(/^PLEASE FILL IN THE PR DESCRIPTION HERE.*$/gm, '');
|
||||
|
||||
// Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ ..."
|
||||
body = body.replace(/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*[\s\S]*$/, '');
|
||||
|
||||
// Remove <details> section containing "PR Checklist (Click to Expand)"
|
||||
body = body.replace(/(---\n\n)?<details>[\s\S]*?<summary>[\s\S]*?PR Checklist \(Click to Expand\)[\s\S]*?<\/summary>[\s\S]*?<\/details>/g, '');
|
||||
|
||||
if (body !== original) {
|
||||
await github.rest.pulls.update({
|
||||
owner,
|
||||
repo,
|
||||
pull_number: pr_number,
|
||||
body,
|
||||
});
|
||||
console.log('Updated PR body');
|
||||
} else {
|
||||
console.log('No changes needed');
|
||||
}
|
||||
|
||||
reminder-comment:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Post welcome comment for first-time contributors
|
||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
with:
|
||||
script: |
|
||||
const { owner, repo } = context.repo;
|
||||
const prAuthor = context.payload.pull_request.user.login;
|
||||
|
||||
const { data: searchResults } = await github.rest.search.issuesAndPullRequests({
|
||||
q: `repo:${owner}/${repo} type:pr author:${prAuthor}`,
|
||||
per_page: 1,
|
||||
});
|
||||
|
||||
const authorPRCount = searchResults.total_count;
|
||||
console.log(`Found ${authorPRCount} PRs by ${prAuthor}`);
|
||||
|
||||
if (authorPRCount === 1) {
|
||||
console.log(`Posting welcome comment for first-time contributor: ${prAuthor}`);
|
||||
await github.rest.issues.createComment({
|
||||
owner,
|
||||
repo,
|
||||
issue_number: context.issue.number,
|
||||
body: [
|
||||
'\u{1f44b} Hi! Thank you for contributing to the vLLM project.',
|
||||
'',
|
||||
'\u{1f4ac} Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.',
|
||||
'',
|
||||
'Just a reminder: PRs would not trigger full CI run by default.',
|
||||
'',
|
||||
'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.',
|
||||
'',
|
||||
'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.',
|
||||
'',
|
||||
'If you have any questions, please reach out to us on Slack at https://slack.vllm.ai.',
|
||||
'',
|
||||
'\u{1f680}',
|
||||
].join('\n'),
|
||||
});
|
||||
} else {
|
||||
console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRCount} PRs found)`);
|
||||
}
|
||||
30
.github/workflows/pre-commit.yml
vendored
30
.github/workflows/pre-commit.yml
vendored
@@ -11,9 +11,39 @@ concurrency:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: read
|
||||
|
||||
jobs:
|
||||
pre-run-check:
|
||||
if: github.event_name == 'pull_request'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check PR label and author merge count
|
||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
with:
|
||||
script: |
|
||||
const { data: pr } = await github.rest.pulls.get({
|
||||
...context.repo,
|
||||
pull_number: context.payload.pull_request.number,
|
||||
});
|
||||
|
||||
const hasReadyLabel = pr.labels.some(l => l.name === 'ready');
|
||||
|
||||
const { data: mergedPRs } = await github.rest.search.issuesAndPullRequests({
|
||||
q: `repo:${context.repo.owner}/${context.repo.repo} is:pr is:merged author:${pr.user.login}`,
|
||||
per_page: 4,
|
||||
});
|
||||
const mergedCount = mergedPRs.total_count;
|
||||
|
||||
if (hasReadyLabel || mergedCount >= 4) {
|
||||
core.info(`Check passed: ready label=${hasReadyLabel}, 4+ merged PRs=${mergedCount >= 4}`);
|
||||
} else {
|
||||
core.setFailed(`PR must have the 'ready' label or the author must have at least 4 merged PRs (found ${mergedCount}).`);
|
||||
}
|
||||
|
||||
pre-commit:
|
||||
needs: pre-run-check
|
||||
if: always() && (needs.pre-run-check.result == 'success' || needs.pre-run-check.result == 'skipped')
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
|
||||
|
||||
54
.github/workflows/reminder_comment.yml
vendored
54
.github/workflows/reminder_comment.yml
vendored
@@ -1,54 +0,0 @@
|
||||
name: PR Reminder Comment Bot
|
||||
permissions:
|
||||
pull-requests: write
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened]
|
||||
jobs:
|
||||
pr_reminder:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Remind to run full CI on PR
|
||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
with:
|
||||
script: |
|
||||
try {
|
||||
// Get the PR author
|
||||
const prAuthor = context.payload.pull_request.user.login;
|
||||
|
||||
// Check if this is the author's first PR in this repository
|
||||
// Use GitHub's search API to find all PRs by this author
|
||||
const { data: searchResults } = await github.rest.search.issuesAndPullRequests({
|
||||
q: `repo:${context.repo.owner}/${context.repo.repo} type:pr author:${prAuthor}`,
|
||||
per_page: 100
|
||||
});
|
||||
|
||||
const authorPRCount = searchResults.total_count;
|
||||
|
||||
console.log(`Found ${authorPRCount} PRs by ${prAuthor}`);
|
||||
|
||||
// Only post comment if this is the first PR (only one PR by this author)
|
||||
if (authorPRCount === 1) {
|
||||
console.log(`Posting welcome comment for first-time contributor: ${prAuthor}`);
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
|
||||
'💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
|
||||
'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. \n\n' +
|
||||
'You ask your reviewers to trigger select CI tests on top of `fastcheck` CI. \n\n' +
|
||||
'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
|
||||
'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
|
||||
'If you have any questions, please reach out to us on Slack at https://slack.vllm.ai.\n\n' +
|
||||
'🚀'
|
||||
});
|
||||
} else {
|
||||
console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRCount} PRs found)`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error checking PR history or posting comment:', error);
|
||||
// Don't fail the workflow, just log the error
|
||||
}
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -189,11 +189,9 @@ cython_debug/
|
||||
.vscode/
|
||||
|
||||
# Claude
|
||||
CLAUDE.md
|
||||
.claude/
|
||||
|
||||
# Codex
|
||||
AGENTS.md
|
||||
.codex/
|
||||
|
||||
# Cursor
|
||||
|
||||
@@ -30,6 +30,7 @@ repos:
|
||||
- id: markdownlint-cli2
|
||||
language_version: lts
|
||||
args: [--fix]
|
||||
exclude: ^CLAUDE\.md$
|
||||
- repo: https://github.com/rhysd/actionlint
|
||||
rev: v1.7.7
|
||||
hooks:
|
||||
|
||||
@@ -9,7 +9,7 @@ build:
|
||||
python: "3.12"
|
||||
jobs:
|
||||
post_checkout:
|
||||
- bash docs/maybe_skip_pr_build.sh
|
||||
# - bash docs/maybe_skip_pr_build.sh
|
||||
- git fetch origin main --unshallow --no-tags --filter=blob:none || true
|
||||
pre_create_environment:
|
||||
- pip install uv
|
||||
|
||||
113
AGENTS.md
Normal file
113
AGENTS.md
Normal file
@@ -0,0 +1,113 @@
|
||||
# Agent Instructions for vLLM
|
||||
|
||||
> These instructions apply to **all** AI-assisted contributions to `vllm-project/vllm`.
|
||||
> Breaching these guidelines can result in automatic banning.
|
||||
|
||||
## 1. Contribution Policy (Mandatory)
|
||||
|
||||
### Duplicate-work checks
|
||||
|
||||
Before proposing a PR, run these checks:
|
||||
|
||||
```bash
|
||||
gh issue view <issue_number> --repo vllm-project/vllm --comments
|
||||
gh pr list --repo vllm-project/vllm --state open --search "<issue_number> in:body"
|
||||
gh pr list --repo vllm-project/vllm --state open --search "<short area keywords>"
|
||||
```
|
||||
|
||||
- If an open PR already addresses the same fix, do not open another.
|
||||
- If your approach is materially different, explain the difference in the issue.
|
||||
|
||||
### No low-value busywork PRs
|
||||
|
||||
Do not open one-off PRs for tiny edits (single typo, isolated style change, one mutable default, etc.). Mechanical cleanups are acceptable only when bundled with substantive work.
|
||||
|
||||
### Accountability
|
||||
|
||||
- Pure code-agent PRs are **not allowed**. A human submitter must understand and defend the change end-to-end.
|
||||
- The submitting human must review every changed line and run relevant tests.
|
||||
- PR descriptions for AI-assisted work **must** include:
|
||||
- Why this is not duplicating an existing PR.
|
||||
- Test commands run and results.
|
||||
- Clear statement that AI assistance was used.
|
||||
|
||||
### Fail-closed behavior
|
||||
|
||||
If work is duplicate/trivial busywork, **do not proceed**. Return a short explanation of what is missing.
|
||||
|
||||
---
|
||||
|
||||
## 2. Development Workflow
|
||||
|
||||
### Environment setup
|
||||
|
||||
```bash
|
||||
# Install `uv` if you don't have it already:
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
|
||||
# Always use `uv` for Python environment management:
|
||||
uv venv --python 3.12
|
||||
source .venv/bin/activate
|
||||
|
||||
# Always make sure `pre-commit` and its hooks are installed:
|
||||
uv pip install -r requirements/lint.txt
|
||||
pre-commit install
|
||||
```
|
||||
|
||||
### Installing dependencies
|
||||
|
||||
```bash
|
||||
# If you are only making Python changes:
|
||||
VLLM_USE_PRECOMPILED=1 uv pip install -e .
|
||||
|
||||
# If you are also making C/C++ changes:
|
||||
uv pip install -e .
|
||||
```
|
||||
|
||||
### Running tests
|
||||
|
||||
Tests require extra dependencies.
|
||||
All versions for test dependencies should be read from `requirements/test.txt`
|
||||
|
||||
```bash
|
||||
# Install bare minimum test dependencies:
|
||||
uv pip install pytest pytest-asyncio tblib
|
||||
|
||||
# Install additional test dependencies as needed, or install them all as follows:
|
||||
uv pip install -r requirements/test.txt
|
||||
|
||||
# Run specific test from specific test file
|
||||
pytest tests/path/to/test.py -v -s -k test_name
|
||||
|
||||
# Run all tests in directory
|
||||
pytest tests/path/to/dir -v -s
|
||||
```
|
||||
|
||||
### Running linters
|
||||
|
||||
```bash
|
||||
# Run all pre-commit hooks on staged files:
|
||||
pre-commit run
|
||||
|
||||
# Run on all files:
|
||||
pre-commit run --all-files
|
||||
|
||||
# Run a specific hook:
|
||||
pre-commit run ruff-check --all-files
|
||||
|
||||
# Run mypy as it is in CI:
|
||||
pre-commit run mypy-3.10 --all-files --hook-stage manual
|
||||
```
|
||||
|
||||
### Commit messages
|
||||
|
||||
Add attribution using commit trailers such as `Co-authored-by:` (other projects use `Assisted-by:` or `Generated-by:`). For example:
|
||||
|
||||
```text
|
||||
Your commit message here
|
||||
|
||||
Co-authored-by: GitHub Copilot
|
||||
Co-authored-by: Claude
|
||||
Co-authored-by: gemini-code-assist
|
||||
Signed-off-by: Your Name <your.email@example.com>
|
||||
```
|
||||
@@ -37,7 +37,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
||||
set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
|
||||
|
||||
# Supported AMD GPU architectures.
|
||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1150;gfx1151;gfx1152;gfx1153;gfx1200;gfx1201")
|
||||
|
||||
# ROCm installation prefix. Default to /opt/rocm but allow override via
|
||||
# -DROCM_PATH=/your/rocm/path when invoking cmake.
|
||||
@@ -340,7 +340,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
list(APPEND VLLM_EXT_SRC
|
||||
"csrc/quantization/awq/gemm_kernels.cu"
|
||||
"csrc/permute_cols.cu"
|
||||
"csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||
@@ -986,6 +985,48 @@ define_extension_target(
|
||||
# Setting this variable sidesteps the issue by calling the driver directly.
|
||||
target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
||||
|
||||
# add OR VLLM_GPU_LANG STREQUAL "HIP" here once
|
||||
# https://github.com/vllm-project/vllm/issues/35163 is resolved
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
#
|
||||
# _C_stable_libtorch extension (ops registered via STABLE_TORCH_LIBRARY)
|
||||
#
|
||||
set(VLLM_STABLE_EXT_SRC
|
||||
"csrc/libtorch_stable/torch_bindings.cpp")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_STABLE_EXT_SRC "csrc/libtorch_stable/permute_cols.cu")
|
||||
endif()
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${VLLM_STABLE_EXT_SRC}"
|
||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||
endif()
|
||||
|
||||
message(STATUS "Enabling C_stable extension.")
|
||||
define_extension_target(
|
||||
_C_stable_libtorch
|
||||
DESTINATION vllm
|
||||
LANGUAGE ${VLLM_GPU_LANG}
|
||||
SOURCES ${VLLM_STABLE_EXT_SRC}
|
||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||
USE_SABI 3
|
||||
WITH_SOABI)
|
||||
|
||||
# Set TORCH_TARGET_VERSION for stable ABI compatibility.
|
||||
# This ensures we only use C-shim APIs available in PyTorch 2.10.
|
||||
# _C_stable_libtorch is abi compatible with PyTorch >= TORCH_TARGET_VERSION
|
||||
# which is currently set to 2.10.
|
||||
target_compile_definitions(_C_stable_libtorch PRIVATE
|
||||
TORCH_TARGET_VERSION=0x020A000000000000ULL)
|
||||
|
||||
# Needed to use cuda APIs from C-shim
|
||||
target_compile_definitions(_C_stable_libtorch PRIVATE
|
||||
USE_CUDA)
|
||||
endif()
|
||||
|
||||
#
|
||||
# _moe_C extension
|
||||
#
|
||||
@@ -999,6 +1040,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_MOE_EXT_SRC
|
||||
"csrc/moe/moe_wna16.cu"
|
||||
"csrc/moe/grouped_topk_kernels.cu"
|
||||
"csrc/moe/gpt_oss_router_gemm.cu"
|
||||
"csrc/moe/router_gemm.cu")
|
||||
endif()
|
||||
|
||||
|
||||
@@ -47,6 +47,8 @@ from common import (
|
||||
is_mla_backend,
|
||||
)
|
||||
|
||||
from vllm.v1.worker.workspace import init_workspace_manager
|
||||
|
||||
|
||||
def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
|
||||
"""Run standard attention benchmark (Flash/Triton/FlashInfer)."""
|
||||
@@ -59,7 +61,9 @@ def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
|
||||
"""Run MLA benchmark with appropriate backend."""
|
||||
from mla_runner import run_mla_benchmark as run_mla
|
||||
|
||||
return run_mla(config.backend, config, **kwargs)
|
||||
return run_mla(
|
||||
config.backend, config, prefill_backend=config.prefill_backend, **kwargs
|
||||
)
|
||||
|
||||
|
||||
def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
|
||||
@@ -440,20 +444,27 @@ def main():
|
||||
# Backend selection
|
||||
parser.add_argument(
|
||||
"--backends",
|
||||
"--decode-backends",
|
||||
nargs="+",
|
||||
help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
|
||||
help="Decode backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
|
||||
"flashinfer_mla, flashattn_mla, flashmla)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--backend",
|
||||
help="Single backend (alternative to --backends)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prefill-backends",
|
||||
nargs="+",
|
||||
help="Prefill backends to compare (fa2, fa3, fa4). "
|
||||
"Uses the first decode backend for impl construction.",
|
||||
)
|
||||
|
||||
# Batch specifications
|
||||
parser.add_argument(
|
||||
"--batch-specs",
|
||||
nargs="+",
|
||||
default=["q2k", "8q1s1k"],
|
||||
default=None,
|
||||
help="Batch specifications using extended grammar",
|
||||
)
|
||||
|
||||
@@ -469,6 +480,21 @@ def main():
|
||||
parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
|
||||
parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
|
||||
parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
|
||||
parser.add_argument(
|
||||
"--kv-cache-dtype",
|
||||
default="auto",
|
||||
choices=["auto", "fp8"],
|
||||
help="KV cache dtype: auto or fp8",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cuda-graphs",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=True,
|
||||
help=(
|
||||
"Launch kernels with CUDA graphs to eliminate CPU overhead"
|
||||
"in measurements (default: True)"
|
||||
),
|
||||
)
|
||||
|
||||
# Parameter sweep (use YAML config for advanced sweeps)
|
||||
parser.add_argument(
|
||||
@@ -502,7 +528,7 @@ def main():
|
||||
|
||||
# Override args with YAML values, but CLI args take precedence
|
||||
# Check if CLI provided backends (they would be non-None and not default)
|
||||
cli_backends_provided = args.backends is not None or args.backend is not None
|
||||
cli_backends_provided = args.backend is not None or args.backends is not None
|
||||
|
||||
# Backend(s) - only use YAML if CLI didn't specify
|
||||
if not cli_backends_provided:
|
||||
@@ -512,6 +538,12 @@ def main():
|
||||
elif "backends" in yaml_config:
|
||||
args.backends = yaml_config["backends"]
|
||||
args.backend = None
|
||||
elif "decode_backends" in yaml_config:
|
||||
args.backends = yaml_config["decode_backends"]
|
||||
args.backend = None
|
||||
|
||||
# Prefill backends (e.g., ["fa3", "fa4"])
|
||||
args.prefill_backends = yaml_config.get("prefill_backends", None)
|
||||
|
||||
# Check for special modes
|
||||
if "mode" in yaml_config:
|
||||
@@ -521,21 +553,24 @@ def main():
|
||||
|
||||
# Batch specs and sizes
|
||||
# Support both explicit batch_specs and generated batch_spec_ranges
|
||||
if "batch_spec_ranges" in yaml_config:
|
||||
# Generate batch specs from ranges
|
||||
generated_specs = generate_batch_specs_from_ranges(
|
||||
yaml_config["batch_spec_ranges"]
|
||||
)
|
||||
# Combine with any explicit batch_specs
|
||||
if "batch_specs" in yaml_config:
|
||||
args.batch_specs = yaml_config["batch_specs"] + generated_specs
|
||||
else:
|
||||
args.batch_specs = generated_specs
|
||||
console.print(
|
||||
f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
|
||||
)
|
||||
elif "batch_specs" in yaml_config:
|
||||
args.batch_specs = yaml_config["batch_specs"]
|
||||
# CLI --batch-specs takes precedence over YAML when provided.
|
||||
cli_batch_specs_provided = args.batch_specs is not None
|
||||
if not cli_batch_specs_provided:
|
||||
if "batch_spec_ranges" in yaml_config:
|
||||
# Generate batch specs from ranges
|
||||
generated_specs = generate_batch_specs_from_ranges(
|
||||
yaml_config["batch_spec_ranges"]
|
||||
)
|
||||
# Combine with any explicit batch_specs
|
||||
if "batch_specs" in yaml_config:
|
||||
args.batch_specs = yaml_config["batch_specs"] + generated_specs
|
||||
else:
|
||||
args.batch_specs = generated_specs
|
||||
console.print(
|
||||
f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
|
||||
)
|
||||
elif "batch_specs" in yaml_config:
|
||||
args.batch_specs = yaml_config["batch_specs"]
|
||||
|
||||
if "batch_sizes" in yaml_config:
|
||||
args.batch_sizes = yaml_config["batch_sizes"]
|
||||
@@ -560,6 +595,10 @@ def main():
|
||||
args.warmup_iters = yaml_config["warmup_iters"]
|
||||
if "profile_memory" in yaml_config:
|
||||
args.profile_memory = yaml_config["profile_memory"]
|
||||
if "kv_cache_dtype" in yaml_config:
|
||||
args.kv_cache_dtype = yaml_config["kv_cache_dtype"]
|
||||
if "cuda_graphs" in yaml_config:
|
||||
args.cuda_graphs = yaml_config["cuda_graphs"]
|
||||
|
||||
# Parameter sweep configuration
|
||||
if "parameter_sweep" in yaml_config:
|
||||
@@ -613,10 +652,19 @@ def main():
|
||||
|
||||
# Determine backends
|
||||
backends = args.backends or ([args.backend] if args.backend else ["flash"])
|
||||
prefill_backends = getattr(args, "prefill_backends", None)
|
||||
if not args.batch_specs:
|
||||
args.batch_specs = ["q2k", "8q1s1k"]
|
||||
console.print(f"Backends: {', '.join(backends)}")
|
||||
if prefill_backends:
|
||||
console.print(f"Prefill backends: {', '.join(prefill_backends)}")
|
||||
console.print(f"Batch specs: {', '.join(args.batch_specs)}")
|
||||
console.print(f"KV cache dtype: {args.kv_cache_dtype}")
|
||||
console.print(f"CUDA graphs: {args.cuda_graphs}")
|
||||
console.print()
|
||||
|
||||
init_workspace_manager(args.device)
|
||||
|
||||
# Run benchmarks
|
||||
all_results = []
|
||||
|
||||
@@ -669,6 +717,8 @@ def main():
|
||||
repeats=args.repeats,
|
||||
warmup_iters=args.warmup_iters,
|
||||
profile_memory=args.profile_memory,
|
||||
kv_cache_dtype=args.kv_cache_dtype,
|
||||
use_cuda_graphs=args.cuda_graphs,
|
||||
)
|
||||
|
||||
# Add decode pipeline config
|
||||
@@ -821,6 +871,8 @@ def main():
|
||||
"repeats": args.repeats,
|
||||
"warmup_iters": args.warmup_iters,
|
||||
"profile_memory": args.profile_memory,
|
||||
"kv_cache_dtype": args.kv_cache_dtype,
|
||||
"use_cuda_graphs": args.cuda_graphs,
|
||||
}
|
||||
all_results = run_model_parameter_sweep(
|
||||
backends,
|
||||
@@ -843,6 +895,8 @@ def main():
|
||||
"repeats": args.repeats,
|
||||
"warmup_iters": args.warmup_iters,
|
||||
"profile_memory": args.profile_memory,
|
||||
"kv_cache_dtype": args.kv_cache_dtype,
|
||||
"use_cuda_graphs": args.cuda_graphs,
|
||||
}
|
||||
all_results = run_parameter_sweep(
|
||||
backends, args.batch_specs, base_config_args, args.parameter_sweep, console
|
||||
@@ -850,37 +904,95 @@ def main():
|
||||
|
||||
else:
|
||||
# Normal mode: compare backends
|
||||
total = len(backends) * len(args.batch_specs)
|
||||
decode_results = []
|
||||
prefill_results = []
|
||||
|
||||
with tqdm(total=total, desc="Benchmarking") as pbar:
|
||||
for spec in args.batch_specs:
|
||||
for backend in backends:
|
||||
config = BenchmarkConfig(
|
||||
backend=backend,
|
||||
batch_spec=spec,
|
||||
num_layers=args.num_layers,
|
||||
head_dim=args.head_dim,
|
||||
num_q_heads=args.num_q_heads,
|
||||
num_kv_heads=args.num_kv_heads,
|
||||
block_size=args.block_size,
|
||||
device=args.device,
|
||||
repeats=args.repeats,
|
||||
warmup_iters=args.warmup_iters,
|
||||
profile_memory=args.profile_memory,
|
||||
)
|
||||
# Run decode backend comparison
|
||||
if not prefill_backends:
|
||||
# No prefill backends specified: compare decode backends as before
|
||||
total = len(backends) * len(args.batch_specs)
|
||||
|
||||
result = run_benchmark(config)
|
||||
all_results.append(result)
|
||||
with tqdm(total=total, desc="Benchmarking") as pbar:
|
||||
for spec in args.batch_specs:
|
||||
for backend in backends:
|
||||
config = BenchmarkConfig(
|
||||
backend=backend,
|
||||
batch_spec=spec,
|
||||
num_layers=args.num_layers,
|
||||
head_dim=args.head_dim,
|
||||
num_q_heads=args.num_q_heads,
|
||||
num_kv_heads=args.num_kv_heads,
|
||||
block_size=args.block_size,
|
||||
device=args.device,
|
||||
repeats=args.repeats,
|
||||
warmup_iters=args.warmup_iters,
|
||||
profile_memory=args.profile_memory,
|
||||
kv_cache_dtype=args.kv_cache_dtype,
|
||||
use_cuda_graphs=args.cuda_graphs,
|
||||
)
|
||||
|
||||
if not result.success:
|
||||
console.print(f"[red]Error {backend} {spec}: {result.error}[/]")
|
||||
result = run_benchmark(config)
|
||||
decode_results.append(result)
|
||||
|
||||
pbar.update(1)
|
||||
if not result.success:
|
||||
console.print(
|
||||
f"[red]Error {backend} {spec}: {result.error}[/]"
|
||||
)
|
||||
|
||||
# Display results
|
||||
console.print("\n[bold green]Results:[/]")
|
||||
formatter = ResultsFormatter(console)
|
||||
formatter.print_table(all_results, backends)
|
||||
pbar.update(1)
|
||||
|
||||
console.print("\n[bold green]Results:[/]")
|
||||
formatter = ResultsFormatter(console)
|
||||
formatter.print_table(decode_results, backends)
|
||||
|
||||
# Run prefill backend comparison
|
||||
if prefill_backends:
|
||||
# Use first decode backend for impl construction
|
||||
decode_backend = backends[0]
|
||||
total = len(prefill_backends) * len(args.batch_specs)
|
||||
|
||||
console.print(
|
||||
f"[yellow]Prefill comparison mode: "
|
||||
f"using {decode_backend} for decode impl[/]"
|
||||
)
|
||||
|
||||
with tqdm(total=total, desc="Prefill benchmarking") as pbar:
|
||||
for spec in args.batch_specs:
|
||||
for pb in prefill_backends:
|
||||
config = BenchmarkConfig(
|
||||
backend=decode_backend,
|
||||
batch_spec=spec,
|
||||
num_layers=args.num_layers,
|
||||
head_dim=args.head_dim,
|
||||
num_q_heads=args.num_q_heads,
|
||||
num_kv_heads=args.num_kv_heads,
|
||||
block_size=args.block_size,
|
||||
device=args.device,
|
||||
repeats=args.repeats,
|
||||
warmup_iters=args.warmup_iters,
|
||||
profile_memory=args.profile_memory,
|
||||
prefill_backend=pb,
|
||||
)
|
||||
|
||||
result = run_benchmark(config)
|
||||
|
||||
# Label result with prefill backend name for display
|
||||
labeled_config = replace(result.config, backend=pb)
|
||||
result = replace(result, config=labeled_config)
|
||||
prefill_results.append(result)
|
||||
|
||||
if not result.success:
|
||||
console.print(f"[red]Error {pb} {spec}: {result.error}[/]")
|
||||
|
||||
pbar.update(1)
|
||||
|
||||
console.print("\n[bold green]Prefill Backend Results:[/]")
|
||||
formatter = ResultsFormatter(console)
|
||||
formatter.print_table(
|
||||
prefill_results, prefill_backends, compare_to_fastest=True
|
||||
)
|
||||
|
||||
all_results = decode_results + prefill_results
|
||||
|
||||
# Save results
|
||||
if all_results:
|
||||
|
||||
@@ -77,6 +77,7 @@ class MockKVBProj:
|
||||
self.qk_nope_head_dim = qk_nope_head_dim
|
||||
self.v_head_dim = v_head_dim
|
||||
self.out_dim = qk_nope_head_dim + v_head_dim
|
||||
self.weight = torch.empty(0, dtype=torch.bfloat16)
|
||||
|
||||
def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
|
||||
"""
|
||||
@@ -212,7 +213,11 @@ class BenchmarkConfig:
|
||||
profile_memory: bool = False
|
||||
use_cuda_graphs: bool = False
|
||||
|
||||
# "auto" or "fp8"
|
||||
kv_cache_dtype: str = "auto"
|
||||
|
||||
# MLA-specific
|
||||
prefill_backend: str | None = None
|
||||
kv_lora_rank: int | None = None
|
||||
qk_nope_head_dim: int | None = None
|
||||
qk_rope_head_dim: int | None = None
|
||||
@@ -367,6 +372,7 @@ class ResultsFormatter:
|
||||
"backend",
|
||||
"batch_spec",
|
||||
"num_layers",
|
||||
"kv_cache_dtype",
|
||||
"mean_time",
|
||||
"std_time",
|
||||
"throughput",
|
||||
@@ -380,6 +386,7 @@ class ResultsFormatter:
|
||||
"backend": r.config.backend,
|
||||
"batch_spec": r.config.batch_spec,
|
||||
"num_layers": r.config.num_layers,
|
||||
"kv_cache_dtype": r.config.kv_cache_dtype,
|
||||
"mean_time": r.mean_time,
|
||||
"std_time": r.std_time,
|
||||
"throughput": r.throughput_tokens_per_sec or 0,
|
||||
|
||||
@@ -30,9 +30,9 @@ batch_specs:
|
||||
- "2q16k_32q1s4k" # 2 very large prefill + 32 decode
|
||||
|
||||
# Context extension + decode
|
||||
- "2q1kkv2k_16q1s1k" # 2 extend + 16 decode
|
||||
- "4q2kkv4k_32q1s2k" # 4 extend + 32 decode
|
||||
- "2q1kkv8k_32q1s2k" # 2 large extend + 32 decode
|
||||
- "2q1ks2k_16q1s1k" # 2 extend + 16 decode
|
||||
- "4q2ks4k_32q1s2k" # 4 extend + 32 decode
|
||||
- "2q1ks8k_32q1s2k" # 2 large extend + 32 decode
|
||||
|
||||
# Explicitly chunked prefill
|
||||
- "q8k" # 8k prefill with chunking hint
|
||||
|
||||
@@ -1,4 +1,19 @@
|
||||
# MLA prefill-only benchmark configuration for sparse backends
|
||||
# MLA prefill backend comparison
|
||||
#
|
||||
# Compares all available MLA prefill backends:
|
||||
# FA backends: fa2, fa3, fa4 (FlashAttention versions)
|
||||
# Non-FA: flashinfer, cudnn, trtllm (Blackwell-only, require flashinfer)
|
||||
#
|
||||
# Uses cutlass_mla as the decode backend for impl construction
|
||||
# (only the prefill path is exercised).
|
||||
#
|
||||
# Backends that aren't available on the current platform will report errors
|
||||
# in the results table (e.g., fa3 on Blackwell, cudnn without artifactory).
|
||||
#
|
||||
# Usage:
|
||||
# python benchmark.py --config configs/mla_prefill.yaml
|
||||
|
||||
description: "MLA prefill backend comparison"
|
||||
|
||||
model:
|
||||
name: "deepseek-v3"
|
||||
@@ -12,20 +27,25 @@ model:
|
||||
v_head_dim: 128
|
||||
block_size: 128
|
||||
|
||||
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
|
||||
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
|
||||
model_parameter_sweep:
|
||||
param_name: "num_q_heads"
|
||||
values: [128, 64, 32, 16]
|
||||
label_format: "{backend}_{value}h"
|
||||
# model:
|
||||
# name: "deepseek-v2-lite"
|
||||
# num_layers: 27
|
||||
# num_q_heads: 16
|
||||
# num_kv_heads: 1
|
||||
# head_dim: 576
|
||||
# kv_lora_rank: 512
|
||||
# qk_nope_head_dim: 128
|
||||
# qk_rope_head_dim: 64
|
||||
# v_head_dim: 128
|
||||
# block_size: 128
|
||||
|
||||
batch_specs:
|
||||
# Pure prefill
|
||||
- "1q512"
|
||||
- "1q1k"
|
||||
- "1q2k"
|
||||
- "1q4k"
|
||||
- "1q8k"
|
||||
- "q512"
|
||||
- "q1k"
|
||||
- "q2k"
|
||||
- "q4k"
|
||||
- "q8k"
|
||||
|
||||
# Batched pure prefill
|
||||
- "2q512"
|
||||
@@ -44,19 +64,63 @@ batch_specs:
|
||||
- "8q4k"
|
||||
- "8q8k"
|
||||
|
||||
# Extend
|
||||
- "1q512s4k"
|
||||
- "1q512s8k"
|
||||
- "1q1ks8k"
|
||||
- "1q2ks8k"
|
||||
- "1q2ks16k"
|
||||
- "1q4ks16k"
|
||||
# Chunked prefill / extend
|
||||
# Short context
|
||||
- "q128s1k"
|
||||
- "q256s2k"
|
||||
- "q512s4k"
|
||||
- "q1ks4k"
|
||||
- "q2ks8k"
|
||||
- "2q128s1k"
|
||||
- "2q256s2k"
|
||||
- "2q512s4k"
|
||||
- "2q1ks4k"
|
||||
- "2q2ks8k"
|
||||
- "4q128s1k"
|
||||
- "4q256s2k"
|
||||
- "4q512s4k"
|
||||
- "4q1ks4k"
|
||||
- "4q2ks8k"
|
||||
- "8q128s1k"
|
||||
- "8q256s2k"
|
||||
- "8q512s4k"
|
||||
- "8q1ks4k"
|
||||
|
||||
backends:
|
||||
- FLASHMLA_SPARSE
|
||||
- FLASHINFER_MLA_SPARSE
|
||||
# Medium context
|
||||
- "q128s16k"
|
||||
- "q512s16k"
|
||||
- "q1ks16k"
|
||||
- "q2ks16k"
|
||||
- "2q128s16k"
|
||||
- "2q512s16k"
|
||||
- "2q1ks16k"
|
||||
- "2q2ks16k"
|
||||
- "4q128s16k"
|
||||
- "4q512s16k"
|
||||
- "4q1ks16k"
|
||||
- "4q2ks16k"
|
||||
|
||||
# Long context
|
||||
- "q128s64k"
|
||||
- "q512s64k"
|
||||
- "q1ks64k"
|
||||
- "q2ks64k"
|
||||
- "2q128s64k"
|
||||
- "2q512s64k"
|
||||
- "2q1ks64k"
|
||||
- "2q2ks64k"
|
||||
|
||||
decode_backends:
|
||||
- CUTLASS_MLA
|
||||
|
||||
prefill_backends:
|
||||
- fa2
|
||||
- fa3
|
||||
- fa4
|
||||
- flashinfer
|
||||
- cudnn
|
||||
- trtllm
|
||||
|
||||
device: "cuda:0"
|
||||
repeats: 10
|
||||
warmup_iters: 3
|
||||
profile_memory: true
|
||||
repeats: 20
|
||||
warmup_iters: 5
|
||||
|
||||
@@ -0,0 +1,58 @@
|
||||
# MLA decode-only benchmark configuration
|
||||
|
||||
model:
|
||||
name: "deepseek-v3"
|
||||
num_layers: 60
|
||||
num_q_heads: 128 # Base value, can be swept for TP simulation
|
||||
num_kv_heads: 1 # MLA uses single latent KV
|
||||
head_dim: 576
|
||||
kv_lora_rank: 512
|
||||
qk_nope_head_dim: 128
|
||||
qk_rope_head_dim: 64
|
||||
v_head_dim: 128
|
||||
block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128
|
||||
|
||||
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
|
||||
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
|
||||
model_parameter_sweep:
|
||||
param_name: "num_q_heads"
|
||||
values: [128, 64, 32, 16]
|
||||
label_format: "{backend}_{value}h"
|
||||
|
||||
batch_specs:
|
||||
# Small batches, varying sequence lengths
|
||||
- "16q1s512" # 16 requests, 512 KV cache
|
||||
- "16q1s1k" # 16 requests, 1k KV cache
|
||||
- "16q1s2k" # 16 requests, 2k KV cache
|
||||
- "16q1s4k" # 16 requests, 4k KV cache
|
||||
|
||||
# Medium batches
|
||||
- "32q1s1k" # 32 requests, 1k KV cache
|
||||
- "32q1s2k" # 32 requests, 2k KV cache
|
||||
- "32q1s4k" # 32 requests, 4k KV cache
|
||||
- "32q1s8k" # 32 requests, 8k KV cache
|
||||
|
||||
# Large batches
|
||||
- "64q1s1k" # 64 requests, 1k KV cache
|
||||
- "64q1s2k" # 64 requests, 2k KV cache
|
||||
- "64q1s4k" # 64 requests, 4k KV cache
|
||||
- "64q1s8k" # 64 requests, 8k KV cache
|
||||
|
||||
# Very large batches
|
||||
- "128q1s1k" # 128 requests, 1k KV cache
|
||||
- "128q1s2k" # 128 requests, 2k KV cache
|
||||
- "128q1s4k" # 128 requests, 4k KV cache
|
||||
- "128q1s8k" # 128 requests, 8k KV cache
|
||||
|
||||
# Long context
|
||||
- "32q1s16k" # 32 requests, 16k KV cache
|
||||
- "32q1s32k" # 32 requests, 32k KV cache
|
||||
|
||||
backends:
|
||||
- FLASHMLA_SPARSE
|
||||
- FLASHINFER_MLA_SPARSE
|
||||
|
||||
device: "cuda:0"
|
||||
repeats: 100
|
||||
warmup_iters: 10
|
||||
profile_memory: true
|
||||
@@ -0,0 +1,62 @@
|
||||
# MLA prefill-only benchmark configuration for sparse backends
|
||||
|
||||
model:
|
||||
name: "deepseek-v3"
|
||||
num_layers: 60
|
||||
num_q_heads: 128
|
||||
num_kv_heads: 1
|
||||
head_dim: 576
|
||||
kv_lora_rank: 512
|
||||
qk_nope_head_dim: 128
|
||||
qk_rope_head_dim: 64
|
||||
v_head_dim: 128
|
||||
block_size: 128
|
||||
|
||||
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
|
||||
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
|
||||
model_parameter_sweep:
|
||||
param_name: "num_q_heads"
|
||||
values: [128, 64, 32, 16]
|
||||
label_format: "{backend}_{value}h"
|
||||
|
||||
batch_specs:
|
||||
# Pure prefill
|
||||
- "1q512"
|
||||
- "1q1k"
|
||||
- "1q2k"
|
||||
- "1q4k"
|
||||
- "1q8k"
|
||||
|
||||
# Batched pure prefill
|
||||
- "2q512"
|
||||
- "2q1k"
|
||||
- "2q2k"
|
||||
- "2q4k"
|
||||
- "2q8k"
|
||||
- "4q512"
|
||||
- "4q1k"
|
||||
- "4q2k"
|
||||
- "4q4k"
|
||||
- "4q8k"
|
||||
- "8q512"
|
||||
- "8q1k"
|
||||
- "8q2k"
|
||||
- "8q4k"
|
||||
- "8q8k"
|
||||
|
||||
# Extend
|
||||
- "1q512s4k"
|
||||
- "1q512s8k"
|
||||
- "1q1ks8k"
|
||||
- "1q2ks8k"
|
||||
- "1q2ks16k"
|
||||
- "1q4ks16k"
|
||||
|
||||
backends:
|
||||
- FLASHMLA_SPARSE
|
||||
- FLASHINFER_MLA_SPARSE
|
||||
|
||||
device: "cuda:0"
|
||||
repeats: 10
|
||||
warmup_iters: 3
|
||||
profile_memory: true
|
||||
@@ -60,8 +60,11 @@ def create_minimal_vllm_config(
|
||||
model_name: str = "deepseek-v3",
|
||||
block_size: int = 128,
|
||||
max_num_seqs: int = 256,
|
||||
max_num_batched_tokens: int = 8192,
|
||||
mla_dims: dict | None = None,
|
||||
index_topk: int | None = None,
|
||||
prefill_backend: str | None = None,
|
||||
kv_cache_dtype: str = "auto",
|
||||
) -> VllmConfig:
|
||||
"""
|
||||
Create minimal VllmConfig for MLA benchmarks.
|
||||
@@ -75,6 +78,9 @@ def create_minimal_vllm_config(
|
||||
setup_mla_dims(model_name)
|
||||
index_topk: Optional topk value for sparse MLA backends. If provided,
|
||||
the config will include index_topk for sparse attention.
|
||||
prefill_backend: Prefill backend name (e.g., "fa3", "fa4", "flashinfer",
|
||||
"cudnn", "trtllm"). Configures the attention config to
|
||||
force the specified prefill backend.
|
||||
|
||||
Returns:
|
||||
VllmConfig for benchmarking
|
||||
@@ -145,13 +151,13 @@ def create_minimal_vllm_config(
|
||||
cache_config = CacheConfig(
|
||||
block_size=block_size,
|
||||
gpu_memory_utilization=0.9,
|
||||
cache_dtype="auto",
|
||||
cache_dtype=kv_cache_dtype,
|
||||
enable_prefix_caching=False,
|
||||
)
|
||||
|
||||
scheduler_config = SchedulerConfig(
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_num_batched_tokens=8192,
|
||||
max_num_batched_tokens=max(max_num_batched_tokens, max_num_seqs),
|
||||
max_model_len=32768,
|
||||
is_encoder_decoder=False,
|
||||
enable_chunked_prefill=True,
|
||||
@@ -163,7 +169,7 @@ def create_minimal_vllm_config(
|
||||
|
||||
compilation_config = CompilationConfig()
|
||||
|
||||
return VllmConfig(
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
cache_config=cache_config,
|
||||
parallel_config=parallel_config,
|
||||
@@ -171,9 +177,84 @@ def create_minimal_vllm_config(
|
||||
compilation_config=compilation_config,
|
||||
)
|
||||
|
||||
if prefill_backend is not None:
|
||||
prefill_cfg = get_prefill_backend_config(prefill_backend)
|
||||
if prefill_cfg["flash_attn_version"] is not None:
|
||||
vllm_config.attention_config.flash_attn_version = prefill_cfg[
|
||||
"flash_attn_version"
|
||||
]
|
||||
vllm_config.attention_config.disable_flashinfer_prefill = prefill_cfg[
|
||||
"disable_flashinfer_prefill"
|
||||
]
|
||||
vllm_config.attention_config.use_cudnn_prefill = prefill_cfg[
|
||||
"use_cudnn_prefill"
|
||||
]
|
||||
vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill = prefill_cfg[
|
||||
"use_trtllm_ragged_deepseek_prefill"
|
||||
]
|
||||
|
||||
return vllm_config
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Backend Configuration
|
||||
# Prefill Backend Configuration
|
||||
# ============================================================================
|
||||
|
||||
# Maps prefill backend names to attention config overrides.
|
||||
# FA backends set flash_attn_version and disable non-FA paths.
|
||||
# Non-FA backends enable their specific path and disable others.
|
||||
_PREFILL_BACKEND_CONFIG: dict[str, dict] = {
|
||||
"fa2": {
|
||||
"flash_attn_version": 2,
|
||||
"disable_flashinfer_prefill": True,
|
||||
"use_cudnn_prefill": False,
|
||||
"use_trtllm_ragged_deepseek_prefill": False,
|
||||
},
|
||||
"fa3": {
|
||||
"flash_attn_version": 3,
|
||||
"disable_flashinfer_prefill": True,
|
||||
"use_cudnn_prefill": False,
|
||||
"use_trtllm_ragged_deepseek_prefill": False,
|
||||
},
|
||||
"fa4": {
|
||||
"flash_attn_version": 4,
|
||||
"disable_flashinfer_prefill": True,
|
||||
"use_cudnn_prefill": False,
|
||||
"use_trtllm_ragged_deepseek_prefill": False,
|
||||
},
|
||||
"flashinfer": {
|
||||
"flash_attn_version": None,
|
||||
"disable_flashinfer_prefill": False,
|
||||
"use_cudnn_prefill": False,
|
||||
"use_trtllm_ragged_deepseek_prefill": False,
|
||||
},
|
||||
"cudnn": {
|
||||
"flash_attn_version": None,
|
||||
"disable_flashinfer_prefill": True,
|
||||
"use_cudnn_prefill": True,
|
||||
"use_trtllm_ragged_deepseek_prefill": False,
|
||||
},
|
||||
"trtllm": {
|
||||
"flash_attn_version": None,
|
||||
"disable_flashinfer_prefill": True,
|
||||
"use_cudnn_prefill": False,
|
||||
"use_trtllm_ragged_deepseek_prefill": True,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def get_prefill_backend_config(prefill_backend: str) -> dict:
|
||||
"""Get attention config overrides for a prefill backend."""
|
||||
if prefill_backend not in _PREFILL_BACKEND_CONFIG:
|
||||
raise ValueError(
|
||||
f"Unknown prefill backend: {prefill_backend!r}. "
|
||||
f"Available: {list(_PREFILL_BACKEND_CONFIG.keys())}"
|
||||
)
|
||||
return _PREFILL_BACKEND_CONFIG[prefill_backend]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Decode Backend Configuration
|
||||
# ============================================================================
|
||||
|
||||
|
||||
@@ -203,6 +284,7 @@ def _get_backend_config(backend: str) -> dict:
|
||||
Returns:
|
||||
Dict with backend configuration
|
||||
"""
|
||||
from vllm.v1.attention.backend import MultipleOf
|
||||
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||
|
||||
try:
|
||||
@@ -219,8 +301,8 @@ def _get_backend_config(backend: str) -> dict:
|
||||
block_sizes = backend_class.get_supported_kernel_block_sizes()
|
||||
# Use first supported block size (backends typically support one for MLA)
|
||||
block_size = block_sizes[0] if block_sizes else None
|
||||
if hasattr(block_size, "value"):
|
||||
# Handle MultipleOf enum
|
||||
if isinstance(block_size, MultipleOf):
|
||||
# No fixed block size; fall back to config value
|
||||
block_size = None
|
||||
|
||||
# Check if sparse via class method if available
|
||||
@@ -455,6 +537,7 @@ def _create_backend_impl(
|
||||
device: torch.device,
|
||||
max_num_tokens: int = 8192,
|
||||
index_topk: int | None = None,
|
||||
kv_cache_dtype: str = "auto",
|
||||
):
|
||||
"""
|
||||
Create backend implementation instance.
|
||||
@@ -503,7 +586,7 @@ def _create_backend_impl(
|
||||
"num_kv_heads": mla_dims["num_kv_heads"],
|
||||
"alibi_slopes": None,
|
||||
"sliding_window": None,
|
||||
"kv_cache_dtype": "auto",
|
||||
"kv_cache_dtype": kv_cache_dtype,
|
||||
"logits_soft_cap": None,
|
||||
"attn_type": "decoder",
|
||||
"kv_sharing_target_layer_name": None,
|
||||
@@ -621,6 +704,7 @@ def _run_single_benchmark(
|
||||
mla_dims: dict,
|
||||
device: torch.device,
|
||||
indexer=None,
|
||||
kv_cache_dtype: str | None = None,
|
||||
) -> BenchmarkResult:
|
||||
"""
|
||||
Run a single benchmark iteration.
|
||||
@@ -654,54 +738,124 @@ def _run_single_benchmark(
|
||||
)
|
||||
|
||||
# Create KV cache
|
||||
kv_cache = torch.zeros(
|
||||
num_blocks,
|
||||
block_size,
|
||||
mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
|
||||
device=device,
|
||||
dtype=torch.bfloat16,
|
||||
)
|
||||
if kv_cache_dtype is None:
|
||||
kv_cache_dtype = getattr(config, "kv_cache_dtype", "auto")
|
||||
head_size = mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"]
|
||||
if kv_cache_dtype == "fp8_ds_mla":
|
||||
# FlashMLA sparse custom format: 656 bytes per token, stored as uint8.
|
||||
# Layout: kv_lora_rank fp8 bytes + 4 float32 tile scales
|
||||
# + 2*rope_dim bf16 bytes
|
||||
# = 512 + 16 + 128 = 656 bytes for DeepSeek dims.
|
||||
kv_cache = torch.zeros(
|
||||
num_blocks,
|
||||
block_size,
|
||||
656,
|
||||
device=device,
|
||||
dtype=torch.uint8,
|
||||
)
|
||||
elif kv_cache_dtype == "fp8":
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
# Create input tensors for both decode and prefill modes
|
||||
decode_inputs, prefill_inputs = _create_input_tensors(
|
||||
total_q,
|
||||
mla_dims,
|
||||
backend_cfg["query_format"],
|
||||
device,
|
||||
torch.bfloat16,
|
||||
)
|
||||
kv_cache = torch.zeros(
|
||||
num_blocks,
|
||||
block_size,
|
||||
head_size,
|
||||
device=device,
|
||||
dtype=torch.uint8,
|
||||
).view(current_platform.fp8_dtype())
|
||||
else:
|
||||
kv_cache = torch.zeros(
|
||||
num_blocks,
|
||||
block_size,
|
||||
head_size,
|
||||
device=device,
|
||||
dtype=torch.bfloat16,
|
||||
)
|
||||
|
||||
# Fill indexer with random indices for sparse backends
|
||||
is_sparse = backend_cfg.get("is_sparse", False)
|
||||
if is_sparse and indexer is not None:
|
||||
indexer.fill_random_indices(total_q, max_kv_len)
|
||||
|
||||
# Determine which forward method to use
|
||||
if is_sparse:
|
||||
# Sparse backends use forward_mqa
|
||||
forward_fn = lambda: impl.forward_mqa(decode_inputs, kv_cache, metadata, layer)
|
||||
elif metadata.decode is not None:
|
||||
forward_fn = lambda: impl._forward_decode(
|
||||
decode_inputs, kv_cache, metadata, layer
|
||||
)
|
||||
elif metadata.prefill is not None:
|
||||
forward_fn = lambda: impl._forward_prefill(
|
||||
prefill_inputs["q"],
|
||||
prefill_inputs["k_c_normed"],
|
||||
prefill_inputs["k_pe"],
|
||||
kv_cache,
|
||||
metadata,
|
||||
prefill_inputs["k_scale"],
|
||||
prefill_inputs["output"],
|
||||
)
|
||||
else:
|
||||
# Determine which forward methods to use based on metadata.
|
||||
# Sparse MLA backends always use forward_mqa
|
||||
has_decode = is_sparse or getattr(metadata, "decode", None) is not None
|
||||
has_prefill = not is_sparse and getattr(metadata, "prefill", None) is not None
|
||||
if not has_decode and not has_prefill:
|
||||
raise RuntimeError("Metadata has neither decode nor prefill metadata")
|
||||
|
||||
num_decode = (
|
||||
metadata.num_decode_tokens
|
||||
if (has_decode and has_prefill)
|
||||
else total_q
|
||||
if has_decode
|
||||
else 0
|
||||
)
|
||||
num_prefill = total_q - num_decode
|
||||
|
||||
# Some backends requires fp8 queries when using fp8 KV cache.
|
||||
is_fp8_kvcache = kv_cache_dtype.startswith("fp8")
|
||||
quantize_query = is_fp8_kvcache and getattr(
|
||||
impl, "supports_quant_query_input", False
|
||||
)
|
||||
|
||||
# quantize_query forces concat format
|
||||
query_fmt = "concat" if quantize_query else backend_cfg["query_format"]
|
||||
|
||||
# Create decode query tensors
|
||||
if has_decode:
|
||||
decode_inputs, _ = _create_input_tensors(
|
||||
num_decode, mla_dims, query_fmt, device, torch.bfloat16
|
||||
)
|
||||
# Cast decode query to fp8 if the backend supports it
|
||||
if quantize_query:
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if isinstance(decode_inputs, tuple):
|
||||
decode_inputs = torch.cat(list(decode_inputs), dim=-1)
|
||||
decode_inputs = decode_inputs.to(current_platform.fp8_dtype())
|
||||
|
||||
# Create prefill input tensors
|
||||
if has_prefill:
|
||||
_, prefill_inputs = _create_input_tensors(
|
||||
num_prefill, mla_dims, query_fmt, device, torch.bfloat16
|
||||
)
|
||||
|
||||
# Build forward function
|
||||
def forward_fn():
|
||||
results = []
|
||||
if has_decode:
|
||||
results.append(impl.forward_mqa(decode_inputs, kv_cache, metadata, layer))
|
||||
if has_prefill:
|
||||
results.append(
|
||||
impl.forward_mha(
|
||||
prefill_inputs["q"],
|
||||
prefill_inputs["k_c_normed"],
|
||||
prefill_inputs["k_pe"],
|
||||
kv_cache,
|
||||
metadata,
|
||||
prefill_inputs["k_scale"],
|
||||
prefill_inputs["output"],
|
||||
)
|
||||
)
|
||||
return results[0] if len(results) == 1 else tuple(results)
|
||||
|
||||
# Warmup
|
||||
for _ in range(config.warmup_iters):
|
||||
forward_fn()
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Optionally capture a CUDA graph after warmup.
|
||||
# Graph replay eliminates CPU launch overhead so timings reflect pure
|
||||
# kernel time.
|
||||
if config.use_cuda_graphs:
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(graph):
|
||||
forward_fn()
|
||||
benchmark_fn = graph.replay
|
||||
else:
|
||||
benchmark_fn = forward_fn
|
||||
|
||||
# Benchmark
|
||||
times = []
|
||||
for _ in range(config.repeats):
|
||||
@@ -710,7 +864,7 @@ def _run_single_benchmark(
|
||||
|
||||
start.record()
|
||||
for _ in range(config.num_layers):
|
||||
forward_fn()
|
||||
benchmark_fn()
|
||||
end.record()
|
||||
|
||||
torch.accelerator.synchronize()
|
||||
@@ -732,6 +886,7 @@ def _run_mla_benchmark_batched(
|
||||
backend: str,
|
||||
configs_with_params: list[tuple], # [(config, threshold, num_splits), ...]
|
||||
index_topk: int = 2048,
|
||||
prefill_backend: str | None = None,
|
||||
) -> list[BenchmarkResult]:
|
||||
"""
|
||||
Unified batched MLA benchmark runner for all backends.
|
||||
@@ -743,11 +898,13 @@ def _run_mla_benchmark_batched(
|
||||
to avoid setup/teardown overhead.
|
||||
|
||||
Args:
|
||||
backend: Backend name
|
||||
backend: Backend name (decode backend used for impl construction)
|
||||
configs_with_params: List of (config, threshold, num_splits) tuples
|
||||
- threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
|
||||
- num_splits: num_kv_splits (CUTLASS only)
|
||||
index_topk: Topk value for sparse MLA backends (default 2048)
|
||||
prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
|
||||
When set, forces the specified FlashAttention version for prefill.
|
||||
|
||||
Returns:
|
||||
List of BenchmarkResult objects
|
||||
@@ -757,7 +914,7 @@ def _run_mla_benchmark_batched(
|
||||
|
||||
backend_cfg = _get_backend_config(backend)
|
||||
device = torch.device(configs_with_params[0][0].device)
|
||||
torch.cuda.set_device(device)
|
||||
torch.accelerator.set_device_index(device)
|
||||
|
||||
# Determine block size
|
||||
config_block_size = configs_with_params[0][0].block_size
|
||||
@@ -774,26 +931,91 @@ def _run_mla_benchmark_batched(
|
||||
# Determine if this is a sparse backend
|
||||
is_sparse = backend_cfg.get("is_sparse", False)
|
||||
|
||||
# Extract kv_cache_dtype from the first config
|
||||
kv_cache_dtype = getattr(first_config, "kv_cache_dtype", "auto")
|
||||
|
||||
# FlashMLA sparse only supports "fp8_ds_mla" internally (not generic "fp8").
|
||||
# Remap here so the user can pass --kv-cache-dtype fp8 regardless of backend.
|
||||
if backend.upper() == "FLASHMLA_SPARSE" and kv_cache_dtype == "fp8":
|
||||
kv_cache_dtype = "fp8_ds_mla"
|
||||
|
||||
# Compute max total_q across all configs so the metadata builder buffer
|
||||
# and scheduler config are large enough for all batch specs.
|
||||
max_total_q = max(
|
||||
sum(r.q_len for r in parse_batch_spec(cfg.batch_spec))
|
||||
for cfg, *_ in configs_with_params
|
||||
)
|
||||
|
||||
# Create and set vLLM config for MLA (reused across all benchmarks)
|
||||
vllm_config = create_minimal_vllm_config(
|
||||
model_name="deepseek-v3", # Used only for model path
|
||||
block_size=block_size,
|
||||
max_num_batched_tokens=max_total_q,
|
||||
mla_dims=mla_dims, # Use custom dims from config or default
|
||||
index_topk=index_topk if is_sparse else None,
|
||||
prefill_backend=prefill_backend,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
)
|
||||
|
||||
results = []
|
||||
|
||||
with set_current_vllm_config(vllm_config):
|
||||
# Clear cached prefill backend detection functions so they re-evaluate
|
||||
# with the current VllmConfig. These are @functools.cache decorated and
|
||||
# would otherwise return stale results from a previous backend's config.
|
||||
from vllm.model_executor.layers.attention.mla_attention import (
|
||||
use_cudnn_prefill,
|
||||
use_flashinfer_prefill,
|
||||
use_trtllm_ragged_deepseek_prefill,
|
||||
)
|
||||
|
||||
use_flashinfer_prefill.cache_clear()
|
||||
use_cudnn_prefill.cache_clear()
|
||||
use_trtllm_ragged_deepseek_prefill.cache_clear()
|
||||
|
||||
# Create backend impl, layer, builder, and indexer (reused across benchmarks)
|
||||
impl, layer, builder_instance, indexer = _create_backend_impl(
|
||||
backend_cfg,
|
||||
mla_dims,
|
||||
vllm_config,
|
||||
device,
|
||||
max_num_tokens=max_total_q,
|
||||
index_topk=index_topk if is_sparse else None,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
)
|
||||
|
||||
# Verify the actual prefill backend matches what was requested
|
||||
if prefill_backend is not None:
|
||||
prefill_cfg = get_prefill_backend_config(prefill_backend)
|
||||
fa_version = prefill_cfg["flash_attn_version"]
|
||||
|
||||
if fa_version is not None:
|
||||
# FA backend: verify the impl's FA version
|
||||
actual_fa_version = getattr(impl, "vllm_flash_attn_version", None)
|
||||
if actual_fa_version != fa_version:
|
||||
raise RuntimeError(
|
||||
f"Prefill backend '{prefill_backend}' requested FA "
|
||||
f"version {fa_version}, but the impl is using FA "
|
||||
f"version {actual_fa_version}. Check "
|
||||
f"vllm/v1/attention/backends/fa_utils.py."
|
||||
)
|
||||
else:
|
||||
# Non-FA backend: verify the builder picked the right path
|
||||
expected_flags = {
|
||||
"flashinfer": "_use_fi_prefill",
|
||||
"cudnn": "_use_cudnn_prefill",
|
||||
"trtllm": "_use_trtllm_ragged_prefill",
|
||||
}
|
||||
flag_name = expected_flags.get(prefill_backend)
|
||||
if flag_name and not getattr(builder_instance, flag_name, False):
|
||||
raise RuntimeError(
|
||||
f"Prefill backend '{prefill_backend}' was requested "
|
||||
f"but the metadata builder did not enable it. This "
|
||||
f"usually means a dependency is missing (e.g., "
|
||||
f"flashinfer not installed) or the platform doesn't "
|
||||
f"support it."
|
||||
)
|
||||
|
||||
# Run each benchmark with the shared impl
|
||||
for config, threshold, num_splits in configs_with_params:
|
||||
# Set threshold for this benchmark (FlashAttn/FlashMLA only)
|
||||
@@ -818,6 +1040,7 @@ def _run_mla_benchmark_batched(
|
||||
mla_dims,
|
||||
device,
|
||||
indexer=indexer,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
@@ -844,6 +1067,7 @@ def run_mla_benchmark(
|
||||
reorder_batch_threshold: int | None = None,
|
||||
num_kv_splits: int | None = None,
|
||||
index_topk: int = 2048,
|
||||
prefill_backend: str | None = None,
|
||||
) -> BenchmarkResult | list[BenchmarkResult]:
|
||||
"""
|
||||
Unified MLA benchmark runner for all backends.
|
||||
@@ -861,6 +1085,8 @@ def run_mla_benchmark(
|
||||
(single config mode only)
|
||||
num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
|
||||
index_topk: Topk value for sparse MLA backends (default 2048)
|
||||
prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
|
||||
When set, forces the specified FlashAttention version for prefill.
|
||||
|
||||
Returns:
|
||||
BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
|
||||
@@ -884,7 +1110,9 @@ def run_mla_benchmark(
|
||||
return_single = True
|
||||
|
||||
# Use unified batched execution
|
||||
results = _run_mla_benchmark_batched(backend, configs_with_params, index_topk)
|
||||
results = _run_mla_benchmark_batched(
|
||||
backend, configs_with_params, index_topk, prefill_backend=prefill_backend
|
||||
)
|
||||
|
||||
# Return single result or list based on input
|
||||
return results[0] if return_single else results
|
||||
|
||||
@@ -140,7 +140,7 @@ def _create_vllm_config(
|
||||
|
||||
cache_config = CacheConfig(
|
||||
block_size=config.block_size,
|
||||
cache_dtype="auto",
|
||||
cache_dtype=config.kv_cache_dtype,
|
||||
)
|
||||
cache_config.num_gpu_blocks = max_num_blocks
|
||||
cache_config.num_cpu_blocks = 0
|
||||
@@ -215,7 +215,7 @@ def _create_backend_impl(
|
||||
num_kv_heads=config.num_kv_heads,
|
||||
alibi_slopes=None,
|
||||
sliding_window=None,
|
||||
kv_cache_dtype="auto",
|
||||
kv_cache_dtype=config.kv_cache_dtype,
|
||||
)
|
||||
|
||||
kv_cache_spec = FullAttentionSpec(
|
||||
@@ -288,12 +288,22 @@ def _create_input_tensors(
|
||||
total_q: int,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
quantize_query: bool = False,
|
||||
) -> tuple:
|
||||
"""Create Q, K, V input tensors for all layers."""
|
||||
"""Create Q, K, V input tensors for all layers.
|
||||
|
||||
When quantize_query is True, queries are cast to fp8 to match backends
|
||||
that require query/key/value dtype consistency.
|
||||
"""
|
||||
q_dtype = dtype
|
||||
if quantize_query:
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
q_dtype = current_platform.fp8_dtype()
|
||||
q_list = [
|
||||
torch.randn(
|
||||
total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
|
||||
)
|
||||
).to(q_dtype)
|
||||
for _ in range(config.num_layers)
|
||||
]
|
||||
k_list = [
|
||||
@@ -344,10 +354,17 @@ def _create_kv_cache(
|
||||
# Compute inverse permutation to get back to logical view
|
||||
inv_order = [stride_order.index(i) for i in range(len(stride_order))]
|
||||
|
||||
# Use fp8 dtype for cache when requested.
|
||||
cache_dtype = dtype
|
||||
if config.kv_cache_dtype == "fp8":
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
cache_dtype = current_platform.fp8_dtype()
|
||||
|
||||
cache_list = []
|
||||
for _ in range(config.num_layers):
|
||||
# Allocate in physical layout order (contiguous in memory)
|
||||
cache = torch.zeros(*physical_shape, device=device, dtype=dtype)
|
||||
cache = torch.zeros(*physical_shape, device=device, dtype=cache_dtype)
|
||||
# Permute to logical view
|
||||
cache = cache.permute(*inv_order)
|
||||
cache_list.append(cache)
|
||||
@@ -392,6 +409,37 @@ def _run_single_benchmark(
|
||||
)
|
||||
torch.accelerator.synchronize()
|
||||
|
||||
# Optionally capture a CUDA graph after warmup.
|
||||
# Graph replay eliminates CPU launch overhead so timings reflect pure
|
||||
# kernel time.
|
||||
if config.use_cuda_graphs:
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(graph):
|
||||
for i in range(config.num_layers):
|
||||
impl.forward(
|
||||
layer,
|
||||
q_list[i],
|
||||
k_list[i],
|
||||
v_list[i],
|
||||
cache_list[i],
|
||||
attn_metadata,
|
||||
output=out,
|
||||
)
|
||||
benchmark_fn = graph.replay
|
||||
else:
|
||||
|
||||
def benchmark_fn():
|
||||
for i in range(config.num_layers):
|
||||
impl.forward(
|
||||
layer,
|
||||
q_list[i],
|
||||
k_list[i],
|
||||
v_list[i],
|
||||
cache_list[i],
|
||||
attn_metadata,
|
||||
output=out,
|
||||
)
|
||||
|
||||
# Benchmark
|
||||
times = []
|
||||
for _ in range(config.repeats):
|
||||
@@ -399,16 +447,7 @@ def _run_single_benchmark(
|
||||
end = torch.cuda.Event(enable_timing=True)
|
||||
|
||||
start.record()
|
||||
for i in range(config.num_layers):
|
||||
impl.forward(
|
||||
layer,
|
||||
q_list[i],
|
||||
k_list[i],
|
||||
v_list[i],
|
||||
cache_list[i],
|
||||
attn_metadata,
|
||||
output=out,
|
||||
)
|
||||
benchmark_fn()
|
||||
end.record()
|
||||
|
||||
torch.accelerator.synchronize()
|
||||
@@ -418,8 +457,8 @@ def _run_single_benchmark(
|
||||
mem_stats = {}
|
||||
if config.profile_memory:
|
||||
mem_stats = {
|
||||
"allocated_mb": torch.cuda.memory_allocated(device) / 1024**2,
|
||||
"reserved_mb": torch.cuda.memory_reserved(device) / 1024**2,
|
||||
"allocated_mb": torch.accelerator.memory_allocated(device) / 1024**2,
|
||||
"reserved_mb": torch.accelerator.memory_reserved(device) / 1024**2,
|
||||
}
|
||||
|
||||
return times, mem_stats
|
||||
@@ -443,7 +482,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
|
||||
BenchmarkResult with timing and memory statistics
|
||||
"""
|
||||
device = torch.device(config.device)
|
||||
torch.cuda.set_device(device)
|
||||
torch.accelerator.set_device_index(device)
|
||||
|
||||
backend_cfg = _get_backend_config(config.backend)
|
||||
|
||||
@@ -502,8 +541,12 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
|
||||
common_attn_metadata=common_metadata,
|
||||
)
|
||||
|
||||
# Only quantize queries when the impl supports it
|
||||
quantize_query = config.kv_cache_dtype.startswith("fp8") and getattr(
|
||||
impl, "supports_quant_query_input", False
|
||||
)
|
||||
q_list, k_list, v_list = _create_input_tensors(
|
||||
config, total_q, device, dtype
|
||||
config, total_q, device, dtype, quantize_query=quantize_query
|
||||
)
|
||||
|
||||
cache_list = _create_kv_cache(
|
||||
|
||||
@@ -40,9 +40,9 @@ LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more
|
||||
details.
|
||||
"""
|
||||
|
||||
import dataclasses
|
||||
import random
|
||||
import time
|
||||
from dataclasses import fields
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
@@ -124,7 +124,7 @@ def main(args):
|
||||
|
||||
# Create the LLM engine
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
|
||||
|
||||
print("------warm up------")
|
||||
|
||||
@@ -32,6 +32,7 @@ import dataclasses
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from dataclasses import fields
|
||||
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
@@ -196,7 +197,7 @@ def main(args):
|
||||
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
|
||||
@@ -3,10 +3,10 @@
|
||||
"""Benchmark offline prioritization."""
|
||||
|
||||
import argparse
|
||||
import dataclasses
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from dataclasses import fields
|
||||
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
@@ -79,7 +79,7 @@ def run_vllm(
|
||||
) -> float:
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
|
||||
|
||||
assert all(
|
||||
llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
|
||||
|
||||
@@ -95,13 +95,16 @@ def create_logits(
|
||||
def measure_memory() -> tuple[int, int]:
|
||||
"""Return (allocated, reserved) memory in bytes."""
|
||||
torch.accelerator.synchronize()
|
||||
return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()
|
||||
return (
|
||||
torch.accelerator.memory_allocated(),
|
||||
torch.accelerator.max_memory_allocated(),
|
||||
)
|
||||
|
||||
|
||||
def reset_memory_stats():
|
||||
"""Reset peak memory statistics."""
|
||||
reset_buffer_cache()
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
torch.accelerator.reset_peak_memory_stats()
|
||||
torch.accelerator.empty_cache()
|
||||
gc.collect()
|
||||
|
||||
|
||||
@@ -64,7 +64,7 @@ def bench_run(
|
||||
per_out_ch: bool,
|
||||
mkn: tuple[int, int, int],
|
||||
):
|
||||
init_workspace_manager(torch.cuda.current_device())
|
||||
init_workspace_manager(torch.accelerator.current_device_index())
|
||||
(m, k, n) = mkn
|
||||
|
||||
dtype = torch.half
|
||||
|
||||
@@ -495,7 +495,7 @@ def main():
|
||||
|
||||
# Set device
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
torch.accelerator.set_device_index(device)
|
||||
|
||||
# Get CPU process group
|
||||
cpu_group = dist.new_group(backend="gloo")
|
||||
|
||||
@@ -392,7 +392,7 @@ def benchmark_operation(
|
||||
num_op_per_cudagraph = 10
|
||||
|
||||
# Use vLLM's graph_capture to make tensor_model_parallel_all_reduce graph-safe
|
||||
device = torch.device(f"cuda:{torch.cuda.current_device()}")
|
||||
device = torch.device(f"cuda:{torch.accelerator.current_device_index()}")
|
||||
with graph_capture(device=device), torch.cuda.graph(graph):
|
||||
for _ in range(num_op_per_cudagraph):
|
||||
operation_func(*args, **kwargs)
|
||||
@@ -984,7 +984,7 @@ def main():
|
||||
world_size = int(os.environ["WORLD_SIZE"])
|
||||
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
torch.accelerator.set_device_index(device)
|
||||
torch.set_default_device(device)
|
||||
|
||||
init_distributed_environment()
|
||||
|
||||
@@ -50,7 +50,7 @@ def bench_run(
|
||||
per_out_ch: bool,
|
||||
mkn: tuple[int, int, int],
|
||||
):
|
||||
init_workspace_manager(torch.cuda.current_device())
|
||||
init_workspace_manager(torch.accelerator.current_device_index())
|
||||
label = "Quant Matmul"
|
||||
|
||||
sub_label = (
|
||||
|
||||
@@ -626,7 +626,11 @@ class BenchmarkWorker:
|
||||
if visible_device != f"{self.device_id}":
|
||||
need_device_guard = True
|
||||
|
||||
with torch.cuda.device(self.device_id) if need_device_guard else nullcontext():
|
||||
with (
|
||||
torch.accelerator.device_index(self.device_id)
|
||||
if need_device_guard
|
||||
else nullcontext()
|
||||
):
|
||||
for idx, config in enumerate(tqdm(search_space)):
|
||||
try:
|
||||
kernel_time = benchmark_config(
|
||||
@@ -746,17 +750,20 @@ def get_weight_block_size_safety(config, default_value=None):
|
||||
|
||||
|
||||
def get_model_params(config):
|
||||
if config.architectures[0] == "DbrxForCausalLM":
|
||||
architectures = getattr(config, "architectures", None) or [type(config).__name__]
|
||||
architecture = architectures[0]
|
||||
|
||||
if architecture == "DbrxForCausalLM":
|
||||
E = config.ffn_config.moe_num_experts
|
||||
topk = config.ffn_config.moe_top_k
|
||||
intermediate_size = config.ffn_config.ffn_hidden_size
|
||||
hidden_size = config.hidden_size
|
||||
elif config.architectures[0] == "JambaForCausalLM":
|
||||
elif architecture == "JambaForCausalLM":
|
||||
E = config.num_experts
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.intermediate_size
|
||||
hidden_size = config.hidden_size
|
||||
elif config.architectures[0] in (
|
||||
elif architecture in (
|
||||
"DeepseekV2ForCausalLM",
|
||||
"DeepseekV3ForCausalLM",
|
||||
"DeepseekV32ForCausalLM",
|
||||
@@ -770,7 +777,7 @@ def get_model_params(config):
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.moe_intermediate_size
|
||||
hidden_size = config.hidden_size
|
||||
elif config.architectures[0] in (
|
||||
elif architecture in (
|
||||
"Qwen2MoeForCausalLM",
|
||||
"Qwen3MoeForCausalLM",
|
||||
"Qwen3NextForCausalLM",
|
||||
@@ -779,23 +786,27 @@ def get_model_params(config):
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.moe_intermediate_size
|
||||
hidden_size = config.hidden_size
|
||||
elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration":
|
||||
elif architecture in (
|
||||
"Qwen3VLMoeForConditionalGeneration",
|
||||
"Qwen3_5MoeForConditionalGeneration",
|
||||
"Qwen3_5MoeTextConfig",
|
||||
):
|
||||
text_config = config.get_text_config()
|
||||
E = text_config.num_experts
|
||||
topk = text_config.num_experts_per_tok
|
||||
intermediate_size = text_config.moe_intermediate_size
|
||||
hidden_size = text_config.hidden_size
|
||||
elif config.architectures[0] == "HunYuanMoEV1ForCausalLM":
|
||||
elif architecture == "HunYuanMoEV1ForCausalLM":
|
||||
E = config.num_experts
|
||||
topk = config.moe_topk[0]
|
||||
intermediate_size = config.moe_intermediate_size[0]
|
||||
hidden_size = config.hidden_size
|
||||
elif config.architectures[0] == "Qwen3OmniMoeForConditionalGeneration":
|
||||
elif architecture == "Qwen3OmniMoeForConditionalGeneration":
|
||||
E = config.thinker_config.text_config.num_experts
|
||||
topk = config.thinker_config.text_config.num_experts_per_tok
|
||||
intermediate_size = config.thinker_config.text_config.moe_intermediate_size
|
||||
hidden_size = config.thinker_config.text_config.hidden_size
|
||||
elif config.architectures[0] == "PixtralForConditionalGeneration":
|
||||
elif architecture == "PixtralForConditionalGeneration":
|
||||
# Pixtral can contain different LLM architectures,
|
||||
# recurse to get their parameters
|
||||
return get_model_params(config.get_text_config())
|
||||
@@ -810,6 +821,23 @@ def get_model_params(config):
|
||||
return E, topk, intermediate_size, hidden_size
|
||||
|
||||
|
||||
def resolve_dtype(config) -> torch.dtype:
|
||||
if current_platform.is_rocm():
|
||||
return torch.float16
|
||||
|
||||
dtype = getattr(config, "dtype", None)
|
||||
if dtype is not None:
|
||||
return dtype
|
||||
|
||||
if hasattr(config, "get_text_config"):
|
||||
text_config = config.get_text_config()
|
||||
dtype = getattr(text_config, "dtype", None)
|
||||
if dtype is not None:
|
||||
return dtype
|
||||
|
||||
return torch.bfloat16
|
||||
|
||||
|
||||
def get_quantization_group_size(config) -> int | None:
|
||||
"""Extract the quantization group size from the HF model config.
|
||||
|
||||
@@ -857,7 +885,7 @@ def main(args: argparse.Namespace):
|
||||
else:
|
||||
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||
dtype = resolve_dtype(config)
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||
use_int4_w4a16 = args.dtype == "int4_w4a16"
|
||||
|
||||
134
benchmarks/kernels/benchmark_router_gemm.py
Normal file
134
benchmarks/kernels/benchmark_router_gemm.py
Normal file
@@ -0,0 +1,134 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.config import get_config
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
# Dimensions supported by the DSV3 specialized kernel
|
||||
DSV3_SUPPORTED_NUM_EXPERTS = [256, 384]
|
||||
DSV3_SUPPORTED_HIDDEN_SIZES = [7168]
|
||||
|
||||
# Dimensions supported by the gpt-oss specialized kernel
|
||||
GPT_OSS_SUPPORTED_NUM_EXPERTS = [32, 128]
|
||||
GPT_OSS_SUPPORTED_HIDDEN_SIZES = [2880]
|
||||
|
||||
|
||||
def get_batch_size_range(max_batch_size):
|
||||
return [2**x for x in range(14) if 2**x <= max_batch_size]
|
||||
|
||||
|
||||
def get_model_params(config):
|
||||
if config.architectures[0] in (
|
||||
"DeepseekV2ForCausalLM",
|
||||
"DeepseekV3ForCausalLM",
|
||||
"DeepseekV32ForCausalLM",
|
||||
):
|
||||
num_experts = config.n_routed_experts
|
||||
hidden_size = config.hidden_size
|
||||
elif config.architectures[0] in ("GptOssForCausalLM",):
|
||||
num_experts = config.num_local_experts
|
||||
hidden_size = config.hidden_size
|
||||
else:
|
||||
raise ValueError(f"Unsupported architecture: {config.architectures}")
|
||||
return num_experts, hidden_size
|
||||
|
||||
|
||||
def get_benchmark(model, max_batch_size, trust_remote_code):
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["batch_size"],
|
||||
x_vals=get_batch_size_range(max_batch_size),
|
||||
x_log=False,
|
||||
line_arg="provider",
|
||||
line_vals=[
|
||||
"torch",
|
||||
"vllm",
|
||||
],
|
||||
line_names=["PyTorch", "vLLM"],
|
||||
styles=([("blue", "-"), ("red", "-")]),
|
||||
ylabel="TFLOPs",
|
||||
plot_name=f"{model} router gemm throughput",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(batch_size, provider):
|
||||
config = get_config(model=model, trust_remote_code=trust_remote_code)
|
||||
num_experts, hidden_size = get_model_params(config)
|
||||
|
||||
mat_a = torch.randn(
|
||||
(batch_size, hidden_size), dtype=torch.bfloat16, device="cuda"
|
||||
).contiguous()
|
||||
mat_b = torch.randn(
|
||||
(num_experts, hidden_size), dtype=torch.bfloat16, device="cuda"
|
||||
).contiguous()
|
||||
bias = torch.randn(
|
||||
num_experts, dtype=torch.bfloat16, device="cuda"
|
||||
).contiguous()
|
||||
|
||||
is_hopper_or_blackwell = current_platform.is_device_capability(
|
||||
90
|
||||
) or current_platform.is_device_capability_family(100)
|
||||
allow_dsv3_router_gemm = (
|
||||
is_hopper_or_blackwell
|
||||
and num_experts in DSV3_SUPPORTED_NUM_EXPERTS
|
||||
and hidden_size in DSV3_SUPPORTED_HIDDEN_SIZES
|
||||
)
|
||||
allow_gpt_oss_router_gemm = (
|
||||
is_hopper_or_blackwell
|
||||
and num_experts in GPT_OSS_SUPPORTED_NUM_EXPERTS
|
||||
and hidden_size in GPT_OSS_SUPPORTED_HIDDEN_SIZES
|
||||
)
|
||||
|
||||
has_bias = False
|
||||
if allow_gpt_oss_router_gemm:
|
||||
has_bias = True
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if provider == "torch":
|
||||
|
||||
def runner():
|
||||
if has_bias:
|
||||
F.linear(mat_a, mat_b, bias)
|
||||
else:
|
||||
F.linear(mat_a, mat_b)
|
||||
elif provider == "vllm":
|
||||
|
||||
def runner():
|
||||
if allow_dsv3_router_gemm:
|
||||
ops.dsv3_router_gemm(mat_a, mat_b, torch.bfloat16)
|
||||
elif allow_gpt_oss_router_gemm:
|
||||
ops.gpt_oss_router_gemm(mat_a, mat_b, bias)
|
||||
else:
|
||||
raise ValueError("Unsupported router gemm")
|
||||
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
runner, quantiles=quantiles
|
||||
)
|
||||
|
||||
def tflops(t_ms):
|
||||
flops = 2 * batch_size * hidden_size * num_experts
|
||||
return flops / (t_ms * 1e-3) / 1e12
|
||||
|
||||
return tflops(ms), tflops(max_ms), tflops(min_ms)
|
||||
|
||||
return benchmark
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser()
|
||||
parser.add_argument("--model", type=str, default="openai/gpt-oss-20b")
|
||||
parser.add_argument("--max-batch-size", default=16, type=int)
|
||||
parser.add_argument("--trust-remote-code", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get the benchmark function
|
||||
benchmark = get_benchmark(args.model, args.max_batch_size, args.trust_remote_code)
|
||||
# Run performance benchmark
|
||||
benchmark.run(print_data=True)
|
||||
@@ -285,7 +285,7 @@ def tune_on_gpu(args_dict):
|
||||
weight_shapes = args_dict["weight_shapes"]
|
||||
args = args_dict["args"]
|
||||
|
||||
torch.cuda.set_device(gpu_id)
|
||||
torch.accelerator.set_device_index(gpu_id)
|
||||
print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
|
||||
|
||||
block_n = args.block_n
|
||||
@@ -334,7 +334,7 @@ def distribute_batch_sizes(batch_sizes, num_gpus):
|
||||
|
||||
def main(args):
|
||||
print(args)
|
||||
num_gpus = torch.cuda.device_count()
|
||||
num_gpus = torch.accelerator.device_count()
|
||||
if num_gpus == 0:
|
||||
raise RuntimeError("No GPU available for tuning")
|
||||
print(f"Found {num_gpus} GPUs for parallel tuning")
|
||||
|
||||
@@ -27,7 +27,7 @@ def get_attn_isa(
|
||||
else:
|
||||
if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
|
||||
return "neon"
|
||||
elif torch._C._cpu._is_amx_tile_supported():
|
||||
elif torch.cpu._is_amx_tile_supported():
|
||||
return "amx"
|
||||
else:
|
||||
return "vec"
|
||||
|
||||
@@ -24,7 +24,7 @@ except (ImportError, AttributeError) as e:
|
||||
sys.exit(1)
|
||||
|
||||
# ISA selection following test_cpu_fused_moe.py pattern
|
||||
ISA_CHOICES = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
|
||||
ISA_CHOICES = ["amx", "vec"] if torch.cpu._is_amx_tile_supported() else ["vec"]
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
|
||||
@@ -79,7 +79,8 @@ else()
|
||||
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
|
||||
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
|
||||
find_isa(${CPUINFO} "S390" S390_FOUND)
|
||||
find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
|
||||
find_isa(${CPUINFO} "zvfhmin" RVV_FP16_FOUND) # Check for RISC-V Vector FP16 support
|
||||
find_isa(${CPUINFO} "zvfbfmin" RVV_BF16_FOUND) # Check for RISC-V Vector BF16 support
|
||||
|
||||
# Support cross-compilation by allowing override via environment variables
|
||||
if (ENABLE_ARM_BF16)
|
||||
@@ -101,11 +102,13 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64" OR ENABLE_X86_ISA)
|
||||
"-mavx512f"
|
||||
"-mavx512vl"
|
||||
"-mavx512bw"
|
||||
"-mavx512dq"
|
||||
"-mavx512bf16"
|
||||
"-mavx512vnni"
|
||||
"-mavx512dq")
|
||||
list(APPEND CXX_COMPILE_FLAGS_AVX512_AMX
|
||||
${CXX_COMPILE_FLAGS_AVX512}
|
||||
"-mamx-bf16"
|
||||
"-mamx-tile")
|
||||
"-mamx-tile"
|
||||
"-mavx512bf16"
|
||||
"-mavx512vnni")
|
||||
list(APPEND CXX_COMPILE_FLAGS_AVX2
|
||||
"-mavx2")
|
||||
elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
||||
@@ -142,11 +145,19 @@ elseif (S390_FOUND)
|
||||
"-march=native"
|
||||
"-mtune=native")
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
|
||||
if(RVV_FOUND)
|
||||
message(FAIL_ERROR "Can't support rvv now.")
|
||||
message(STATUS "RISC-V detected")
|
||||
if(RVV_BF16_FOUND)
|
||||
message(STATUS "BF16 extension detected")
|
||||
set(MARCH_FLAGS -march=rv64gcv_zvfh_zfbfmin_zvfbfmin_zvl128b -mrvv-vector-bits=zvl -mabi=lp64d)
|
||||
add_compile_definitions(RISCV_BF16_SUPPORT)
|
||||
elseif (RVV_FP16_FOUND)
|
||||
message(WARNING "BF16 functionality is not available")
|
||||
set(MARCH_FLAGS -march=rv64gcv_zvfh_zvl128b -mrvv-vector-bits=zvl -mabi=lp64d)
|
||||
else()
|
||||
message(STATUS "compile riscv with scalar")
|
||||
list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
|
||||
endif()
|
||||
list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})
|
||||
else()
|
||||
message(FATAL_ERROR "vLLM CPU backend requires X86, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
|
||||
endif()
|
||||
@@ -305,7 +316,8 @@ endif()
|
||||
|
||||
# TODO: Refactor this
|
||||
if (ENABLE_X86_ISA)
|
||||
message(STATUS "CPU extension (AVX512) compile flags: ${CXX_COMPILE_FLAGS_AVX512}")
|
||||
message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) compile flags: ${CXX_COMPILE_FLAGS_AVX512_AMX}")
|
||||
message(STATUS "CPU extension (AVX512F) compile flags: ${CXX_COMPILE_FLAGS_AVX512}")
|
||||
message(STATUS "CPU extension (AVX2) compile flags: ${CXX_COMPILE_FLAGS_AVX2}")
|
||||
else()
|
||||
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
||||
@@ -357,13 +369,15 @@ if(USE_ONEDNN)
|
||||
endif()
|
||||
|
||||
if (ENABLE_X86_ISA)
|
||||
set(VLLM_EXT_SRC_AVX512
|
||||
set(VLLM_EXT_SRC_SGL
|
||||
"csrc/cpu/sgl-kernels/gemm.cpp"
|
||||
"csrc/cpu/sgl-kernels/gemm_int8.cpp"
|
||||
"csrc/cpu/sgl-kernels/gemm_fp8.cpp"
|
||||
"csrc/cpu/sgl-kernels/moe.cpp"
|
||||
"csrc/cpu/sgl-kernels/moe_int8.cpp"
|
||||
"csrc/cpu/sgl-kernels/moe_fp8.cpp"
|
||||
"csrc/cpu/sgl-kernels/moe_fp8.cpp")
|
||||
|
||||
set(VLLM_EXT_SRC_AVX512
|
||||
"csrc/cpu/shm.cpp"
|
||||
"csrc/cpu/cpu_wna16.cpp"
|
||||
"csrc/cpu/cpu_fused_moe.cpp"
|
||||
@@ -389,31 +403,48 @@ if (ENABLE_X86_ISA)
|
||||
"csrc/cpu/pos_encoding.cpp"
|
||||
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
|
||||
|
||||
message(STATUS "CPU extension (AVX512) source files: ${VLLM_EXT_SRC_AVX512}")
|
||||
message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) source files: ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL}")
|
||||
message(STATUS "CPU extension (AVX512F) source files: ${VLLM_EXT_SRC_AVX512}")
|
||||
message(STATUS "CPU extension (AVX2) source files: ${VLLM_EXT_SRC_AVX2}")
|
||||
|
||||
set(_C_LIBS numa dnnl_ext)
|
||||
set(_C_AVX512_LIBS numa dnnl_ext)
|
||||
set(_C_AVX2_LIBS numa)
|
||||
|
||||
# AMX + AVX512F + AVX512BF16 + AVX512VNNI
|
||||
define_extension_target(
|
||||
_C
|
||||
DESTINATION vllm
|
||||
LANGUAGE CXX
|
||||
SOURCES ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL}
|
||||
LIBRARIES ${_C_LIBS}
|
||||
COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512_AMX}
|
||||
USE_SABI 3
|
||||
WITH_SOABI
|
||||
)
|
||||
|
||||
# For AMX kernels
|
||||
target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16")
|
||||
|
||||
# AVX512F
|
||||
define_extension_target(
|
||||
_C_AVX512
|
||||
DESTINATION vllm
|
||||
LANGUAGE CXX
|
||||
SOURCES ${VLLM_EXT_SRC_AVX512}
|
||||
LIBRARIES ${LIBS}
|
||||
LIBRARIES ${_C_AVX512_LIBS}
|
||||
COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512}
|
||||
USE_SABI 3
|
||||
WITH_SOABI
|
||||
)
|
||||
|
||||
# For SGL kernels
|
||||
target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AVX512")
|
||||
# For AMX kernels
|
||||
target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16")
|
||||
|
||||
# AVX2
|
||||
define_extension_target(
|
||||
_C_AVX2
|
||||
DESTINATION vllm
|
||||
LANGUAGE CXX
|
||||
SOURCES ${VLLM_EXT_SRC_AVX2}
|
||||
LIBRARIES ${LIBS}
|
||||
LIBRARIES ${_C_AVX2_LIBS}
|
||||
COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX2}
|
||||
USE_SABI 3
|
||||
WITH_SOABI
|
||||
|
||||
@@ -39,7 +39,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 140c00c0241bb60cc6e44e7c1be9998d4b20d8d2
|
||||
GIT_TAG 29210221863736a08f71a866459e368ad1ac4a95
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
|
||||
@@ -919,8 +919,8 @@ __global__ void gather_and_maybe_dequant_cache(
|
||||
// SCALAR_T is the data type of the destination tensor.
|
||||
// CACHE_T is the stored data type of kv-cache.
|
||||
// KV_DTYPE is the real data type of kv-cache.
|
||||
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576, \
|
||||
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE, ENTRY_SZ) \
|
||||
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, ENTRY_SZ, \
|
||||
thread_block_size> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \
|
||||
@@ -931,6 +931,12 @@ __global__ void gather_and_maybe_dequant_cache(
|
||||
dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
|
||||
seq_starts_ptr);
|
||||
|
||||
#define CALL_GATHER_CACHE_576(SCALAR_T, CACHE_T, KV_DTYPE) \
|
||||
CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE, 576)
|
||||
|
||||
#define CALL_GATHER_CACHE_320(SCALAR_T, CACHE_T, KV_DTYPE) \
|
||||
CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE, 320)
|
||||
|
||||
// Gather sequences from the cache into the destination tensor.
|
||||
// - cu_seq_lens contains the cumulative sequence lengths for each batch
|
||||
// - block_table contains the cache block indices for each sequence
|
||||
@@ -960,9 +966,10 @@ void gather_and_maybe_dequant_cache(
|
||||
TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
|
||||
"seq_starts must be int32");
|
||||
}
|
||||
TORCH_CHECK(head_dim == 576,
|
||||
"gather_and_maybe_dequant_cache only support the head_dim to 576 "
|
||||
"for better performance")
|
||||
TORCH_CHECK(
|
||||
head_dim == 320 || head_dim == 576,
|
||||
"gather_and_maybe_dequant_cache only support the head_dim to 320 or 576 "
|
||||
"for better performance")
|
||||
|
||||
TORCH_CHECK(src_cache.device() == dst.device(),
|
||||
"src_cache and dst must be on the same device");
|
||||
@@ -987,7 +994,13 @@ void gather_and_maybe_dequant_cache(
|
||||
const int32_t* seq_starts_ptr =
|
||||
seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
|
||||
|
||||
DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, CALL_GATHER_CACHE);
|
||||
if (head_dim == 576) {
|
||||
DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype,
|
||||
CALL_GATHER_CACHE_576);
|
||||
} else {
|
||||
DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype,
|
||||
CALL_GATHER_CACHE_320);
|
||||
}
|
||||
}
|
||||
|
||||
namespace vllm {
|
||||
|
||||
@@ -13,6 +13,9 @@
|
||||
#elif defined(__aarch64__)
|
||||
// arm implementation
|
||||
#include "cpu_types_arm.hpp"
|
||||
#elif defined(__riscv_v)
|
||||
// riscv implementation
|
||||
#include "cpu_types_riscv.hpp"
|
||||
#else
|
||||
#warning "unsupported vLLM cpu implementation, vLLM will compile with scalar"
|
||||
#include "cpu_types_scalar.hpp"
|
||||
|
||||
832
csrc/cpu/cpu_types_riscv.hpp
Normal file
832
csrc/cpu/cpu_types_riscv.hpp
Normal file
@@ -0,0 +1,832 @@
|
||||
#ifndef CPU_TYPES_RISCV_HPP
|
||||
#define CPU_TYPES_RISCV_HPP
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
#include <riscv_vector.h>
|
||||
#include <torch/all.h>
|
||||
|
||||
// ============================================================================
|
||||
// Vector Register Type Definitions (VLEN=128 bits)
|
||||
// ============================================================================
|
||||
|
||||
typedef vfloat16m1_t fixed_vfloat16m1_t
|
||||
__attribute__((riscv_rvv_vector_bits(128)));
|
||||
typedef vfloat16m2_t fixed_vfloat16m2_t
|
||||
__attribute__((riscv_rvv_vector_bits(256)));
|
||||
|
||||
typedef vfloat32m1_t fixed_vfloat32m1_t
|
||||
__attribute__((riscv_rvv_vector_bits(128)));
|
||||
typedef vfloat32m2_t fixed_vfloat32m2_t
|
||||
__attribute__((riscv_rvv_vector_bits(256)));
|
||||
typedef vfloat32m4_t fixed_vfloat32m4_t
|
||||
__attribute__((riscv_rvv_vector_bits(512)));
|
||||
typedef vfloat32m8_t fixed_vfloat32m8_t
|
||||
__attribute__((riscv_rvv_vector_bits(1024)));
|
||||
|
||||
typedef vint32m2_t fixed_vint32m2_t __attribute__((riscv_rvv_vector_bits(256)));
|
||||
typedef vint32m4_t fixed_vint32m4_t __attribute__((riscv_rvv_vector_bits(512)));
|
||||
|
||||
typedef vuint16m1_t fixed_vuint16m1_t
|
||||
__attribute__((riscv_rvv_vector_bits(128)));
|
||||
typedef vuint16m2_t fixed_vuint16m2_t
|
||||
__attribute__((riscv_rvv_vector_bits(256)));
|
||||
typedef vuint16m4_t fixed_vuint16m4_t
|
||||
__attribute__((riscv_rvv_vector_bits(512)));
|
||||
|
||||
#ifdef RISCV_BF16_SUPPORT
|
||||
typedef vbfloat16m1_t fixed_vbfloat16m1_t
|
||||
__attribute__((riscv_rvv_vector_bits(128)));
|
||||
typedef vbfloat16m2_t fixed_vbfloat16m2_t
|
||||
__attribute__((riscv_rvv_vector_bits(256)));
|
||||
typedef vbfloat16m4_t fixed_vbfloat16m4_t
|
||||
__attribute__((riscv_rvv_vector_bits(512)));
|
||||
#endif
|
||||
|
||||
namespace vec_op {
|
||||
|
||||
#ifdef RISCV_BF16_SUPPORT
|
||||
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
|
||||
#else
|
||||
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
|
||||
#endif
|
||||
|
||||
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
|
||||
|
||||
#define FORCE_INLINE __attribute__((always_inline)) inline
|
||||
|
||||
namespace {
|
||||
template <typename T, T... indexes, typename F>
|
||||
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
|
||||
(f(std::integral_constant<T, indexes>{}), ...);
|
||||
};
|
||||
} // namespace
|
||||
|
||||
template <typename T, T count, typename F,
|
||||
typename = std::enable_if_t<std::is_invocable_v<F, T>>>
|
||||
constexpr void unroll_loop(F&& f) {
|
||||
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct Vec {
|
||||
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; };
|
||||
};
|
||||
|
||||
struct FP32Vec8;
|
||||
struct FP32Vec16;
|
||||
|
||||
// ============================================================================
|
||||
// FP16 Implementation
|
||||
// ============================================================================
|
||||
|
||||
struct FP16Vec8 : public Vec<FP16Vec8> {
|
||||
constexpr static int VEC_ELEM_NUM = 8;
|
||||
fixed_vfloat16m1_t reg;
|
||||
|
||||
explicit FP16Vec8(const void* ptr)
|
||||
: reg(__riscv_vle16_v_f16m1(static_cast<const _Float16*>(ptr),
|
||||
VEC_ELEM_NUM)) {};
|
||||
|
||||
explicit FP16Vec8(const FP32Vec8&);
|
||||
|
||||
void save(void* ptr) const {
|
||||
__riscv_vse16_v_f16m1(static_cast<_Float16*>(ptr), reg, VEC_ELEM_NUM);
|
||||
}
|
||||
void save(void* ptr, int elem_num) const {
|
||||
__riscv_vse16_v_f16m1(static_cast<_Float16*>(ptr), reg, elem_num);
|
||||
}
|
||||
void save_strided(void* ptr, ptrdiff_t stride) const {
|
||||
ptrdiff_t byte_stride = stride * sizeof(_Float16);
|
||||
__riscv_vsse16_v_f16m1(static_cast<_Float16*>(ptr), byte_stride, reg,
|
||||
VEC_ELEM_NUM);
|
||||
}
|
||||
};
|
||||
|
||||
struct FP16Vec16 : public Vec<FP16Vec16> {
|
||||
constexpr static int VEC_ELEM_NUM = 16;
|
||||
fixed_vfloat16m2_t reg;
|
||||
|
||||
explicit FP16Vec16(const void* ptr)
|
||||
: reg(__riscv_vle16_v_f16m2(static_cast<const _Float16*>(ptr),
|
||||
VEC_ELEM_NUM)) {};
|
||||
|
||||
explicit FP16Vec16(const FP32Vec16& vec);
|
||||
|
||||
void save(void* ptr) const {
|
||||
__riscv_vse16_v_f16m2(static_cast<_Float16*>(ptr), reg, VEC_ELEM_NUM);
|
||||
}
|
||||
void save(void* ptr, int elem_num) const {
|
||||
__riscv_vse16_v_f16m2(static_cast<_Float16*>(ptr), reg, elem_num);
|
||||
}
|
||||
void save_strided(void* ptr, ptrdiff_t stride) const {
|
||||
ptrdiff_t byte_stride = stride * sizeof(_Float16);
|
||||
__riscv_vsse16_v_f16m2(static_cast<_Float16*>(ptr), byte_stride, reg,
|
||||
VEC_ELEM_NUM);
|
||||
}
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// BF16 Implementation
|
||||
// ============================================================================
|
||||
|
||||
#ifdef RISCV_BF16_SUPPORT
|
||||
|
||||
FORCE_INLINE fixed_vuint16m1_t bf16_to_u16(fixed_vbfloat16m1_t v) {
|
||||
return __riscv_vreinterpret_v_bf16m1_u16m1(v);
|
||||
}
|
||||
FORCE_INLINE fixed_vuint16m2_t bf16_to_u16(fixed_vbfloat16m2_t v) {
|
||||
return __riscv_vreinterpret_v_bf16m2_u16m2(v);
|
||||
}
|
||||
FORCE_INLINE fixed_vuint16m4_t bf16_to_u16(fixed_vbfloat16m4_t v) {
|
||||
return __riscv_vreinterpret_v_bf16m4_u16m4(v);
|
||||
}
|
||||
|
||||
struct BF16Vec8 : public Vec<BF16Vec8> {
|
||||
constexpr static int VEC_ELEM_NUM = 8;
|
||||
fixed_vbfloat16m1_t reg;
|
||||
|
||||
explicit BF16Vec8(const void* ptr)
|
||||
: reg(__riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vle16_v_u16m1(
|
||||
reinterpret_cast<const uint16_t*>(ptr), VEC_ELEM_NUM))) {};
|
||||
|
||||
explicit BF16Vec8(fixed_vbfloat16m1_t data) : reg(data) {};
|
||||
explicit BF16Vec8(const FP32Vec8&);
|
||||
|
||||
void save(void* ptr) const {
|
||||
__riscv_vse16_v_u16m1(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
|
||||
VEC_ELEM_NUM);
|
||||
}
|
||||
void save(void* ptr, int elem_num) const {
|
||||
__riscv_vse16_v_u16m1(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
|
||||
elem_num);
|
||||
}
|
||||
void save_strided(void* ptr, ptrdiff_t stride) const {
|
||||
ptrdiff_t byte_stride = stride * sizeof(uint16_t);
|
||||
__riscv_vsse16_v_u16m1(reinterpret_cast<uint16_t*>(ptr), byte_stride,
|
||||
bf16_to_u16(reg), VEC_ELEM_NUM);
|
||||
}
|
||||
};
|
||||
|
||||
struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||
constexpr static int VEC_ELEM_NUM = 16;
|
||||
fixed_vbfloat16m2_t reg;
|
||||
|
||||
explicit BF16Vec16(const void* ptr)
|
||||
: reg(__riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vle16_v_u16m2(
|
||||
reinterpret_cast<const uint16_t*>(ptr), VEC_ELEM_NUM))) {};
|
||||
|
||||
explicit BF16Vec16(fixed_vbfloat16m2_t data) : reg(data) {};
|
||||
explicit BF16Vec16(const FP32Vec16&);
|
||||
|
||||
void save(void* ptr) const {
|
||||
__riscv_vse16_v_u16m2(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
|
||||
VEC_ELEM_NUM);
|
||||
}
|
||||
void save(void* ptr, int elem_num) const {
|
||||
__riscv_vse16_v_u16m2(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
|
||||
elem_num);
|
||||
}
|
||||
void save_strided(void* ptr, ptrdiff_t stride) const {
|
||||
ptrdiff_t byte_stride = stride * sizeof(uint16_t);
|
||||
__riscv_vsse16_v_u16m2(reinterpret_cast<uint16_t*>(ptr), byte_stride,
|
||||
bf16_to_u16(reg), VEC_ELEM_NUM);
|
||||
}
|
||||
};
|
||||
|
||||
struct BF16Vec32 : public Vec<BF16Vec32> {
|
||||
constexpr static int VEC_ELEM_NUM = 32;
|
||||
fixed_vbfloat16m4_t reg;
|
||||
|
||||
explicit BF16Vec32(const void* ptr)
|
||||
: reg(__riscv_vreinterpret_v_u16m4_bf16m4(__riscv_vle16_v_u16m4(
|
||||
reinterpret_cast<const uint16_t*>(ptr), VEC_ELEM_NUM))) {};
|
||||
|
||||
explicit BF16Vec32(fixed_vbfloat16m4_t data) : reg(data) {};
|
||||
|
||||
explicit BF16Vec32(const BF16Vec8& v) {
|
||||
fixed_vuint16m1_t u16_val = bf16_to_u16(v.reg);
|
||||
fixed_vuint16m4_t u16_combined =
|
||||
__riscv_vcreate_v_u16m1_u16m4(u16_val, u16_val, u16_val, u16_val);
|
||||
reg = __riscv_vreinterpret_v_u16m4_bf16m4(u16_combined);
|
||||
};
|
||||
|
||||
void save(void* ptr) const {
|
||||
__riscv_vse16_v_u16m4(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
|
||||
VEC_ELEM_NUM);
|
||||
}
|
||||
void save(void* ptr, int elem_num) const {
|
||||
__riscv_vse16_v_u16m4(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
|
||||
elem_num);
|
||||
}
|
||||
void save_strided(void* ptr, ptrdiff_t stride) const {
|
||||
ptrdiff_t byte_stride = stride * sizeof(uint16_t);
|
||||
__riscv_vsse16_v_u16m4(reinterpret_cast<uint16_t*>(ptr), byte_stride,
|
||||
bf16_to_u16(reg), VEC_ELEM_NUM);
|
||||
}
|
||||
};
|
||||
|
||||
#else
|
||||
// ============================================================================
|
||||
// BF16 Fallback Implementation (FP32 Simulation)
|
||||
// ============================================================================
|
||||
|
||||
struct BF16Vec8 : public Vec<BF16Vec8> {
|
||||
constexpr static int VEC_ELEM_NUM = 8;
|
||||
fixed_vfloat32m2_t reg_fp32;
|
||||
explicit BF16Vec8(const void* ptr) {
|
||||
const uint16_t* u16 = static_cast<const uint16_t*>(ptr);
|
||||
float tmp[8];
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
uint32_t v = static_cast<uint32_t>(u16[i]) << 16;
|
||||
std::memcpy(&tmp[i], &v, 4);
|
||||
}
|
||||
reg_fp32 = __riscv_vle32_v_f32m2(tmp, 8);
|
||||
}
|
||||
explicit BF16Vec8(const FP32Vec8&);
|
||||
void save(void* ptr) const {
|
||||
float tmp[8];
|
||||
__riscv_vse32_v_f32m2(tmp, reg_fp32, 8);
|
||||
uint16_t* u16 = static_cast<uint16_t*>(ptr);
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
uint32_t v;
|
||||
std::memcpy(&v, &tmp[i], 4);
|
||||
u16[i] = static_cast<uint16_t>(v >> 16);
|
||||
}
|
||||
}
|
||||
void save(void* ptr, int elem_num) const {
|
||||
float tmp[8];
|
||||
__riscv_vse32_v_f32m2(tmp, reg_fp32, 8);
|
||||
uint16_t* u16 = static_cast<uint16_t*>(ptr);
|
||||
for (int i = 0; i < elem_num; ++i) {
|
||||
uint32_t v;
|
||||
std::memcpy(&v, &tmp[i], 4);
|
||||
u16[i] = static_cast<uint16_t>(v >> 16);
|
||||
}
|
||||
}
|
||||
void save_strided(void* ptr, ptrdiff_t stride) const {
|
||||
float tmp[8];
|
||||
__riscv_vse32_v_f32m2(tmp, reg_fp32, 8);
|
||||
uint8_t* u8 = static_cast<uint8_t*>(ptr);
|
||||
ptrdiff_t byte_stride = stride * sizeof(uint16_t);
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
uint32_t v;
|
||||
std::memcpy(&v, &tmp[i], 4);
|
||||
uint16_t val = static_cast<uint16_t>(v >> 16);
|
||||
*reinterpret_cast<uint16_t*>(u8 + i * byte_stride) = val;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||
constexpr static int VEC_ELEM_NUM = 16;
|
||||
fixed_vfloat32m4_t reg_fp32;
|
||||
explicit BF16Vec16(const void* ptr) {
|
||||
const uint16_t* u16 = static_cast<const uint16_t*>(ptr);
|
||||
float tmp[16];
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
uint32_t v = static_cast<uint32_t>(u16[i]) << 16;
|
||||
std::memcpy(&tmp[i], &v, 4);
|
||||
}
|
||||
reg_fp32 = __riscv_vle32_v_f32m4(tmp, 16);
|
||||
}
|
||||
explicit BF16Vec16(const FP32Vec16&);
|
||||
void save(void* ptr) const {
|
||||
float tmp[16];
|
||||
__riscv_vse32_v_f32m4(tmp, reg_fp32, 16);
|
||||
uint16_t* u16 = static_cast<uint16_t*>(ptr);
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
uint32_t v;
|
||||
std::memcpy(&v, &tmp[i], 4);
|
||||
u16[i] = static_cast<uint16_t>(v >> 16);
|
||||
}
|
||||
}
|
||||
void save(void* ptr, int elem_num) const {
|
||||
float tmp[16];
|
||||
__riscv_vse32_v_f32m4(tmp, reg_fp32, 16);
|
||||
uint16_t* u16 = static_cast<uint16_t*>(ptr);
|
||||
for (int i = 0; i < elem_num; ++i) {
|
||||
uint32_t v;
|
||||
std::memcpy(&v, &tmp[i], 4);
|
||||
u16[i] = static_cast<uint16_t>(v >> 16);
|
||||
}
|
||||
}
|
||||
void save_strided(void* ptr, ptrdiff_t stride) const {
|
||||
float tmp[16];
|
||||
__riscv_vse32_v_f32m4(tmp, reg_fp32, 16);
|
||||
uint8_t* u8 = static_cast<uint8_t*>(ptr);
|
||||
ptrdiff_t byte_stride = stride * sizeof(uint16_t);
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
uint32_t v;
|
||||
std::memcpy(&v, &tmp[i], 4);
|
||||
uint16_t val = static_cast<uint16_t>(v >> 16);
|
||||
*reinterpret_cast<uint16_t*>(u8 + i * byte_stride) = val;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct BF16Vec32 : public Vec<BF16Vec32> {
|
||||
constexpr static int VEC_ELEM_NUM = 32;
|
||||
fixed_vfloat32m8_t reg_fp32;
|
||||
|
||||
explicit BF16Vec32(const void* ptr) {
|
||||
const uint16_t* u16 = static_cast<const uint16_t*>(ptr);
|
||||
float tmp[32];
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
uint32_t v = static_cast<uint32_t>(u16[i]) << 16;
|
||||
std::memcpy(&tmp[i], &v, 4);
|
||||
}
|
||||
reg_fp32 = __riscv_vle32_v_f32m8(tmp, 32);
|
||||
}
|
||||
|
||||
explicit BF16Vec32(const BF16Vec8& v) {
|
||||
float tmp_small[8];
|
||||
__riscv_vse32_v_f32m2(tmp_small, v.reg_fp32, 8);
|
||||
float tmp_large[32];
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
std::memcpy(tmp_large + (i * 8), tmp_small, 8 * sizeof(float));
|
||||
}
|
||||
reg_fp32 = __riscv_vle32_v_f32m8(tmp_large, 32);
|
||||
}
|
||||
|
||||
void save(void* ptr) const {
|
||||
float tmp[32];
|
||||
__riscv_vse32_v_f32m8(tmp, reg_fp32, 32);
|
||||
uint16_t* u16 = static_cast<uint16_t*>(ptr);
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
uint32_t v;
|
||||
std::memcpy(&v, &tmp[i], 4);
|
||||
u16[i] = static_cast<uint16_t>(v >> 16);
|
||||
}
|
||||
}
|
||||
|
||||
void save(void* ptr, int elem_num) const {
|
||||
float tmp[32];
|
||||
__riscv_vse32_v_f32m8(tmp, reg_fp32, 32);
|
||||
uint16_t* u16 = static_cast<uint16_t*>(ptr);
|
||||
for (int i = 0; i < elem_num; ++i) {
|
||||
uint32_t v;
|
||||
std::memcpy(&v, &tmp[i], 4);
|
||||
u16[i] = static_cast<uint16_t>(v >> 16);
|
||||
}
|
||||
}
|
||||
|
||||
void save_strided(void* ptr, ptrdiff_t stride) const {
|
||||
float tmp[32];
|
||||
__riscv_vse32_v_f32m8(tmp, reg_fp32, 32);
|
||||
uint8_t* u8 = static_cast<uint8_t*>(ptr);
|
||||
ptrdiff_t byte_stride = stride * sizeof(uint16_t);
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
uint32_t v;
|
||||
std::memcpy(&v, &tmp[i], 4);
|
||||
uint16_t val = static_cast<uint16_t>(v >> 16);
|
||||
*reinterpret_cast<uint16_t*>(u8 + i * byte_stride) = val;
|
||||
}
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
// ============================================================================
|
||||
// FP32 Implementation
|
||||
// ============================================================================
|
||||
|
||||
struct FP32Vec4 : public Vec<FP32Vec4> {
|
||||
constexpr static int VEC_ELEM_NUM = 4;
|
||||
fixed_vfloat32m1_t reg;
|
||||
explicit FP32Vec4(float v) : reg(__riscv_vfmv_v_f_f32m1(v, VEC_ELEM_NUM)) {};
|
||||
explicit FP32Vec4() : reg(__riscv_vfmv_v_f_f32m1(0.0f, VEC_ELEM_NUM)) {};
|
||||
explicit FP32Vec4(const float* ptr)
|
||||
: reg(__riscv_vle32_v_f32m1(ptr, VEC_ELEM_NUM)) {};
|
||||
explicit FP32Vec4(fixed_vfloat32m1_t data) : reg(data) {};
|
||||
explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {};
|
||||
void save(float* ptr) const { __riscv_vse32_v_f32m1(ptr, reg, VEC_ELEM_NUM); }
|
||||
void save(float* ptr, int elem_num) const {
|
||||
__riscv_vse32_v_f32m1(ptr, reg, elem_num);
|
||||
}
|
||||
};
|
||||
|
||||
struct FP32Vec8 : public Vec<FP32Vec8> {
|
||||
constexpr static int VEC_ELEM_NUM = 8;
|
||||
fixed_vfloat32m2_t reg;
|
||||
|
||||
explicit FP32Vec8(float v) : reg(__riscv_vfmv_v_f_f32m2(v, VEC_ELEM_NUM)) {};
|
||||
explicit FP32Vec8() : reg(__riscv_vfmv_v_f_f32m2(0.0f, VEC_ELEM_NUM)) {};
|
||||
explicit FP32Vec8(const float* ptr)
|
||||
: reg(__riscv_vle32_v_f32m2(ptr, VEC_ELEM_NUM)) {};
|
||||
explicit FP32Vec8(fixed_vfloat32m2_t data) : reg(data) {};
|
||||
explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
|
||||
explicit FP32Vec8(const FP16Vec8& v)
|
||||
: reg(__riscv_vfwcvt_f_f_v_f32m2(v.reg, VEC_ELEM_NUM)) {};
|
||||
explicit FP32Vec8(fixed_vfloat16m1_t v)
|
||||
: reg(__riscv_vfwcvt_f_f_v_f32m2(v, VEC_ELEM_NUM)) {};
|
||||
|
||||
#ifdef RISCV_BF16_SUPPORT
|
||||
explicit FP32Vec8(fixed_vbfloat16m1_t v)
|
||||
: reg(__riscv_vfwcvtbf16_f_f_v_f32m2(v, VEC_ELEM_NUM)) {};
|
||||
explicit FP32Vec8(const BF16Vec8& v)
|
||||
: reg(__riscv_vfwcvtbf16_f_f_v_f32m2(v.reg, VEC_ELEM_NUM)) {};
|
||||
#else
|
||||
explicit FP32Vec8(const BF16Vec8& v) : reg(v.reg_fp32) {};
|
||||
#endif
|
||||
|
||||
float reduce_sum() const {
|
||||
fixed_vfloat32m1_t scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1);
|
||||
scalar = __riscv_vfredusum_vs_f32m2_f32m1(reg, scalar, VEC_ELEM_NUM);
|
||||
return __riscv_vfmv_f_s_f32m1_f32(scalar);
|
||||
}
|
||||
|
||||
FP32Vec8 operator*(const FP32Vec8& b) const {
|
||||
return FP32Vec8(__riscv_vfmul_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
|
||||
}
|
||||
FP32Vec8 operator+(const FP32Vec8& b) const {
|
||||
return FP32Vec8(__riscv_vfadd_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
|
||||
}
|
||||
FP32Vec8 operator-(const FP32Vec8& b) const {
|
||||
return FP32Vec8(__riscv_vfsub_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
|
||||
}
|
||||
FP32Vec8 operator/(const FP32Vec8& b) const {
|
||||
return FP32Vec8(__riscv_vfdiv_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
|
||||
}
|
||||
|
||||
FP32Vec8 min(const FP32Vec8& b) const {
|
||||
return FP32Vec8(__riscv_vfmin_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
|
||||
}
|
||||
FP32Vec8 max(const FP32Vec8& b) const {
|
||||
return FP32Vec8(__riscv_vfmax_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
|
||||
}
|
||||
FP32Vec8 abs() const {
|
||||
return FP32Vec8(__riscv_vfabs_v_f32m2(reg, VEC_ELEM_NUM));
|
||||
}
|
||||
|
||||
FP32Vec8 min(const FP32Vec8& b, int elem_num) const {
|
||||
return FP32Vec8(__riscv_vfmin_vv_f32m2(reg, b.reg, elem_num));
|
||||
}
|
||||
FP32Vec8 max(const FP32Vec8& b, int elem_num) const {
|
||||
return FP32Vec8(__riscv_vfmax_vv_f32m2(reg, b.reg, elem_num));
|
||||
}
|
||||
|
||||
FP32Vec8 clamp(const FP32Vec8& min_v, const FP32Vec8& max_v) const {
|
||||
fixed_vfloat32m2_t temp =
|
||||
__riscv_vfmax_vv_f32m2(min_v.reg, reg, VEC_ELEM_NUM);
|
||||
return FP32Vec8(__riscv_vfmin_vv_f32m2(max_v.reg, temp, VEC_ELEM_NUM));
|
||||
}
|
||||
|
||||
void save(float* ptr) const { __riscv_vse32_v_f32m2(ptr, reg, VEC_ELEM_NUM); }
|
||||
void save(float* ptr, int elem_num) const {
|
||||
__riscv_vse32_v_f32m2(ptr, reg, elem_num);
|
||||
}
|
||||
void save_strided(float* ptr, ptrdiff_t stride) const {
|
||||
ptrdiff_t byte_stride = stride * sizeof(float);
|
||||
__riscv_vsse32_v_f32m2(ptr, byte_stride, reg, VEC_ELEM_NUM);
|
||||
}
|
||||
|
||||
FP32Vec8 exp() const {
|
||||
const float inv_ln2 = 1.44269504088896341f;
|
||||
fixed_vfloat32m2_t x_scaled =
|
||||
__riscv_vfmul_vf_f32m2(reg, inv_ln2, VEC_ELEM_NUM);
|
||||
fixed_vint32m2_t n_int = __riscv_vfcvt_x_f_v_i32m2(x_scaled, VEC_ELEM_NUM);
|
||||
fixed_vfloat32m2_t n_float = __riscv_vfcvt_f_x_v_f32m2(n_int, VEC_ELEM_NUM);
|
||||
|
||||
fixed_vfloat32m2_t r =
|
||||
__riscv_vfsub_vv_f32m2(x_scaled, n_float, VEC_ELEM_NUM);
|
||||
|
||||
fixed_vfloat32m2_t poly =
|
||||
__riscv_vfmv_v_f_f32m2(0.001333355810164f, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfadd_vf_f32m2(poly, 0.009618129107628f, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfadd_vf_f32m2(poly, 0.055504108664821f, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfadd_vf_f32m2(poly, 0.240226506959101f, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfadd_vf_f32m2(poly, 0.693147180559945f, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfadd_vf_f32m2(poly, 1.0f, VEC_ELEM_NUM);
|
||||
|
||||
fixed_vint32m2_t biased_exp =
|
||||
__riscv_vadd_vx_i32m2(n_int, 127, VEC_ELEM_NUM);
|
||||
biased_exp = __riscv_vmax_vx_i32m2(biased_exp, 0, VEC_ELEM_NUM);
|
||||
fixed_vint32m2_t exponent_bits =
|
||||
__riscv_vsll_vx_i32m2(biased_exp, 23, VEC_ELEM_NUM);
|
||||
fixed_vfloat32m2_t scale =
|
||||
__riscv_vreinterpret_v_i32m2_f32m2(exponent_bits);
|
||||
|
||||
return FP32Vec8(__riscv_vfmul_vv_f32m2(poly, scale, VEC_ELEM_NUM));
|
||||
}
|
||||
|
||||
FP32Vec8 tanh() const {
|
||||
fixed_vfloat32m2_t x_clamped = __riscv_vfmin_vf_f32m2(
|
||||
__riscv_vfmax_vf_f32m2(reg, -9.0f, VEC_ELEM_NUM), 9.0f, VEC_ELEM_NUM);
|
||||
fixed_vfloat32m2_t x2 =
|
||||
__riscv_vfmul_vf_f32m2(x_clamped, 2.0f, VEC_ELEM_NUM);
|
||||
FP32Vec8 exp_val = FP32Vec8(x2).exp();
|
||||
fixed_vfloat32m2_t num =
|
||||
__riscv_vfsub_vf_f32m2(exp_val.reg, 1.0f, VEC_ELEM_NUM);
|
||||
fixed_vfloat32m2_t den =
|
||||
__riscv_vfadd_vf_f32m2(exp_val.reg, 1.0f, VEC_ELEM_NUM);
|
||||
return FP32Vec8(__riscv_vfdiv_vv_f32m2(num, den, VEC_ELEM_NUM));
|
||||
}
|
||||
|
||||
FP32Vec8 er() const {
|
||||
const float p = 0.3275911f, a1 = 0.254829592f, a2 = -0.284496736f,
|
||||
a3 = 1.421413741f, a4 = -1.453152027f, a5 = 1.061405429f;
|
||||
fixed_vfloat32m2_t abs_x = __riscv_vfabs_v_f32m2(reg, VEC_ELEM_NUM);
|
||||
|
||||
fixed_vfloat32m2_t t = __riscv_vfadd_vf_f32m2(
|
||||
__riscv_vfmul_vf_f32m2(abs_x, p, VEC_ELEM_NUM), 1.0f, VEC_ELEM_NUM);
|
||||
t = __riscv_vfrdiv_vf_f32m2(t, 1.0f, VEC_ELEM_NUM);
|
||||
|
||||
fixed_vfloat32m2_t poly = __riscv_vfmv_v_f_f32m2(a5, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
|
||||
a4, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
|
||||
a3, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
|
||||
a2, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
|
||||
a1, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM);
|
||||
|
||||
fixed_vfloat32m2_t exp_val =
|
||||
FP32Vec8(__riscv_vfneg_v_f32m2(
|
||||
__riscv_vfmul_vv_f32m2(abs_x, abs_x, VEC_ELEM_NUM),
|
||||
VEC_ELEM_NUM))
|
||||
.exp()
|
||||
.reg;
|
||||
fixed_vfloat32m2_t res = __riscv_vfrsub_vf_f32m2(
|
||||
__riscv_vfmul_vv_f32m2(poly, exp_val, VEC_ELEM_NUM), 1.0f,
|
||||
VEC_ELEM_NUM);
|
||||
|
||||
vbool16_t mask = __riscv_vmflt_vf_f32m2_b16(reg, 0.0f, VEC_ELEM_NUM);
|
||||
return FP32Vec8(__riscv_vfneg_v_f32m2_m(mask, res, VEC_ELEM_NUM));
|
||||
}
|
||||
};
|
||||
|
||||
struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
constexpr static int VEC_ELEM_NUM = 16;
|
||||
fixed_vfloat32m4_t reg;
|
||||
|
||||
explicit FP32Vec16(float v) : reg(__riscv_vfmv_v_f_f32m4(v, VEC_ELEM_NUM)) {};
|
||||
explicit FP32Vec16() : reg(__riscv_vfmv_v_f_f32m4(0.0f, VEC_ELEM_NUM)) {};
|
||||
explicit FP32Vec16(const float* ptr)
|
||||
: reg(__riscv_vle32_v_f32m4(ptr, VEC_ELEM_NUM)) {};
|
||||
explicit FP32Vec16(fixed_vfloat32m4_t data) : reg(data) {};
|
||||
explicit FP32Vec16(const FP32Vec8& data)
|
||||
: reg(__riscv_vcreate_v_f32m2_f32m4(data.reg, data.reg)) {};
|
||||
explicit FP32Vec16(const FP32Vec16& data) : reg(data.reg) {};
|
||||
explicit FP32Vec16(const FP16Vec16& v);
|
||||
|
||||
#ifdef RISCV_BF16_SUPPORT
|
||||
explicit FP32Vec16(fixed_vbfloat16m2_t v)
|
||||
: reg(__riscv_vfwcvtbf16_f_f_v_f32m4(v, VEC_ELEM_NUM)) {};
|
||||
explicit FP32Vec16(const BF16Vec16& v)
|
||||
: reg(__riscv_vfwcvtbf16_f_f_v_f32m4(v.reg, VEC_ELEM_NUM)) {};
|
||||
#else
|
||||
explicit FP32Vec16(const BF16Vec16& v) : reg(v.reg_fp32) {};
|
||||
#endif
|
||||
|
||||
FP32Vec16 operator+(const FP32Vec16& b) const {
|
||||
return FP32Vec16(__riscv_vfadd_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
|
||||
}
|
||||
FP32Vec16 operator-(const FP32Vec16& b) const {
|
||||
return FP32Vec16(__riscv_vfsub_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
|
||||
}
|
||||
FP32Vec16 operator*(const FP32Vec16& b) const {
|
||||
return FP32Vec16(__riscv_vfmul_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
|
||||
}
|
||||
FP32Vec16 operator/(const FP32Vec16& b) const {
|
||||
return FP32Vec16(__riscv_vfdiv_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
|
||||
}
|
||||
|
||||
FP32Vec16 fma(const FP32Vec16& a, const FP32Vec16& b) const {
|
||||
return FP32Vec16(__riscv_vfmacc_vv_f32m4(reg, a.reg, b.reg, VEC_ELEM_NUM));
|
||||
}
|
||||
|
||||
float reduce_sum() const {
|
||||
fixed_vfloat32m1_t scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1);
|
||||
scalar = __riscv_vfredusum_vs_f32m4_f32m1(reg, scalar, VEC_ELEM_NUM);
|
||||
return __riscv_vfmv_f_s_f32m1_f32(scalar);
|
||||
}
|
||||
|
||||
float reduce_max() const {
|
||||
fixed_vfloat32m1_t scalar =
|
||||
__riscv_vfmv_s_f_f32m1(std::numeric_limits<float>::lowest(), 1);
|
||||
scalar = __riscv_vfredmax_vs_f32m4_f32m1(reg, scalar, VEC_ELEM_NUM);
|
||||
return __riscv_vfmv_f_s_f32m1_f32(scalar);
|
||||
}
|
||||
|
||||
float reduce_min() const {
|
||||
fixed_vfloat32m1_t scalar =
|
||||
__riscv_vfmv_s_f_f32m1(std::numeric_limits<float>::max(), 1);
|
||||
scalar = __riscv_vfredmin_vs_f32m4_f32m1(reg, scalar, VEC_ELEM_NUM);
|
||||
return __riscv_vfmv_f_s_f32m1_f32(scalar);
|
||||
}
|
||||
|
||||
template <int group_size>
|
||||
float reduce_sub_sum(int idx) {
|
||||
static_assert(VEC_ELEM_NUM % group_size == 0);
|
||||
const int start = idx * group_size;
|
||||
vuint32m4_t indices = __riscv_vid_v_u32m4(VEC_ELEM_NUM);
|
||||
vbool8_t mask = __riscv_vmand_mm_b8(
|
||||
__riscv_vmsgeu_vx_u32m4_b8(indices, start, VEC_ELEM_NUM),
|
||||
__riscv_vmsltu_vx_u32m4_b8(indices, start + group_size, VEC_ELEM_NUM),
|
||||
VEC_ELEM_NUM);
|
||||
fixed_vfloat32m1_t scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1);
|
||||
scalar =
|
||||
__riscv_vfredusum_vs_f32m4_f32m1_m(mask, reg, scalar, VEC_ELEM_NUM);
|
||||
return __riscv_vfmv_f_s_f32m1_f32(scalar);
|
||||
};
|
||||
|
||||
FP32Vec16 max(const FP32Vec16& b) const {
|
||||
return FP32Vec16(__riscv_vfmax_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
|
||||
}
|
||||
FP32Vec16 min(const FP32Vec16& b) const {
|
||||
return FP32Vec16(__riscv_vfmin_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
|
||||
}
|
||||
FP32Vec16 abs() const {
|
||||
return FP32Vec16(__riscv_vfabs_v_f32m4(reg, VEC_ELEM_NUM));
|
||||
}
|
||||
|
||||
FP32Vec16 clamp(const FP32Vec16& min_v, const FP32Vec16& max_v) const {
|
||||
return FP32Vec16(__riscv_vfmin_vv_f32m4(
|
||||
max_v.reg, __riscv_vfmax_vv_f32m4(min_v.reg, reg, VEC_ELEM_NUM),
|
||||
VEC_ELEM_NUM));
|
||||
}
|
||||
|
||||
void save(float* ptr) const { __riscv_vse32_v_f32m4(ptr, reg, VEC_ELEM_NUM); }
|
||||
void save(float* ptr, int elem_num) const {
|
||||
__riscv_vse32_v_f32m4(ptr, reg, elem_num);
|
||||
}
|
||||
void save_strided(float* ptr, ptrdiff_t stride) const {
|
||||
ptrdiff_t byte_stride = stride * sizeof(float);
|
||||
__riscv_vsse32_v_f32m4(ptr, byte_stride, reg, VEC_ELEM_NUM);
|
||||
}
|
||||
|
||||
FP32Vec16 exp() const {
|
||||
const float inv_ln2 = 1.44269504088896341f;
|
||||
fixed_vfloat32m4_t x_scaled =
|
||||
__riscv_vfmul_vf_f32m4(reg, inv_ln2, VEC_ELEM_NUM);
|
||||
fixed_vint32m4_t n_int = __riscv_vfcvt_x_f_v_i32m4(x_scaled, VEC_ELEM_NUM);
|
||||
fixed_vfloat32m4_t n_float = __riscv_vfcvt_f_x_v_f32m4(n_int, VEC_ELEM_NUM);
|
||||
fixed_vfloat32m4_t r =
|
||||
__riscv_vfsub_vv_f32m4(x_scaled, n_float, VEC_ELEM_NUM);
|
||||
|
||||
fixed_vfloat32m4_t poly =
|
||||
__riscv_vfmv_v_f_f32m4(0.001333355810164f, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
|
||||
0.009618129107628f, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
|
||||
0.055504108664821f, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
|
||||
0.240226506959101f, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
|
||||
0.693147180559945f, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
|
||||
1.0f, VEC_ELEM_NUM);
|
||||
|
||||
fixed_vint32m4_t biased_exp = __riscv_vmax_vx_i32m4(
|
||||
__riscv_vadd_vx_i32m4(n_int, 127, VEC_ELEM_NUM), 0, VEC_ELEM_NUM);
|
||||
fixed_vfloat32m4_t scale = __riscv_vreinterpret_v_i32m4_f32m4(
|
||||
__riscv_vsll_vx_i32m4(biased_exp, 23, VEC_ELEM_NUM));
|
||||
|
||||
return FP32Vec16(__riscv_vfmul_vv_f32m4(poly, scale, VEC_ELEM_NUM));
|
||||
}
|
||||
|
||||
FP32Vec16 tanh() const {
|
||||
fixed_vfloat32m4_t x_clamped = __riscv_vfmin_vf_f32m4(
|
||||
__riscv_vfmax_vf_f32m4(reg, -9.0f, VEC_ELEM_NUM), 9.0f, VEC_ELEM_NUM);
|
||||
FP32Vec16 exp_val =
|
||||
FP32Vec16(__riscv_vfmul_vf_f32m4(x_clamped, 2.0f, VEC_ELEM_NUM)).exp();
|
||||
return FP32Vec16(__riscv_vfdiv_vv_f32m4(
|
||||
__riscv_vfsub_vf_f32m4(exp_val.reg, 1.0f, VEC_ELEM_NUM),
|
||||
__riscv_vfadd_vf_f32m4(exp_val.reg, 1.0f, VEC_ELEM_NUM), VEC_ELEM_NUM));
|
||||
}
|
||||
|
||||
FP32Vec16 er() const {
|
||||
const float p = 0.3275911f, a1 = 0.254829592f, a2 = -0.284496736f,
|
||||
a3 = 1.421413741f, a4 = -1.453152027f, a5 = 1.061405429f;
|
||||
fixed_vfloat32m4_t abs_x = __riscv_vfabs_v_f32m4(reg, VEC_ELEM_NUM);
|
||||
fixed_vfloat32m4_t t = __riscv_vfrdiv_vf_f32m4(
|
||||
__riscv_vfadd_vf_f32m4(__riscv_vfmul_vf_f32m4(abs_x, p, VEC_ELEM_NUM),
|
||||
1.0f, VEC_ELEM_NUM),
|
||||
1.0f, VEC_ELEM_NUM);
|
||||
|
||||
fixed_vfloat32m4_t poly = __riscv_vfmv_v_f_f32m4(a5, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
|
||||
a4, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
|
||||
a3, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
|
||||
a2, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
|
||||
a1, VEC_ELEM_NUM);
|
||||
poly = __riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM);
|
||||
|
||||
fixed_vfloat32m4_t exp_val =
|
||||
FP32Vec16(__riscv_vfneg_v_f32m4(
|
||||
__riscv_vfmul_vv_f32m4(abs_x, abs_x, VEC_ELEM_NUM),
|
||||
VEC_ELEM_NUM))
|
||||
.exp()
|
||||
.reg;
|
||||
fixed_vfloat32m4_t res = __riscv_vfrsub_vf_f32m4(
|
||||
__riscv_vfmul_vv_f32m4(poly, exp_val, VEC_ELEM_NUM), 1.0f,
|
||||
VEC_ELEM_NUM);
|
||||
|
||||
vbool8_t mask = __riscv_vmflt_vf_f32m4_b8(reg, 0.0f, VEC_ELEM_NUM);
|
||||
return FP32Vec16(__riscv_vfneg_v_f32m4_m(mask, res, VEC_ELEM_NUM));
|
||||
}
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// Type Traits & Global Helpers
|
||||
// ============================================================================
|
||||
|
||||
template <typename T>
|
||||
struct VecType {
|
||||
using vec_type = void;
|
||||
using vec_t = void;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
using vec_t = typename VecType<T>::vec_type;
|
||||
|
||||
template <>
|
||||
struct VecType<float> {
|
||||
using vec_type = FP32Vec8;
|
||||
using vec_t = FP32Vec8;
|
||||
};
|
||||
template <>
|
||||
struct VecType<c10::Half> {
|
||||
using vec_type = FP16Vec8;
|
||||
using vec_t = FP16Vec8;
|
||||
};
|
||||
template <>
|
||||
struct VecType<c10::BFloat16> {
|
||||
using vec_type = BF16Vec8;
|
||||
using vec_t = BF16Vec8;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void storeFP32(float v, T* ptr) {
|
||||
*ptr = v;
|
||||
}
|
||||
template <>
|
||||
inline void storeFP32<c10::Half>(float v, c10::Half* ptr) {
|
||||
*reinterpret_cast<_Float16*>(ptr) = static_cast<_Float16>(v);
|
||||
}
|
||||
|
||||
inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
|
||||
reg = __riscv_vfncvt_f_f_w_f16m2(v.reg, VEC_ELEM_NUM);
|
||||
}
|
||||
inline FP16Vec8::FP16Vec8(const FP32Vec8& v) {
|
||||
reg = __riscv_vfncvt_f_f_w_f16m1(v.reg, VEC_ELEM_NUM);
|
||||
}
|
||||
inline FP32Vec16::FP32Vec16(const FP16Vec16& v) {
|
||||
reg = __riscv_vfwcvt_f_f_v_f32m4(v.reg, VEC_ELEM_NUM);
|
||||
}
|
||||
inline void fma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
|
||||
acc = acc.fma(a, b);
|
||||
}
|
||||
|
||||
#ifdef RISCV_BF16_SUPPORT
|
||||
template <>
|
||||
inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
|
||||
*ptr = static_cast<__bf16>(v);
|
||||
};
|
||||
inline BF16Vec8::BF16Vec8(const FP32Vec8& v)
|
||||
: reg(__riscv_vfncvtbf16_f_f_w_bf16m1(v.reg, VEC_ELEM_NUM)) {};
|
||||
inline BF16Vec16::BF16Vec16(const FP32Vec16& v)
|
||||
: reg(__riscv_vfncvtbf16_f_f_w_bf16m2(v.reg, VEC_ELEM_NUM)) {};
|
||||
#else
|
||||
template <>
|
||||
inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
|
||||
uint32_t val;
|
||||
std::memcpy(&val, &v, 4);
|
||||
*reinterpret_cast<uint16_t*>(ptr) = static_cast<uint16_t>(val >> 16);
|
||||
}
|
||||
inline BF16Vec8::BF16Vec8(const FP32Vec8& v) : reg_fp32(v.reg) {}
|
||||
inline BF16Vec16::BF16Vec16(const FP32Vec16& v) : reg_fp32(v.reg) {}
|
||||
#endif
|
||||
|
||||
inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 1); }
|
||||
|
||||
} // namespace vec_op
|
||||
|
||||
#ifndef CPU_KERNEL_GUARD_IN
|
||||
#define CPU_KERNEL_GUARD_IN(NAME)
|
||||
#endif
|
||||
|
||||
#ifndef CPU_KERNEL_GUARD_OUT
|
||||
#define CPU_KERNEL_GUARD_OUT(NAME)
|
||||
#endif
|
||||
|
||||
#endif // CPU_TYPES_RISCV_HPP
|
||||
@@ -173,10 +173,13 @@ ScratchPadManager::ScratchPadManager() : size_(0), ptr_(nullptr) {
|
||||
void ScratchPadManager::realloc(size_t new_size) {
|
||||
new_size = round(new_size);
|
||||
if (new_size > size_) {
|
||||
void* new_ptr = std::aligned_alloc(64, new_size);
|
||||
TORCH_CHECK(new_ptr != nullptr,
|
||||
"ScratchPadManager: aligned_alloc failed for size ", new_size);
|
||||
if (ptr_ != nullptr) {
|
||||
std::free(ptr_);
|
||||
}
|
||||
ptr_ = std::aligned_alloc(64, new_size);
|
||||
ptr_ = new_ptr;
|
||||
size_ = new_size;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -196,6 +196,7 @@ __forceinline__ __device__ u32x8_t ld256_cs(const u32x8_t* addr) {
|
||||
return val;
|
||||
#else
|
||||
assert(false && "ld256_cs requires SM100+ with CUDA 12.9+");
|
||||
return u32x8_t{};
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -109,16 +109,18 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
|
||||
|
||||
#ifndef USE_ROCM
|
||||
int flag = 0;
|
||||
CUDA_CHECK(cuDeviceGetAttribute(
|
||||
CUresult rdma_result = cuDeviceGetAttribute(
|
||||
&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
|
||||
device));
|
||||
if (flag) { // support GPUDirect RDMA if possible
|
||||
device);
|
||||
if (rdma_result == CUDA_SUCCESS &&
|
||||
flag) { // support GPUDirect RDMA if possible
|
||||
prop.allocFlags.gpuDirectRDMACapable = 1;
|
||||
}
|
||||
int fab_flag = 0;
|
||||
CUDA_CHECK(cuDeviceGetAttribute(
|
||||
&fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device));
|
||||
if (fab_flag) { // support fabric handle if possible
|
||||
CUresult fab_result = cuDeviceGetAttribute(
|
||||
&fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device);
|
||||
if (fab_result == CUDA_SUCCESS &&
|
||||
fab_flag) { // support fabric handle if possible
|
||||
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
|
||||
}
|
||||
#endif
|
||||
|
||||
9
csrc/libtorch_stable/ops.h
Normal file
9
csrc/libtorch_stable/ops.h
Normal file
@@ -0,0 +1,9 @@
|
||||
#pragma once
|
||||
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
|
||||
#ifndef USE_ROCM
|
||||
torch::stable::Tensor permute_cols(torch::stable::Tensor const& A,
|
||||
torch::stable::Tensor const& perm);
|
||||
#endif
|
||||
@@ -1,10 +1,13 @@
|
||||
#include <torch/all.h>
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/accelerator.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
#include <torch/headeronly/core/ScalarType.h>
|
||||
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include "torch_utils.h"
|
||||
|
||||
static constexpr int default_threads = 256;
|
||||
static constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
|
||||
|
||||
@@ -64,19 +67,22 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
|
||||
|
||||
// More efficient version of A[..., perm]
|
||||
// taken from gptq_marlin.cu
|
||||
torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm) {
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
|
||||
auto dev = A.get_device();
|
||||
auto stream = at::cuda::getCurrentCUDAStream(dev);
|
||||
torch::stable::Tensor permute_cols(torch::stable::Tensor const& A,
|
||||
torch::stable::Tensor const& perm) {
|
||||
const int32_t dev = A.get_device_index();
|
||||
const torch::stable::accelerator::DeviceGuard device_guard(dev);
|
||||
const auto stream = get_current_cuda_stream(dev);
|
||||
|
||||
TORCH_CHECK(A.scalar_type() == at::kHalf || A.scalar_type() == at::kBFloat16,
|
||||
"Currently only 16bit types are supported");
|
||||
TORCH_CHECK(A.is_contiguous(), "A must be contiguous");
|
||||
TORCH_CHECK(A.size(-1) % 8 == 0,
|
||||
"A columns must be a multiple of 8 (128bits)");
|
||||
auto A_2d = A.view({-1, A.size(-1)});
|
||||
STD_TORCH_CHECK(
|
||||
A.scalar_type() == torch::headeronly::ScalarType::Half ||
|
||||
A.scalar_type() == torch::headeronly::ScalarType::BFloat16,
|
||||
"Currently only 16bit types are supported");
|
||||
STD_TORCH_CHECK(A.is_contiguous(), "A must be contiguous");
|
||||
STD_TORCH_CHECK(A.size(-1) % 8 == 0,
|
||||
"A columns must be a multiple of 8 (128bits)");
|
||||
auto A_2d = torch::stable::view(A, {-1, A.size(-1)});
|
||||
|
||||
torch::Tensor D = torch::empty_like(A);
|
||||
torch::stable::Tensor D = torch::stable::empty_like(A);
|
||||
int sms;
|
||||
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
|
||||
int block_rows = div_ceil(A_2d.size(0), sms);
|
||||
21
csrc/libtorch_stable/torch_bindings.cpp
Normal file
21
csrc/libtorch_stable/torch_bindings.cpp
Normal file
@@ -0,0 +1,21 @@
|
||||
#include "ops.h"
|
||||
#include "core/registration.h"
|
||||
|
||||
#include <torch/csrc/stable/library.h>
|
||||
|
||||
// Register ops with STABLE_TORCH_LIBRARY for libtorch stable ABI compatibility.
|
||||
// Note: We register under namespace "_C" so ops are accessible as
|
||||
// torch.ops._C.<op_name> for compatibility with existing code.
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(_C, m) {
|
||||
#ifndef USE_ROCM
|
||||
m.def("permute_cols(Tensor A, Tensor perm) -> Tensor");
|
||||
#endif
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(_C, CUDA, m) {
|
||||
#ifndef USE_ROCM
|
||||
m.impl("permute_cols", TORCH_BOX(&permute_cols));
|
||||
#endif
|
||||
}
|
||||
|
||||
REGISTER_EXTENSION(_C_stable_libtorch)
|
||||
13
csrc/libtorch_stable/torch_utils.h
Normal file
13
csrc/libtorch_stable/torch_utils.h
Normal file
@@ -0,0 +1,13 @@
|
||||
#pragma once
|
||||
|
||||
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
// Utility to get the current CUDA stream for a given device using stable APIs.
|
||||
// Returns a cudaStream_t for use in kernel launches.
|
||||
inline cudaStream_t get_current_cuda_stream(int32_t device_index) {
|
||||
void* stream_ptr = nullptr;
|
||||
TORCH_ERROR_CODE_CHECK(
|
||||
aoti_torch_get_current_cuda_stream(device_index, &stream_ptr));
|
||||
return reinterpret_cast<cudaStream_t>(stream_ptr);
|
||||
}
|
||||
144
csrc/moe/gpt_oss_router_gemm.cu
Normal file
144
csrc/moe/gpt_oss_router_gemm.cu
Normal file
@@ -0,0 +1,144 @@
|
||||
/*
|
||||
* Adapted from
|
||||
* https://github.com/NVIDIA/TensorRT-LLM/blob/v1.3.0rc7/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_cuda.cu
|
||||
* Copyright (c) 2025, The vLLM team.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
|
||||
* All rights reserved. SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <torch/all.h>
|
||||
#include "gpt_oss_router_gemm.cuh"
|
||||
|
||||
void launch_gpt_oss_router_gemm(__nv_bfloat16* gA, __nv_bfloat16* gB,
|
||||
__nv_bfloat16* gC, __nv_bfloat16* bias,
|
||||
int batch_size, int output_features,
|
||||
int input_features, cudaStream_t stream) {
|
||||
static int const WARP_TILE_M = 16;
|
||||
static int const TILE_M = WARP_TILE_M;
|
||||
static int const TILE_N = 8;
|
||||
static int const TILE_K = 64;
|
||||
static int const STAGES = 16;
|
||||
static int const STAGE_UNROLL = 4;
|
||||
static bool const PROFILE = false;
|
||||
|
||||
CUtensorMap weight_map{};
|
||||
CUtensorMap activation_map{};
|
||||
|
||||
constexpr uint32_t rank = 2;
|
||||
uint64_t size[rank] = {(uint64_t)input_features, (uint64_t)output_features};
|
||||
uint64_t stride[rank - 1] = {input_features * sizeof(__nv_bfloat16)};
|
||||
uint32_t box_size[rank] = {TILE_K, TILE_M};
|
||||
uint32_t elem_stride[rank] = {1, 1};
|
||||
|
||||
CUresult res = cuTensorMapEncodeTiled(
|
||||
&weight_map, CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, rank,
|
||||
gB, size, stride, box_size, elem_stride,
|
||||
CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
|
||||
CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_128B,
|
||||
CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
|
||||
CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
|
||||
TORCH_CHECK(res == CUDA_SUCCESS,
|
||||
"cuTensorMapEncodeTiled failed for weight_map, error code=",
|
||||
static_cast<int>(res));
|
||||
|
||||
size[1] = batch_size;
|
||||
box_size[1] = TILE_N;
|
||||
|
||||
res = cuTensorMapEncodeTiled(
|
||||
&activation_map, CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
|
||||
rank, gA, size, stride, box_size, elem_stride,
|
||||
CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
|
||||
CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_128B,
|
||||
CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
|
||||
CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
|
||||
TORCH_CHECK(res == CUDA_SUCCESS,
|
||||
"cuTensorMapEncodeTiled failed for activation_map, error code=",
|
||||
static_cast<int>(res));
|
||||
|
||||
int smem_size = STAGES * STAGE_UNROLL *
|
||||
(TILE_M * TILE_K * sizeof(__nv_bfloat16) +
|
||||
TILE_N * TILE_K * sizeof(__nv_bfloat16));
|
||||
|
||||
gpuErrChk(cudaFuncSetAttribute(
|
||||
gpt_oss_router_gemm_kernel<WARP_TILE_M, TILE_M, TILE_N, TILE_K, STAGES,
|
||||
STAGE_UNROLL, PROFILE>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
|
||||
|
||||
int tiles_m = (output_features + TILE_M - 1) / TILE_M;
|
||||
int tiles_n = (batch_size + TILE_N - 1) / TILE_N;
|
||||
|
||||
dim3 grid(tiles_m, tiles_n);
|
||||
dim3 block(384);
|
||||
|
||||
cudaLaunchConfig_t config;
|
||||
cudaLaunchAttribute attrs[1];
|
||||
config.gridDim = grid;
|
||||
config.blockDim = block;
|
||||
config.dynamicSmemBytes = smem_size;
|
||||
config.stream = stream;
|
||||
config.attrs = attrs;
|
||||
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
|
||||
attrs[0].val.programmaticStreamSerializationAllowed = 1;
|
||||
config.numAttrs = 1;
|
||||
|
||||
cudaLaunchKernelEx(
|
||||
&config,
|
||||
&gpt_oss_router_gemm_kernel<WARP_TILE_M, TILE_M, TILE_N, TILE_K, STAGES,
|
||||
STAGE_UNROLL, PROFILE>,
|
||||
gC, gA, gB, bias, output_features, batch_size, input_features, weight_map,
|
||||
activation_map, nullptr);
|
||||
}
|
||||
|
||||
void gpt_oss_router_gemm_cuda_forward(torch::Tensor& output,
|
||||
torch::Tensor input, torch::Tensor weight,
|
||||
torch::Tensor bias) {
|
||||
auto const batch_size = input.size(0);
|
||||
auto const input_dim = input.size(1);
|
||||
auto const output_dim = weight.size(0);
|
||||
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
if (input.scalar_type() == at::ScalarType::BFloat16) {
|
||||
launch_gpt_oss_router_gemm((__nv_bfloat16*)input.data_ptr(),
|
||||
(__nv_bfloat16*)weight.data_ptr(),
|
||||
(__nv_bfloat16*)output.mutable_data_ptr(),
|
||||
(__nv_bfloat16*)bias.data_ptr(), batch_size,
|
||||
output_dim, input_dim, stream);
|
||||
} else {
|
||||
throw std::invalid_argument("Unsupported dtype, only supports bfloat16");
|
||||
}
|
||||
}
|
||||
|
||||
void gpt_oss_router_gemm(torch::Tensor& output, torch::Tensor input,
|
||||
torch::Tensor weight, torch::Tensor bias) {
|
||||
TORCH_CHECK(input.dim() == 2, "input must be 2D");
|
||||
TORCH_CHECK(weight.dim() == 2, "weight must be 2D");
|
||||
TORCH_CHECK(bias.dim() == 1, "bias must be 1D");
|
||||
TORCH_CHECK(input.sizes()[1] == weight.sizes()[1],
|
||||
"input.size(1) must match weight.size(1)");
|
||||
TORCH_CHECK(weight.sizes()[0] == bias.sizes()[0],
|
||||
"weight.size(0) must match bias.size(0)");
|
||||
TORCH_CHECK(input.scalar_type() == at::ScalarType::BFloat16,
|
||||
"input tensor must be bfloat16");
|
||||
TORCH_CHECK(weight.scalar_type() == at::ScalarType::BFloat16,
|
||||
"weight tensor must be bfloat16");
|
||||
TORCH_CHECK(bias.scalar_type() == at::ScalarType::BFloat16,
|
||||
"bias tensor must be bfloat16");
|
||||
gpt_oss_router_gemm_cuda_forward(output, input, weight, bias);
|
||||
}
|
||||
447
csrc/moe/gpt_oss_router_gemm.cuh
Normal file
447
csrc/moe/gpt_oss_router_gemm.cuh
Normal file
@@ -0,0 +1,447 @@
|
||||
/*
|
||||
* Adapted from
|
||||
* https://github.com/NVIDIA/TensorRT-LLM/blob/v1.3.0rc7/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh
|
||||
* Copyright (c) 2025, The vLLM team.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
|
||||
* All rights reserved. SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "cuda_bf16.h"
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
|
||||
#include "cuda_pipeline.h"
|
||||
#include <cuda.h>
|
||||
#include <cuda/barrier>
|
||||
#include <cuda/std/utility>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
using barrier = cuda::barrier<cuda::thread_scope_block>;
|
||||
namespace cde = cuda::device::experimental;
|
||||
namespace ptx = cuda::ptx;
|
||||
|
||||
#define gpuErrChk(ans) \
|
||||
{ \
|
||||
gpuAssert((ans), __FILE__, __LINE__); \
|
||||
}
|
||||
|
||||
inline void gpuAssert(cudaError_t code, char const* file, int line,
|
||||
bool abort = true) {
|
||||
if (code != cudaSuccess) {
|
||||
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
|
||||
line);
|
||||
if (abort) {
|
||||
throw std::runtime_error(cudaGetErrorString(code));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
|
||||
__device__ uint64_t gclock64() {
|
||||
unsigned long long int rv;
|
||||
asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(rv));
|
||||
return rv;
|
||||
}
|
||||
|
||||
__device__ void ldmatrix(__nv_bfloat16 rv[2], uint32_t smem_ptr) {
|
||||
int dst;
|
||||
asm volatile("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];\n"
|
||||
: "=r"(dst)
|
||||
: "r"(smem_ptr));
|
||||
int* rvi = reinterpret_cast<int*>(&rv[0]);
|
||||
rvi[0] = dst;
|
||||
}
|
||||
|
||||
__device__ void ldmatrix2(__nv_bfloat16 rv[4], uint32_t smem_ptr) {
|
||||
int x, y;
|
||||
asm volatile("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];\n"
|
||||
: "=r"(x), "=r"(y)
|
||||
: "r"(smem_ptr));
|
||||
|
||||
int* rvi = reinterpret_cast<int*>(&rv[0]);
|
||||
rvi[0] = x;
|
||||
rvi[1] = y;
|
||||
}
|
||||
|
||||
__device__ void ldmatrix4(__nv_bfloat16 rv[8], uint32_t smem_ptr) {
|
||||
int x, y, z, w;
|
||||
asm volatile(
|
||||
"ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];"
|
||||
: "=r"(x), "=r"(y), "=r"(z), "=r"(w)
|
||||
: "r"(smem_ptr));
|
||||
int* rvi = reinterpret_cast<int*>(&rv[0]);
|
||||
rvi[0] = x;
|
||||
rvi[1] = y;
|
||||
rvi[2] = z;
|
||||
rvi[3] = w;
|
||||
}
|
||||
|
||||
__device__ void HMMA_1688(float d[4], __nv_bfloat16 a[4], __nv_bfloat16 b[2],
|
||||
float c[4]) {
|
||||
uint32_t const* A = reinterpret_cast<uint32_t const*>(&a[0]);
|
||||
uint32_t const* B = reinterpret_cast<uint32_t const*>(&b[0]);
|
||||
float const* C = reinterpret_cast<float const*>(&c[0]);
|
||||
float* D = reinterpret_cast<float*>(&d[0]);
|
||||
|
||||
asm volatile(
|
||||
"mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32 "
|
||||
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
|
||||
: "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
|
||||
: "r"(A[0]), "r"(A[1]), "r"(B[0]), "f"(C[0]), "f"(C[1]), "f"(C[2]),
|
||||
"f"(C[3]));
|
||||
}
|
||||
|
||||
__device__ void HMMA_16816(float d[4], __nv_bfloat16 a[8], __nv_bfloat16 b[4],
|
||||
float c[4]) {
|
||||
uint32_t const* A = reinterpret_cast<uint32_t const*>(&a[0]);
|
||||
uint32_t const* B = reinterpret_cast<uint32_t const*>(&b[0]);
|
||||
float const* C = reinterpret_cast<float const*>(&c[0]);
|
||||
float* D = reinterpret_cast<float*>(&d[0]);
|
||||
|
||||
asm volatile(
|
||||
"mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
|
||||
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
|
||||
: "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
|
||||
: "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
|
||||
"f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
|
||||
}
|
||||
|
||||
__device__ void bar_wait(uint32_t bar_ptr, int phase) {
|
||||
asm volatile(
|
||||
"{\n"
|
||||
".reg .pred P1;\n"
|
||||
"LAB_WAIT:\n"
|
||||
"mbarrier.try_wait.parity.shared::cta.b64 P1, [%0], %1;\n"
|
||||
"@P1 bra.uni DONE;\n"
|
||||
"bra.uni LAB_WAIT;\n"
|
||||
"DONE:\n"
|
||||
"}\n" ::"r"(bar_ptr),
|
||||
"r"(phase));
|
||||
}
|
||||
|
||||
__device__ bool bar_try_wait(uint32_t bar_ptr, int phase) {
|
||||
uint32_t success;
|
||||
#ifdef INTERNAL
|
||||
asm volatile(".pragma \"set knob DontInsertYield\";\n" : : : "memory");
|
||||
#endif
|
||||
asm volatile(
|
||||
"{\n\t"
|
||||
".reg .pred P1; \n\t"
|
||||
"mbarrier.try_wait.parity.shared::cta.b64 P1, [%1], %2; \n\t"
|
||||
"selp.b32 %0, 1, 0, P1; \n\t"
|
||||
"}"
|
||||
: "=r"(success)
|
||||
: "r"(bar_ptr), "r"(phase));
|
||||
return success;
|
||||
}
|
||||
|
||||
__device__ uint32_t elect_one_sync() {
|
||||
uint32_t pred = 0;
|
||||
uint32_t laneid = 0;
|
||||
asm volatile(
|
||||
"{\n"
|
||||
".reg .b32 %%rx;\n"
|
||||
".reg .pred %%px;\n"
|
||||
" elect.sync %%rx|%%px, %2;\n"
|
||||
"@%%px mov.s32 %1, 1;\n"
|
||||
" mov.s32 %0, %%rx;\n"
|
||||
"}\n"
|
||||
: "+r"(laneid), "+r"(pred)
|
||||
: "r"(0xFFFFFFFF));
|
||||
return pred;
|
||||
}
|
||||
#endif
|
||||
|
||||
struct Profile {
|
||||
uint64_t start;
|
||||
uint64_t weight_load_start;
|
||||
uint64_t act_load_start;
|
||||
uint64_t compute_start;
|
||||
uint64_t complete;
|
||||
};
|
||||
|
||||
template <int WARP_TILE_M, int TILE_M, int TILE_N, int TILE_K, int STAGES,
|
||||
int STAGE_UNROLL, bool PROFILE>
|
||||
__global__ __launch_bounds__(384, 1) void gpt_oss_router_gemm_kernel(
|
||||
__nv_bfloat16* output, __nv_bfloat16* weights, __nv_bfloat16* activations,
|
||||
__nv_bfloat16* bias, int M, int N, int K,
|
||||
const __grid_constant__ CUtensorMap weight_map,
|
||||
const __grid_constant__ CUtensorMap activation_map,
|
||||
Profile* profile = nullptr) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
|
||||
|
||||
if (PROFILE && threadIdx.x == 0 && blockIdx.y == 0)
|
||||
profile[blockIdx.x].start = gclock64();
|
||||
|
||||
extern __shared__ __align__(128) char smem[];
|
||||
|
||||
__nv_bfloat16* sh_weights = (__nv_bfloat16*)&smem[0];
|
||||
__nv_bfloat16* sh_activations =
|
||||
(__nv_bfloat16*)&smem[STAGES * STAGE_UNROLL * TILE_M * TILE_K *
|
||||
sizeof(__nv_bfloat16)];
|
||||
|
||||
#pragma nv_diag_suppress static_var_with_dynamic_init
|
||||
__shared__ barrier bar_wt_ready[STAGES];
|
||||
__shared__ barrier bar_act_ready[STAGES];
|
||||
__shared__ barrier bar_data_consumed[STAGES];
|
||||
|
||||
__shared__ float4 reduction_buffer[128];
|
||||
|
||||
__shared__ nv_bfloat16 sh_bias[TILE_M];
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
for (int i = 0; i < STAGES; i++) {
|
||||
init(&bar_wt_ready[i], 1);
|
||||
init(&bar_act_ready[i], 1);
|
||||
init(&bar_data_consumed[i], 32);
|
||||
}
|
||||
ptx::fence_proxy_async(ptx::space_shared);
|
||||
asm volatile("prefetch.tensormap [%0];"
|
||||
:
|
||||
: "l"(reinterpret_cast<uint64_t>(&weight_map))
|
||||
: "memory");
|
||||
asm volatile("prefetch.tensormap [%0];"
|
||||
:
|
||||
: "l"(reinterpret_cast<uint64_t>(&activation_map))
|
||||
: "memory");
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int lane_id = threadIdx.x % 32;
|
||||
|
||||
int phase = 0;
|
||||
|
||||
int mib = blockIdx.x * TILE_M;
|
||||
int ni = blockIdx.y * TILE_N;
|
||||
|
||||
float accum[4];
|
||||
for (int i = 0; i < 4; i++) accum[i] = 0.f;
|
||||
|
||||
int const K_LOOPS_DMA =
|
||||
(K + 4 * TILE_K * STAGE_UNROLL - 1) / (4 * (TILE_K * STAGE_UNROLL));
|
||||
int const K_LOOPS_COMPUTE = K_LOOPS_DMA;
|
||||
|
||||
// Data loading thread
|
||||
if (warp_id >= 4 && elect_one_sync()) {
|
||||
int stage = warp_id % 4;
|
||||
|
||||
bool weight_warp = warp_id < 8;
|
||||
if (!weight_warp) {
|
||||
cudaGridDependencySynchronize();
|
||||
cudaTriggerProgrammaticLaunchCompletion();
|
||||
}
|
||||
|
||||
for (int ki = 0; ki < K_LOOPS_DMA; ki++) {
|
||||
int k = (ki * 4 + (warp_id % 4)) * TILE_K * STAGE_UNROLL;
|
||||
|
||||
uint64_t desc_ptr_wt = reinterpret_cast<uint64_t>(&weight_map);
|
||||
uint64_t desc_ptr_act = reinterpret_cast<uint64_t>(&activation_map);
|
||||
|
||||
uint32_t bar_ptr_wt = __cvta_generic_to_shared(&bar_wt_ready[stage]);
|
||||
uint32_t bar_ptr_act = __cvta_generic_to_shared(&bar_act_ready[stage]);
|
||||
int bytes_wt = TILE_M * TILE_K * sizeof(__nv_bfloat16);
|
||||
int bytes_act = TILE_N * TILE_K * sizeof(__nv_bfloat16);
|
||||
|
||||
bar_wait(__cvta_generic_to_shared(&bar_data_consumed[stage]), phase ^ 1);
|
||||
|
||||
if (weight_warp)
|
||||
asm volatile("mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;"
|
||||
:
|
||||
: "r"(bar_ptr_wt), "r"(STAGE_UNROLL * bytes_wt));
|
||||
if (!weight_warp)
|
||||
asm volatile("mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;"
|
||||
:
|
||||
: "r"(bar_ptr_act), "r"(STAGE_UNROLL * bytes_act));
|
||||
|
||||
if (PROFILE && blockIdx.y == 0 && ki == 0 && weight_warp)
|
||||
profile[blockIdx.x].weight_load_start = gclock64();
|
||||
if (PROFILE && blockIdx.y == 0 && ki == 0 && !weight_warp)
|
||||
profile[blockIdx.x].act_load_start = gclock64();
|
||||
|
||||
for (int i = 0; i < STAGE_UNROLL; i++) {
|
||||
uint32_t smem_ptr_wt = __cvta_generic_to_shared(
|
||||
&sh_weights[(stage * STAGE_UNROLL + i) * TILE_M * TILE_K]);
|
||||
uint32_t crd0 = k + i * TILE_K;
|
||||
uint32_t crd1 = mib;
|
||||
if (weight_warp)
|
||||
asm volatile(
|
||||
"cp.async.bulk.tensor.2d.shared::cta.global.mbarrier::complete_"
|
||||
"tx::bytes [%0], [%1, {%3,%4}], "
|
||||
"[%2];"
|
||||
:
|
||||
: "r"(smem_ptr_wt), "l"(desc_ptr_wt), "r"(bar_ptr_wt), "r"(crd0),
|
||||
"r"(crd1)
|
||||
: "memory");
|
||||
|
||||
uint32_t smem_ptr_act = __cvta_generic_to_shared(
|
||||
&sh_activations[(stage * STAGE_UNROLL + i) * TILE_N * TILE_K]);
|
||||
crd0 = k + i * TILE_K;
|
||||
crd1 = ni;
|
||||
if (!weight_warp)
|
||||
asm volatile(
|
||||
"cp.async.bulk.tensor.2d.shared::cta.global.mbarrier::complete_"
|
||||
"tx::bytes [%0], [%1, {%3,%4}], "
|
||||
"[%2];"
|
||||
:
|
||||
: "r"(smem_ptr_act), "l"(desc_ptr_act), "r"(bar_ptr_act),
|
||||
"r"(crd0), "r"(crd1)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
stage += 4;
|
||||
if (stage >= STAGES) {
|
||||
stage = warp_id % 4;
|
||||
phase ^= 1;
|
||||
}
|
||||
}
|
||||
// Wait for pending loads to be consumed before exiting, to avoid race
|
||||
for (int i = 0; i < (STAGES / 4) - 1; i++) {
|
||||
bar_wait(__cvta_generic_to_shared(&bar_data_consumed[stage]), phase ^ 1);
|
||||
stage += 4;
|
||||
if (stage >= STAGES) {
|
||||
stage = warp_id % 4;
|
||||
phase ^= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Compute threads
|
||||
else if (warp_id < 4) {
|
||||
// Sneak the bias load into the compute warps since they're just waiting for
|
||||
// stuff anyway
|
||||
if (threadIdx.x < TILE_M) sh_bias[threadIdx.x] = bias[mib + threadIdx.x];
|
||||
|
||||
int stage = warp_id;
|
||||
|
||||
int phase = 0;
|
||||
int lane_id_div8 = lane_id / 8;
|
||||
int lane_id_mod8 = lane_id % 8;
|
||||
|
||||
int lane_row_offset_wt = (lane_id_div8 % 2) ? 8 : 0;
|
||||
int lane_col_offset_wt = (lane_id_div8 / 2) ? 1 : 0;
|
||||
|
||||
int row_wt = lane_id_mod8 + lane_row_offset_wt;
|
||||
int row_act = lane_id_mod8;
|
||||
|
||||
int row_offset_wt = (reinterpret_cast<uintptr_t>(sh_weights) / 128) % 8;
|
||||
int row_offset_act = row_offset_wt;
|
||||
|
||||
uint32_t bar_ptr_wt = __cvta_generic_to_shared(&bar_wt_ready[stage]);
|
||||
uint32_t bar_ptr_act = __cvta_generic_to_shared(&bar_act_ready[stage]);
|
||||
|
||||
bool weight_ready = bar_try_wait(bar_ptr_wt, phase);
|
||||
bool act_ready = bar_try_wait(bar_ptr_act, phase);
|
||||
|
||||
#pragma unroll 2
|
||||
for (int ki = 0; ki < K_LOOPS_COMPUTE; ki++) {
|
||||
int next_stage = stage + 4;
|
||||
int next_phase = phase;
|
||||
if (next_stage >= STAGES) {
|
||||
next_stage = warp_id;
|
||||
next_phase ^= 1;
|
||||
}
|
||||
|
||||
while (!weight_ready || !act_ready) {
|
||||
weight_ready = bar_try_wait(bar_ptr_wt, phase);
|
||||
act_ready = bar_try_wait(bar_ptr_act, phase);
|
||||
}
|
||||
|
||||
if (PROFILE && blockIdx.y == 0 && threadIdx.x == 0 && ki == 0)
|
||||
profile[blockIdx.x].compute_start = gclock64();
|
||||
|
||||
if (ki + 1 < K_LOOPS_COMPUTE) {
|
||||
weight_ready = bar_try_wait(
|
||||
__cvta_generic_to_shared(&bar_wt_ready[next_stage]), next_phase);
|
||||
act_ready = bar_try_wait(
|
||||
__cvta_generic_to_shared(&bar_act_ready[next_stage]), next_phase);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int su = 0; su < STAGE_UNROLL; su++) {
|
||||
__nv_bfloat16* ptr_weights =
|
||||
&sh_weights[(stage * STAGE_UNROLL + su) * TILE_M * TILE_K];
|
||||
__nv_bfloat16* ptr_act =
|
||||
&sh_activations[(stage * STAGE_UNROLL + su) * TILE_N * TILE_K];
|
||||
|
||||
#pragma unroll
|
||||
for (int kii = 0; kii < TILE_K / 16; kii++) {
|
||||
__nv_bfloat16 a[8];
|
||||
__nv_bfloat16 b[4];
|
||||
|
||||
int col = 2 * kii + lane_col_offset_wt;
|
||||
int col_sw = ((row_wt + row_offset_wt) % 8) ^ col;
|
||||
|
||||
ldmatrix4(a, __cvta_generic_to_shared(
|
||||
&ptr_weights[row_wt * TILE_K + col_sw * 8]));
|
||||
|
||||
col = 2 * kii + lane_id_div8;
|
||||
col_sw = ((row_act + row_offset_act) % 8) ^ col;
|
||||
|
||||
ldmatrix2(b, __cvta_generic_to_shared(
|
||||
&ptr_act[row_act * TILE_K + 8 * col_sw]));
|
||||
|
||||
HMMA_16816(accum, a, b, accum);
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t bar_c = __cvta_generic_to_shared(&bar_data_consumed[stage]);
|
||||
asm volatile("mbarrier.arrive.shared::cta.b64 _, [%0];" : : "r"(bar_c));
|
||||
|
||||
stage = next_stage;
|
||||
phase = next_phase;
|
||||
}
|
||||
|
||||
float4 accum4;
|
||||
accum4.x = accum[0];
|
||||
accum4.y = accum[1];
|
||||
accum4.z = accum[2];
|
||||
accum4.w = accum[3];
|
||||
reduction_buffer[threadIdx.x] = accum4;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (warp_id == 0) {
|
||||
int mi = mib + warp_id * WARP_TILE_M;
|
||||
int tm = mi + lane_id / 4;
|
||||
int tn = ni + 2 * (lane_id % 4);
|
||||
|
||||
float4 accum1 = reduction_buffer[32 + threadIdx.x];
|
||||
float4 accum2 = reduction_buffer[64 + threadIdx.x];
|
||||
float4 accum3 = reduction_buffer[96 + threadIdx.x];
|
||||
|
||||
accum[0] = accum[0] + accum1.x + accum2.x + accum3.x;
|
||||
accum[1] = accum[1] + accum1.y + accum2.y + accum3.y;
|
||||
accum[2] = accum[2] + accum1.z + accum2.z + accum3.z;
|
||||
accum[3] = accum[3] + accum1.w + accum2.w + accum3.w;
|
||||
|
||||
float bias_lo = __bfloat162float(sh_bias[tm - mib]);
|
||||
float bias_hi = __bfloat162float(sh_bias[tm + 8 - mib]);
|
||||
|
||||
if (tn < N && tm < M)
|
||||
output[tn * M + tm] = __float2bfloat16(accum[0] + bias_lo);
|
||||
if (tn + 1 < N && tm < M)
|
||||
output[(tn + 1) * M + tm] = __float2bfloat16(accum[1] + bias_lo);
|
||||
if (tn < N && tm + 8 < M)
|
||||
output[tn * M + tm + 8] = __float2bfloat16(accum[2] + bias_hi);
|
||||
if (tn + 1 < N && tm + 8 < M)
|
||||
output[(tn + 1) * M + tm + 8] = __float2bfloat16(accum[3] + bias_hi);
|
||||
|
||||
if (PROFILE && blockIdx.y == 0 && threadIdx.x == 0)
|
||||
profile[blockIdx.x].complete = gclock64();
|
||||
}
|
||||
}
|
||||
#endif // end if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
|
||||
}
|
||||
@@ -70,4 +70,8 @@ torch::Tensor router_gemm_bf16_fp32(torch::Tensor const& input,
|
||||
// Supports num_tokens in [1, 16], num_experts in {256, 384}, hidden_dim = 7168
|
||||
void dsv3_router_gemm(torch::Tensor& output, const torch::Tensor& mat_a,
|
||||
const torch::Tensor& mat_b);
|
||||
|
||||
// gpt-oss optimized router GEMM kernel for SM90+
|
||||
void gpt_oss_router_gemm(torch::Tensor& output, torch::Tensor input,
|
||||
torch::Tensor weight, torch::Tensor bias);
|
||||
#endif
|
||||
|
||||
@@ -73,10 +73,9 @@ void moe_permute(
|
||||
MOE_DISPATCH(input.scalar_type(), [&] {
|
||||
expandInputRowsKernelLauncher<scalar_t>(
|
||||
get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
|
||||
get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
|
||||
get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
|
||||
get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
|
||||
n_hidden, topk, n_local_expert, stream);
|
||||
get_ptr<int>(sorted_row_idx), get_ptr<int>(inv_permuted_idx),
|
||||
get_ptr<int>(permuted_idx), get_ptr<int64_t>(expert_first_token_offset),
|
||||
n_token, valid_num_ptr, n_hidden, topk, n_local_expert, stream);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -57,7 +57,7 @@ void sortAndScanExpert(const int* expert_for_source_row, const int* source_rows,
|
||||
|
||||
template <typename T>
|
||||
void expandInputRowsKernelLauncher(
|
||||
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
|
||||
T const* unpermuted_input, T* permuted_output,
|
||||
int const* expanded_dest_row_to_expanded_source_row,
|
||||
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
|
||||
int64_t const* expert_first_token_offset, int64_t const num_rows,
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
template <typename T, bool CHECK_SKIPPED>
|
||||
__global__ void expandInputRowsKernel(
|
||||
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
|
||||
T const* unpermuted_input, T* permuted_output,
|
||||
int const* expanded_dest_row_to_expanded_source_row,
|
||||
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
|
||||
int64_t const* expert_first_token_offset, int64_t const num_rows,
|
||||
@@ -16,7 +16,6 @@ __global__ void expandInputRowsKernel(
|
||||
int64_t expanded_dest_row = blockIdx.x;
|
||||
int64_t const expanded_source_row =
|
||||
expanded_dest_row_to_expanded_source_row[expanded_dest_row];
|
||||
int expert_id = sorted_experts[expanded_dest_row];
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
assert(expanded_dest_row <= INT32_MAX);
|
||||
@@ -54,7 +53,7 @@ __global__ void expandInputRowsKernel(
|
||||
|
||||
template <typename T>
|
||||
void expandInputRowsKernelLauncher(
|
||||
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
|
||||
T const* unpermuted_input, T* permuted_output,
|
||||
int const* expanded_dest_row_to_expanded_source_row,
|
||||
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
|
||||
int64_t const* expert_first_token_offset, int64_t const num_rows,
|
||||
@@ -70,12 +69,12 @@ void expandInputRowsKernelLauncher(
|
||||
bool is_check_skip = num_valid_tokens_ptr != nullptr;
|
||||
auto func = func_map[is_check_skip];
|
||||
|
||||
func<<<blocks, threads, 0, stream>>>(
|
||||
unpermuted_input, permuted_output, sorted_experts,
|
||||
expanded_dest_row_to_expanded_source_row,
|
||||
expanded_source_row_to_expanded_dest_row, permuted_idx,
|
||||
expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
|
||||
num_local_experts);
|
||||
func<<<blocks, threads, 0, stream>>>(unpermuted_input, permuted_output,
|
||||
expanded_dest_row_to_expanded_source_row,
|
||||
expanded_source_row_to_expanded_dest_row,
|
||||
permuted_idx, expert_first_token_offset,
|
||||
num_rows, num_valid_tokens_ptr, cols, k,
|
||||
num_local_experts);
|
||||
}
|
||||
|
||||
template <class T, class U>
|
||||
|
||||
@@ -132,6 +132,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
// DeepSeek V3 optimized router GEMM for SM90+
|
||||
m.def("dsv3_router_gemm(Tensor! output, Tensor mat_a, Tensor mat_b) -> ()");
|
||||
// conditionally compiled so impl registration is in source file
|
||||
|
||||
// gpt-oss optimized router GEMM kernel for SM90+
|
||||
m.def(
|
||||
"gpt_oss_router_gemm(Tensor! output, Tensor input, Tensor weights, "
|
||||
"Tensor bias) -> ()");
|
||||
m.impl("gpt_oss_router_gemm", torch::kCUDA, &gpt_oss_router_gemm);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
16
csrc/ops.h
16
csrc/ops.h
@@ -201,7 +201,6 @@ torch::Tensor awq_dequantize(torch::Tensor _kernel,
|
||||
torch::Tensor _zeros, int64_t split_k_iters,
|
||||
int64_t thx, int64_t thy);
|
||||
|
||||
torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm);
|
||||
#endif
|
||||
|
||||
torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
|
||||
@@ -262,7 +261,8 @@ void get_cutlass_moe_mm_data(
|
||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
|
||||
const int64_t num_experts, const int64_t n, const int64_t k,
|
||||
const std::optional<torch::Tensor>& blockscale_offsets);
|
||||
const std::optional<torch::Tensor>& blockscale_offsets,
|
||||
const bool is_gated);
|
||||
|
||||
void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
|
||||
const torch::Tensor& expert_first_token_offset,
|
||||
@@ -295,10 +295,14 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
|
||||
|
||||
std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a);
|
||||
|
||||
void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
|
||||
torch::Tensor& output_scale,
|
||||
torch::Tensor const& input_scale,
|
||||
bool is_sf_swizzled_layout);
|
||||
std::tuple<torch::Tensor, torch::Tensor> scaled_fp4_quant_func(
|
||||
torch::Tensor const& input, torch::Tensor const& input_scale,
|
||||
bool is_sf_swizzled_layout);
|
||||
|
||||
void scaled_fp4_quant_out(torch::Tensor const& input,
|
||||
torch::Tensor const& input_scale,
|
||||
bool is_sf_swizzled_layout, torch::Tensor& output,
|
||||
torch::Tensor& output_scale);
|
||||
|
||||
void scaled_fp4_experts_quant(
|
||||
torch::Tensor& output, torch::Tensor& output_scale,
|
||||
|
||||
@@ -16,6 +16,8 @@
|
||||
|
||||
#include <torch/all.h>
|
||||
|
||||
#include "nvfp4_utils.cuh"
|
||||
|
||||
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
|
||||
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
|
||||
void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
|
||||
@@ -51,9 +53,10 @@ void silu_and_mul_scaled_fp4_experts_quant_sm1xxa(
|
||||
torch::Tensor const& output_scale_offset_by_experts);
|
||||
#endif
|
||||
|
||||
void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
|
||||
torch::Tensor& output_sf, torch::Tensor const& input_sf,
|
||||
bool is_sf_swizzled_layout) {
|
||||
void scaled_fp4_quant_out(torch::Tensor const& input,
|
||||
torch::Tensor const& input_sf,
|
||||
bool is_sf_swizzled_layout, torch::Tensor& output,
|
||||
torch::Tensor& output_sf) {
|
||||
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
|
||||
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
|
||||
return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf,
|
||||
@@ -62,6 +65,34 @@ void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization kernel");
|
||||
}
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor> scaled_fp4_quant_func(
|
||||
torch::Tensor const& input, torch::Tensor const& input_sf,
|
||||
bool is_sf_swizzled_layout) {
|
||||
int64_t n = input.size(-1);
|
||||
int64_t m = input.numel() / n;
|
||||
auto device = input.device();
|
||||
|
||||
// Two fp4 values packed into a uint8
|
||||
auto output = torch::empty(
|
||||
{m, n / 2}, torch::TensorOptions().device(device).dtype(torch::kUInt8));
|
||||
|
||||
torch::Tensor output_sf;
|
||||
if (is_sf_swizzled_layout) {
|
||||
auto [sf_m, sf_n] = vllm::computeSwizzledSFShape(m, n);
|
||||
output_sf = torch::empty(
|
||||
{sf_m, sf_n},
|
||||
torch::TensorOptions().device(device).dtype(torch::kInt32));
|
||||
} else {
|
||||
output_sf = torch::empty(
|
||||
{m, n / CVT_FP4_SF_VEC_SIZE},
|
||||
torch::TensorOptions().device(device).dtype(torch::kUInt8));
|
||||
}
|
||||
|
||||
scaled_fp4_quant_out(input, input_sf, is_sf_swizzled_layout, output,
|
||||
output_sf);
|
||||
return {output, output_sf};
|
||||
}
|
||||
|
||||
void scaled_fp4_experts_quant(
|
||||
torch::Tensor& output, torch::Tensor& output_scale,
|
||||
torch::Tensor const& input, torch::Tensor const& input_global_scale,
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp8.h>
|
||||
#include <utility>
|
||||
|
||||
#include "../../cuda_vec_utils.cuh"
|
||||
|
||||
@@ -54,6 +55,18 @@ inline int computeEffectiveRows(int m) {
|
||||
return round_up(m, ROW_TILE);
|
||||
}
|
||||
|
||||
// Compute the shape of the swizzled SF output tensor.
|
||||
// Returns (rounded_m, rounded_n / 4) where:
|
||||
// rounded_m = round_up(m, 128)
|
||||
// rounded_n = round_up(n / CVT_FP4_SF_VEC_SIZE, 4)
|
||||
inline std::pair<int64_t, int64_t> computeSwizzledSFShape(int64_t m,
|
||||
int64_t n) {
|
||||
int64_t rounded_m = round_up(m, static_cast<int64_t>(128));
|
||||
int64_t scale_n = n / CVT_FP4_SF_VEC_SIZE;
|
||||
int64_t rounded_n = round_up(scale_n, static_cast<int64_t>(4));
|
||||
return {rounded_m, rounded_n / 4};
|
||||
}
|
||||
|
||||
// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
|
||||
inline __device__ uint32_t fp32_vec8_to_e2m1(float (&array)[8]) {
|
||||
uint32_t val;
|
||||
|
||||
@@ -15,31 +15,33 @@ __device__ void rms_norm_dynamic_per_token_quant_vec(
|
||||
scalar_t const* __restrict__ input, // [..., hidden_size]
|
||||
scalar_t const* __restrict__ weight, // [hidden_size]
|
||||
float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
|
||||
scalar_t* __restrict__ residual = nullptr) {
|
||||
int32_t const input_stride, scalar_t* __restrict__ residual = nullptr) {
|
||||
float rms = 0.0f;
|
||||
float token_scale = 0.0f;
|
||||
|
||||
// Compute rms
|
||||
vllm::vectorized::compute_rms<scalar_t, has_residual>(
|
||||
&rms, input, hidden_size, var_epsilon, residual);
|
||||
&rms, input, hidden_size, input_stride, var_epsilon, residual);
|
||||
|
||||
// Compute scale
|
||||
vllm::vectorized::compute_dynamic_per_token_scales<scalar_t, scalar_out_t,
|
||||
has_residual>(
|
||||
&token_scale, scales, input, weight, rms, scale_ub, hidden_size,
|
||||
residual);
|
||||
input_stride, residual);
|
||||
|
||||
// RMS Norm + Quant
|
||||
if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
|
||||
token_scale = 1.0f / token_scale;
|
||||
vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, true,
|
||||
has_residual>(
|
||||
out, input, weight, rms, &token_scale, hidden_size, residual);
|
||||
has_residual>(out, input, weight, rms,
|
||||
&token_scale, hidden_size,
|
||||
input_stride, residual);
|
||||
} else {
|
||||
// FP8 - Do not invert token_scale for exact match with FBGemm
|
||||
vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, false,
|
||||
has_residual>(
|
||||
out, input, weight, rms, &token_scale, hidden_size, residual);
|
||||
has_residual>(out, input, weight, rms,
|
||||
&token_scale, hidden_size,
|
||||
input_stride, residual);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -51,38 +53,40 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
|
||||
scalar_t const* __restrict__ input, // [..., hidden_size]
|
||||
scalar_t const* __restrict__ weight, // [hidden_size]
|
||||
float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
|
||||
scalar_t* __restrict__ residual = nullptr) {
|
||||
int32_t const input_stride, scalar_t* __restrict__ residual = nullptr) {
|
||||
// For vectorization, token_input and token_output pointers need to be
|
||||
// aligned at 8-byte and 4-byte addresses respectively.
|
||||
bool const can_vectorize = hidden_size % 4 == 0;
|
||||
bool const can_vectorize = hidden_size % 4 == 0 and input_stride % 4 == 0;
|
||||
|
||||
if (can_vectorize) {
|
||||
return rms_norm_dynamic_per_token_quant_vec<scalar_t, scalar_out_t,
|
||||
has_residual>(
|
||||
out, scales, input, weight, scale_ub, var_epsilon, hidden_size,
|
||||
residual);
|
||||
input_stride, residual);
|
||||
}
|
||||
|
||||
float rms = 0.0f;
|
||||
float token_scale = 0.0f;
|
||||
|
||||
// Compute RMS
|
||||
vllm::compute_rms<scalar_t, has_residual>(&rms, input, hidden_size,
|
||||
var_epsilon, residual);
|
||||
vllm::compute_rms<scalar_t, has_residual>(
|
||||
&rms, input, hidden_size, input_stride, var_epsilon, residual);
|
||||
// Compute Scale
|
||||
vllm::compute_dynamic_per_token_scales<scalar_t, scalar_out_t, has_residual>(
|
||||
&token_scale, scales, input, weight, rms, scale_ub, hidden_size,
|
||||
residual);
|
||||
input_stride, residual);
|
||||
|
||||
// RMS Norm + Quant
|
||||
if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
|
||||
token_scale = 1.0f / token_scale;
|
||||
vllm::norm_and_quant<scalar_t, scalar_out_t, true, has_residual>(
|
||||
out, input, weight, rms, &token_scale, hidden_size, residual);
|
||||
out, input, weight, rms, &token_scale, hidden_size, input_stride,
|
||||
residual);
|
||||
} else {
|
||||
// FP8 - Do not invert s_token_scale for exact match with FBGemm
|
||||
vllm::norm_and_quant<scalar_t, scalar_out_t, false, has_residual>(
|
||||
out, input, weight, rms, &token_scale, hidden_size, residual);
|
||||
out, input, weight, rms, &token_scale, hidden_size, input_stride,
|
||||
residual);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -97,19 +101,20 @@ __global__ void rms_norm_per_block_quant_kernel(
|
||||
scalar_t const* __restrict__ input, // [..., hidden_size]
|
||||
scalar_t const* __restrict__ weight, // [hidden_size]
|
||||
float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
|
||||
scalar_t* __restrict__ residual = nullptr, int64_t outer_scale_stride = 1) {
|
||||
int32_t const input_stride, scalar_t* __restrict__ residual = nullptr,
|
||||
int64_t outer_scale_stride = 1) {
|
||||
float rms;
|
||||
// Compute RMS
|
||||
// Always able to vectorize due to constraints on hidden_size
|
||||
vllm::vectorized::compute_rms<scalar_t, has_residual>(
|
||||
&rms, input, hidden_size, var_epsilon, residual);
|
||||
&rms, input, hidden_size, input_stride, var_epsilon, residual);
|
||||
|
||||
// Compute Scale
|
||||
// Always able to vectorize due to constraints on hidden_size and group_size
|
||||
vllm::vectorized::compute_dynamic_per_token_scales<
|
||||
scalar_t, scalar_out_t, has_residual, is_scale_transposed, group_size>(
|
||||
nullptr, scales, input, weight, rms, scale_ub, hidden_size, residual,
|
||||
outer_scale_stride);
|
||||
nullptr, scales, input, weight, rms, scale_ub, hidden_size, input_stride,
|
||||
residual, outer_scale_stride);
|
||||
|
||||
// RMS Norm + Quant
|
||||
// Always able to vectorize due to constraints on hidden_size
|
||||
@@ -120,7 +125,7 @@ __global__ void rms_norm_per_block_quant_kernel(
|
||||
vllm::vectorized::norm_and_quant<
|
||||
scalar_t, scalar_out_t, std::is_same_v<scalar_out_t, int8_t>,
|
||||
has_residual, is_scale_transposed, group_size>(
|
||||
out, input, weight, rms, scales, hidden_size, residual,
|
||||
out, input, weight, rms, scales, hidden_size, input_stride, residual,
|
||||
outer_scale_stride);
|
||||
}
|
||||
|
||||
@@ -137,6 +142,7 @@ void rms_norm_dynamic_per_token_quant_dispatch(
|
||||
std::optional<at::Tensor> const& scale_ub,
|
||||
std::optional<at::Tensor>& residual) {
|
||||
int32_t hidden_size = input.size(-1);
|
||||
int32_t input_stride = input.view({-1, hidden_size}).stride(0);
|
||||
auto num_tokens = input.numel() / hidden_size;
|
||||
|
||||
dim3 grid(num_tokens);
|
||||
@@ -153,7 +159,7 @@ void rms_norm_dynamic_per_token_quant_dispatch(
|
||||
out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
|
||||
input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
|
||||
scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
|
||||
var_epsilon, hidden_size,
|
||||
var_epsilon, hidden_size, input_stride,
|
||||
has_residual ? residual->data_ptr<scalar_in_t>() : nullptr);
|
||||
});
|
||||
});
|
||||
@@ -170,7 +176,9 @@ void rms_norm_dynamic_per_token_quant(
|
||||
? c10::ScalarType::Float8_e4m3fn
|
||||
: c10::ScalarType::Float8_e4m3fnuz;
|
||||
TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
|
||||
TORCH_CHECK(out.is_contiguous() && input.is_contiguous());
|
||||
TORCH_CHECK(out.is_contiguous());
|
||||
TORCH_CHECK(input.stride(-1) == 1,
|
||||
"Input must be contiguous in the last dimension");
|
||||
|
||||
if (scale_ub.has_value()) {
|
||||
TORCH_CHECK(out.dtype() == kFp8Type);
|
||||
@@ -179,6 +187,7 @@ void rms_norm_dynamic_per_token_quant(
|
||||
TORCH_CHECK(scales.dtype() == torch::kFloat32);
|
||||
if (residual) {
|
||||
TORCH_CHECK(residual->scalar_type() == input.scalar_type());
|
||||
TORCH_CHECK(residual->is_contiguous());
|
||||
}
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
@@ -200,6 +209,15 @@ void rms_norm_per_block_quant_dispatch(
|
||||
std::optional<at::Tensor> const& scale_ub,
|
||||
std::optional<at::Tensor>& residual, bool is_scale_transposed) {
|
||||
int32_t hidden_size = input.size(-1);
|
||||
int32_t input_stride = input.view({-1, hidden_size}).stride(0);
|
||||
|
||||
TORCH_CHECK(hidden_size % 4 == 0,
|
||||
"Hidden size must be divisible by 4 for vectorized access");
|
||||
TORCH_CHECK(input_stride % 4 == 0,
|
||||
"Input stride must be divisible by 4 for vectorized access");
|
||||
TORCH_CHECK(group_size % 4 == 0,
|
||||
"Group size must be divisible by 4 for vectorized access");
|
||||
|
||||
auto num_tokens = input.numel() / hidden_size;
|
||||
|
||||
dim3 grid(num_tokens);
|
||||
@@ -225,7 +243,7 @@ void rms_norm_per_block_quant_dispatch(
|
||||
weight.data_ptr<scalar_in_t>(),
|
||||
scale_ub.has_value() ? scale_ub->data_ptr<float>()
|
||||
: nullptr,
|
||||
var_epsilon, hidden_size,
|
||||
var_epsilon, hidden_size, input_stride,
|
||||
has_residual ? residual->data_ptr<scalar_in_t>()
|
||||
: nullptr,
|
||||
scales.stride(1));
|
||||
@@ -246,7 +264,9 @@ void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||
? c10::ScalarType::Float8_e4m3fn
|
||||
: c10::ScalarType::Float8_e4m3fnuz;
|
||||
TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
|
||||
TORCH_CHECK(out.is_contiguous() && input.is_contiguous());
|
||||
TORCH_CHECK(out.is_contiguous());
|
||||
TORCH_CHECK(input.stride(-1) == 1,
|
||||
"Input must be contiguous in the last dimension");
|
||||
|
||||
if (scale_ub.has_value()) {
|
||||
TORCH_CHECK(out.dtype() == kFp8Type);
|
||||
@@ -255,6 +275,7 @@ void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||
TORCH_CHECK(scales.dtype() == torch::kFloat32);
|
||||
if (residual) {
|
||||
TORCH_CHECK(residual->scalar_type() == input.scalar_type());
|
||||
TORCH_CHECK(residual->is_contiguous());
|
||||
}
|
||||
|
||||
TORCH_CHECK(group_size == 128 || group_size == 64,
|
||||
@@ -265,6 +286,15 @@ void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||
"Outer scale stride must be 1 when scales are not transposed");
|
||||
}
|
||||
|
||||
int64_t hidden_size = input.size(-1);
|
||||
TORCH_CHECK(hidden_size > 0 && hidden_size % group_size == 0,
|
||||
"hidden_size must be a positive multiple of group_size");
|
||||
int64_t num_tokens = input.numel() / hidden_size;
|
||||
int64_t num_groups = hidden_size / group_size;
|
||||
TORCH_CHECK(scales.numel() >= num_tokens * num_groups,
|
||||
"scales buffer too small: need ", num_tokens * num_groups,
|
||||
" elements, got ", scales.numel());
|
||||
|
||||
rms_norm_per_block_quant_dispatch(out, input, weight, scales, group_size,
|
||||
var_epsilon, scale_ub, residual,
|
||||
is_scale_transposed);
|
||||
|
||||
@@ -16,14 +16,17 @@ namespace vllm {
|
||||
// has_residual must be true, if residual is not a nullptr
|
||||
template <typename scalar_t, bool has_residual = false>
|
||||
__device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
|
||||
int32_t const hidden_size, float const epsilon,
|
||||
int32_t const hidden_size,
|
||||
int32_t const input_stride, float const epsilon,
|
||||
scalar_t const* __restrict__ residual = nullptr) {
|
||||
int64_t const input_token_offset =
|
||||
blockIdx.x * static_cast<int64_t>(input_stride);
|
||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||
// sum of squares
|
||||
float ss = 0.0f;
|
||||
|
||||
for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float x = static_cast<float>(input[token_offset + i]);
|
||||
float x = static_cast<float>(input[input_token_offset + i]);
|
||||
if constexpr (has_residual) {
|
||||
x += static_cast<float>(residual[token_offset + i]);
|
||||
}
|
||||
@@ -73,15 +76,20 @@ __device__ void compute_dynamic_per_token_scales(
|
||||
float* __restrict__ token_scale, float* __restrict__ all_token_scales,
|
||||
scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
|
||||
float const rms, float const* __restrict__ scale_ub,
|
||||
int32_t const hidden_size, scalar_t const* __restrict__ residual = nullptr,
|
||||
int32_t const hidden_size, int32_t const input_stride,
|
||||
scalar_t const* __restrict__ residual = nullptr,
|
||||
int32_t const group_size = 0, int64_t outer_scale_stride = 1) {
|
||||
float block_absmax_val_maybe = 0.0f;
|
||||
constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
|
||||
__syncthreads();
|
||||
|
||||
int64_t const input_token_offset =
|
||||
blockIdx.x * static_cast<int64_t>(input_stride);
|
||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||
|
||||
if (group_size > 0) {
|
||||
__shared__ float s_max_vals[1024];
|
||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||
int64_t num_groups = hidden_size / group_size;
|
||||
__shared__ float s_max_vals[1024];
|
||||
int64_t const threads_per_group = blockDim.x / num_groups;
|
||||
int64_t const thread_in_group = threadIdx.x % threads_per_group;
|
||||
int64_t const group_offset = threadIdx.x / threads_per_group * group_size;
|
||||
@@ -89,7 +97,7 @@ __device__ void compute_dynamic_per_token_scales(
|
||||
int64_t const thread_end =
|
||||
min(group_offset + group_size, static_cast<int64_t>(hidden_size));
|
||||
for (auto i = thread_offset; i < thread_end; i += threads_per_group) {
|
||||
float x = static_cast<float>(input[token_offset + i]);
|
||||
float x = static_cast<float>(input[input_token_offset + i]);
|
||||
if constexpr (has_residual) {
|
||||
x += static_cast<float>(residual[token_offset + i]);
|
||||
}
|
||||
@@ -144,10 +152,8 @@ __device__ void compute_dynamic_per_token_scales(
|
||||
}
|
||||
__syncthreads();
|
||||
} else {
|
||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||
|
||||
for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float x = static_cast<float>(input[token_offset + i]);
|
||||
float x = static_cast<float>(input[input_token_offset + i]);
|
||||
if constexpr (has_residual) {
|
||||
x += static_cast<float>(residual[token_offset + i]);
|
||||
}
|
||||
@@ -185,12 +191,15 @@ template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
|
||||
__device__ void norm_and_quant(
|
||||
scalar_out_t* __restrict__ output, scalar_t const* __restrict__ input,
|
||||
scalar_t const* __restrict__ weight, float const rms, float* const scale,
|
||||
int32_t const hidden_size, scalar_t* __restrict__ residual = nullptr,
|
||||
int32_t const group_size = 0, int64_t outer_scale_stride = 1) {
|
||||
int32_t const hidden_size, int32_t const input_stride,
|
||||
scalar_t* __restrict__ residual = nullptr, int32_t const group_size = 0,
|
||||
int64_t outer_scale_stride = 1) {
|
||||
int64_t const input_token_offset =
|
||||
blockIdx.x * static_cast<int64_t>(input_stride);
|
||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||
|
||||
for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float x = static_cast<float>(input[token_offset + i]);
|
||||
float x = static_cast<float>(input[input_token_offset + i]);
|
||||
if constexpr (has_residual) {
|
||||
x += static_cast<float>(residual[token_offset + i]);
|
||||
residual[token_offset + i] = static_cast<scalar_t>(x);
|
||||
@@ -224,13 +233,16 @@ namespace vectorized {
|
||||
// hidden_size must be a multiple of 4
|
||||
template <typename scalar_t, bool has_residual = false>
|
||||
__device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
|
||||
int32_t const hidden_size, float const epsilon,
|
||||
int32_t const hidden_size,
|
||||
int32_t const input_stride, float const epsilon,
|
||||
scalar_t const* __restrict__ residual = nullptr) {
|
||||
int64_t const input_token_offset =
|
||||
blockIdx.x * static_cast<int64_t>(input_stride);
|
||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||
|
||||
// Vectorized input/output to better utilize memory bandwidth.
|
||||
vec4_t<scalar_t> const* vec_input =
|
||||
reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
|
||||
reinterpret_cast<vec4_t<scalar_t> const*>(&input[input_token_offset]);
|
||||
vec4_t<scalar_t> const* vec_residual = nullptr;
|
||||
if constexpr (has_residual) {
|
||||
vec_residual =
|
||||
@@ -288,7 +300,8 @@ __device__ void compute_dynamic_per_token_scales(
|
||||
float* __restrict__ token_scale, float* __restrict__ all_token_scales,
|
||||
scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
|
||||
float const rms, float const* __restrict__ scale_ub,
|
||||
int32_t const hidden_size, scalar_t const* __restrict__ residual = nullptr,
|
||||
int32_t const hidden_size, int32_t const input_stride,
|
||||
scalar_t const* __restrict__ residual = nullptr,
|
||||
int64_t outer_scale_stride = 1) {
|
||||
constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
|
||||
|
||||
@@ -300,10 +313,13 @@ __device__ void compute_dynamic_per_token_scales(
|
||||
vec4_t<scalar_t> const* vec_weight = nullptr;
|
||||
vec4_t<scalar_t> const* vec_residual = nullptr;
|
||||
|
||||
int64_t const input_token_offset =
|
||||
blockIdx.x * static_cast<int64_t>(input_stride);
|
||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||
|
||||
if constexpr (group_size > 0) {
|
||||
__shared__ float s_max_vals[1024];
|
||||
|
||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||
int64_t const num_groups = hidden_size / group_size;
|
||||
int64_t const threads_per_group = blockDim.x / num_groups;
|
||||
int64_t const thread_in_group = threadIdx.x % threads_per_group;
|
||||
@@ -312,7 +328,8 @@ __device__ void compute_dynamic_per_token_scales(
|
||||
int64_t const thread_offset = group_offset + thread_in_group;
|
||||
int64_t const thread_end = min(group_offset + (group_size >> 2),
|
||||
static_cast<int64_t>(hidden_size >> 2));
|
||||
vec_input = reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
|
||||
vec_input =
|
||||
reinterpret_cast<vec4_t<scalar_t> const*>(&input[input_token_offset]);
|
||||
vec_weight = reinterpret_cast<vec4_t<scalar_t> const*>(weight);
|
||||
if constexpr (has_residual) {
|
||||
vec_residual =
|
||||
@@ -396,8 +413,8 @@ __device__ void compute_dynamic_per_token_scales(
|
||||
__syncthreads();
|
||||
|
||||
} else {
|
||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||
vec_input = reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
|
||||
vec_input =
|
||||
reinterpret_cast<vec4_t<scalar_t> const*>(&input[input_token_offset]);
|
||||
vec_weight = reinterpret_cast<vec4_t<scalar_t> const*>(weight);
|
||||
if constexpr (has_residual) {
|
||||
vec_residual =
|
||||
@@ -462,18 +479,18 @@ __device__ void compute_dynamic_per_token_scales(
|
||||
template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
|
||||
bool has_residual = false, bool is_scale_transposed = false,
|
||||
int32_t group_size = 0>
|
||||
__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
|
||||
scalar_t const* __restrict__ input,
|
||||
scalar_t const* __restrict__ weight,
|
||||
float const rms, float* const scale,
|
||||
int32_t const hidden_size,
|
||||
scalar_t* __restrict__ residual = nullptr,
|
||||
int64_t outer_scale_stride = 1) {
|
||||
__device__ void norm_and_quant(
|
||||
scalar_out_t* __restrict__ output, scalar_t const* __restrict__ input,
|
||||
scalar_t const* __restrict__ weight, float const rms, float* const scale,
|
||||
int32_t const hidden_size, int32_t const input_stride,
|
||||
scalar_t* __restrict__ residual = nullptr, int64_t outer_scale_stride = 1) {
|
||||
int64_t const input_token_offset =
|
||||
blockIdx.x * static_cast<int64_t>(input_stride);
|
||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||
|
||||
// Vectorized input/output/weight/residual to better utilize memory bandwidth.
|
||||
vec4_t<scalar_t> const* vec_input =
|
||||
reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
|
||||
reinterpret_cast<vec4_t<scalar_t> const*>(&input[input_token_offset]);
|
||||
vec4_t<scalar_t> const* vec_weight =
|
||||
reinterpret_cast<vec4_t<scalar_t> const*>(weight);
|
||||
q8x4_t<scalar_out_t>* vec_output =
|
||||
|
||||
@@ -17,8 +17,11 @@ __global__ void compute_problem_sizes(const int32_t* __restrict__ topk_ids,
|
||||
int32_t* problem_sizes2,
|
||||
int32_t* atomic_buffer,
|
||||
const int topk_length, const int n,
|
||||
const int k) {
|
||||
const int k, const bool is_gated) {
|
||||
int expert_id = blockIdx.x;
|
||||
// For gated activations (gate + up), first GEMM output is 2*n.
|
||||
// For non-gated activations (up only), first GEMM output is n.
|
||||
int const n1 = is_gated ? 2 * n : n;
|
||||
|
||||
int occurrences = 0;
|
||||
for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) {
|
||||
@@ -31,13 +34,13 @@ __global__ void compute_problem_sizes(const int32_t* __restrict__ topk_ids,
|
||||
int final_occurrences = atomic_buffer[expert_id];
|
||||
if constexpr (!SWAP_AB) {
|
||||
problem_sizes1[expert_id * 3] = final_occurrences;
|
||||
problem_sizes1[expert_id * 3 + 1] = 2 * n;
|
||||
problem_sizes1[expert_id * 3 + 1] = n1;
|
||||
problem_sizes1[expert_id * 3 + 2] = k;
|
||||
problem_sizes2[expert_id * 3] = final_occurrences;
|
||||
problem_sizes2[expert_id * 3 + 1] = k;
|
||||
problem_sizes2[expert_id * 3 + 2] = n;
|
||||
} else {
|
||||
problem_sizes1[expert_id * 3] = 2 * n;
|
||||
problem_sizes1[expert_id * 3] = n1;
|
||||
problem_sizes1[expert_id * 3 + 1] = final_occurrences;
|
||||
problem_sizes1[expert_id * 3 + 2] = k;
|
||||
problem_sizes2[expert_id * 3] = k;
|
||||
@@ -107,13 +110,11 @@ __global__ void compute_arg_sorts(const int32_t* __restrict__ topk_ids,
|
||||
}
|
||||
|
||||
namespace {
|
||||
inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
|
||||
torch::Tensor& problem_sizes1,
|
||||
torch::Tensor& problem_sizes2,
|
||||
torch::Tensor& atomic_buffer,
|
||||
int64_t num_experts, int64_t n,
|
||||
int64_t k, cudaStream_t stream,
|
||||
const bool swap_ab) {
|
||||
inline void launch_compute_problem_sizes(
|
||||
const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
|
||||
torch::Tensor& problem_sizes2, torch::Tensor& atomic_buffer,
|
||||
int64_t num_experts, int64_t n, int64_t k, cudaStream_t stream,
|
||||
const bool swap_ab, const bool is_gated) {
|
||||
int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
|
||||
|
||||
auto const* topk_ptr = topk_ids.data_ptr<int32_t>();
|
||||
@@ -125,7 +126,7 @@ inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
|
||||
compute_problem_sizes<SwapAB><<<num_experts, num_threads, 0, stream>>>(
|
||||
topk_ptr, ps1_ptr, ps2_ptr, atomic_ptr,
|
||||
static_cast<int>(topk_ids.numel()), static_cast<int>(n),
|
||||
static_cast<int>(k));
|
||||
static_cast<int>(k), is_gated);
|
||||
});
|
||||
}
|
||||
} // namespace
|
||||
@@ -222,7 +223,8 @@ void get_cutlass_moe_mm_data_caller(
|
||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
|
||||
const int64_t num_experts, const int64_t n, const int64_t k,
|
||||
const std::optional<torch::Tensor>& blockscale_offsets) {
|
||||
const std::optional<torch::Tensor>& blockscale_offsets,
|
||||
const bool is_gated) {
|
||||
auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
|
||||
auto options_int32 =
|
||||
torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
|
||||
@@ -236,7 +238,7 @@ void get_cutlass_moe_mm_data_caller(
|
||||
|
||||
launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
|
||||
atomic_buffer, num_experts, n, k, stream,
|
||||
may_swap_ab);
|
||||
may_swap_ab, is_gated);
|
||||
|
||||
if (blockscale_offsets.has_value()) {
|
||||
// fp4 path
|
||||
|
||||
@@ -75,7 +75,8 @@ void get_cutlass_moe_mm_data_caller(
|
||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
|
||||
const int64_t num_experts, const int64_t n, const int64_t k,
|
||||
const std::optional<torch::Tensor>& blockscale_offsets);
|
||||
const std::optional<torch::Tensor>& blockscale_offsets,
|
||||
const bool is_gated);
|
||||
|
||||
void get_cutlass_moe_mm_problem_sizes_from_expert_offsets_caller(
|
||||
const torch::Tensor& expert_first_token_offset,
|
||||
@@ -278,7 +279,8 @@ void get_cutlass_moe_mm_data(
|
||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
|
||||
const int64_t num_experts, const int64_t n, const int64_t k,
|
||||
const std::optional<torch::Tensor>& blockscale_offsets) {
|
||||
const std::optional<torch::Tensor>& blockscale_offsets,
|
||||
const bool is_gated) {
|
||||
// This function currently gets compiled only if we have a valid cutlass moe
|
||||
// mm to run it for.
|
||||
int32_t version_num = get_sm_version_num();
|
||||
@@ -288,7 +290,7 @@ void get_cutlass_moe_mm_data(
|
||||
get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
|
||||
problem_sizes2, input_permutation,
|
||||
output_permutation, num_experts, n, k,
|
||||
blockscale_offsets);
|
||||
blockscale_offsets, is_gated);
|
||||
return;
|
||||
#endif
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
|
||||
@@ -26,6 +26,16 @@
|
||||
#define __HIP__GFX9__
|
||||
#endif
|
||||
|
||||
#if defined(__HIPCC__) && \
|
||||
(defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1150__) || \
|
||||
defined(__gfx1151__) || defined(__gfx1200__) || defined(__gfx1201__))
|
||||
#define __HIP__GFX1X__
|
||||
#endif
|
||||
|
||||
#if defined(__HIPCC__) && (defined(__gfx1200__) || defined(__gfx1201__))
|
||||
#define __HIP__GFX12__
|
||||
#endif
|
||||
|
||||
#if defined(__HIPCC__) && (defined(__gfx942__) || defined(__gfx950__))
|
||||
#define __HIP__MI3XX__
|
||||
#endif
|
||||
@@ -37,15 +47,31 @@
|
||||
#endif
|
||||
|
||||
int get_lds_size() {
|
||||
static bool is_cached = false;
|
||||
static int result;
|
||||
if (is_cached == false) {
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
std::string device_arch = dprops->gcnArchName;
|
||||
size_t substring = device_arch.find("gfx95");
|
||||
result = (substring == std::string::npos ? 64 * 1024 : 160 * 1024);
|
||||
is_cached = true;
|
||||
}
|
||||
static const int result = [] {
|
||||
const auto* dprops = at::cuda::getCurrentDeviceProperties();
|
||||
const std::string device_arch = dprops->gcnArchName;
|
||||
return device_arch.find("gfx95") == std::string::npos ? 64 * 1024
|
||||
: 160 * 1024;
|
||||
}();
|
||||
return result;
|
||||
}
|
||||
|
||||
bool on_gfx1x() {
|
||||
static const bool result = [] {
|
||||
const auto* dprops = at::cuda::getCurrentDeviceProperties();
|
||||
const std::string device_arch = dprops->gcnArchName;
|
||||
return device_arch.find("gfx11") != std::string::npos ||
|
||||
device_arch.find("gfx12") != std::string::npos;
|
||||
}();
|
||||
return result;
|
||||
}
|
||||
|
||||
bool on_gfx12() {
|
||||
static const bool result = [] {
|
||||
const auto* dprops = at::cuda::getCurrentDeviceProperties();
|
||||
const std::string device_arch = dprops->gcnArchName;
|
||||
return device_arch.find("gfx12") != std::string::npos;
|
||||
}();
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -286,21 +312,35 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
|
||||
return out_c;
|
||||
}
|
||||
|
||||
#define DOT2C(V0, V2, V3) \
|
||||
if constexpr (std::is_same_v<scalar_t, half>) { \
|
||||
asm("v_dot2c_f32_f16 %0, %2, %3" : "=v"(V0) : "0"(V0), "v"(V2), "v"(V3)); \
|
||||
} else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) { \
|
||||
float2 s = __bfloat1622float2(*((__hip_bfloat162*)(&(V2)))) * \
|
||||
__bfloat1622float2(*((__hip_bfloat162*)(&(V3)))); \
|
||||
V0 += (s.x + s.y); \
|
||||
}
|
||||
#if defined(__HIP__GFX9__) && !defined(__HIP__GFX1X__)
|
||||
#define DOT2C(V0, V2, V3) \
|
||||
if constexpr (std::is_same_v<scalar_t, half>) { \
|
||||
asm("v_dot2c_f32_f16 %0, %2, %3" \
|
||||
: "=v"(V0) \
|
||||
: "0"(V0), "v"(V2), "v"(V3)); \
|
||||
} else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) { \
|
||||
float2 s = __bfloat1622float2(*((__hip_bfloat162*)(&(V2)))) * \
|
||||
__bfloat1622float2(*((__hip_bfloat162*)(&(V3)))); \
|
||||
V0 += (s.x + s.y); \
|
||||
}
|
||||
#elif defined(__HIP__GFX1X__)
|
||||
// gfx1x: v_dot2_f32_f16 (VOP3-P, dot10-insts, available on gfx11+gfx12)
|
||||
#define DOT2C(V0, V2, V3) \
|
||||
if constexpr (std::is_same_v<scalar_t, half>) { \
|
||||
asm("v_dot2_f32_f16 %0, %1, %2, %0" : "+v"(V0) : "v"(V2), "v"(V3)); \
|
||||
} else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) { \
|
||||
float2 s = __bfloat1622float2(*((__hip_bfloat162*)(&(V2)))) * \
|
||||
__bfloat1622float2(*((__hip_bfloat162*)(&(V3)))); \
|
||||
V0 += (s.x + s.y); \
|
||||
}
|
||||
#endif
|
||||
|
||||
// To avoid LLVM silently upcasting to double
|
||||
__device__ inline unsigned int min__(uint32_t a, uint32_t b) {
|
||||
return min(a, b);
|
||||
}
|
||||
|
||||
#if defined(__HIP__GFX9__) // TODO: Add NAVI support
|
||||
#if defined(__HIP__GFX9__) || defined(__HIP__GFX1X__)
|
||||
// This version targets cases where A[] fits LDS capacity
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
@@ -442,14 +482,18 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
1); // row_shr2
|
||||
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x111, 0xf, 0xf,
|
||||
1); // row_shr1
|
||||
#if defined(__HIP__GFX9__)
|
||||
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x142, 0xf, 0xf,
|
||||
1); // ROW_BCAST15
|
||||
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x143, 0xf, 0xf,
|
||||
1); // ROW_BCAST31
|
||||
#else
|
||||
sum[n][y] += __shfl_xor(sum[n][y], 16);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
if (threadIdx.x == 63) {
|
||||
if (threadIdx.x == (THRDS - 1)) {
|
||||
scalar_t biases[N][YTILE] = {};
|
||||
if (BIAS)
|
||||
for (int n = 0; n < N; n++) {
|
||||
@@ -469,9 +513,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
#ifdef __HIP__GFX9__
|
||||
#pragma unroll
|
||||
for (int n = 0; n < N; n++) {
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for (int y = 0; y < YTILE; y++) {
|
||||
/*float accm1 = 0;
|
||||
for (int i=0; i<64; i++)
|
||||
@@ -498,7 +543,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
sum4[n][y][0] = accm;
|
||||
}
|
||||
}
|
||||
if (threadIdx.x == 63) {
|
||||
if (threadIdx.x == (THRDS - 1)) {
|
||||
scalar_t biases[N][YTILE] = {};
|
||||
if (BIAS)
|
||||
for (int n = 0; n < N; n++) {
|
||||
@@ -513,11 +558,12 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // __HIP__GFX9__ (MFMA path)
|
||||
}
|
||||
m += CuCount * _WvPrGrp * YTILE;
|
||||
}
|
||||
}
|
||||
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
#else
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
__global__ void wvSplitK_hf_sml_(const int K, const int Kbp, const int Kap,
|
||||
@@ -528,9 +574,9 @@ __global__ void wvSplitK_hf_sml_(const int K, const int Kbp, const int Kap,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
UNREACHABLE_CODE
|
||||
}
|
||||
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
#endif
|
||||
|
||||
#if defined(__HIP__GFX9__) // TODO: Add NAVI support
|
||||
#if defined(__HIP__GFX9__) || defined(__HIP__GFX1X__)
|
||||
// This version targets cases where A[] marginally exceeds LDS capacity
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
@@ -657,14 +703,18 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
1); // row_shr2
|
||||
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x111, 0xf, 0xf,
|
||||
1); // row_shr1
|
||||
#if defined(__HIP__GFX9__)
|
||||
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x142, 0xf, 0xf,
|
||||
1); // ROW_BCAST15
|
||||
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x143, 0xf, 0xf,
|
||||
1); // ROW_BCAST31
|
||||
#else
|
||||
sum[n][y] += __shfl_xor(sum[n][y], 16);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
if (threadIdx.x == 63) {
|
||||
if (threadIdx.x == (THRDS - 1)) {
|
||||
scalar_t biases[N][YTILE] = {};
|
||||
if (BIAS)
|
||||
for (int n = 0; n < N; n++) {
|
||||
@@ -686,9 +736,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
#ifdef __HIP__GFX9__
|
||||
#pragma unroll
|
||||
for (int n = 0; n < N; n++) {
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for (int y = 0; y < YTILE; y++) {
|
||||
// float accm1 = 0;
|
||||
// for (int i=0; i<64; i++)
|
||||
@@ -713,7 +764,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
sum4[n][y][0] = accm;
|
||||
}
|
||||
}
|
||||
if (threadIdx.x == 63) {
|
||||
if (threadIdx.x == (THRDS - 1)) {
|
||||
scalar_t biases[N][YTILE] = {};
|
||||
if (BIAS)
|
||||
for (int n = 0; n < N; n++) {
|
||||
@@ -730,6 +781,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // __HIP__GFX9__ (MFMA path)
|
||||
}
|
||||
|
||||
m += CuCount * _WvPrGrp * YTILE;
|
||||
@@ -746,7 +798,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
}
|
||||
}
|
||||
|
||||
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
#else
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
__global__ void wvSplitK_hf_(const int K, const int Kbp, const int Kap,
|
||||
@@ -756,9 +808,9 @@ __global__ void wvSplitK_hf_(const int K, const int Kbp, const int Kap,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
UNREACHABLE_CODE
|
||||
}
|
||||
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
#endif
|
||||
|
||||
#if defined(__HIP__GFX9__) // TODO: Add NAVI support
|
||||
#if defined(__HIP__GFX9__) || defined(__HIP__GFX1X__)
|
||||
// This version targets big A[] cases, where it is much larger than LDS capacity
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
@@ -1004,14 +1056,18 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
1); // row_shr2
|
||||
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x111, 0xf, 0xf,
|
||||
1); // row_shr1
|
||||
#if defined(__HIP__GFX9__)
|
||||
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x142, 0xf, 0xf,
|
||||
1); // ROW_BCAST15
|
||||
sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x143, 0xf, 0xf,
|
||||
1); // ROW_BCAST31
|
||||
#else
|
||||
sum[n][y] += __shfl_xor(sum[n][y], 16);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
if (threadIdx.x == 63) {
|
||||
if (threadIdx.x == (THRDS - 1)) {
|
||||
scalar_t biases[N][YTILE] = {};
|
||||
if (BIAS)
|
||||
for (int n = 0; n < N; n++) {
|
||||
@@ -1033,9 +1089,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
#ifdef __HIP__GFX9__
|
||||
#pragma unroll
|
||||
for (int n = 0; n < N; n++) {
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for (int y = 0; y < YTILE; y++) {
|
||||
float accm = sum4[n][y][0];
|
||||
accm += __builtin_amdgcn_mov_dpp(sum4[n][y][1], 0x101, 0xf, 0xf,
|
||||
@@ -1057,7 +1114,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
sum4[n][y][0] = accm;
|
||||
}
|
||||
}
|
||||
if (threadIdx.x == 63) {
|
||||
if (threadIdx.x == (THRDS - 1)) {
|
||||
scalar_t biases[N][YTILE] = {};
|
||||
if (BIAS)
|
||||
for (int n = 0; n < N; n++) {
|
||||
@@ -1074,6 +1131,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // __HIP__GFX9__ (MFMA path)
|
||||
}
|
||||
|
||||
m += CuCount * _WvPrGrp * YTILE;
|
||||
@@ -1090,7 +1148,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
}
|
||||
}
|
||||
}
|
||||
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
#else
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N>
|
||||
__global__ void wvSplitK_hf_big_(const int K, const int Kbp, const int Kap,
|
||||
@@ -1101,7 +1159,7 @@ __global__ void wvSplitK_hf_big_(const int K, const int Kbp, const int Kap,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
UNREACHABLE_CODE
|
||||
}
|
||||
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
#endif
|
||||
|
||||
// Find the min val of div2 that doesn't increase N/(div1*div2)
|
||||
int mindiv(int N, int div1, int div2) {
|
||||
@@ -1148,40 +1206,40 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
const int max_lds_len = get_lds_size() / 2;
|
||||
|
||||
#define WVSPLITK(_YTILE, _UNRL, _N) \
|
||||
#define WVSPLITK_CFG(_THRDS, _WVPRGRP, _YTILE, _UNRL, _N) \
|
||||
{ \
|
||||
dim3 block(64, 16); \
|
||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, 16); \
|
||||
dim3 block(_THRDS, _WVPRGRP); \
|
||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, _WVPRGRP); \
|
||||
if ((Kbp_in * N_in <= max_lds_len) && (M_in % _YTILE == 0)) \
|
||||
wvSplitK_hf_sml_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
|
||||
wvSplitK_hf_sml_<fptype, _THRDS, _YTILE, _WVPRGRP, 8, _UNRL, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \
|
||||
By_in, af4, bf4, biasf4, c, __wvPrGrp, \
|
||||
CuCount); \
|
||||
else if (Kbp_in * N_in <= max_lds_len * 1.2) \
|
||||
wvSplitK_hf_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
|
||||
wvSplitK_hf_<fptype, _THRDS, _YTILE, _WVPRGRP, 8, _UNRL, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \
|
||||
By_in, af4, bf4, biasf4, c, __wvPrGrp, \
|
||||
CuCount); \
|
||||
else \
|
||||
wvSplitK_hf_big_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
|
||||
wvSplitK_hf_big_<fptype, _THRDS, _YTILE, _WVPRGRP, 8, _UNRL, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \
|
||||
By_in, af4, bf4, biasf4, c, __wvPrGrp, \
|
||||
CuCount); \
|
||||
}
|
||||
|
||||
#define WVSPLIT_TILE(_sYT, __N) \
|
||||
#define WVSPLIT_TILE_CFG(_THRDS, _WVPRGRP, _sYT, __N) \
|
||||
{ \
|
||||
bool fit_lds = (Kbp_in * N_in <= max_lds_len); \
|
||||
if (_sYT <= 1) \
|
||||
WVSPLITK(1, 4, __N) \
|
||||
WVSPLITK_CFG(_THRDS, _WVPRGRP, 1, 4, __N) \
|
||||
else if ((__N == 1) || (!fit_lds) || (_sYT <= 4 * 2)) \
|
||||
WVSPLITK(2, 2, __N) \
|
||||
WVSPLITK_CFG(_THRDS, _WVPRGRP, 2, 2, __N) \
|
||||
else if (_sYT <= 4 * 3) \
|
||||
WVSPLITK(3, 2, __N) \
|
||||
WVSPLITK_CFG(_THRDS, _WVPRGRP, 3, 2, __N) \
|
||||
else if (__N == 4) \
|
||||
WVSPLITK(4, 1, __N) \
|
||||
WVSPLITK_CFG(_THRDS, _WVPRGRP, 4, 1, __N) \
|
||||
else \
|
||||
WVSPLITK(4, 2, __N) \
|
||||
WVSPLITK_CFG(_THRDS, _WVPRGRP, 4, 2, __N) \
|
||||
}
|
||||
|
||||
AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] {
|
||||
@@ -1198,18 +1256,31 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
// then cut the active waves to balance their distribution...
|
||||
int sYT = (M_in + CuCount * 4 - 1) / (CuCount * 4);
|
||||
|
||||
const bool use_wave32 = on_gfx1x();
|
||||
switch (N_in) {
|
||||
case 1:
|
||||
WVSPLIT_TILE(sYT, 1)
|
||||
if (use_wave32)
|
||||
WVSPLIT_TILE_CFG(32, 16, sYT, 1)
|
||||
else
|
||||
WVSPLIT_TILE_CFG(64, 16, sYT, 1)
|
||||
break;
|
||||
case 2:
|
||||
WVSPLIT_TILE(sYT, 2)
|
||||
if (use_wave32)
|
||||
WVSPLIT_TILE_CFG(32, 16, sYT, 2)
|
||||
else
|
||||
WVSPLIT_TILE_CFG(64, 16, sYT, 2)
|
||||
break;
|
||||
case 3:
|
||||
WVSPLIT_TILE(sYT, 3)
|
||||
if (use_wave32)
|
||||
WVSPLIT_TILE_CFG(32, 16, sYT, 3)
|
||||
else
|
||||
WVSPLIT_TILE_CFG(64, 16, sYT, 3)
|
||||
break;
|
||||
case 4:
|
||||
WVSPLIT_TILE(sYT, 4)
|
||||
if (use_wave32)
|
||||
WVSPLIT_TILE_CFG(32, 16, sYT, 4)
|
||||
else
|
||||
WVSPLIT_TILE_CFG(64, 16, sYT, 4)
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
@@ -1653,7 +1724,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
#else
|
||||
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
|
||||
int UNRL, int N, int GrpsShrB, int CHUNKK, int DTRMNSTC>
|
||||
__global__ void wvSplitKrc_(const int actlN, const int K, const int Kap,
|
||||
@@ -1688,6 +1759,8 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
TORCH_CHECK(in_a.dtype() == torch::kFloat16 ||
|
||||
in_a.dtype() == torch::kBFloat16);
|
||||
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
|
||||
|
||||
auto out_c = torch::empty(
|
||||
{N_in, M_in},
|
||||
torch::TensorOptions().dtype(in_a.dtype()).device(in_a.device()));
|
||||
@@ -1696,7 +1769,6 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
|
||||
dim3 grid(CuCount);
|
||||
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
// const int max_lds_len = get_lds_size() / 2;
|
||||
|
||||
@@ -1773,7 +1845,7 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
return out_c;
|
||||
}
|
||||
|
||||
#if defined(__HIP__MI3XX__) // TODO: Add NAVI support
|
||||
#if defined(__HIP__MI3XX__) || defined(__HIP__GFX12__)
|
||||
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
||||
int A_CHUNK, int UNRL, int N>
|
||||
__global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
@@ -1817,12 +1889,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
|
||||
uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;
|
||||
|
||||
using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
|
||||
float sA = *s_A;
|
||||
float sB = *s_B;
|
||||
|
||||
while (m < M) {
|
||||
#ifdef __HIP__GFX12__
|
||||
// gfx12: per-lane scalar accumulation via v_dot4_f32_fp8_fp8
|
||||
float sum[N][YTILE] = {};
|
||||
#else
|
||||
// gfx9: MFMA accumulation
|
||||
scalar8 sum[N][YTILE] = {};
|
||||
#endif
|
||||
for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
|
||||
bigType bigA[N][UNRL] = {};
|
||||
bigType bigB[YTILE][UNRL];
|
||||
@@ -1854,6 +1931,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
#pragma unroll
|
||||
for (uint32_t k2 = 0; k2 < UNRL; k2++) {
|
||||
for (uint32_t n = 0; n < N; n++) {
|
||||
#ifdef __HIP__GFX12__
|
||||
// gfx12: 4 x dot4 per A_CHUNK=16 bytes (4 FP8 per dot4)
|
||||
for (int y = 0; y < YTILE; ++y) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < A_CHUNK / 4; i++) {
|
||||
sum[n][y] = __builtin_amdgcn_dot4_f32_fp8_fp8(
|
||||
bigA[n][k2].i[i], bigB[y][k2].i[i], sum[n][y]);
|
||||
}
|
||||
}
|
||||
#else
|
||||
// gfx9: MFMA path
|
||||
for (int i = 0; i < A_CHUNK; i += 8) {
|
||||
for (int y = 0; y < YTILE; ++y) {
|
||||
sum[n][y] = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(
|
||||
@@ -1861,11 +1949,33 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Final reduction
|
||||
#ifdef __HIP__GFX12__
|
||||
// gfx12 wave32: DPP row_shr within 16-lane rows + cross-row shuffle
|
||||
for (int n = 0; n < N; n++) {
|
||||
for (int y = 0; y < YTILE; y++) {
|
||||
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
|
||||
: "=v"(sum[n][y])
|
||||
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
|
||||
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
|
||||
: "=v"(sum[n][y])
|
||||
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
|
||||
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
|
||||
: "=v"(sum[n][y])
|
||||
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
|
||||
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:1 bound_ctrl:0 "
|
||||
: "=v"(sum[n][y])
|
||||
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
|
||||
sum[n][y] += __shfl_xor(sum[n][y], 16);
|
||||
}
|
||||
}
|
||||
#else
|
||||
// gfx9 MFMA reduction
|
||||
for (int n = 0; n < N; n++) {
|
||||
for (int y = 0; y < YTILE; y++) {
|
||||
float accm0 = sum[n][y][0];
|
||||
@@ -1880,8 +1990,15 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
sum[n][y][0] = accm0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
const bool writeback_lane =
|
||||
#ifdef __HIP__GFX12__
|
||||
threadIdx.x == (THRDS - 1);
|
||||
#else
|
||||
threadIdx.x == 0;
|
||||
#endif
|
||||
if (writeback_lane) {
|
||||
scalar_t biases[N][YTILE] = {};
|
||||
if (BIAS)
|
||||
for (int n = 0; n < N; n++) {
|
||||
@@ -1892,13 +2009,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
for (int n = 0; n < N; n++) {
|
||||
for (int y = 0; y < YTILE; y++) {
|
||||
if (y + m >= M) break; // To avoid mem access fault.
|
||||
sum[n][y][0] *= sA * sB;
|
||||
#ifdef __HIP__GFX12__
|
||||
float result = sum[n][y] * sA * sB;
|
||||
#else
|
||||
float result = sum[n][y][0] * sA * sB;
|
||||
#endif
|
||||
if constexpr (std::is_same_v<scalar_t, half>) {
|
||||
sum[n][y][0] += __half2float(biases[n][y]);
|
||||
result += __half2float(biases[n][y]);
|
||||
} else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
|
||||
sum[n][y][0] += __bfloat162float(biases[n][y]);
|
||||
result += __bfloat162float(biases[n][y]);
|
||||
}
|
||||
C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0]);
|
||||
C[m + y + n * M] = __float2s<scalar_t>(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1906,7 +2027,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
m += CuCount * _WvPrGrp * YTILE;
|
||||
}
|
||||
}
|
||||
#else // !defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||
#else // !defined(__HIP__MI3XX__) && !defined(__HIP__GFX12__)
|
||||
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
||||
int A_CHUNK, int UNRL, int N>
|
||||
__global__ void wvSplitKQ_hf_sml_(const int K, const int Kap, const int Kbp,
|
||||
@@ -1918,9 +2039,9 @@ __global__ void wvSplitKQ_hf_sml_(const int K, const int Kap, const int Kbp,
|
||||
const int _WvPrGrp, const int CuCount) {
|
||||
UNREACHABLE_CODE
|
||||
}
|
||||
#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||
#endif // defined(__HIP__MI3XX__) || defined(__HIP__GFX12__)
|
||||
|
||||
#if defined(__HIP__MI3XX__) // TODO: Add NAVI support
|
||||
#if defined(__HIP__MI3XX__) || defined(__HIP__GFX12__)
|
||||
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
||||
int A_CHUNK, int UNRL, int N>
|
||||
__global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
@@ -1963,12 +2084,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
|
||||
uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;
|
||||
|
||||
using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
|
||||
float sA = *s_A;
|
||||
float sB = *s_B;
|
||||
|
||||
while (m < M) {
|
||||
#ifdef __HIP__GFX12__
|
||||
// gfx12: per-lane scalar accumulation via v_dot4_f32_fp8_fp8
|
||||
float sum[N][YTILE] = {};
|
||||
#else
|
||||
// gfx9: MFMA accumulation
|
||||
scalar8 sum[N][YTILE] = {};
|
||||
#endif
|
||||
for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
|
||||
bigType bigA[N][UNRL] = {};
|
||||
bigType bigB[YTILE][UNRL];
|
||||
@@ -2002,6 +2128,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
#pragma unroll
|
||||
for (uint32_t k2 = 0; k2 < UNRL; k2++) {
|
||||
for (uint32_t n = 0; n < N; n++) {
|
||||
#ifdef __HIP__GFX12__
|
||||
// gfx12: 4 x dot4 per A_CHUNK=16 bytes (4 FP8 per dot4)
|
||||
for (int y = 0; y < YTILE; ++y) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < A_CHUNK / 4; i++) {
|
||||
sum[n][y] = __builtin_amdgcn_dot4_f32_fp8_fp8(
|
||||
bigA[n][k2].i[i], bigB[y][k2].i[i], sum[n][y]);
|
||||
}
|
||||
}
|
||||
#else
|
||||
// gfx9: MFMA path
|
||||
for (int i = 0; i < A_CHUNK; i += 8) {
|
||||
for (int y = 0; y < YTILE; ++y) {
|
||||
sum[n][y] = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(
|
||||
@@ -2009,11 +2146,33 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Final reduction
|
||||
#ifdef __HIP__GFX12__
|
||||
// gfx12 wave32: DPP row_shr within 16-lane rows + cross-row shuffle
|
||||
for (int n = 0; n < N; n++) {
|
||||
for (int y = 0; y < YTILE; y++) {
|
||||
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
|
||||
: "=v"(sum[n][y])
|
||||
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
|
||||
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
|
||||
: "=v"(sum[n][y])
|
||||
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
|
||||
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
|
||||
: "=v"(sum[n][y])
|
||||
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
|
||||
asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:1 bound_ctrl:0 "
|
||||
: "=v"(sum[n][y])
|
||||
: "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
|
||||
sum[n][y] += __shfl_xor(sum[n][y], 16);
|
||||
}
|
||||
}
|
||||
#else
|
||||
// gfx9 MFMA reduction
|
||||
for (int n = 0; n < N; n++) {
|
||||
for (int y = 0; y < YTILE; y++) {
|
||||
float accm0 = sum[n][y][0];
|
||||
@@ -2028,8 +2187,15 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
sum[n][y][0] = accm0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
const bool writeback_lane =
|
||||
#ifdef __HIP__GFX12__
|
||||
threadIdx.x == (THRDS - 1);
|
||||
#else
|
||||
threadIdx.x == 0;
|
||||
#endif
|
||||
if (writeback_lane) {
|
||||
scalar_t biases[N][YTILE] = {};
|
||||
if (BIAS)
|
||||
for (int n = 0; n < N; n++) {
|
||||
@@ -2040,13 +2206,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
for (int n = 0; n < N; n++) {
|
||||
for (int y = 0; y < YTILE; y++) {
|
||||
if (y + m >= M) break; // To avoid mem access fault.
|
||||
sum[n][y][0] *= sA * sB;
|
||||
#ifdef __HIP__GFX12__
|
||||
float result = sum[n][y] * sA * sB;
|
||||
#else
|
||||
float result = sum[n][y][0] * sA * sB;
|
||||
#endif
|
||||
if constexpr (std::is_same_v<scalar_t, half>) {
|
||||
sum[n][y][0] += __half2float(biases[n][y]);
|
||||
result += __half2float(biases[n][y]);
|
||||
} else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
|
||||
sum[n][y][0] += __bfloat162float(biases[n][y]);
|
||||
result += __bfloat162float(biases[n][y]);
|
||||
}
|
||||
C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0]);
|
||||
C[m + y + n * M] = __float2s<scalar_t>(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2054,7 +2224,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
||||
m += CuCount * _WvPrGrp * YTILE;
|
||||
}
|
||||
}
|
||||
#else // !defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||
#else // !defined(__HIP__MI3XX__) && !defined(__HIP__GFX12__)
|
||||
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
|
||||
int A_CHUNK, int UNRL, int N>
|
||||
__global__ void wvSplitKQ_hf_(const int K, const int Kap, const int Kbp,
|
||||
@@ -2066,7 +2236,7 @@ __global__ void wvSplitKQ_hf_(const int K, const int Kap, const int Kbp,
|
||||
const int CuCount) {
|
||||
UNREACHABLE_CODE
|
||||
}
|
||||
#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support
|
||||
#endif // defined(__HIP__MI3XX__) || defined(__HIP__GFX12__)
|
||||
|
||||
void wvSplitKQ(const at::Tensor& in_b, const at::Tensor& in_a,
|
||||
const std::optional<at::Tensor>& in_bias, at::Tensor& out_c,
|
||||
@@ -2099,24 +2269,30 @@ void wvSplitKQ(const at::Tensor& in_b, const at::Tensor& in_a,
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
const int max_lds_len = get_lds_size();
|
||||
|
||||
#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N) \
|
||||
{ \
|
||||
dim3 block(64, _WvPrGrp); \
|
||||
if ((Kap_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \
|
||||
int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEs, 16)); \
|
||||
wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \
|
||||
By_in, b_ptr, a_ptr, bias_ptr, c_ptr, \
|
||||
s_a, s_b, __wvPrGrp, CuCount); \
|
||||
} else { \
|
||||
int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEm, 16)); \
|
||||
wvSplitKQ_hf_<fptype, fp8_t, 64, _YTILEm, _WvPrGrp, 16, _UNRLm, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \
|
||||
By_in, b_ptr, a_ptr, bias_ptr, c_ptr, \
|
||||
s_a, s_b, __wvPrGrp, CuCount); \
|
||||
} \
|
||||
#define WVSPLITKQ_IMPL(_THRDS, _WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N) \
|
||||
{ \
|
||||
dim3 block(_THRDS, _WvPrGrp); \
|
||||
if ((Kap_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \
|
||||
int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEs, 16)); \
|
||||
wvSplitKQ_hf_sml_<fptype, fp8_t, _THRDS, _YTILEs, _WvPrGrp, 16, _UNRLs, \
|
||||
_N><<<grid, block, 0, stream>>>( \
|
||||
K_in, Kap_in, Kbp_in, M_in, Bx_in, By_in, b_ptr, a_ptr, bias_ptr, \
|
||||
c_ptr, s_a, s_b, __wvPrGrp, CuCount); \
|
||||
} else { \
|
||||
int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEm, 16)); \
|
||||
wvSplitKQ_hf_<fptype, fp8_t, _THRDS, _YTILEm, _WvPrGrp, 16, _UNRLm, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \
|
||||
By_in, b_ptr, a_ptr, bias_ptr, c_ptr, \
|
||||
s_a, s_b, __wvPrGrp, CuCount); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N) \
|
||||
if (on_gfx12()) \
|
||||
WVSPLITKQ_IMPL(32, _WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N) \
|
||||
else \
|
||||
WVSPLITKQ_IMPL(64, _WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N)
|
||||
|
||||
AT_DISPATCH_REDUCED_FLOATING_TYPES(out_c.scalar_type(), "wvSplitKQ", [&] {
|
||||
using fptype = typename scalar<scalar_t>::type;
|
||||
auto c_ptr = reinterpret_cast<fptype*>(out_c.data_ptr());
|
||||
@@ -2136,10 +2312,10 @@ void wvSplitKQ(const at::Tensor& in_b, const at::Tensor& in_a,
|
||||
WVSPLITKQ(16, 2, 2, 2, 2, 2)
|
||||
break;
|
||||
case 3:
|
||||
WVSPLITKQ(16, 2, 2, 2, 2, 3)
|
||||
WVSPLITKQ(16, 2, 2, 1, 1, 3)
|
||||
break;
|
||||
case 4:
|
||||
WVSPLITKQ(16, 2, 2, 2, 2, 4)
|
||||
WVSPLITKQ(16, 2, 2, 1, 1, 4)
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
|
||||
@@ -575,7 +575,7 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode(
|
||||
// The range of logits within the row.
|
||||
int rowStart = 0;
|
||||
int seq_len = seqLens[rowIdx / next_n];
|
||||
int rowEnd = seq_len - next_n + (rowIdx % next_n) + 1;
|
||||
int rowEnd = max(0, seq_len - next_n + (rowIdx % next_n) + 1);
|
||||
|
||||
// Local pointers to this block
|
||||
if constexpr (!multipleBlocksPerRow && !mergeBlocks) {
|
||||
|
||||
@@ -303,9 +303,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
") -> Tensor");
|
||||
// conditionally compiled so impl registration is in source file
|
||||
|
||||
ops.def("permute_cols(Tensor A, Tensor perm) -> Tensor");
|
||||
ops.impl("permute_cols", torch::kCUDA, &permute_cols);
|
||||
|
||||
// Marlin Optimized Quantized GEMM (supports GPTQ, AWQ, FP8, NVFP4, MXFP4).
|
||||
ops.def(
|
||||
"marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, "
|
||||
@@ -489,8 +486,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
" Tensor! problem_sizes1, Tensor! problem_sizes2, "
|
||||
" Tensor! input_permutation, "
|
||||
" Tensor! output_permutation, int num_experts, "
|
||||
" int n, int k, Tensor? blockscale_offsets) -> "
|
||||
"()");
|
||||
" int n, int k, Tensor? blockscale_offsets, "
|
||||
" bool is_gated) -> ()");
|
||||
ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
|
||||
|
||||
// compute per-expert problem sizes from expert_first_token_offset
|
||||
@@ -564,10 +561,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
|
||||
// Compute NVFP4 block quantized tensor.
|
||||
ops.def(
|
||||
"scaled_fp4_quant(Tensor! output, Tensor input,"
|
||||
" Tensor! output_scale, Tensor input_scale, bool "
|
||||
"is_sf_swizzled_layout) -> ()");
|
||||
ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant);
|
||||
"scaled_fp4_quant(Tensor input,"
|
||||
" Tensor input_scale, bool "
|
||||
"is_sf_swizzled_layout) -> (Tensor, Tensor)");
|
||||
ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant_func);
|
||||
|
||||
// Out variant
|
||||
// TODO: Add {at::Tag::out_variant} tag and update all call sites
|
||||
// to use the functional variant once vLLM upgrades PyTorch.
|
||||
// See pytorch/pytorch#176117.
|
||||
ops.def(
|
||||
"scaled_fp4_quant.out(Tensor input,"
|
||||
" Tensor input_scale, bool "
|
||||
"is_sf_swizzled_layout, *, Tensor(a!) output, Tensor(b!) output_scale) "
|
||||
"-> ()");
|
||||
ops.impl("scaled_fp4_quant.out", torch::kCUDA, &scaled_fp4_quant_out);
|
||||
|
||||
// Compute NVFP4 experts quantization.
|
||||
ops.def(
|
||||
|
||||
@@ -586,7 +586,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
# This is ~1.1GB and only changes when FlashInfer version bumps
|
||||
# https://docs.flashinfer.ai/installation.html
|
||||
# From versions.json: .flashinfer.version
|
||||
ARG FLASHINFER_VERSION=0.6.4
|
||||
ARG FLASHINFER_VERSION=0.6.6
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \
|
||||
&& uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
|
||||
@@ -620,7 +620,7 @@ RUN set -eux; \
|
||||
ARG BITSANDBYTES_VERSION_X86=0.46.1
|
||||
ARG BITSANDBYTES_VERSION_ARM64=0.42.0
|
||||
ARG TIMM_VERSION=">=1.0.17"
|
||||
ARG RUNAI_MODEL_STREAMER_VERSION=">=0.15.3"
|
||||
ARG RUNAI_MODEL_STREAMER_VERSION=">=0.15.7"
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||
BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_ARM64}"; \
|
||||
@@ -628,7 +628,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_X86}"; \
|
||||
fi; \
|
||||
uv pip install --system accelerate hf_transfer modelscope \
|
||||
"bitsandbytes>=${BITSANDBYTES_VERSION}" "timm${TIMM_VERSION}" "runai-model-streamer[s3,gcs]${RUNAI_MODEL_STREAMER_VERSION}"
|
||||
"bitsandbytes>=${BITSANDBYTES_VERSION}" "timm${TIMM_VERSION}" "runai-model-streamer[s3,gcs,azure]${RUNAI_MODEL_STREAMER_VERSION}"
|
||||
|
||||
# ============================================================
|
||||
# VLLM INSTALLATION (depends on build stage)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user