Compare commits

..

16 Commits

Author SHA1 Message Date
roikoren755
95c0f928cd [NemotronH] Small fix reasoning parser (#36635)
Signed-off-by: Roi Koren <roik@nvidia.com>
(cherry picked from commit e661b9ee83)
2026-03-11 02:51:18 -07:00
Shaun Kotek
c9b1e977dc add nemotron v3 reasoning parser (#36393)
Signed-off-by: Shaun Kotek - Nvidia <skotek@nvidia.com>
Co-authored-by: root <root@gpu-259.slurm-workers-slurm.slurm.svc.cluster.local>
(cherry picked from commit 203a7f27da)
2026-03-11 02:51:04 -07:00
Kevin H. Luu
1ff2393897 [ci] Bound nvidia-cudnn-frontend version (#36719)
Signed-off-by: khluu <khluu000@gmail.com>
(cherry picked from commit 82b110d50e)
2026-03-10 21:20:41 -07:00
Benjamin Chislett
5bec0b0ba3 [DSV3.2][MTP] Optimize Indexer MTP handling (#36723)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
(cherry picked from commit 9040cd40af)
2026-03-10 21:20:23 -07:00
Wei Zhao
6da1310f91 [Bug] Fix TRTLLM Block FP8 MoE Monolithic (#36296)
Signed-off-by: wzhao18 <wzhao18.sz@gmail.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
(cherry picked from commit 84e436ed1c)
2026-03-10 19:08:18 -07:00
khluu
bc46be5daf Revert "add nemotron v3 reasoning parser (#36393)"
This reverts commit 8e39d39fd4.
2026-03-10 11:47:09 -07:00
Shaun Kotek
8e39d39fd4 add nemotron v3 reasoning parser (#36393)
Signed-off-by: Shaun Kotek - Nvidia <skotek@nvidia.com>
Co-authored-by: root <root@gpu-259.slurm-workers-slurm.slurm.svc.cluster.local>
(cherry picked from commit 203a7f27da)
2026-03-10 09:50:38 -07:00
Vadim Gimpelson
46fa044cc1 [BUGFIX][Mamba][Qwen3.5] Zero freed SSM cache blocks on GPU (#35219)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
(cherry picked from commit 4ff8c3c8f9)
2026-03-10 09:26:18 -07:00
amirkl94
ab43e37158 Fix: Re-Enable EP for trtllm MoE FP8 backend (#36494)
Signed-off-by: Amir Klein <203507526+amirkl94@users.noreply.github.com>
(cherry picked from commit 156e33553c)
2026-03-10 09:26:03 -07:00
Shaun Kotek
f45d010120 Fix/resupport nongated fused moe triton (#36412)
Signed-off-by: Shaun Kotek - Nvidia <skotek@nvidia.com>
Signed-off-by: Natan Bagrov <nbagrov@nvidia.com>
Signed-off-by: Daniel Serebrenik <daserebrenik@nvidia.com>
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Signed-off-by: liweiguang <codingpunk@gmail.com>
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Alex Brooks <albrooks@redhat.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: cong-or <conchubhar.gannon@gmail.com>
Signed-off-by: Tushar Shetty <tushar.shetty@abbyy.com>
Signed-off-by: Tushar Shetty <54362365+tusharshetty61@users.noreply.github.com>
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
Signed-off-by: Xin Yang <xyangx@amazon.com>
Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: nvnbagrov <nbagrov@nvidia.com>
Co-authored-by: Sage <80211083+sagearc@users.noreply.github.com>
Co-authored-by: danisereb <daserebrenik@nvidia.com>
Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Weiguang Li <codingpunk@gmail.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
Co-authored-by: wang.yuqi <yuqi.wang@daocloud.io>
Co-authored-by: Alex Brooks <albrooks@redhat.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: cong-or <conchubhar.gannon@gmail.com>
Co-authored-by: Tushar Shetty <54362365+tusharshetty61@users.noreply.github.com>
Co-authored-by: liuzhenwei <zhenwei.liu@intel.com>
Co-authored-by: Xin Yang <105740670+xyang16@users.noreply.github.com>
Co-authored-by: Kevin H. Luu <khluu000@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
(cherry picked from commit fa028207aa)
2026-03-10 09:25:51 -07:00
amitz-nv
244b922088 [Bugfix] Fix passing of activation_type to trtllm fused MoE NVFP4 and FP8 (#36017)
Signed-off-by: amitz-nv <203509407+amitz-nv@users.noreply.github.com>
(cherry picked from commit d7adcadb9b)
2026-03-10 09:25:36 -07:00
khluu
b31e9326a7 Bound openai to under 2.25.0
Signed-off-by: khluu <khluu000@gmail.com>
2026-03-06 13:04:15 -08:00
Doug Smith
e346c08560 [Release] Include source distribution (sdist) in PyPI uploads (#35136)
Signed-off-by: dougbtv <dosmith@redhat.com>
Co-authored-by: Daniele Trifirò <dtrifiro@redhat.com>
(cherry picked from commit 0bfa229bf1)
2026-03-06 13:03:53 -08:00
Avery Miao
b7a423cb01 [BUGFIX]Fix Qwen-Omni models audio max_token_per_item estimation error leading to encoder_cache_size is 0 (#35994)
Signed-off-by: Miao, Avery <avery.miao@intel.com>
(cherry picked from commit e998fa76b9)
2026-03-06 13:03:40 -08:00
Cyrus Leung
fa78ec8a72 [Bugfix] Fix Qwen-VL tokenizer implementation (#36140)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
(cherry picked from commit 7196348157)
2026-03-06 13:03:26 -08:00
Kunshang Ji
9a474ce7a4 [XPU] bump vllm-xpu-kernels to v0.1.3 (#35984)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
(cherry picked from commit a8f66cbde8)
2026-03-06 13:03:05 -08:00
1107 changed files with 44909 additions and 81578 deletions

View File

@@ -10,7 +10,7 @@ steps:
docker build
--build-arg max_jobs=16
--build-arg REMOTE_VLLM=1
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx942;gfx950'
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-f docker/Dockerfile.rocm

View File

@@ -21,20 +21,6 @@ steps:
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
pytest -x -v -s tests/kernels/test_onednn.py"
- label: CPU-Compatibility Tests
depends_on: []
soft_fail: true
device: intel_cpu
no_plugin: true
source_file_dependencies:
- cmake/cpu_extension.cmake
- setup.py
- vllm/platforms/cpu.py
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
- label: CPU-Language Generation and Pooling Model Tests
depends_on: []
soft_fail: true

View File

@@ -25,7 +25,9 @@ fi
docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--build-arg VLLM_CPU_X86=true \
--build-arg VLLM_CPU_AVX512BF16=true \
--build-arg VLLM_CPU_AVX512VNNI=true \
--build-arg VLLM_CPU_AMXBF16=true \
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
--target vllm-test \
--progress plain .

View File

@@ -13,10 +13,9 @@ import os
from contextlib import contextmanager
import lm_eval
import numpy as np
import yaml
from vllm.platforms import current_platform
DEFAULT_RTOL = 0.08
@@ -64,9 +63,6 @@ def launch_lm_eval(eval_config, tp_size):
"allow_deprecated_quantization=True,"
)
if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]:
model_args += "attention_backend=TRITON_ATTN"
env_vars = eval_config.get("env_vars", None)
with scoped_env_vars(env_vars):
results = lm_eval.simple_evaluate(
@@ -106,8 +102,6 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
f"ground_truth={ground_truth:.3f} | "
f"measured={measured_value:.3f} | rtol={rtol}"
)
min_acceptable = ground_truth * (1 - rtol)
success = success and measured_value >= min_acceptable
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
assert success

View File

@@ -83,6 +83,7 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
"server_parameters": {
"model": "meta-llama/Meta-Llama-3-8B",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy"
},

View File

@@ -7,12 +7,12 @@ import argparse
import html as _html
import json
import os
from contextlib import nullcontext
from dataclasses import dataclass
from importlib import util
from pathlib import Path
import pandas as pd
import regex as re
pd.options.display.float_format = "{:.2f}".format
plotly_found = util.find_spec("plotly.express") is not None
@@ -33,45 +33,6 @@ pd.set_option("display.precision", 2)
pd.set_option("display.float_format", lambda x: f"{x:.2f}")
# -----------------------------
# Concurrency normalization (NEW, small)
# -----------------------------
def _find_concurrency_col(df: pd.DataFrame) -> str:
for c in [
"# of max concurrency.",
"# of max concurrency",
"Max Concurrency",
"max_concurrency",
"Concurrency",
]:
if c in df.columns:
return c
for c in df.columns:
if "concurr" in str(c).lower():
s = df[c]
if s.dtype.kind in "iu" and s.nunique() > 1 and s.min() >= 1:
return c
raise ValueError(
"Cannot infer concurrency column. "
"Please rename the column to one of the known names "
"or add an explicit override (e.g., --concurrency-col)."
)
def _normalize_concurrency_in_df(
df: pd.DataFrame, canonical: str = "# of max concurrency."
) -> pd.DataFrame:
if canonical in df.columns:
return df
detected = _find_concurrency_col(df)
if detected in df.columns and detected != canonical:
return df.rename(columns={detected: canonical})
df[canonical] = pd.NA
return df
# -----------------------------
# Core data compare
# -----------------------------
@@ -91,25 +52,19 @@ def compare_data_columns(
- Concat along axis=1 (indexes align), then reset_index so callers can
group by columns.
- If --debug, add a <file_label>_name column per file.
Minimal fix to support different max_concurrency lists across files:
- normalize concurrency column naming to "# of max concurrency."
- align on UNION of keys (missing points become NaN)
- BUGFIX: don't drop throughput rows based on P99/Median presence
"""
print("\ncompare_data_column:", data_column)
frames = []
raw_data_cols: list[str] = []
compare_frames = []
# Determine key cols after normalizing concurrency
cols_per_file: list[set] = []
for f in files:
try:
df_tmp = pd.read_json(f, orient="records")
except Exception as err:
raise ValueError(f"Failed to read {f}") from err
df_tmp = _normalize_concurrency_in_df(df_tmp, canonical="# of max concurrency.")
cols_per_file.append(set(df_tmp.columns))
key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
@@ -120,25 +75,12 @@ def compare_data_columns(
"No common key columns found from info_cols across the input files."
)
union_index = None
metas: list[pd.DataFrame] = []
staged: list[tuple[str, pd.Series, pd.Series | None]] = []
meta_added = False
for file in files:
df = pd.read_json(file, orient="records")
df = _normalize_concurrency_in_df(df, canonical="# of max concurrency.")
# BUGFIX: only drop rows for latency-like metrics; throughput rows may have
# NaN in P99/Median columns even if the column exists in the JSON.
metric_lc = str(data_column).lower()
is_latency_metric = (
"ttft" in metric_lc
or "tpot" in metric_lc
or "p99" in metric_lc
or "median" in metric_lc
or metric_lc.strip() in {"p99", "median"}
)
if is_latency_metric and drop_column in df.columns:
if drop_column in df.columns:
df = df.dropna(subset=[drop_column], ignore_index=True)
for c in (
@@ -163,61 +105,35 @@ def compare_data_columns(
meta = meta.groupby(level=key_cols, dropna=False).first()
file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
if data_column in df_idx.columns:
s = df_idx[data_column]
if not s.index.is_unique:
s = s.groupby(level=key_cols, dropna=False).mean()
else:
# keep NA series to preserve meta keys for union_index
s = pd.Series(pd.NA, index=meta.index)
s = df_idx[data_column]
if not s.index.is_unique:
s = s.groupby(level=key_cols, dropna=False).mean()
s.name = file_label
name_s = None
if not meta_added:
frames.append(meta)
meta_added = True
if debug and name_column in df_idx.columns:
name_s = df_idx[name_column]
if not name_s.index.is_unique:
name_s = name_s.groupby(level=key_cols, dropna=False).first()
name_s.name = f"{file_label}_name"
frames.append(name_s)
if union_index is None:
union_index = meta.index
else:
union_index = union_index.union(meta.index)
metas.append(meta)
staged.append((file_label, s, name_s))
if union_index is None:
raise ValueError("No data found after loading inputs.")
# meta first (union-aligned): build UNION meta across all files
if metas:
meta_union = pd.concat(metas, axis=0)
# Collapse duplicates on the MultiIndex; keep first non-null per column
meta_union = meta_union.groupby(level=key_cols, dropna=False).first()
frames.append(meta_union.reindex(union_index))
# values + ratios (union-aligned)
metric_series_aligned: list[pd.Series] = []
for file_label, s, name_s in staged:
s_aligned = s.reindex(union_index)
frames.append(s_aligned)
frames.append(s)
raw_data_cols.append(file_label)
metric_series_aligned.append(s_aligned)
compare_frames.append(s)
if debug and name_s is not None:
frames.append(name_s.reindex(union_index))
if len(metric_series_aligned) >= 2:
base = metric_series_aligned[0]
current = metric_series_aligned[-1]
if "P99" in str(data_column) or "Median" in str(data_column):
if len(compare_frames) >= 2:
base = compare_frames[0]
current = compare_frames[-1]
if "P99" in data_column or "Median" in data_column:
ratio = base / current
else:
ratio = current / base
ratio = ratio.mask(base == 0)
ratio.name = f"Ratio 1 vs {len(metric_series_aligned)}"
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
frames.append(ratio)
concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
@@ -288,10 +204,24 @@ def split_json_by_tp_pp(
# -----------------------------
# Styling helpers
# -----------------------------
def _find_concurrency_col(df: pd.DataFrame) -> str:
for c in [
"# of max concurrency.",
"# of max concurrency",
"Max Concurrency",
"max_concurrency",
"Concurrency",
]:
if c in df.columns:
return c
for c in df.columns:
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
return c
return "# of max concurrency."
def _highlight_threshold(
df: pd.DataFrame,
threshold: float,
slack_pct: float = 0.0,
df: pd.DataFrame, threshold: float
) -> pd.io.formats.style.Styler:
conc_col = _find_concurrency_col(df)
key_cols = [
@@ -304,24 +234,12 @@ def _highlight_threshold(
]
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
try:
slack_pct = float(slack_pct or 0.0)
except Exception:
slack_pct = 0.0
slack_limit = threshold * (1.0 + slack_pct / 100.0)
def _cell(v):
if pd.isna(v):
return ""
if v <= threshold:
# Strict SLA
return "background-color:#e6ffe6;font-weight:bold;"
if v <= slack_limit:
# Within slack range
return "background-color:#ffe5cc;font-weight:bold;"
return ""
return df.style.map(_cell, subset=conf_cols)
return df.style.map(
lambda v: "background-color:#e6ffe6;font-weight:bold;"
if pd.notna(v) and v <= threshold
else "",
subset=conf_cols,
)
def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
@@ -368,30 +286,11 @@ def _sanitize_sheet_name(name: str) -> str:
- max 31 chars
- cannot contain: : \ / ? * [ ]
- cannot be empty
NOTE: Use fast, non-regex operations here to avoid the third-party `regex`
module's compile overhead/edge-cases on some systems.
"""
name = "sheet" if name is None else str(name)
# Replace illegal characters with underscore.
trans = str.maketrans(
{
":": "_",
"\\": "_",
"/": "_",
"?": "_",
"*": "_",
"[": "_",
"]": "_",
}
)
name = name.translate(trans)
# Strip quotes/spaces and collapse whitespace.
name = re.sub(r"[:\\/?*\[\]]", "_", name)
name = name.strip().strip("'")
name = " ".join(name.split())
name = re.sub(r"\s+", " ", name)
if not name:
name = "sheet"
return name[:31]
@@ -399,57 +298,30 @@ def _sanitize_sheet_name(name: str) -> str:
def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
d = dict(zip(group_cols, gkey_tuple))
# Always keep input/output lengths (these are important).
model = d.get("Model", "model")
model_short = str(model).split("/")[-1]
ilen = d.get("Input Len", "")
olen = d.get("Output Len", "")
lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
# Shorten model name aggressively to make room for lens.
model = d.get("Model", "model")
leaf = str(model).split("/")[-1]
max_model_len = max(1, 31 - len(lens))
model_short = leaf[:max_model_len]
return _sanitize_sheet_name(f"{model_short}{lens}")
def _write_tables_to_excel_sheet(
writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
):
"""Write all blocks to a sheet with a single to_excel() call.
Pandas+openpyxl can be extremely slow when called many times per sheet.
We flatten blocks into one table with a 'Section' column to keep structure
while making Excel generation fast and deterministic.
"""
if not blocks:
pd.DataFrame().to_excel(writer, sheet_name=sheet, index=False)
return
combined_parts: list[pd.DataFrame] = []
startrow = 0
for title, df in blocks:
df2 = df.copy()
# Put the section label as the first column for readability.
df2.insert(0, "Section", title)
combined_parts.append(df2)
combined = pd.concat(combined_parts, axis=0, ignore_index=True, sort=False)
combined.to_excel(writer, sheet_name=sheet, index=False)
pd.DataFrame([[title]]).to_excel(
writer, sheet_name=sheet, index=False, header=False, startrow=startrow
)
startrow += 1
df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow)
startrow += len(df) + 3
def _safe_filename(s: str) -> str:
# Fast path without the third-party `regex` module.
s = " ".join(str(s).strip().split())
allowed = []
for ch in s:
if ch.isalnum() or ch in "._-":
allowed.append(ch)
else:
allowed.append("_")
out = "".join(allowed)
return out[:180] if len(out) > 180 else out
s = re.sub(r"[^\w\-.]+", "_", str(s).strip())
return s[:180] if len(s) > 180 else s
# -----------------------------
@@ -556,11 +428,7 @@ def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
def _max_concurrency_ok(
df: pd.DataFrame,
conc_col: str,
cfg_col: str,
threshold: float,
slack_pct: float = 0.0,
df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
):
if df is None or conc_col not in df.columns or cfg_col not in df.columns:
return pd.NA
@@ -573,14 +441,7 @@ def _max_concurrency_ok(
if d.empty:
return pd.NA
# Accept values up to (1 + slack_pct%) above the SLA.
try:
slack_pct = float(slack_pct or 0.0)
except Exception:
slack_pct = 0.0
effective_limit = float(threshold) * (1.0 + slack_pct / 100.0)
ok = d[d[cfg_col] <= effective_limit]
ok = d[d[cfg_col] <= threshold]
if ok.empty:
return pd.NA
@@ -646,25 +507,15 @@ def build_valid_max_concurrency_summary_html(
if not cfg_cols:
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
# Display SLA ranges in the table header (SLA .. SLA*(1+slack))
ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
ttft_range = f"{args.ttft_max_ms:g}{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
tpot_range = f"{args.tpot_max_ms:g}{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
rows = []
for cfg in cfg_cols:
ttft_max = (
_max_concurrency_ok(
ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
)
_max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
if ttft_group_df is not None
else pd.NA
)
tpot_max = (
_max_concurrency_ok(
tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
)
_max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
if tpot_group_df is not None
else pd.NA
)
@@ -693,8 +544,8 @@ def build_valid_max_concurrency_summary_html(
rows.append(
{
"Configuration": cfg,
f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
f"Max {conc_col} (Both)": both,
"Output Tput @ Both (tok/s)": tput_at_both,
"TTFT @ Both (ms)": ttft_at_both,
@@ -769,24 +620,15 @@ def build_valid_max_concurrency_summary_df(
if not cfg_cols:
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
ttft_range = f"{args.ttft_max_ms:g}{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
tpot_range = f"{args.tpot_max_ms:g}{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
rows = []
for cfg in cfg_cols:
ttft_max = (
_max_concurrency_ok(
ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
)
_max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
if ttft_group_df is not None
else pd.NA
)
tpot_max = (
_max_concurrency_ok(
tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
)
_max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
if tpot_group_df is not None
else pd.NA
)
@@ -815,8 +657,8 @@ def build_valid_max_concurrency_summary_df(
rows.append(
{
"Configuration": cfg,
f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
f"Max {conc_col} (Both)": both,
"Output Tput @ Both (tok/s)": tput_at_both,
"TTFT @ Both (ms)": ttft_at_both,
@@ -909,21 +751,7 @@ def build_parser() -> argparse.ArgumentParser:
help="Reference limit for TPOT plots (ms)",
)
# ---- SLA tolerance (slack) options ----
parser.add_argument(
"--ttft-slack-pct",
type=float,
default=5.0,
help="Allowed percentage above TTFT SLA (default: 5).",
)
parser.add_argument(
"--tpot-slack-pct",
type=float,
default=5.0,
help="Allowed percentage above TPOT SLA (default: 5).",
)
# ---- export options ----
# ---- NEW: export options ----
parser.add_argument(
"--excel-out",
type=str,
@@ -1015,13 +843,9 @@ def render_metric_table_html(
metric_name = metric_label.lower()
if "ttft" in metric_name:
styler = _highlight_threshold(
display_group, args.ttft_max_ms, args.ttft_slack_pct
)
styler = _highlight_threshold(display_group, args.ttft_max_ms)
elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
styler = _highlight_threshold(
display_group, args.tpot_max_ms, args.tpot_slack_pct
)
styler = _highlight_threshold(display_group, args.tpot_max_ms)
else:
styler = display_group.style
@@ -1138,46 +962,22 @@ def write_report_group_first(
csv_dir.mkdir(parents=True, exist_ok=True)
excel_path = args.excel_out or "perf_comparison.xlsx"
disable_excel = os.getenv("VLLM_COMPARE_DISABLE_EXCEL", "0") == "1"
# Prefer xlsxwriter for speed; fallback to openpyxl if unavailable.
excel_engine = (
os.getenv("VLLM_COMPARE_EXCEL_ENGINE", "xlsxwriter").strip() or "xlsxwriter"
)
if excel_engine == "xlsxwriter" and util.find_spec("xlsxwriter") is None:
excel_engine = "openpyxl"
excel_engine_kwargs = {}
if excel_engine == "xlsxwriter":
# Reduce memory pressure & usually faster writes.
excel_engine_kwargs = {"options": {"constant_memory": True}}
xw_ctx = (
nullcontext(None)
if disable_excel
else pd.ExcelWriter(
excel_path, engine=excel_engine, engine_kwargs=excel_engine_kwargs
)
)
with xw_ctx as xw:
used_sheets: set[str] = set()
with pd.ExcelWriter(excel_path, engine="openpyxl") as xw:
# ---- Environment sheet (first) ----
env_sheet = _sanitize_sheet_name("Environment")
env_df = _load_env_df_for_inputs(args, files)
if xw is not None:
if env_df is None or env_df.empty:
pd.DataFrame(
[
{
"Section": "Environment",
"Key": "vllm_env.txt",
"Value": "NOT FOUND (or empty)",
}
]
).to_excel(xw, sheet_name=env_sheet, index=False)
else:
env_df.to_excel(xw, sheet_name=env_sheet, index=False)
used_sheets.add(env_sheet)
if env_df is None or env_df.empty:
pd.DataFrame(
[
{
"Section": "Environment",
"Key": "vllm_env.txt",
"Value": "NOT FOUND (or empty)",
}
]
).to_excel(xw, sheet_name=env_sheet, index=False)
else:
env_df.to_excel(xw, sheet_name=env_sheet, index=False)
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
main_fh.write('<meta charset="utf-8">\n')
for gkey in group_keys:
@@ -1193,19 +993,12 @@ def write_report_group_first(
main_fh.write(group_header)
do_excel = xw is not None
sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
sheet_base = sheet
if do_excel:
dedup_i = 1
while sheet in used_sheets:
dedup_i += 1
suffix = f"_{dedup_i}"
# Ensure uniqueness even when sheet names are truncated.
base = str(sheet_base)
keep = max(1, 31 - len(suffix))
sheet = _sanitize_sheet_name(base[:keep] + suffix)
used_sheets.add(sheet)
dedup_i = 1
while sheet in xw.sheets:
dedup_i += 1
sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}")
excel_blocks: list[tuple[str, pd.DataFrame]] = []
@@ -1266,7 +1059,7 @@ def write_report_group_first(
)
excel_blocks.append(
(metric_label, group_df.reset_index(drop=True))
(metric_label, display_group.reset_index(drop=True))
)
if csv_dir:
fn = _safe_filename(
@@ -1274,7 +1067,7 @@ def write_report_group_first(
"/", "_"
)
)
group_df.to_csv(csv_dir / f"{fn}.csv", index=False)
display_group.to_csv(csv_dir / f"{fn}.csv", index=False)
summary_html = build_valid_max_concurrency_summary_html(
tput_group_df=tput_group_df,
@@ -1304,13 +1097,9 @@ def write_report_group_first(
)
summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
if do_excel:
_write_tables_to_excel_sheet(xw, sheet, excel_blocks)
_write_tables_to_excel_sheet(xw, sheet, excel_blocks)
if disable_excel:
print("Skipped Excel generation (VLLM_COMPARE_DISABLE_EXCEL=1).")
else:
print(f"Wrote Excel: {excel_path}")
print(f"Wrote Excel: {excel_path}")
if csv_dir:
print(f"Wrote CSVs under: {csv_dir}")

View File

@@ -12,13 +12,6 @@ DRY_RUN="${DRY_RUN:-0}"
MODEL_FILTER="${MODEL_FILTER:-}"
DTYPE_FILTER="${DTYPE_FILTER:-}"
# Adaptive search controls
ENABLE_ADAPTIVE_CONCURRENCY="${ENABLE_ADAPTIVE_CONCURRENCY:-0}"
SLA_TTFT_MS="${SLA_TTFT_MS:-3000}"
SLA_TPOT_MS="${SLA_TPOT_MS:-100}"
ADAPTIVE_MAX_PROBES="${ADAPTIVE_MAX_PROBES:-8}"
ADAPTIVE_MAX_CONCURRENCY="${ADAPTIVE_MAX_CONCURRENCY:-1024}"
check_gpus() {
if command -v nvidia-smi; then
# check the number of GPUs and GPU type.
@@ -190,304 +183,6 @@ upload_to_buildkite() {
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
}
# -------------------------------
# Adaptive concurrency helpers
# -------------------------------
result_json_path_for_serving() {
local test_name=$1
local qps=$2
local max_concurrency=$3
echo "$RESULTS_FOLDER/${test_name}_qps_${qps}_concurrency_${max_concurrency}.json"
}
extract_metric_ms() {
local metric_name=$1
local json_file=$2
[[ -f "$json_file" ]] || return 0
if [[ "$metric_name" == "ttft" ]]; then
jq -r '
[
.ttft_ms.p99?,
.metrics.ttft_ms.p99?,
.ttft.p99?,
.metrics.ttft.p99?,
.p99_ttft_ms?,
.ttft_ms.mean?,
.metrics.ttft_ms.mean?,
.ttft.mean?,
.metrics.ttft.mean?,
.mean_ttft_ms?
] | map(select(. != null)) | .[0] // empty
' "$json_file"
else
jq -r '
[
.tpot_ms.p99?,
.metrics.tpot_ms.p99?,
.tpot.p99?,
.metrics.tpot.p99?,
.p99_tpot_ms?,
.itl_ms.p99?,
.metrics.itl_ms.p99?,
.inter_token_latency_ms.p99?,
.tpot_ms.mean?,
.metrics.tpot_ms.mean?,
.tpot.mean?,
.metrics.tpot.mean?,
.itl_ms.mean?,
.metrics.itl_ms.mean?,
.mean_tpot_ms?,
.mean_itl_ms?
] | map(select(. != null)) | .[0] // empty
' "$json_file"
fi
}
evaluate_sla_from_json() {
local json_file=$1
local ttft
local tpot
local pass
[[ -f "$json_file" ]] || return 2
ttft=$(extract_metric_ms ttft "$json_file")
tpot=$(extract_metric_ms tpot "$json_file")
[[ -n "$ttft" && -n "$tpot" ]] || return 2
pass=$(jq -n \
--argjson ttft "$ttft" \
--argjson tpot "$tpot" \
--argjson sla_ttft "$SLA_TTFT_MS" \
--argjson sla_tpot "$SLA_TPOT_MS" \
'($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)')
[[ "$pass" == "true" ]]
}
write_adaptive_summary_json() {
local summary_file=$1
local test_name=$2
local qps=$3
local static_last_pass=$4
local static_first_fail=$5
local final_last_pass=$6
local final_first_fail=$7
jq -n \
--arg test_name "$test_name" \
--arg qps "$qps" \
--argjson sla_ttft "$SLA_TTFT_MS" \
--argjson sla_tpot "$SLA_TPOT_MS" \
--arg static_last_pass "${static_last_pass:-}" \
--arg static_first_fail "${static_first_fail:-}" \
--arg final_last_pass "${final_last_pass:-}" \
--arg final_first_fail "${final_first_fail:-}" \
'{
test_name: $test_name,
qps: $qps,
sla_ttft_ms: $sla_ttft,
sla_tpot_ms: $sla_tpot,
static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end),
static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end),
final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end),
final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end)
}' > "$summary_file"
}
run_single_serving_probe() {
local test_name=$1
local qps=$2
local max_concurrency=$3
local tp=$4
local compilation_config_mode=$5
local optimization_level=$6
local client_args_effective=$7
local client_remote_args=$8
local server_command=$9
local new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
local result_json
local num_prompts_arg=""
local client_command
result_json=$(result_json_path_for_serving "$test_name" "$qps" "$max_concurrency")
if [[ -f "$result_json" ]]; then
evaluate_sla_from_json "$result_json"
return $?
fi
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
num_prompts_arg="--num-prompts $num_prompts"
fi
client_command="vllm bench serve \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--max-concurrency $max_concurrency \
$num_prompts_arg \
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level adaptive_search=1 \
$client_args_effective $client_remote_args "
echo "Adaptive probe: $client_command"
if [[ "${DRY_RUN:-0}" != "1" ]]; then
bash -c "$client_command"
fi
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
adaptive_search: true
}')
echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
evaluate_sla_from_json "$result_json"
}
adaptive_refine_from_static_results() {
local test_name=$1
local qps=$2
local max_concurrency_list_raw=$3
local tp=$4
local compilation_config_mode=$5
local optimization_level=$6
local client_args_effective=$7
local client_remote_args=$8
local server_command=$9
local sorted_points
local point
local rc
local static_last_pass=""
local static_first_fail=""
local largest_static=""
local step_hint=1
local previous_point=""
local low
local high
local mid
local probes=0
local summary_file="$RESULTS_FOLDER/${test_name}_qps_${qps}_sla_summary.json"
[[ "${ENABLE_ADAPTIVE_CONCURRENCY}" == "1" ]] || return 0
[[ "${DRY_RUN:-0}" != "1" ]] || return 0
sorted_points=$(for point in $max_concurrency_list_raw; do printf '%s\n' "$point"; done | tr -d "'" | awk '/^[0-9]+$/' | sort -n | uniq)
[[ -n "$sorted_points" ]] || return 0
while read -r point; do
[[ -z "$point" ]] && continue
largest_static="$point"
evaluate_sla_from_json "$(result_json_path_for_serving "$test_name" "$qps" "$point")"
rc=$?
if (( rc == 0 )); then
static_last_pass="$point"
elif (( rc == 1 )); then
if [[ -n "$static_last_pass" ]]; then
static_first_fail="$point"
break
fi
fi
if [[ -n "$previous_point" ]]; then
step_hint=$(( point - previous_point ))
if (( step_hint < 1 )); then step_hint=1; fi
fi
previous_point="$point"
done <<< "$sorted_points"
if [[ -z "$static_last_pass" ]]; then
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "" "$static_first_fail" "" "$static_first_fail"
return 0
fi
if [[ -n "$static_first_fail" ]]; then
low=$static_last_pass
high=$static_first_fail
while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
mid=$(( (low + high) / 2 ))
probes=$(( probes + 1 ))
run_single_serving_probe \
"$test_name" "$qps" "$mid" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
rc=$?
if (( rc == 0 )); then
low=$mid
elif (( rc == 1 )); then
high=$mid
else
break
fi
done
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "$static_first_fail" "$low" "$high"
return 0
fi
low=$largest_static
high=""
while (( probes < ADAPTIVE_MAX_PROBES )); do
point=$(( low + step_hint ))
if (( point > ADAPTIVE_MAX_CONCURRENCY )); then
point=$ADAPTIVE_MAX_CONCURRENCY
fi
(( point > low )) || break
probes=$(( probes + 1 ))
run_single_serving_probe \
"$test_name" "$qps" "$point" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
rc=$?
if (( rc == 0 )); then
low=$point
(( point == ADAPTIVE_MAX_CONCURRENCY )) && break
step_hint=$(( step_hint * 2 ))
if (( step_hint < 1 )); then step_hint=1; fi
elif (( rc == 1 )); then
high=$point
break
else
break
fi
done
if [[ -n "$high" ]]; then
while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
mid=$(( (low + high) / 2 ))
probes=$(( probes + 1 ))
run_single_serving_probe \
"$test_name" "$qps" "$mid" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
rc=$?
if (( rc == 0 )); then
low=$mid
elif (( rc == 1 )); then
high=$mid
else
break
fi
done
fi
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "" "$low" "$high"
}
run_benchmark_tests() {
# run benchmark tests using `vllm bench <test_type>` command
# $1: test type (latency or throughput)
@@ -652,48 +347,10 @@ run_serving_tests() {
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
client_params=$(echo "$params" | jq -r '.client_parameters')
# vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly.
server_model=$(echo "$server_params" | jq -r '.model // empty')
if [[ -z "$server_model" || "$server_model" == "null" ]]; then
echo "Error: serving test '$test_name' is missing server_parameters.model" >&2
exit 1
fi
server_params_no_model=$(echo "$server_params" | jq -c 'del(.model)')
server_args=$(json2args "$server_params_no_model")
server_args=$(json2args "$server_params")
server_envs=$(json2envs "$server_envs")
client_args=$(json2args "$client_params")
# ------------------------------------------------------------
# Option 1: Dynamic num-prompts scaling based on max_concurrency
#
# If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with:
# num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY
#
# If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior
# unchanged (i.e., whatever is in serving-tests-*.json).
# ------------------------------------------------------------
PROMPTS_PER_CONCURRENCY="${PROMPTS_PER_CONCURRENCY-}" # no default on purpose
MIN_NUM_PROMPTS="${MIN_NUM_PROMPTS:-1}"
MAX_NUM_PROMPTS="${MAX_NUM_PROMPTS:-1000000}"
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
# Handles: --num-prompts 123 and --num-prompts=123
client_args_no_np="$(
printf ' %s ' "$client_args" \
| sed -E \
-e 's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g' \
-e 's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g'
)"
# normalize whitespace
client_args_no_np="$(echo "$client_args_no_np" | tr -s ' ' | sed -E 's/^ //; s/ $//')"
client_args_no_np="$(echo "$client_args_no_np" | xargs)"
client_args_effective="$client_args_no_np"
else
client_args_effective="$client_args"
fi
# qps_list
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
@@ -725,13 +382,14 @@ run_serving_tests() {
fi
# check if server model and client model is aligned
server_model=$(echo "$server_params" | jq -r '.model')
client_model=$(echo "$client_params" | jq -r '.model')
if [[ $server_model != "$client_model" ]]; then
echo "Server model and client model must be the same. Skip testcase $test_name."
continue
fi
server_command="$server_envs vllm serve $server_model \
server_command="$server_envs vllm serve \
$server_args"
# run the server
@@ -778,14 +436,6 @@ run_serving_tests() {
for max_concurrency in $max_concurrency_list; do
new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
echo " new test name $new_test_name"
# If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts.
num_prompts_arg=""
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
num_prompts_arg="--num-prompts $num_prompts"
fi
# pass the tensor parallel size, the compilation mode, and the optimization
# level to the client so that they can be used on the benchmark dashboard
client_command="vllm bench serve \
@@ -794,9 +444,8 @@ run_serving_tests() {
--result-filename ${new_test_name}.json \
--request-rate $qps \
--max-concurrency $max_concurrency \
$num_prompts_arg \
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
$client_args_effective $client_remote_args "
$client_args $client_remote_args "
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
@@ -818,11 +467,6 @@ run_serving_tests() {
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
adaptive_refine_from_static_results \
"$test_name" "$qps" "$max_concurrency_list" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
done
# clean up
@@ -888,7 +532,6 @@ main() {
# postprocess benchmarking results
pip install tabulate pandas
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
python3 $QUICK_BENCHMARK_ROOT/scripts/compare-json-results.py -f $RESULTS_FOLDER/benchmark_results.json
upload_to_buildkite
}

View File

@@ -1,37 +0,0 @@
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
},
"server_parameters": {
"dtype": "bfloat16",
"model": "openai/whisper-large-v3-turbo"
},
"client_parameters": {
"model": "openai/whisper-large-v3-turbo",
"backend": "openai-audio",
"endpoint": "/v1/audio/transcriptions",
"dataset_name": "hf",
"dataset_path": "openslr/librispeech_asr",
"hf_subset": "clean",
"hf_split": "test",
"no_stream": "",
"no_oversample": "",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_whisper_large_v3_turbo_librispeech_clean_tp1",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {}
}
]
}

View File

@@ -149,39 +149,6 @@
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp4_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_int4_tp1_random_128_128",
"server_parameters": {
@@ -221,45 +188,6 @@
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int8_tp1_random_128_128",
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int8_tp2_random_128_128",
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int8_tp4_random_128_128",
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {

View File

@@ -72,6 +72,17 @@
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_128_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"server_parameters": {
@@ -94,6 +105,17 @@
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp4_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"server_parameters": {
@@ -117,25 +139,14 @@
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_2048",
"test_name": "serving_llama8B_tp4_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 1
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
"random-output-len": 128
}
}
]

View File

@@ -10,6 +10,7 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
@@ -36,6 +37,7 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
@@ -62,6 +64,7 @@
"server_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
@@ -88,6 +91,7 @@
"server_parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"tensor_parallel_size": 8,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,

View File

@@ -5,6 +5,7 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy"
},
@@ -22,6 +23,7 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy"
},
@@ -39,6 +41,7 @@
"server_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy"
},
@@ -56,6 +59,7 @@
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"swap_space": 16,
"speculative_config": {
"model": "turboderp/Qwama-0.5B-Instruct",
"num_speculative_tokens": 4,

View File

@@ -83,7 +83,7 @@ steps:
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
@@ -152,7 +152,7 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:

View File

@@ -166,19 +166,12 @@ See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for contex
EOF
fi
# Notify Slack if webhook is configured and PR/branch are valid.
# Notify Slack if webhook is configured.
if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
PR="${BUILDKITE_PULL_REQUEST:-}"
BRANCH="${BUILDKITE_BRANCH:-}"
# Skip notification if PR is invalid or branch is empty
if [[ "$PR" = "false" || -z "$PR" || -z "$BRANCH" ]]; then
echo ">>> Skipping Slack notification (invalid PR or empty branch: PR=$PR, branch=$BRANCH)"
else
echo ">>> Sending Slack notification"
# Single quotes are intentional: the f-string expressions are Python, not shell.
# shellcheck disable=SC2016
PAYLOAD=$(python3 -c '
echo ">>> Sending Slack notification"
# Single quotes are intentional: the f-string expressions are Python, not shell.
# shellcheck disable=SC2016
PAYLOAD=$(python3 -c '
import json, os, sys
pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A")
branch = os.getenv("BUILDKITE_BRANCH", "unknown")
@@ -201,11 +194,10 @@ data = {
print(json.dumps(data))
')
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
-H 'Content-type: application/json' \
-d "$PAYLOAD")
echo " Slack webhook response: $HTTP_CODE"
fi
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
-H 'Content-type: application/json' \
-d "$PAYLOAD")
echo " Slack webhook response: $HTTP_CODE"
else
echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
fi

View File

@@ -205,13 +205,6 @@ re_quote_pytest_markers() {
esac
if $is_boundary; then
# Strip surrounding double quotes if present (from upstream
# single-to-double conversion); without this, wrapping below
# would produce '"expr"' with literal double-quote characters.
if [[ "$marker_buf" == '"'*'"' ]]; then
marker_buf="${marker_buf#\"}"
marker_buf="${marker_buf%\"}"
fi
# Flush the collected marker expression
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
output+="'${marker_buf}' "
@@ -249,11 +242,6 @@ re_quote_pytest_markers() {
# Flush any trailing marker expression (marker at end of command)
if $collecting && [[ -n "$marker_buf" ]]; then
# Strip surrounding double quotes (see mid-stream flush comment)
if [[ "$marker_buf" == '"'*'"' ]]; then
marker_buf="${marker_buf#\"}"
marker_buf="${marker_buf%\"}"
fi
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
output+="'${marker_buf}'"
else
@@ -504,8 +492,6 @@ else
-e HF_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-e BUILDKITE_PARALLEL_JOB \
-e BUILDKITE_PARALLEL_JOB_COUNT \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \

View File

@@ -1,65 +0,0 @@
#!/bin/bash
set -euox pipefail
export VLLM_CPU_KVCACHE_SPACE=1
export VLLM_CPU_CI_ENV=1
# Reduce sub-processes for acceleration
export TORCH_COMPILE_DISABLE=1
export VLLM_ENABLE_V1_MULTIPROCESSING=0
SDE_ARCHIVE="sde-external-10.7.0-2026-02-18-lin.tar.xz"
SDE_CHECKSUM="CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217"
wget "https://downloadmirror.intel.com/913594/${SDE_ARCHIVE}"
echo "${SDE_CHECKSUM} ${SDE_ARCHIVE}" | sha256sum --check
mkdir -p sde
tar -xvf "./${SDE_ARCHIVE}" --strip-components=1 -C ./sde/
wait_for_pid_and_check_log() {
local pid="$1"
local log_file="$2"
local exit_status
if [ -z "$pid" ] || [ -z "$log_file" ]; then
echo "Usage: wait_for_pid_and_check_log <PID> <LOG_FILE>"
return 1
fi
echo "Waiting for process $pid to finish..."
# Use the 'wait' command to pause the script until the specific PID exits.
# The 'wait' command's own exit status will be that of the waited-for process.
if wait "$pid"; then
exit_status=$?
echo "Process $pid finished with exit status $exit_status (Success)."
else
exit_status=$?
echo "Process $pid finished with exit status $exit_status (Failure)."
fi
if [ "$exit_status" -ne 0 ]; then
echo "Process exited with a non-zero status."
echo "--- Last few lines of log file: $log_file ---"
tail -n 50 "$log_file"
echo "---------------------------------------------"
return 1 # Indicate failure based on exit status
fi
echo "No errors detected in log file and process exited successfully."
return 0
}
# Test Sky Lake (AVX512F)
./sde/sde64 -skl -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_0.log 2>&1 &
PID_TEST_0=$!
# Test Cascade Lake (AVX512F + VNNI)
./sde/sde64 -clx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_1.log 2>&1 &
PID_TEST_1=$!
# Test Cooper Lake (AVX512F + VNNI + BF16)
./sde/sde64 -cpx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_2.log 2>&1 &
PID_TEST_2=$!
wait_for_pid_and_check_log $PID_TEST_0 test_0.log
wait_for_pid_and_check_log $PID_TEST_1 test_1.log
wait_for_pid_and_check_log $PID_TEST_2 test_2.log

View File

@@ -34,7 +34,7 @@ function cpu_tests() {
# offline inference
docker exec cpu-test bash -c "
set -e
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run model tests
docker exec cpu-test bash -c "

View File

@@ -27,7 +27,7 @@ function cpu_tests() {
podman exec -it "$container_id" bash -c "
export TORCH_COMPILE_DISABLE=1
set -xve
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
# Run basic model test
podman exec -it "$container_id" bash -c "

View File

@@ -25,5 +25,5 @@ remove_docker_container
# Run the image and test offline inference
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
'

View File

@@ -76,7 +76,7 @@ docker run --rm --runtime=habana --name="${container_name}" --network=host \
-e PT_HPU_LAZY_MODE=1 \
"${image_name}" \
/bin/bash -c '
cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
'
EXITCODE=$?

View File

@@ -34,15 +34,15 @@ docker run \
set -e
echo $ZE_AFFINITY_MASK
pip install tblib==3.1.0
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
cd tests
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
pytest -v -s v1/engine

View File

@@ -24,7 +24,7 @@ if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:
BACKENDS=("allgather_reducescatter")
# Disable MOE padding for ROCm since it is causing eplb to fail
export VLLM_ROCM_MOE_PADDING=0
PLATFORM_ARGS=("--no-async-scheduling" "--attention-backend=TRITON_ATTN")
PLATFORM_ARGS=("--no-async-scheduling")
echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
else
# Non-ROCm platform (CUDA/other)

View File

@@ -1,248 +0,0 @@
#!/bin/bash
# Run BFCL (Berkeley Function Call Leaderboard) tool-calling correctness
# evaluation against a local vLLM server.
#
# Usage:
# # Run with defaults (gpt-oss-20b, multi_turn)
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
#
# # Run with gpt-oss-120b and multiple test categories
# BFCL_MODEL="openai/gpt-oss-120b" BFCL_TP_SIZE=4 \
# BFCL_TEST_CATEGORY="live_simple, multiple, parallel_multiple" \
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
#
# # Chain both API types (use BFCL_OUTPUT_DIR to avoid overwriting results)
# BFCL_OUTPUT_DIR=./bfcl-chat-completions BFCL_API_TYPE=chat_completions \
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh && \
# BFCL_OUTPUT_DIR=./bfcl-responses BFCL_API_TYPE=responses \
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
#
# Environment variables (all optional, with defaults):
# BFCL_MODEL - HF model name (default: openai/gpt-oss-20b)
# BFCL_API_TYPE - API type: "chat_completions" or "responses" (default: chat_completions)
# BFCL_OUTPUT_DIR - Directory for BFCL results (default: current working directory)
# BFCL_TEST_CATEGORY - BFCL test categories (default: multi_turn)
# BFCL_TOOL_CALL_PARSER - Tool call parser name (default: openai)
# BFCL_NUM_THREADS - Threads for BFCL generate (default: 8)
# BFCL_TP_SIZE - Tensor parallel size (default: 1)
# BFCL_MAX_MODEL_LEN - Max model length (default: 4096)
# BFCL_PORT - Server port (default: 8000)
# BFCL_REASONING_PARSER - Reasoning parser name (default: disabled)
# BFCL_EXTRA_ARGS - Additional vLLM server args
set -euo pipefail
# ---- Configuration ----
MODEL="${BFCL_MODEL:-openai/gpt-oss-20b}"
API_TYPE="${BFCL_API_TYPE:-chat_completions}"
OUTPUT_DIR="${BFCL_OUTPUT_DIR:-}"
TEST_CATEGORY="${BFCL_TEST_CATEGORY:-multi_turn}"
TOOL_CALL_PARSER="${BFCL_TOOL_CALL_PARSER:-openai}"
NUM_THREADS="${BFCL_NUM_THREADS:-8}"
TP_SIZE="${BFCL_TP_SIZE:-1}"
MAX_MODEL_LEN="${BFCL_MAX_MODEL_LEN:-4096}"
PORT="${BFCL_PORT:-8000}"
REASONING_PARSER="${BFCL_REASONING_PARSER:-}"
EXTRA_ARGS="${BFCL_EXTRA_ARGS:-}"
# Set up output directory
if [ -n "$OUTPUT_DIR" ]; then
mkdir -p "$OUTPUT_DIR"
OUTPUT_DIR="$(cd "$OUTPUT_DIR" && pwd)"
fi
echo "============================================"
echo "BFCL Tool Call Correctness Evaluation"
echo "============================================"
echo "Model: $MODEL"
echo "Tool parser: $TOOL_CALL_PARSER"
echo "API type: $API_TYPE"
echo "Output dir: ${OUTPUT_DIR:-<cwd>}"
echo "Test category: $TEST_CATEGORY"
echo "TP size: $TP_SIZE"
echo "Max model len: $MAX_MODEL_LEN"
echo "Port: $PORT"
echo "Num threads: $NUM_THREADS"
echo "============================================"
# ---- Install bfcl-eval if missing ----
if ! python3 -c "import bfcl_eval" 2>/dev/null; then
echo "Installing bfcl-eval..."
pip install "bfcl-eval>=2025.10.20.1,<2026"
fi
# ---- Cleanup handler ----
SERVER_PID=""
cleanup() {
if [ -n "$SERVER_PID" ]; then
echo "Stopping vLLM server (pid=$SERVER_PID)..."
kill "$SERVER_PID" 2>/dev/null || true
wait "$SERVER_PID" 2>/dev/null || true
fi
# Remove BFCL lock files (created by filelock for thread-safe writes)
rm -rf .file_locks/
if [ -n "${OUTPUT_DIR:-}" ]; then
rm -rf "$OUTPUT_DIR/.file_locks/"
fi
}
trap cleanup EXIT
# ---- Start vLLM server ----
echo "Starting vLLM server..."
SERVE_ARGS=(
"$MODEL"
--port "$PORT"
--enable-auto-tool-choice
--tool-call-parser "$TOOL_CALL_PARSER"
--tensor-parallel-size "$TP_SIZE"
--max-model-len "$MAX_MODEL_LEN"
--enforce-eager
--no-enable-prefix-caching
)
# Append reasoning parser if specified
if [ -n "$REASONING_PARSER" ]; then
SERVE_ARGS+=(--reasoning-parser "$REASONING_PARSER")
fi
# Append any extra args
if [ -n "$EXTRA_ARGS" ]; then
read -ra EXTRA_ARGS_ARRAY <<< "$EXTRA_ARGS"
SERVE_ARGS+=("${EXTRA_ARGS_ARRAY[@]}")
fi
echo "Command: vllm serve ${SERVE_ARGS[*]}"
vllm serve "${SERVE_ARGS[@]}" &
SERVER_PID=$!
# ---- Wait for server to be ready ----
echo "Waiting for vLLM server to start (timeout: 600s)..."
SECONDS_WAITED=0
until curl -sf "http://localhost:${PORT}/health" > /dev/null 2>&1; do
if [ $SECONDS_WAITED -ge 600 ]; then
echo ""
echo "ERROR: vLLM server failed to start within 600s"
exit 1
fi
if (( SECONDS_WAITED % 30 == 0 && SECONDS_WAITED > 0 )); then
echo " Still waiting... (${SECONDS_WAITED}s elapsed)"
fi
sleep 2
SECONDS_WAITED=$((SECONDS_WAITED + 2))
done
echo "vLLM server is ready. (started in ${SECONDS_WAITED}s)"
# ---- Run BFCL evaluation ----
# bfcl-eval has no CLI entry point; generate() and evaluate() are Typer
# functions that must be called from Python. The MODEL_CONFIG_MAPPING must
# be patched in-process so BFCL knows to use the OpenAI-compatible handler
# against our local vLLM server.
bfcl_exit_code=0
python3 - "$MODEL" "$TEST_CATEGORY" "$NUM_THREADS" "$PORT" "$API_TYPE" "$OUTPUT_DIR" << 'PYEOF' || bfcl_exit_code=$?
import os
import sys
model = sys.argv[1]
test_category = sys.argv[2]
num_threads = int(sys.argv[3])
port = sys.argv[4]
api_type = sys.argv[5]
output_dir = sys.argv[6] if len(sys.argv) > 6 and sys.argv[6] else os.getcwd()
os.environ["OPENAI_BASE_URL"] = f"http://localhost:{port}/v1"
os.environ["OPENAI_API_KEY"] = "dummy"
os.environ["BFCL_PROJECT_ROOT"] = output_dir
import bfcl_eval.constants.model_config as bfcl_model_config
from bfcl_eval.constants.model_config import ModelConfig
from bfcl_eval.model_handler.api_inference.openai_completion import (
OpenAICompletionsHandler,
)
from bfcl_eval.model_handler.api_inference.openai_response import (
OpenAIResponsesHandler,
)
if api_type == "responses":
handler = OpenAIResponsesHandler
else:
handler = OpenAICompletionsHandler
bfcl_model_config.MODEL_CONFIG_MAPPING[model] = ModelConfig(
model_name=model,
display_name=f"{model} (FC) (vLLM)",
url=f"https://huggingface.co/{model}",
org="",
license="apache-2.0",
model_handler=handler,
input_price=None,
output_price=None,
is_fc_model=True,
underscore_to_dot=True,
)
from bfcl_eval.__main__ import evaluate, generate
import inspect
import typer
def _get_default_kwargs(function):
kwargs = {}
for k, v in inspect.signature(function).parameters.items():
if v.default is not inspect.Parameter.empty:
default = v.default
if isinstance(default, typer.models.OptionInfo):
default = default.default
kwargs[k] = default
return kwargs
# ---- generate ----
print(f"=== BFCL generate: model={model} test_category={test_category} ===")
gen_kwargs = _get_default_kwargs(generate)
gen_kwargs["model"] = [model]
gen_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
gen_kwargs["skip_server_setup"] = True
gen_kwargs["num_threads"] = num_threads
generate(**gen_kwargs)
# ---- evaluate ----
print(f"=== BFCL evaluate: model={model} test_category={test_category} ===")
eval_kwargs = _get_default_kwargs(evaluate)
eval_kwargs["model"] = [model]
eval_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
evaluate(**eval_kwargs)
print("=== BFCL evaluation completed successfully ===")
PYEOF
# ---- Upload results to buildkite ----
if command -v buildkite-agent &>/dev/null; then
if [ $bfcl_exit_code -eq 0 ]; then
STYLE="success"
STATUS="PASSED"
else
STYLE="error"
STATUS="FAILED"
fi
buildkite-agent annotate --style "$STYLE" --context "bfcl-results" <<EOF
### BFCL Tool Call Correctness - ${STATUS}
- **Model:** \`${MODEL}\`
- **Parser:** \`${TOOL_CALL_PARSER}\`
- **API type:** \`${API_TYPE}\`
- **Test category:** \`${TEST_CATEGORY}\`
EOF
# BFCL writes results to $BFCL_PROJECT_ROOT/result/ and scores to
# $BFCL_PROJECT_ROOT/score/
RESULTS_ROOT="${OUTPUT_DIR:-.}"
if [ -d "$RESULTS_ROOT/result" ]; then
buildkite-agent artifact upload "$RESULTS_ROOT/result/**/*"
fi
if [ -d "$RESULTS_ROOT/score" ]; then
buildkite-agent artifact upload "$RESULTS_ROOT/score/**/*"
fi
fi
exit $bfcl_exit_code

View File

@@ -72,7 +72,7 @@ obj_json="objects.json"
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
mkdir -p "$INDICES_OUTPUT_DIR"
# call script to generate indices for all existing wheels
# call script to generate indicies for all existing wheels
# this indices have relative paths that could work as long as it is next to the wheel directory in s3
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/

File diff suppressed because it is too large Load Diff

View File

@@ -14,3 +14,8 @@ steps:
- pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd

View File

@@ -36,16 +36,6 @@ steps:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
- label: AsyncTP Correctness Tests (B200)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/"
device: b200
optional: true
num_devices: 2
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
- label: Distributed Compile Unit Tests (2xH100)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/"
@@ -101,8 +91,8 @@ steps:
- nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
# Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and (qwen3 or deepseek)"
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
- label: Fusion E2E Config Sweep (H100)
timeout_in_minutes: 30
@@ -132,9 +122,9 @@ steps:
commands:
- nvidia-smi
# Run all models but only FLASHINFER, Inductor partition and native custom ops
# Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
# Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek)) or llama-3)"
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"
- label: Fusion E2E TP2 Quick (H100)
timeout_in_minutes: 20
@@ -150,8 +140,8 @@ steps:
commands:
- nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
timeout_in_minutes: 40
@@ -205,7 +195,7 @@ steps:
commands:
- nvidia-smi
# Run all models but only FLASHINFER, Inductor partition and native custom ops
# include qwen/deepseek with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
# include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
# for ar-rms-quant-fp4, also sweep llama3
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))) or Llama-3.1-8B-Instruct-FP4"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"

View File

@@ -50,18 +50,23 @@ steps:
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
- label: Distributed Torchrun + Examples (4 GPUs)
timeout_in_minutes: 30
- label: Distributed Tests (4 GPUs)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/
- tests/distributed/test_torchrun_example.py
- tests/distributed/test_torchrun_example_moe.py
- tests/distributed/test_utils
- tests/distributed/test_pynccl
- tests/distributed/test_events
- tests/compile/fullgraph/test_basic_correctness.py
- examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py
- examples/offline_inference/new_weight_syncing/
- tests/examples/offline_inference/data_parallel.py
- tests/v1/distributed
- tests/v1/engine/test_engine_core_client.py
- tests/distributed/test_symm_mem_allreduce.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
@@ -79,6 +84,19 @@ steps:
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/fullgraph/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py
- pytest -v -s distributed/test_symm_mem_allreduce.py
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
# OLD rlhf examples
- cd ../examples/offline_inference
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
@@ -88,47 +106,6 @@ steps:
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
- label: Distributed DP Tests (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/
- tests/v1/distributed
- tests/v1/engine/test_engine_core_client.py
- tests/distributed/test_utils
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py
- label: Distributed Compile + Comm (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/
- tests/distributed/test_pynccl
- tests/distributed/test_events
- tests/compile/fullgraph/test_basic_correctness.py
- tests/distributed/test_symm_mem_allreduce.py
- tests/distributed/test_multiproc_executor.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- pytest -v -s compile/fullgraph/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py
- pytest -v -s distributed/test_symm_mem_allreduce.py
# test multi-node TP with multiproc executor (simulated on single node)
- pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
- label: Distributed Tests (8 GPUs)(H100)
timeout_in_minutes: 10
device: h100
@@ -169,7 +146,7 @@ steps:
num_devices: 2
commands:
- pytest -v -s tests/distributed/test_context_parallel.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
# - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py --- failing, need to re-enable
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py
@@ -233,19 +210,6 @@ steps:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: NixlConnector PD + Spec Decode acceptance (2 GPUs)
timeout_in_minutes: 30
device: a100
working_dir: "/vllm-workspace/tests"
num_devices: 2
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- vllm/v1/worker/kv_connector_model_runner_mixin.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
- label: Pipeline + Context Parallelism (4 GPUs)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests"

View File

@@ -1,5 +1,5 @@
group: Engine
depends_on:
depends_on:
- image-build
steps:
- label: Engine
@@ -14,30 +14,28 @@ steps:
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
- label: Engine (1 GPU)
timeout_in_minutes: 30
- label: V1 e2e + engine (1 GPU)
timeout_in_minutes: 45
source_file_dependencies:
- vllm/v1/engine/
- tests/v1/engine/
- vllm/
- tests/v1
commands:
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e
# Run this test standalone for now;
# need to untangle use (implicit) use of spawn/fork across the tests.
- pytest -v -s v1/engine/test_preprocess_error_handling.py
# Run the rest of v1/engine tests
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
- label: e2e Scheduling (1 GPU)
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/
- tests/v1/e2e/general/
commands:
- pytest -v -s v1/e2e/general/test_async_scheduling.py
- label: e2e Core (1 GPU)
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/
- tests/v1/e2e/general/
commands:
- pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
commands:
- pytest -v -s v1/e2e
- pytest -v -s v1/engine
- label: V1 e2e (2 GPUs)
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
@@ -48,7 +46,7 @@ steps:
- tests/v1/e2e
commands:
# Only run tests that need exactly 2 GPUs
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
mirror:
amd:
device: mi325_2
@@ -64,7 +62,7 @@ steps:
- tests/v1/e2e
commands:
# Only run tests that need 4 GPUs
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
mirror:
amd:
device: mi325_4

View File

@@ -24,6 +24,11 @@ steps:
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Entrypoints Integration (API Server 1)
timeout_in_minutes: 130
@@ -34,13 +39,8 @@ steps:
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
- pytest -v -s entrypoints/test_chat_utils.py
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Entrypoints Integration (API Server 2)
timeout_in_minutes: 130
@@ -65,6 +65,11 @@ steps:
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/pooling
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Entrypoints Integration (Responses API)
timeout_in_minutes: 50
@@ -82,11 +87,6 @@ steps:
- tests/v1
commands:
- pytest -v -s v1/entrypoints
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: OpenAI API Correctness
timeout_in_minutes: 30

View File

@@ -8,9 +8,8 @@ steps:
- csrc/
- tests/kernels/core
- tests/kernels/test_top_k_per_row.py
- tests/kernels/test_concat_mla_q.py
commands:
- pytest -v -s kernels/core kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
- label: Kernels Attention Test %N
timeout_in_minutes: 35
@@ -97,7 +96,7 @@ steps:
- vllm/platforms/cuda.py
commands:
- nvidia-smi
- python3 examples/basic/offline_inference/chat.py
- python3 examples/offline_inference/basic/chat.py
# Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_attention_selector.py

View File

@@ -67,13 +67,12 @@ steps:
- examples/
commands:
- pip install tensorizer # for tensorizer test
# for basic
- python3 basic/offline_inference/chat.py
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 basic/offline_inference/classify.py
- python3 basic/offline_inference/embed.py
- python3 basic/offline_inference/score.py
- python3 offline_inference/basic/chat.py # for basic
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
# for multi-modal models
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0

View File

@@ -1,110 +0,0 @@
group: Model Runner V2
depends_on:
- image-build
steps:
- label: Model Runner V2 Core Tests
timeout_in_minutes: 45
source_file_dependencies:
- vllm/v1/worker/gpu/
- vllm/v1/worker/gpu_worker.py
- vllm/v1/core/sched/
- vllm/v1/attention/
- tests/v1/engine/test_llm_engine.py
- tests/v1/e2e/
- tests/v1/entrypoints/llm/test_struct_output_generate.py
commands:
- set -x
- export VLLM_USE_V2_MODEL_RUNNER=1
- pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics"
# This requires eager until we sort out CG correctness issues.
# TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged.
- ENFORCE_EAGER=1 pytest -v -s v1/e2e/general/test_async_scheduling.py -k "not ngram"
- pytest -v -s v1/e2e/general/test_context_length.py
- pytest -v -s v1/e2e/general/test_min_tokens.py
# Temporary hack filter to exclude ngram spec decoding based tests.
- pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
- label: Model Runner V2 Examples
timeout_in_minutes: 45
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
- vllm/v1/worker/gpu/
- vllm/v1/core/sched/
- vllm/v1/worker/gpu_worker.py
- examples/offline_inference/
- examples/basic/offline_inference/
- examples/pooling/embed/vision_embedding_offline.py
- examples/others/tensorize_vllm_model.py
commands:
- set -x
- export VLLM_USE_V2_MODEL_RUNNER=1
- pip install tensorizer # for tensorizer test
- python3 basic/offline_inference/chat.py # for basic
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
#- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 # TODO
#- python3 basic/offline_inference/embed.py # TODO
# for multi-modal models
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
# for pooling models
- python3 pooling/embed/vision_embedding_offline.py --seed 0
# for features demo
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
- label: Model Runner V2 Distributed (2 GPUs)
timeout_in_minutes: 45
working_dir: "/vllm-workspace/tests"
num_devices: 2
source_file_dependencies:
- vllm/v1/worker/gpu/
- vllm/v1/worker/gpu_worker.py
- tests/basic_correctness/test_basic_correctness.py
- tests/v1/distributed/test_async_llm_dp.py
- tests/v1/distributed/test_eagle_dp.py
commands:
- set -x
- export VLLM_USE_V2_MODEL_RUNNER=1
# The "and not True" here is a hacky way to exclude the prompt_embeds cases which aren't yet supported.
- TARGET_TEST_SUITE=L4 pytest -v -s basic_correctness/test_basic_correctness.py -m 'distributed(num_gpus=2)' -k "not ray and not True"
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py -k "not ray"
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
# These require fix https://github.com/vllm-project/vllm/pull/36280
- label: Model Runner V2 Pipeline Parallelism (4 GPUs)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/v1/worker/gpu/
- vllm/v1/worker/gpu_worker.py
- tests/distributed/test_pipeline_parallel.py
#- tests/distributed/test_pp_cudagraph.py
commands:
- set -x
- export VLLM_USE_V2_MODEL_RUNNER=1
- pytest -v -s distributed/test_pipeline_parallel.py -k "not ray and not Jamba"
# TODO: Uncomment once https://github.com/vllm-project/vllm/pull/35162 is merged.
#- pytest -v -s distributed/test_pp_cudagraph.py -k "not ray"
- label: Model Runner V2 Spec Decode
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/v1/worker/gpu/
- vllm/v1/worker/gpu_worker.py
- tests/v1/spec_decode/test_max_len.py
- tests/v1/e2e/spec_decode/test_spec_decode.py
commands:
- set -x
- export VLLM_USE_V2_MODEL_RUNNER=1
- pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp"
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle or mtp"

View File

@@ -65,7 +65,7 @@ steps:
- pytest -v -s tests/models/test_transformers.py
- pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/basic/offline_inference/chat.py
- python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

View File

@@ -2,65 +2,16 @@ group: Models - Multimodal
depends_on:
- image-build
steps:
- label: "Multi-Modal Models (Standard) 1: qwen2"
timeout_in_minutes: 45
- label: Multi-Modal Models (Standard) # 60min
timeout_in_minutes: 80
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2"
- pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: "Multi-Modal Models (Standard) 2: qwen3 + gemma"
timeout_in_minutes: 45
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma"
- pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl"
timeout_in_minutes: 45
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma"
- pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: "Multi-Modal Models (Standard) 4: other + whisper"
timeout_in_minutes: 45
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- pip freeze | grep -E 'torch'
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Multi-Modal Processor Test (CPU)
depends_on:
@@ -103,11 +54,6 @@ steps:
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Multi-Modal Models (Extended) 2
optional: true

View File

@@ -15,12 +15,9 @@ steps:
- pytest -v -s plugins_tests/test_platform_plugins.py
- pip uninstall vllm_add_dummy_platform -y
# end platform plugin tests
# begin io_processor plugins test
# test generic io_processor plugins functions
- pytest -v -s ./plugins_tests/test_io_processor_plugins.py
# test Terratorch io_processor plugins
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
- pip install -e ./plugins/prithvi_io_processor_plugin
- pytest -v -s plugins_tests/test_terratorch_io_processor_plugins.py
- pytest -v -s plugins_tests/test_io_processor_plugins.py
- pip uninstall prithvi_io_processor_plugin -y
# test bge_m3_sparse io_processor plugin
- pip install -e ./plugins/bge_m3_sparse_plugin

View File

@@ -1,40 +0,0 @@
group: Spec Decode
depends_on:
- image-build
steps:
- label: Spec Decode Eagle
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"
- label: Spec Decode Speculators + MTP
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- vllm/transformers_utils/configs/speculators/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
- label: Spec Decode Ngram + Suffix
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"
- label: Spec Decode Draft Model
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"

11
.github/mergify.yml vendored
View File

@@ -3,7 +3,6 @@ pull_request_rules:
description: Automatically apply documentation label
conditions:
- label != stale
- -closed
- or:
- files~=^[^/]+\.md$
- files~=^docs/
@@ -27,7 +26,7 @@ pull_request_rules:
Hi @{{author}}, the pre-commit checks have failed. Please run:
```bash
uv pip install pre-commit>=4.5.1
uv pip install pre-commit
pre-commit install
pre-commit run --all-files
```
@@ -38,13 +37,15 @@ pull_request_rules:
> [!TIP]
> <details>
> <summary>Is <code>mypy</code> failing?</summary>
> <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
> <br/>
> <code>mypy</code> is run differently in CI. If the failure is related to this check, please use the following command to run it locally:
> <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
>
> ```bash
> # For mypy (substitute "3.10" with the failing version if needed)
> pre-commit run --hook-stage manual mypy-3.10
> # For markdownlint
> pre-commit run --hook-stage manual markdownlint
> ```
> </details>
@@ -334,7 +335,7 @@ pull_request_rules:
- or:
- files~=^tests/tool_use/
- files~=^tests/entrypoints/openai/tool_parsers/
- files=tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
- files~=^vllm/entrypoints/openai/tool_parsers/
- files=docs/features/tool_calling.md
- files~=^examples/tool_chat_*

View File

@@ -6,9 +6,6 @@ on:
- main
workflow_dispatch: # Manual trigger
permissions:
contents: read
jobs:
macos-m1-smoke-test:
runs-on: macos-latest

2
.gitignore vendored
View File

@@ -189,9 +189,11 @@ cython_debug/
.vscode/
# Claude
CLAUDE.md
.claude/
# Codex
AGENTS.md
.codex/
# Cursor

View File

@@ -13,7 +13,7 @@ repos:
args: [--output-format, github, --fix]
- id: ruff-format
- repo: https://github.com/crate-ci/typos
rev: v1.43.5
rev: v1.38.1
hooks:
- id: typos
args: [--force-exclude]
@@ -24,13 +24,12 @@ repos:
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
types_or: [c++, cuda]
args: [--style=file, --verbose]
- repo: https://github.com/DavidAnson/markdownlint-cli2
rev: v0.21.0
- repo: https://github.com/igorshubovych/markdownlint-cli
rev: v0.45.0
hooks:
- id: markdownlint-cli2
language_version: lts
args: [--fix]
exclude: ^CLAUDE\.md$
- id: markdownlint
exclude: '.*\.inc\.md'
stages: [manual] # Only run in CI
- repo: https://github.com/rhysd/actionlint
rev: v1.7.7
hooks:
@@ -56,7 +55,7 @@ repos:
language: python
types_or: [python, pyi]
require_serial: true
additional_dependencies: ["mypy[faster-cache]==1.19.1", regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.10
entry: python tools/pre_commit/mypy.py 1 "3.10"
@@ -128,13 +127,6 @@ repos:
language: python
types: [python]
additional_dependencies: [regex]
# prevent use torch.cuda APIs
- id: check-torch-cuda-call
name: "Prevent new 'torch.cuda' APIs call"
entry: python tools/pre_commit/check_torch_cuda.py
language: python
types: [python]
additional_dependencies: [regex]
- id: validate-config
name: Validate configuration has default values and that each field has a docstring
entry: python tools/pre_commit/validate_config.py

View File

@@ -9,7 +9,6 @@ build:
python: "3.12"
jobs:
post_checkout:
# - bash docs/maybe_skip_pr_build.sh
- git fetch origin main --unshallow --no-tags --filter=blob:none || true
pre_create_environment:
- pip install uv

113
AGENTS.md
View File

@@ -1,113 +0,0 @@
# Agent Instructions for vLLM
> These instructions apply to **all** AI-assisted contributions to `vllm-project/vllm`.
> Breaching these guidelines can result in automatic banning.
## 1. Contribution Policy (Mandatory)
### Duplicate-work checks
Before proposing a PR, run these checks:
```bash
gh issue view <issue_number> --repo vllm-project/vllm --comments
gh pr list --repo vllm-project/vllm --state open --search "<issue_number> in:body"
gh pr list --repo vllm-project/vllm --state open --search "<short area keywords>"
```
- If an open PR already addresses the same fix, do not open another.
- If your approach is materially different, explain the difference in the issue.
### No low-value busywork PRs
Do not open one-off PRs for tiny edits (single typo, isolated style change, one mutable default, etc.). Mechanical cleanups are acceptable only when bundled with substantive work.
### Accountability
- Pure code-agent PRs are **not allowed**. A human submitter must understand and defend the change end-to-end.
- The submitting human must review every changed line and run relevant tests.
- PR descriptions for AI-assisted work **must** include:
- Why this is not duplicating an existing PR.
- Test commands run and results.
- Clear statement that AI assistance was used.
### Fail-closed behavior
If work is duplicate/trivial busywork, **do not proceed**. Return a short explanation of what is missing.
---
## 2. Development Workflow
### Environment setup
```bash
# Install `uv` if you don't have it already:
curl -LsSf https://astral.sh/uv/install.sh | sh
# Always use `uv` for Python environment management:
uv venv --python 3.12
source .venv/bin/activate
# Always make sure `pre-commit` and its hooks are installed:
uv pip install -r requirements/lint.txt
pre-commit install
```
### Installing dependencies
```bash
# If you are only making Python changes:
VLLM_USE_PRECOMPILED=1 uv pip install -e .
# If you are also making C/C++ changes:
uv pip install -e .
```
### Running tests
Tests require extra dependencies.
All versions for test dependencies should be read from `requirements/test.txt`
```bash
# Install bare minimum test dependencies:
uv pip install pytest pytest-asyncio tblib
# Install additional test dependencies as needed, or install them all as follows:
uv pip install -r requirements/test.txt
# Run specific test from specific test file
pytest tests/path/to/test.py -v -s -k test_name
# Run all tests in directory
pytest tests/path/to/dir -v -s
```
### Running linters
```bash
# Run all pre-commit hooks on staged files:
pre-commit run
# Run on all files:
pre-commit run --all-files
# Run a specific hook:
pre-commit run ruff-check --all-files
# Run mypy as it is in CI:
pre-commit run mypy-3.10 --all-files --hook-stage manual
```
### Commit messages
Add attribution using commit trailers such as `Co-authored-by:` (other projects use `Assisted-by:` or `Generated-by:`). For example:
```text
Your commit message here
Co-authored-by: GitHub Copilot
Co-authored-by: Claude
Co-authored-by: gemini-code-assist
Signed-off-by: Your Name <your.email@example.com>
```

View File

@@ -1 +0,0 @@
@AGENTS.md

View File

@@ -37,7 +37,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1150;gfx1151;gfx1152;gfx1153;gfx1200;gfx1201")
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
# ROCm installation prefix. Default to /opt/rocm but allow override via
# -DROCM_PATH=/your/rocm/path when invoking cmake.

View File

@@ -187,7 +187,7 @@ python benchmark.py \
## Hardware Requirements
| Backend | Hardware |
| ------- | -------- |
|---------|----------|
| Flash/Triton/FlashInfer | Any CUDA GPU |
| CUTLASS MLA | Blackwell (SM100+) |
| FlashAttn MLA | Hopper (SM90+) |

View File

@@ -59,9 +59,7 @@ def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
"""Run MLA benchmark with appropriate backend."""
from mla_runner import run_mla_benchmark as run_mla
return run_mla(
config.backend, config, prefill_backend=config.prefill_backend, **kwargs
)
return run_mla(config.backend, config, **kwargs)
def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
@@ -442,21 +440,14 @@ def main():
# Backend selection
parser.add_argument(
"--backends",
"--decode-backends",
nargs="+",
help="Decode backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
"flashinfer_mla, flashattn_mla, flashmla)",
)
parser.add_argument(
"--backend",
help="Single backend (alternative to --backends)",
)
parser.add_argument(
"--prefill-backends",
nargs="+",
help="Prefill backends to compare (fa2, fa3, fa4). "
"Uses the first decode backend for impl construction.",
)
# Batch specifications
parser.add_argument(
@@ -511,7 +502,7 @@ def main():
# Override args with YAML values, but CLI args take precedence
# Check if CLI provided backends (they would be non-None and not default)
cli_backends_provided = args.backend is not None or args.backends is not None
cli_backends_provided = args.backends is not None or args.backend is not None
# Backend(s) - only use YAML if CLI didn't specify
if not cli_backends_provided:
@@ -521,12 +512,6 @@ def main():
elif "backends" in yaml_config:
args.backends = yaml_config["backends"]
args.backend = None
elif "decode_backends" in yaml_config:
args.backends = yaml_config["decode_backends"]
args.backend = None
# Prefill backends (e.g., ["fa3", "fa4"])
args.prefill_backends = yaml_config.get("prefill_backends", None)
# Check for special modes
if "mode" in yaml_config:
@@ -628,10 +613,7 @@ def main():
# Determine backends
backends = args.backends or ([args.backend] if args.backend else ["flash"])
prefill_backends = getattr(args, "prefill_backends", None)
console.print(f"Backends: {', '.join(backends)}")
if prefill_backends:
console.print(f"Prefill backends: {', '.join(prefill_backends)}")
console.print(f"Batch specs: {', '.join(args.batch_specs)}")
console.print()
@@ -868,93 +850,37 @@ def main():
else:
# Normal mode: compare backends
decode_results = []
prefill_results = []
total = len(backends) * len(args.batch_specs)
# Run decode backend comparison
if not prefill_backends:
# No prefill backends specified: compare decode backends as before
total = len(backends) * len(args.batch_specs)
with tqdm(total=total, desc="Benchmarking") as pbar:
for spec in args.batch_specs:
for backend in backends:
config = BenchmarkConfig(
backend=backend,
batch_spec=spec,
num_layers=args.num_layers,
head_dim=args.head_dim,
num_q_heads=args.num_q_heads,
num_kv_heads=args.num_kv_heads,
block_size=args.block_size,
device=args.device,
repeats=args.repeats,
warmup_iters=args.warmup_iters,
profile_memory=args.profile_memory,
)
with tqdm(total=total, desc="Benchmarking") as pbar:
for spec in args.batch_specs:
for backend in backends:
config = BenchmarkConfig(
backend=backend,
batch_spec=spec,
num_layers=args.num_layers,
head_dim=args.head_dim,
num_q_heads=args.num_q_heads,
num_kv_heads=args.num_kv_heads,
block_size=args.block_size,
device=args.device,
repeats=args.repeats,
warmup_iters=args.warmup_iters,
profile_memory=args.profile_memory,
)
result = run_benchmark(config)
all_results.append(result)
result = run_benchmark(config)
decode_results.append(result)
if not result.success:
console.print(f"[red]Error {backend} {spec}: {result.error}[/]")
if not result.success:
console.print(
f"[red]Error {backend} {spec}: {result.error}[/]"
)
pbar.update(1)
pbar.update(1)
console.print("\n[bold green]Results:[/]")
formatter = ResultsFormatter(console)
formatter.print_table(decode_results, backends)
# Run prefill backend comparison
if prefill_backends:
# Use first decode backend for impl construction
decode_backend = backends[0]
total = len(prefill_backends) * len(args.batch_specs)
console.print(
f"[yellow]Prefill comparison mode: "
f"using {decode_backend} for decode impl[/]"
)
with tqdm(total=total, desc="Prefill benchmarking") as pbar:
for spec in args.batch_specs:
for pb in prefill_backends:
config = BenchmarkConfig(
backend=decode_backend,
batch_spec=spec,
num_layers=args.num_layers,
head_dim=args.head_dim,
num_q_heads=args.num_q_heads,
num_kv_heads=args.num_kv_heads,
block_size=args.block_size,
device=args.device,
repeats=args.repeats,
warmup_iters=args.warmup_iters,
profile_memory=args.profile_memory,
prefill_backend=pb,
)
result = run_benchmark(config)
# Label result with prefill backend name for display
labeled_config = replace(result.config, backend=pb)
result = replace(result, config=labeled_config)
prefill_results.append(result)
if not result.success:
console.print(f"[red]Error {pb} {spec}: {result.error}[/]")
pbar.update(1)
console.print("\n[bold green]Prefill Backend Results:[/]")
formatter = ResultsFormatter(console)
formatter.print_table(
prefill_results, prefill_backends, compare_to_fastest=True
)
all_results = decode_results + prefill_results
# Display results
console.print("\n[bold green]Results:[/]")
formatter = ResultsFormatter(console)
formatter.print_table(all_results, backends)
# Save results
if all_results:

View File

@@ -30,7 +30,7 @@ def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
max_kv_len = max(r.kv_len for r in requests) if requests else 0
return (batch_size, max_q_len, max_kv_len)
except Exception:
# Fallback for unparsable specs
# Fallback for unparseable specs
return (0, 0, 0)
@@ -77,7 +77,6 @@ class MockKVBProj:
self.qk_nope_head_dim = qk_nope_head_dim
self.v_head_dim = v_head_dim
self.out_dim = qk_nope_head_dim + v_head_dim
self.weight = torch.empty(0, dtype=torch.bfloat16)
def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
"""
@@ -214,7 +213,6 @@ class BenchmarkConfig:
use_cuda_graphs: bool = False
# MLA-specific
prefill_backend: str | None = None
kv_lora_rank: int | None = None
qk_nope_head_dim: int | None = None
qk_rope_head_dim: int | None = None

View File

@@ -1,19 +1,4 @@
# MLA prefill backend comparison
#
# Compares all available MLA prefill backends:
# FA backends: fa2, fa3, fa4 (FlashAttention versions)
# Non-FA: flashinfer, cudnn, trtllm (Blackwell-only, require flashinfer)
#
# Uses cutlass_mla as the decode backend for impl construction
# (only the prefill path is exercised).
#
# Backends that aren't available on the current platform will report errors
# in the results table (e.g., fa3 on Blackwell, cudnn without artifactory).
#
# Usage:
# python benchmark.py --config configs/mla_prefill.yaml
description: "MLA prefill backend comparison"
# MLA prefill-only benchmark configuration for sparse backends
model:
name: "deepseek-v3"
@@ -27,25 +12,20 @@ model:
v_head_dim: 128
block_size: 128
# model:
# name: "deepseek-v2-lite"
# num_layers: 27
# num_q_heads: 16
# num_kv_heads: 1
# head_dim: 576
# kv_lora_rank: 512
# qk_nope_head_dim: 128
# qk_rope_head_dim: 64
# v_head_dim: 128
# block_size: 128
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
model_parameter_sweep:
param_name: "num_q_heads"
values: [128, 64, 32, 16]
label_format: "{backend}_{value}h"
batch_specs:
# Pure prefill
- "q512"
- "q1k"
- "q2k"
- "q4k"
- "q8k"
- "1q512"
- "1q1k"
- "1q2k"
- "1q4k"
- "1q8k"
# Batched pure prefill
- "2q512"
@@ -64,63 +44,19 @@ batch_specs:
- "8q4k"
- "8q8k"
# Chunked prefill / extend
# Short context
- "q128s1k"
- "q256s2k"
- "q512s4k"
- "q1ks4k"
- "q2ks8k"
- "2q128s1k"
- "2q256s2k"
- "2q512s4k"
- "2q1ks4k"
- "2q2ks8k"
- "4q128s1k"
- "4q256s2k"
- "4q512s4k"
- "4q1ks4k"
- "4q2ks8k"
- "8q128s1k"
- "8q256s2k"
- "8q512s4k"
- "8q1ks4k"
# Extend
- "1q512s4k"
- "1q512s8k"
- "1q1ks8k"
- "1q2ks8k"
- "1q2ks16k"
- "1q4ks16k"
# Medium context
- "q128s16k"
- "q512s16k"
- "q1ks16k"
- "q2ks16k"
- "2q128s16k"
- "2q512s16k"
- "2q1ks16k"
- "2q2ks16k"
- "4q128s16k"
- "4q512s16k"
- "4q1ks16k"
- "4q2ks16k"
# Long context
- "q128s64k"
- "q512s64k"
- "q1ks64k"
- "q2ks64k"
- "2q128s64k"
- "2q512s64k"
- "2q1ks64k"
- "2q2ks64k"
decode_backends:
- CUTLASS_MLA
prefill_backends:
- fa2
- fa3
- fa4
- flashinfer
- cudnn
- trtllm
backends:
- FLASHMLA_SPARSE
- FLASHINFER_MLA_SPARSE
device: "cuda:0"
repeats: 20
warmup_iters: 5
repeats: 10
warmup_iters: 3
profile_memory: true

View File

@@ -1,62 +0,0 @@
# MLA prefill-only benchmark configuration for sparse backends
model:
name: "deepseek-v3"
num_layers: 60
num_q_heads: 128
num_kv_heads: 1
head_dim: 576
kv_lora_rank: 512
qk_nope_head_dim: 128
qk_rope_head_dim: 64
v_head_dim: 128
block_size: 128
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
model_parameter_sweep:
param_name: "num_q_heads"
values: [128, 64, 32, 16]
label_format: "{backend}_{value}h"
batch_specs:
# Pure prefill
- "1q512"
- "1q1k"
- "1q2k"
- "1q4k"
- "1q8k"
# Batched pure prefill
- "2q512"
- "2q1k"
- "2q2k"
- "2q4k"
- "2q8k"
- "4q512"
- "4q1k"
- "4q2k"
- "4q4k"
- "4q8k"
- "8q512"
- "8q1k"
- "8q2k"
- "8q4k"
- "8q8k"
# Extend
- "1q512s4k"
- "1q512s8k"
- "1q1ks8k"
- "1q2ks8k"
- "1q2ks16k"
- "1q4ks16k"
backends:
- FLASHMLA_SPARSE
- FLASHINFER_MLA_SPARSE
device: "cuda:0"
repeats: 10
warmup_iters: 3
profile_memory: true

View File

@@ -62,7 +62,6 @@ def create_minimal_vllm_config(
max_num_seqs: int = 256,
mla_dims: dict | None = None,
index_topk: int | None = None,
prefill_backend: str | None = None,
) -> VllmConfig:
"""
Create minimal VllmConfig for MLA benchmarks.
@@ -76,9 +75,6 @@ def create_minimal_vllm_config(
setup_mla_dims(model_name)
index_topk: Optional topk value for sparse MLA backends. If provided,
the config will include index_topk for sparse attention.
prefill_backend: Prefill backend name (e.g., "fa3", "fa4", "flashinfer",
"cudnn", "trtllm"). Configures the attention config to
force the specified prefill backend.
Returns:
VllmConfig for benchmarking
@@ -149,6 +145,7 @@ def create_minimal_vllm_config(
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
enable_prefix_caching=False,
)
@@ -167,7 +164,7 @@ def create_minimal_vllm_config(
compilation_config = CompilationConfig()
vllm_config = VllmConfig(
return VllmConfig(
model_config=model_config,
cache_config=cache_config,
parallel_config=parallel_config,
@@ -175,84 +172,9 @@ def create_minimal_vllm_config(
compilation_config=compilation_config,
)
if prefill_backend is not None:
prefill_cfg = get_prefill_backend_config(prefill_backend)
if prefill_cfg["flash_attn_version"] is not None:
vllm_config.attention_config.flash_attn_version = prefill_cfg[
"flash_attn_version"
]
vllm_config.attention_config.disable_flashinfer_prefill = prefill_cfg[
"disable_flashinfer_prefill"
]
vllm_config.attention_config.use_cudnn_prefill = prefill_cfg[
"use_cudnn_prefill"
]
vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill = prefill_cfg[
"use_trtllm_ragged_deepseek_prefill"
]
return vllm_config
# ============================================================================
# Prefill Backend Configuration
# ============================================================================
# Maps prefill backend names to attention config overrides.
# FA backends set flash_attn_version and disable non-FA paths.
# Non-FA backends enable their specific path and disable others.
_PREFILL_BACKEND_CONFIG: dict[str, dict] = {
"fa2": {
"flash_attn_version": 2,
"disable_flashinfer_prefill": True,
"use_cudnn_prefill": False,
"use_trtllm_ragged_deepseek_prefill": False,
},
"fa3": {
"flash_attn_version": 3,
"disable_flashinfer_prefill": True,
"use_cudnn_prefill": False,
"use_trtllm_ragged_deepseek_prefill": False,
},
"fa4": {
"flash_attn_version": 4,
"disable_flashinfer_prefill": True,
"use_cudnn_prefill": False,
"use_trtllm_ragged_deepseek_prefill": False,
},
"flashinfer": {
"flash_attn_version": None,
"disable_flashinfer_prefill": False,
"use_cudnn_prefill": False,
"use_trtllm_ragged_deepseek_prefill": False,
},
"cudnn": {
"flash_attn_version": None,
"disable_flashinfer_prefill": True,
"use_cudnn_prefill": True,
"use_trtllm_ragged_deepseek_prefill": False,
},
"trtllm": {
"flash_attn_version": None,
"disable_flashinfer_prefill": True,
"use_cudnn_prefill": False,
"use_trtllm_ragged_deepseek_prefill": True,
},
}
def get_prefill_backend_config(prefill_backend: str) -> dict:
"""Get attention config overrides for a prefill backend."""
if prefill_backend not in _PREFILL_BACKEND_CONFIG:
raise ValueError(
f"Unknown prefill backend: {prefill_backend!r}. "
f"Available: {list(_PREFILL_BACKEND_CONFIG.keys())}"
)
return _PREFILL_BACKEND_CONFIG[prefill_backend]
# ============================================================================
# Decode Backend Configuration
# Backend Configuration
# ============================================================================
@@ -282,7 +204,6 @@ def _get_backend_config(backend: str) -> dict:
Returns:
Dict with backend configuration
"""
from vllm.v1.attention.backend import MultipleOf
from vllm.v1.attention.backends.registry import AttentionBackendEnum
try:
@@ -299,8 +220,8 @@ def _get_backend_config(backend: str) -> dict:
block_sizes = backend_class.get_supported_kernel_block_sizes()
# Use first supported block size (backends typically support one for MLA)
block_size = block_sizes[0] if block_sizes else None
if isinstance(block_size, MultipleOf):
# No fixed block size; fall back to config value
if hasattr(block_size, "value"):
# Handle MultipleOf enum
block_size = None
# Check if sparse via class method if available
@@ -756,11 +677,16 @@ def _run_single_benchmark(
if is_sparse and indexer is not None:
indexer.fill_random_indices(total_q, max_kv_len)
# Determine which forward method to use based on metadata
if metadata.decode is not None:
# Determine which forward method to use
if is_sparse:
# Sparse backends use forward_mqa
forward_fn = lambda: impl.forward_mqa(decode_inputs, kv_cache, metadata, layer)
elif metadata.decode is not None:
forward_fn = lambda: impl._forward_decode(
decode_inputs, kv_cache, metadata, layer
)
elif metadata.prefill is not None:
forward_fn = lambda: impl.forward_mha(
forward_fn = lambda: impl._forward_prefill(
prefill_inputs["q"],
prefill_inputs["k_c_normed"],
prefill_inputs["k_pe"],
@@ -775,7 +701,7 @@ def _run_single_benchmark(
# Warmup
for _ in range(config.warmup_iters):
forward_fn()
torch.accelerator.synchronize()
torch.cuda.synchronize()
# Benchmark
times = []
@@ -788,7 +714,7 @@ def _run_single_benchmark(
forward_fn()
end.record()
torch.accelerator.synchronize()
torch.cuda.synchronize()
elapsed_ms = start.elapsed_time(end)
times.append(elapsed_ms / 1000.0 / config.num_layers)
@@ -807,7 +733,6 @@ def _run_mla_benchmark_batched(
backend: str,
configs_with_params: list[tuple], # [(config, threshold, num_splits), ...]
index_topk: int = 2048,
prefill_backend: str | None = None,
) -> list[BenchmarkResult]:
"""
Unified batched MLA benchmark runner for all backends.
@@ -819,13 +744,11 @@ def _run_mla_benchmark_batched(
to avoid setup/teardown overhead.
Args:
backend: Backend name (decode backend used for impl construction)
backend: Backend name
configs_with_params: List of (config, threshold, num_splits) tuples
- threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
- num_splits: num_kv_splits (CUTLASS only)
index_topk: Topk value for sparse MLA backends (default 2048)
prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
When set, forces the specified FlashAttention version for prefill.
Returns:
List of BenchmarkResult objects
@@ -835,7 +758,7 @@ def _run_mla_benchmark_batched(
backend_cfg = _get_backend_config(backend)
device = torch.device(configs_with_params[0][0].device)
torch.accelerator.set_device_index(device)
torch.cuda.set_device(device)
# Determine block size
config_block_size = configs_with_params[0][0].block_size
@@ -858,25 +781,11 @@ def _run_mla_benchmark_batched(
block_size=block_size,
mla_dims=mla_dims, # Use custom dims from config or default
index_topk=index_topk if is_sparse else None,
prefill_backend=prefill_backend,
)
results = []
with set_current_vllm_config(vllm_config):
# Clear cached prefill backend detection functions so they re-evaluate
# with the current VllmConfig. These are @functools.cache decorated and
# would otherwise return stale results from a previous backend's config.
from vllm.model_executor.layers.attention.mla_attention import (
use_cudnn_prefill,
use_flashinfer_prefill,
use_trtllm_ragged_deepseek_prefill,
)
use_flashinfer_prefill.cache_clear()
use_cudnn_prefill.cache_clear()
use_trtllm_ragged_deepseek_prefill.cache_clear()
# Create backend impl, layer, builder, and indexer (reused across benchmarks)
impl, layer, builder_instance, indexer = _create_backend_impl(
backend_cfg,
@@ -886,38 +795,6 @@ def _run_mla_benchmark_batched(
index_topk=index_topk if is_sparse else None,
)
# Verify the actual prefill backend matches what was requested
if prefill_backend is not None:
prefill_cfg = get_prefill_backend_config(prefill_backend)
fa_version = prefill_cfg["flash_attn_version"]
if fa_version is not None:
# FA backend: verify the impl's FA version
actual_fa_version = getattr(impl, "vllm_flash_attn_version", None)
if actual_fa_version != fa_version:
raise RuntimeError(
f"Prefill backend '{prefill_backend}' requested FA "
f"version {fa_version}, but the impl is using FA "
f"version {actual_fa_version}. Check "
f"vllm/v1/attention/backends/fa_utils.py."
)
else:
# Non-FA backend: verify the builder picked the right path
expected_flags = {
"flashinfer": "_use_fi_prefill",
"cudnn": "_use_cudnn_prefill",
"trtllm": "_use_trtllm_ragged_prefill",
}
flag_name = expected_flags.get(prefill_backend)
if flag_name and not getattr(builder_instance, flag_name, False):
raise RuntimeError(
f"Prefill backend '{prefill_backend}' was requested "
f"but the metadata builder did not enable it. This "
f"usually means a dependency is missing (e.g., "
f"flashinfer not installed) or the platform doesn't "
f"support it."
)
# Run each benchmark with the shared impl
for config, threshold, num_splits in configs_with_params:
# Set threshold for this benchmark (FlashAttn/FlashMLA only)
@@ -968,7 +845,6 @@ def run_mla_benchmark(
reorder_batch_threshold: int | None = None,
num_kv_splits: int | None = None,
index_topk: int = 2048,
prefill_backend: str | None = None,
) -> BenchmarkResult | list[BenchmarkResult]:
"""
Unified MLA benchmark runner for all backends.
@@ -986,8 +862,6 @@ def run_mla_benchmark(
(single config mode only)
num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
index_topk: Topk value for sparse MLA backends (default 2048)
prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
When set, forces the specified FlashAttention version for prefill.
Returns:
BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
@@ -1011,9 +885,7 @@ def run_mla_benchmark(
return_single = True
# Use unified batched execution
results = _run_mla_benchmark_batched(
backend, configs_with_params, index_topk, prefill_backend=prefill_backend
)
results = _run_mla_benchmark_batched(backend, configs_with_params, index_topk)
# Return single result or list based on input
return results[0] if return_single else results

View File

@@ -141,6 +141,7 @@ def _create_vllm_config(
cache_config = CacheConfig(
block_size=config.block_size,
cache_dtype="auto",
swap_space=0,
)
cache_config.num_gpu_blocks = max_num_blocks
cache_config.num_cpu_blocks = 0
@@ -390,7 +391,7 @@ def _run_single_benchmark(
attn_metadata,
output=out,
)
torch.accelerator.synchronize()
torch.cuda.synchronize()
# Benchmark
times = []
@@ -411,15 +412,15 @@ def _run_single_benchmark(
)
end.record()
torch.accelerator.synchronize()
torch.cuda.synchronize()
elapsed_ms = start.elapsed_time(end)
times.append(elapsed_ms / 1000.0 / config.num_layers) # seconds per layer
mem_stats = {}
if config.profile_memory:
mem_stats = {
"allocated_mb": torch.accelerator.memory_allocated(device) / 1024**2,
"reserved_mb": torch.accelerator.memory_reserved(device) / 1024**2,
"allocated_mb": torch.cuda.memory_allocated(device) / 1024**2,
"reserved_mb": torch.cuda.memory_reserved(device) / 1024**2,
}
return times, mem_stats
@@ -443,7 +444,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
BenchmarkResult with timing and memory statistics
"""
device = torch.device(config.device)
torch.accelerator.set_device_index(device)
torch.cuda.set_device(device)
backend_cfg = _get_backend_config(config.backend)

View File

@@ -41,7 +41,7 @@ MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LE
| --- | --- | --- |
| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
| `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` |
| `SYSTEM` | **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
| `TP` | **Required.** The tensor-parallelism size. | `1` |
| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
| `INPUT_LEN` | **Required.** Request input length. | `4000` |

View File

@@ -94,18 +94,15 @@ def create_logits(
def measure_memory() -> tuple[int, int]:
"""Return (allocated, reserved) memory in bytes."""
torch.accelerator.synchronize()
return (
torch.accelerator.memory_allocated(),
torch.accelerator.max_memory_allocated(),
)
torch.cuda.synchronize()
return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()
def reset_memory_stats():
"""Reset peak memory statistics."""
reset_buffer_cache()
torch.accelerator.reset_peak_memory_stats()
torch.accelerator.empty_cache()
torch.cuda.reset_peak_memory_stats()
torch.cuda.empty_cache()
gc.collect()
@@ -126,7 +123,7 @@ def benchmark_function(
for _ in range(warmup_iters):
logits_copy = logits.clone()
func(logits_copy, k, p)
torch.accelerator.synchronize()
torch.cuda.synchronize()
# Reset memory stats before benchmark
reset_memory_stats()
@@ -143,7 +140,7 @@ def benchmark_function(
func(logits_copy, k, p)
end_events[i].record()
torch.accelerator.synchronize()
torch.cuda.synchronize()
# Calculate timing
times = [

View File

@@ -1,98 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import torch
from vllm import _custom_ops as ops
from vllm.triton_utils import triton
# DeepSeek V3 dimensions
NOPE_DIM = 512
ROPE_DIM = 64
NUM_HEADS = 128
NUM_TOKENS = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
def get_configs():
return NUM_TOKENS
def make_inputs(num_tokens, dtype):
"""Create inputs matching the real code path.
Args:
contiguous_nope: If False, simulate the transposed BMM output
(non-contiguous nope with stride pattern from
[N,B,L].transpose(0,1)).
"""
# Simulate: bmm output [N, B, L].transpose(0, 1) -> [B, N, L]
raw = torch.randn(NUM_HEADS, num_tokens, NOPE_DIM, dtype=dtype, device="cuda")
ql_nope = raw.transpose(0, 1)
q_pe = torch.randn(num_tokens, NUM_HEADS, ROPE_DIM, dtype=dtype, device="cuda")
return ql_nope, q_pe
# ---- Non-contiguous nope benchmark (real code path) ----
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["num_tokens"],
x_vals=get_configs(),
line_arg="provider",
line_vals=["torch_cat", "concat_mla_q"],
line_names=["torch.cat", "concat_mla_q (v8)"],
styles=[("blue", "--"), ("green", "-")],
ylabel="Latency (us)",
plot_name="concat_mla_q-transposed",
args={},
)
)
def bench_transposed(num_tokens, provider):
dtype = torch.bfloat16
ql_nope, q_pe = make_inputs(num_tokens, dtype)
q_out = torch.empty(
num_tokens, NUM_HEADS, NOPE_DIM + ROPE_DIM, dtype=dtype, device="cuda"
)
quantiles = [0.5, 0.2, 0.8]
if provider == "torch_cat":
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: torch.cat((ql_nope, q_pe), dim=-1), quantiles=quantiles, rep=500
)
else:
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: ops.concat_mla_q(ql_nope, q_pe, q_out), quantiles=quantiles, rep=500
)
return ms * 1000, max_ms * 1000, min_ms * 1000 # us
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Benchmark concat_mla_q vs torch.cat")
parser.add_argument(
"--save-path", type=str, default=None, help="Path to save benchmark results"
)
args = parser.parse_args()
print("\n" + "=" * 70)
print("CONCAT MLA Q KERNEL BENCHMARKS")
print("=" * 70)
print(f"Dimensions: nope={NOPE_DIM}, rope={ROPE_DIM}, heads={NUM_HEADS}")
print(
f"Per-head output: {NOPE_DIM + ROPE_DIM} bf16 = "
f"{(NOPE_DIM + ROPE_DIM) * 2} bytes"
)
print(f"num_tokens (decode=batch_size, prefill=chunk_size): {NUM_TOKENS}")
print("=" * 70)
print("\n--- Non-contiguous nope inputs (transposed BMM output) ---")
bench_transposed.run(print_data=True, save_path=args.save_path)
print("\n" + "=" * 70)
print("Benchmarking complete!")
print("=" * 70)

View File

@@ -1,153 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import math
import torch
from vllm import _custom_ops as ops
from vllm.triton_utils import triton
# DeepSeek V3 MLA dimensions
NOPE_DIM = 512
ROPE_DIM = 64
HEAD_DIM = NOPE_DIM + ROPE_DIM # 576 BF16 output elements per token
ENTRY_BYTES = 656 # 512 FP8 + 16 scales + 128 BF16 RoPE
BLOCK_SIZE = 64 # tokens per physical cache block - get_supported_kernel_block_sizes
# Realistic prefill scenarios:
# - 1 long prefill: single request, 16K-96K tokens
# - 4 medium prefills: 4 requests, 4K-24K tokens each
# - 16 shorter prefills: 16 requests, 1K-6K tokens each
SCENARIOS = [
# (label, num_reqs, total_tokens_list)
("1-req", 1, [8192, 16384, 32768, 65536, 98304]),
("4-reqs", 4, [8192, 16384, 32768, 65536, 98304]),
("16-reqs", 16, [8192, 16384, 32768, 65536, 98304]),
]
def make_inputs(total_tokens, num_reqs, block_size):
"""Create synthetic FP8 cache, block table, and output buffer.
Fills the cache with random bytes (we only measure throughput,
not correctness). Block table maps each request to contiguous
physical blocks.
"""
# Divide tokens evenly across requests
base_len = total_tokens // num_reqs
remainder = total_tokens % num_reqs
seq_lens = [base_len + (1 if r < remainder else 0) for r in range(num_reqs)]
# workspace_starts: cumulative sum of seq_lens
workspace_starts = [0] * num_reqs
for r in range(1, num_reqs):
workspace_starts[r] = workspace_starts[r - 1] + seq_lens[r - 1]
# Physical blocks needed per request
blocks_per_req = [math.ceil(s / block_size) for s in seq_lens]
total_blocks = sum(blocks_per_req)
max_blocks = max(blocks_per_req)
# Allocate cache with random data (content doesn't matter for perf)
cache = torch.randint(
0,
256,
(total_blocks, block_size, ENTRY_BYTES),
dtype=torch.uint8,
device="cuda",
)
# Block table: contiguous block assignments
block_table = torch.zeros(num_reqs, max_blocks, dtype=torch.int32, device="cuda")
block_idx = 0
for r in range(num_reqs):
for b in range(blocks_per_req[r]):
block_table[r, b] = block_idx
block_idx += 1
# Output workspace
dst = torch.zeros(total_tokens, HEAD_DIM, dtype=torch.bfloat16, device="cuda")
seq_lens_t = torch.tensor(seq_lens, dtype=torch.int32, device="cuda")
workspace_starts_t = torch.tensor(
workspace_starts, dtype=torch.int32, device="cuda"
)
return cache, dst, block_table, seq_lens_t, workspace_starts_t
def bench_scenario(label, num_reqs, total_tokens_list, save_path):
"""Run benchmark for a specific (num_reqs, total_tokens) scenario."""
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["total_tokens"],
x_vals=total_tokens_list,
line_arg="provider",
line_vals=["cuda_kernel"],
line_names=["cp_gather_fp8 (CUDA)"],
styles=[("green", "-")],
ylabel="Latency (us)",
plot_name=f"cp_gather_fp8-{label}-bs{BLOCK_SIZE}",
args={"num_reqs": num_reqs},
)
)
def bench_fn(total_tokens, provider, num_reqs):
cache, dst, block_table, seq_lens_t, ws_starts = make_inputs(
total_tokens, num_reqs, BLOCK_SIZE
)
quantiles = [0.5, 0.2, 0.8]
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: ops.cp_gather_and_upconvert_fp8_kv_cache(
cache, dst, block_table, seq_lens_t, ws_starts, num_reqs
),
quantiles=quantiles,
rep=500,
)
return ms * 1000, max_ms * 1000, min_ms * 1000 # us
seq_len_per_req = total_tokens_list[0] // num_reqs
seq_len_per_req_max = total_tokens_list[-1] // num_reqs
print(
f"\n--- {label}: {num_reqs} request(s), "
f"~{seq_len_per_req}-{seq_len_per_req_max} tokens/req ---"
)
bench_fn.run(print_data=True, save_path=save_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Benchmark cp_gather_and_upconvert_fp8_kv_cache"
)
parser.add_argument(
"--save-path",
type=str,
default=None,
help="Path to save benchmark results as CSV",
)
args = parser.parse_args()
# Print data volume info for bandwidth analysis
read_per_token = ENTRY_BYTES # 656 bytes from cache
write_per_token = HEAD_DIM * 2 # 576 * 2 = 1152 bytes to workspace
total_per_token = read_per_token + write_per_token # 1808 bytes
print("\n" + "=" * 70)
print("CP_GATHER_AND_UPCONVERT_FP8_KV_CACHE BENCHMARKS")
print("=" * 70)
print(f"Cache entry: {ENTRY_BYTES} bytes (512 FP8 + 16 scales + 128 RoPE)")
print(f"Output row: {HEAD_DIM} BF16 = {HEAD_DIM * 2} bytes")
print(f"Per token: {total_per_token} bytes (read + write)")
print(f"Block size: {BLOCK_SIZE} tokens/block")
print("=" * 70)
for label, num_reqs, total_tokens_list in SCENARIOS:
bench_scenario(label, num_reqs, total_tokens_list, args.save_path)
print("\n" + "=" * 70)
print("Benchmarking complete!")
print("=" * 70)

View File

@@ -168,7 +168,7 @@ def bench_impl(
# warmup
for kwargs in kwargs_list:
impl_type.get_impl()(**kwargs)
torch.accelerator.synchronize()
torch.cuda.synchronize()
# Merge into a single kwargs and qualify arguments as ArgPool
kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
@@ -202,7 +202,7 @@ def test_correctness(T: int, N: int):
# reference output
ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
# test output
# test ouptut
out_q, out_s = output_from_impl(
ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
)

View File

@@ -64,7 +64,7 @@ def bench_run(
per_out_ch: bool,
mkn: tuple[int, int, int],
):
init_workspace_manager(torch.accelerator.current_device_index())
init_workspace_manager(torch.cuda.current_device())
(m, k, n) = mkn
dtype = torch.half
@@ -171,7 +171,7 @@ def bench_run(
activation=MoEActivation.SILU,
global_num_experts=num_experts,
)
torch.accelerator.synchronize()
torch.cuda.synchronize()
# Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
triton_stream = torch.cuda.Stream()
@@ -187,14 +187,14 @@ def bench_run(
topk_ids,
quant_config=quant_config,
)
torch.accelerator.synchronize()
torch.cuda.synchronize()
def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
"""Benchmark CUDA graph using events like benchmark_moe.py"""
# Warmup
for _ in range(num_warmup):
graph.replay()
torch.accelerator.synchronize()
torch.cuda.synchronize()
# Timing
start_event = torch.Event(enable_timing=True)
@@ -202,7 +202,7 @@ def bench_run(
latencies = []
for _ in range(num_iters):
torch.accelerator.synchronize()
torch.cuda.synchronize()
start_event.record()
graph.replay()
end_event.record()

View File

@@ -307,7 +307,7 @@ def bench_run(
def replay_graph(graph, num_repeats):
for _ in range(num_repeats):
graph.replay()
torch.accelerator.synchronize()
torch.cuda.synchronize()
cutlass_stream = torch.cuda.Stream()
cutlass_graph = torch.cuda.CUDAGraph()
@@ -330,7 +330,7 @@ def bench_run(
e=num_experts,
device=device,
)
torch.accelerator.synchronize()
torch.cuda.synchronize()
triton_stream = torch.cuda.Stream()
triton_graph = torch.cuda.CUDAGraph()
@@ -345,7 +345,7 @@ def bench_run(
w2_fp8scale,
a_fp8_scale,
)
torch.accelerator.synchronize()
torch.cuda.synchronize()
min_run_time = 5
num_warmup = 5

View File

@@ -342,7 +342,7 @@ class CommunicatorBenchmark:
if not should_use_fn(tensor):
return None
torch.accelerator.synchronize()
torch.cuda.synchronize()
stream = torch.cuda.Stream()
with torch.cuda.stream(stream):
graph_input = tensor.clone()
@@ -360,17 +360,17 @@ class CommunicatorBenchmark:
for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
allreduce_fn(graph_input)
torch.accelerator.synchronize()
torch.cuda.synchronize()
for _ in range(num_warmup):
graph.replay()
torch.accelerator.synchronize()
torch.cuda.synchronize()
torch.accelerator.synchronize()
torch.cuda.synchronize()
start_time = time.perf_counter()
for _ in range(num_trials):
graph.replay()
torch.accelerator.synchronize()
torch.cuda.synchronize()
end_time = time.perf_counter()
@@ -495,7 +495,7 @@ def main():
# Set device
device = torch.device(f"cuda:{rank}")
torch.accelerator.set_device_index(device)
torch.cuda.set_device(device)
# Get CPU process group
cpu_group = dist.new_group(backend="gloo")

View File

@@ -385,32 +385,32 @@ def benchmark_operation(
# Warmup before graph capture
for _ in range(warmup):
operation_func(*args, **kwargs)
torch.accelerator.synchronize()
torch.cuda.synchronize()
# Create CUDA graph
graph = torch.cuda.CUDAGraph()
num_op_per_cudagraph = 10
# Use vLLM's graph_capture to make tensor_model_parallel_all_reduce graph-safe
device = torch.device(f"cuda:{torch.accelerator.current_device_index()}")
device = torch.device(f"cuda:{torch.cuda.current_device()}")
with graph_capture(device=device), torch.cuda.graph(graph):
for _ in range(num_op_per_cudagraph):
operation_func(*args, **kwargs)
# Graph warmup
torch.accelerator.synchronize()
torch.cuda.synchronize()
for _ in range(warmup):
graph.replay()
# Benchmark with CUDA graph
torch.accelerator.synchronize()
torch.cuda.synchronize()
start_time = time.perf_counter()
for _ in range(trials // num_op_per_cudagraph):
# operation_func(*args, **kwargs)
graph.replay()
torch.accelerator.synchronize()
torch.cuda.synchronize()
end_time = time.perf_counter()
avg_time_ms = ((end_time - start_time) / trials) * 1000
@@ -984,7 +984,7 @@ def main():
world_size = int(os.environ["WORLD_SIZE"])
device = torch.device(f"cuda:{rank}")
torch.accelerator.set_device_index(device)
torch.cuda.set_device(device)
torch.set_default_device(device)
init_distributed_environment()

View File

@@ -50,7 +50,7 @@ def bench_run(
per_out_ch: bool,
mkn: tuple[int, int, int],
):
init_workspace_manager(torch.accelerator.current_device_index())
init_workspace_manager(torch.cuda.current_device())
label = "Quant Matmul"
sub_label = (
@@ -224,7 +224,7 @@ def bench_run(
def replay_graph(graph, num_repeats):
for _ in range(num_repeats):
graph.replay()
torch.accelerator.synchronize()
torch.cuda.synchronize()
cutlass_stream = torch.cuda.Stream()
cutlass_graph = torch.cuda.CUDAGraph()
@@ -239,7 +239,7 @@ def bench_run(
topk_weights,
topk_ids,
)
torch.accelerator.synchronize()
torch.cuda.synchronize()
triton_stream = torch.cuda.Stream()
triton_graph = torch.cuda.CUDAGraph()
@@ -254,7 +254,7 @@ def bench_run(
w2_scale,
a_scale,
)
torch.accelerator.synchronize()
torch.cuda.synchronize()
min_run_time = 5
num_warmup = 5

View File

@@ -34,14 +34,14 @@ def main(
residual = torch.randn_like(x) * scale if add_residual else None
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
torch.accelerator.synchronize()
torch.cuda.synchronize()
if profile:
torch.cuda.cudart().cudaProfilerStart()
start_time = time.perf_counter()
for _ in range(num_iters):
layer(x, residual)
torch.accelerator.synchronize()
torch.cuda.synchronize()
end_time = time.perf_counter()
if profile:

View File

@@ -1035,7 +1035,7 @@ def bench_optype(
# Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
for kwargs in kwargs_list:
op_type.bench_fn()(**kwargs)
torch.accelerator.synchronize()
torch.cuda.synchronize()
# Merge into a single kwargs and qualify arguments as ArgPool
kwargs = {k: ArgPool([]) for k in kwargs_list[0]}

View File

@@ -47,13 +47,13 @@ def benchmark_method(
# Warmup
for _ in range(num_warmup):
_ = method(k_nope, k_pe)
torch.accelerator.synchronize()
torch.cuda.synchronize()
# Benchmark
start = time.perf_counter()
for _ in range(num_iters):
_ = method(k_nope, k_pe)
torch.accelerator.synchronize()
torch.cuda.synchronize()
end = time.perf_counter()
return (end - start) / num_iters * 1000 # Convert to ms

View File

@@ -54,7 +54,7 @@ def clear_triton_cache():
# Clear CUDA memory cache
if torch.cuda.is_available():
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
# Try to clear Triton's runtime cache
try:
@@ -304,19 +304,19 @@ def benchmark_config(
# JIT compilation & warmup
run()
torch.accelerator.synchronize()
torch.cuda.synchronize()
# Capture 10 invocations with CUDA graph
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph):
for _ in range(10):
run()
torch.accelerator.synchronize()
torch.cuda.synchronize()
# Warmup
for _ in range(5):
graph.replay()
torch.accelerator.synchronize()
torch.cuda.synchronize()
start_event = torch.Event(enable_timing=True)
end_event = torch.Event(enable_timing=True)
@@ -324,7 +324,7 @@ def benchmark_config(
latencies: list[float] = []
for i in range(num_iters):
prepare(i)
torch.accelerator.synchronize()
torch.cuda.synchronize()
start_event.record()
graph.replay()
@@ -626,11 +626,7 @@ class BenchmarkWorker:
if visible_device != f"{self.device_id}":
need_device_guard = True
with (
torch.accelerator.device_index(self.device_id)
if need_device_guard
else nullcontext()
):
with torch.cuda.device(self.device_id) if need_device_guard else nullcontext():
for idx, config in enumerate(tqdm(search_space)):
try:
kernel_time = benchmark_config(

View File

@@ -131,7 +131,7 @@ def benchmark_config(
topk_ids,
quant_config=quant_config,
)
torch.accelerator.synchronize()
torch.cuda.synchronize()
# Benchmark
start = torch.cuda.Event(enable_timing=True)
@@ -149,7 +149,7 @@ def benchmark_config(
quant_config=quant_config,
)
end.record()
torch.accelerator.synchronize()
torch.cuda.synchronize()
return start.elapsed_time(end) / num_iters * 1000 # ms -> us

View File

@@ -69,19 +69,19 @@ def benchmark_permute(
# JIT compilation & warmup
run()
torch.accelerator.synchronize()
torch.cuda.synchronize()
# Capture 10 invocations with CUDA graph
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph):
for _ in range(10):
run()
torch.accelerator.synchronize()
torch.cuda.synchronize()
# Warmup
for _ in range(5):
graph.replay()
torch.accelerator.synchronize()
torch.cuda.synchronize()
start_event = torch.Event(enable_timing=True)
end_event = torch.Event(enable_timing=True)
@@ -89,7 +89,7 @@ def benchmark_permute(
latencies: list[float] = []
for i in range(num_iters):
prepare(i)
torch.accelerator.synchronize()
torch.cuda.synchronize()
start_event.record()
graph.replay()
@@ -159,26 +159,26 @@ def benchmark_unpermute(
# JIT compilation & warmup
input = prepare()
run(input)
torch.accelerator.synchronize()
torch.cuda.synchronize()
# Capture 10 invocations with CUDA graph
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph):
for _ in range(10):
run(input)
torch.accelerator.synchronize()
torch.cuda.synchronize()
# Warmup
for _ in range(5):
graph.replay()
torch.accelerator.synchronize()
torch.cuda.synchronize()
start_event = torch.Event(enable_timing=True)
end_event = torch.Event(enable_timing=True)
latencies: list[float] = []
for i in range(num_iters):
torch.accelerator.synchronize()
torch.cuda.synchronize()
start_event.record()
graph.replay()
end_event.record()

View File

@@ -135,14 +135,14 @@ def benchmark_mrope(
key.clone(),
)
torch.accelerator.synchronize()
torch.cuda.synchronize()
# Time reference implementation
torch_times = []
for _ in range(benchmark_iter):
query_clone = query.clone()
key_clone = key.clone()
torch.accelerator.synchronize()
torch.cuda.synchronize()
start_time = time.time()
mrope_helper_class.forward_native(
@@ -151,7 +151,7 @@ def benchmark_mrope(
key_clone,
)
torch.accelerator.synchronize()
torch.cuda.synchronize()
torch_times.append(time.time() - start_time)
# Time triton kernel implementation
@@ -159,14 +159,14 @@ def benchmark_mrope(
for _ in range(benchmark_iter):
query_clone = query.clone()
key_clone = key.clone()
torch.accelerator.synchronize()
torch.cuda.synchronize()
start_time = time.time()
mrope_helper_class.forward_cuda(
positions,
query_clone,
key_clone,
)
torch.accelerator.synchronize()
torch.cuda.synchronize()
triton_times.append(time.time() - start_time)
# Calculate statistics

View File

@@ -103,7 +103,7 @@ def main(
max_logits = torch.empty_like(exp_sums)
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
torch.accelerator.synchronize()
torch.cuda.synchronize()
if profile:
torch.cuda.cudart().cudaProfilerStart()
start_time = time.perf_counter()
@@ -173,7 +173,7 @@ def main(
)
else:
raise ValueError(f"Invalid version: {version}")
torch.accelerator.synchronize()
torch.cuda.synchronize()
end_time = time.perf_counter()
if profile:

View File

@@ -28,7 +28,7 @@ def _time_cuda(
# warmup
for _ in range(warmup_iters):
fn()
torch.accelerator.synchronize()
torch.cuda.synchronize()
start = torch.Event(enable_timing=True)
end = torch.Event(enable_timing=True)
@@ -37,7 +37,7 @@ def _time_cuda(
for _ in range(bench_iters):
fn()
end.record()
torch.accelerator.synchronize()
torch.cuda.synchronize()
return start.elapsed_time(end) / bench_iters # ms/iter

View File

@@ -29,7 +29,7 @@ def main(
scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
torch.accelerator.synchronize()
torch.cuda.synchronize()
if profile:
torch.cuda.cudart().cudaProfilerStart()
start_time = time.perf_counter()
@@ -39,7 +39,7 @@ def main(
ops.scaled_int8_quant(x, scale)
else:
ops.scaled_fp8_quant(x, scale)
torch.accelerator.synchronize()
torch.cuda.synchronize()
end_time = time.perf_counter()
if profile:

View File

@@ -84,16 +84,16 @@ def run_benchmark(
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
function_under_test()
torch.accelerator.synchronize()
torch.cuda.synchronize()
function_under_test = lambda: g.replay()
def run_cuda_benchmark(n_iters: int) -> float:
nonlocal key, value, key_cache, value_cache, slot_mapping
torch.accelerator.synchronize()
torch.cuda.synchronize()
start = time.perf_counter()
for _ in range(n_iters):
function_under_test()
torch.accelerator.synchronize()
torch.cuda.synchronize()
end = time.perf_counter()
return (end - start) / n_iters
@@ -104,7 +104,7 @@ def run_benchmark(
# free tensors to mitigate OOM when sweeping
del key, value, key_cache, value_cache, slot_mapping
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
return lat

View File

@@ -109,16 +109,16 @@ def run_benchmark(
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
function_under_test()
torch.accelerator.synchronize()
torch.cuda.synchronize()
function_under_test = lambda: g.replay()
def run_cuda_benchmark(n_iters: int) -> float:
nonlocal key, value, key_cache, value_cache, slot_mapping
torch.accelerator.synchronize()
torch.cuda.synchronize()
start = time.perf_counter()
for _ in range(n_iters):
function_under_test()
torch.accelerator.synchronize()
torch.cuda.synchronize()
end = time.perf_counter()
return (end - start) / n_iters
@@ -129,7 +129,7 @@ def run_benchmark(
# free tensors to mitigate OOM when sweeping
del key, value, key_cache, value_cache, slot_mapping
torch.accelerator.empty_cache()
torch.cuda.empty_cache()
return lat

View File

@@ -251,7 +251,7 @@ def benchmark(
kernel(
y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
)
torch.accelerator.synchronize()
torch.cuda.synchronize()
start_event = torch.Event(enable_timing=True)
end_event = torch.Event(enable_timing=True)
@@ -259,7 +259,7 @@ def benchmark(
# Benchmark
latencies: list[float] = []
for _ in range(runs):
torch.accelerator.synchronize()
torch.cuda.synchronize()
start_event.record()
for i in range(iterations_per_run):

View File

@@ -126,7 +126,7 @@ def benchmark_decode(
)
def time_fn(fn, warmup=10, trials=20):
torch.accelerator.synchronize()
torch.cuda.synchronize()
start = torch.Event(enable_timing=True)
end = torch.Event(enable_timing=True)
times = []
@@ -136,7 +136,7 @@ def benchmark_decode(
start.record()
fn()
end.record()
torch.accelerator.synchronize()
torch.cuda.synchronize()
times.append(start.elapsed_time(end)) # ms
return sum(times) / len(times), torch.std(torch.tensor(times))

View File

@@ -138,7 +138,7 @@ def benchmark_prefill(
)
def time_fn(fn, warmup=10, trials=20):
torch.accelerator.synchronize()
torch.cuda.synchronize()
start = torch.Event(enable_timing=True)
end = torch.Event(enable_timing=True)
times = []
@@ -148,7 +148,7 @@ def benchmark_prefill(
start.record()
fn()
end.record()
torch.accelerator.synchronize()
torch.cuda.synchronize()
times.append(start.elapsed_time(end)) # ms
return sum(times) / len(times), torch.std(torch.tensor(times))

View File

@@ -177,18 +177,18 @@ def benchmark_config(
def run():
w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
torch.accelerator.synchronize()
torch.cuda.synchronize()
# JIT complication & warmup
for _ in range(5):
run()
torch.accelerator.synchronize()
torch.cuda.synchronize()
start_event = torch.Event(enable_timing=True)
end_event = torch.Event(enable_timing=True)
latencies: list[float] = []
for i in range(num_iters):
torch.accelerator.synchronize()
torch.cuda.synchronize()
start_event.record()
run()
end_event.record()
@@ -285,7 +285,7 @@ def tune_on_gpu(args_dict):
weight_shapes = args_dict["weight_shapes"]
args = args_dict["args"]
torch.accelerator.set_device_index(gpu_id)
torch.cuda.set_device(gpu_id)
print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
block_n = args.block_n
@@ -334,7 +334,7 @@ def distribute_batch_sizes(batch_sizes, num_gpus):
def main(args):
print(args)
num_gpus = torch.accelerator.device_count()
num_gpus = torch.cuda.device_count()
if num_gpus == 0:
raise RuntimeError("No GPU available for tuning")
print(f"Found {num_gpus} GPUs for parallel tuning")

View File

@@ -35,7 +35,7 @@ def benchmark_shape(
B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
# Reference result in BF16
torch.accelerator.synchronize()
torch.cuda.synchronize()
C_ref = A @ B.t()
# Pre-quantize B for all implementations
@@ -121,14 +121,14 @@ def benchmark_shape(
# Warmup
for _ in range(warmup):
func()
torch.accelerator.synchronize()
torch.cuda.synchronize()
# Timing loop
torch.accelerator.synchronize()
torch.cuda.synchronize()
start = time.time()
for _ in range(repeat):
func()
torch.accelerator.synchronize()
torch.cuda.synchronize()
end = time.time()
# Calculate timing and TFLOPS

View File

@@ -79,8 +79,7 @@ else()
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
find_isa(${CPUINFO} "S390" S390_FOUND)
find_isa(${CPUINFO} "zvfhmin" RVV_FP16_FOUND) # Check for RISC-V Vector FP16 support
find_isa(${CPUINFO} "zvfbfmin" RVV_BF16_FOUND) # Check for RISC-V Vector BF16 support
find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
# Support cross-compilation by allowing override via environment variables
if (ENABLE_ARM_BF16)
@@ -102,13 +101,11 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64" OR ENABLE_X86_ISA)
"-mavx512f"
"-mavx512vl"
"-mavx512bw"
"-mavx512dq")
list(APPEND CXX_COMPILE_FLAGS_AVX512_AMX
${CXX_COMPILE_FLAGS_AVX512}
"-mamx-bf16"
"-mamx-tile"
"-mavx512dq"
"-mavx512bf16"
"-mavx512vnni")
"-mavx512vnni"
"-mamx-bf16"
"-mamx-tile")
list(APPEND CXX_COMPILE_FLAGS_AVX2
"-mavx2")
elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
@@ -145,19 +142,11 @@ elseif (S390_FOUND)
"-march=native"
"-mtune=native")
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
message(STATUS "RISC-V detected")
if(RVV_BF16_FOUND)
message(STATUS "BF16 extension detected")
set(MARCH_FLAGS -march=rv64gcv_zvfh_zfbfmin_zvfbfmin_zvl128b -mrvv-vector-bits=zvl -mabi=lp64d)
add_compile_definitions(RISCV_BF16_SUPPORT)
elseif (RVV_FP16_FOUND)
message(WARNING "BF16 functionality is not available")
set(MARCH_FLAGS -march=rv64gcv_zvfh_zvl128b -mrvv-vector-bits=zvl -mabi=lp64d)
if(RVV_FOUND)
message(FAIL_ERROR "Can't support rvv now.")
else()
message(STATUS "compile riscv with scalar")
list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
endif()
list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})
else()
message(FATAL_ERROR "vLLM CPU backend requires X86, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
endif()
@@ -253,24 +242,13 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
)
else()
message(STATUS "Downloading oneDNN from GitHub")
if(ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
message(STATUS "aarch64 detected: using pinned oneDNN commit 9c5be1cc59e368aebf0909e6cf20f981ea61462a")
FetchContent_Declare(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
GIT_TAG 9c5be1cc59e368aebf0909e6cf20f981ea61462a
GIT_PROGRESS TRUE
GIT_SHALLOW FALSE
)
else()
FetchContent_Declare(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
GIT_TAG v3.10
GIT_PROGRESS TRUE
GIT_SHALLOW TRUE
)
endif()
FetchContent_Declare(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
GIT_TAG v3.10
GIT_PROGRESS TRUE
GIT_SHALLOW TRUE
)
endif()
set(ONEDNN_LIBRARY_TYPE "STATIC")
@@ -316,8 +294,7 @@ endif()
# TODO: Refactor this
if (ENABLE_X86_ISA)
message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) compile flags: ${CXX_COMPILE_FLAGS_AVX512_AMX}")
message(STATUS "CPU extension (AVX512F) compile flags: ${CXX_COMPILE_FLAGS_AVX512}")
message(STATUS "CPU extension (AVX512) compile flags: ${CXX_COMPILE_FLAGS_AVX512}")
message(STATUS "CPU extension (AVX2) compile flags: ${CXX_COMPILE_FLAGS_AVX2}")
else()
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
@@ -369,15 +346,13 @@ if(USE_ONEDNN)
endif()
if (ENABLE_X86_ISA)
set(VLLM_EXT_SRC_SGL
set(VLLM_EXT_SRC_AVX512
"csrc/cpu/sgl-kernels/gemm.cpp"
"csrc/cpu/sgl-kernels/gemm_int8.cpp"
"csrc/cpu/sgl-kernels/gemm_fp8.cpp"
"csrc/cpu/sgl-kernels/moe.cpp"
"csrc/cpu/sgl-kernels/moe_int8.cpp"
"csrc/cpu/sgl-kernels/moe_fp8.cpp")
set(VLLM_EXT_SRC_AVX512
"csrc/cpu/sgl-kernels/moe_fp8.cpp"
"csrc/cpu/shm.cpp"
"csrc/cpu/cpu_wna16.cpp"
"csrc/cpu/cpu_fused_moe.cpp"
@@ -403,48 +378,31 @@ if (ENABLE_X86_ISA)
"csrc/cpu/pos_encoding.cpp"
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) source files: ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL}")
message(STATUS "CPU extension (AVX512F) source files: ${VLLM_EXT_SRC_AVX512}")
message(STATUS "CPU extension (AVX512) source files: ${VLLM_EXT_SRC_AVX512}")
message(STATUS "CPU extension (AVX2) source files: ${VLLM_EXT_SRC_AVX2}")
set(_C_LIBS numa dnnl_ext)
set(_C_AVX512_LIBS numa dnnl_ext)
set(_C_AVX2_LIBS numa)
# AMX + AVX512F + AVX512BF16 + AVX512VNNI
define_extension_target(
_C
DESTINATION vllm
LANGUAGE CXX
SOURCES ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL}
LIBRARIES ${_C_LIBS}
COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512_AMX}
USE_SABI 3
WITH_SOABI
)
# For AMX kernels
target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16")
# AVX512F
define_extension_target(
_C_AVX512
DESTINATION vllm
LANGUAGE CXX
SOURCES ${VLLM_EXT_SRC_AVX512}
LIBRARIES ${_C_AVX512_LIBS}
LIBRARIES ${LIBS}
COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512}
USE_SABI 3
WITH_SOABI
)
# AVX2
# For SGL kernels
target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AVX512")
# For AMX kernels
target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16")
define_extension_target(
_C_AVX2
DESTINATION vllm
LANGUAGE CXX
SOURCES ${VLLM_EXT_SRC_AVX2}
LIBRARIES ${_C_AVX2_LIBS}
LIBRARIES ${LIBS}
COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX2}
USE_SABI 3
WITH_SOABI

View File

@@ -39,7 +39,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 1488682bb545f7d020e958a33116b1419d1cfc83
GIT_TAG 140c00c0241bb60cc6e44e7c1be9998d4b20d8d2
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

View File

@@ -74,12 +74,6 @@ void indexer_k_quant_and_cache(
int64_t quant_block_size, // quantization block size
const std::string& scale_fmt);
// Concatenate query nope and rope for MLA/DSA attention
void concat_mla_q(
torch::Tensor& ql_nope, // [num_tokens, num_heads, nope_dim]
torch::Tensor& q_pe, // [num_tokens, num_heads, rope_dim]
torch::Tensor& q_out); // [num_tokens, num_heads, nope_dim + rope_dim]
// Extract function to gather quantized K cache
void cp_gather_indexer_k_quant_cache(
const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]

View File

@@ -8,7 +8,6 @@
#include "cuda_compat.h"
#include "dispatch_utils.h"
#include "quantization/vectorization_utils.cuh"
#include "concat_mla_q.cuh"
#ifdef USE_ROCM
#include "quantization/w8a8/fp8/amd/quant_utils.cuh"
@@ -919,8 +918,8 @@ __global__ void gather_and_maybe_dequant_cache(
// SCALAR_T is the data type of the destination tensor.
// CACHE_T is the stored data type of kv-cache.
// KV_DTYPE is the real data type of kv-cache.
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE, ENTRY_SZ) \
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, ENTRY_SZ, \
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576, \
thread_block_size> \
<<<grid, block, 0, stream>>>( \
reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \
@@ -931,12 +930,6 @@ __global__ void gather_and_maybe_dequant_cache(
dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
seq_starts_ptr);
#define CALL_GATHER_CACHE_576(SCALAR_T, CACHE_T, KV_DTYPE) \
CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE, 576)
#define CALL_GATHER_CACHE_320(SCALAR_T, CACHE_T, KV_DTYPE) \
CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE, 320)
// Gather sequences from the cache into the destination tensor.
// - cu_seq_lens contains the cumulative sequence lengths for each batch
// - block_table contains the cache block indices for each sequence
@@ -966,10 +959,9 @@ void gather_and_maybe_dequant_cache(
TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
"seq_starts must be int32");
}
TORCH_CHECK(
head_dim == 320 || head_dim == 576,
"gather_and_maybe_dequant_cache only support the head_dim to 320 or 576 "
"for better performance")
TORCH_CHECK(head_dim == 576,
"gather_and_maybe_dequant_cache only support the head_dim to 576 "
"for better performance")
TORCH_CHECK(src_cache.device() == dst.device(),
"src_cache and dst must be on the same device");
@@ -994,13 +986,7 @@ void gather_and_maybe_dequant_cache(
const int32_t* seq_starts_ptr =
seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
if (head_dim == 576) {
DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype,
CALL_GATHER_CACHE_576);
} else {
DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype,
CALL_GATHER_CACHE_320);
}
DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, CALL_GATHER_CACHE);
}
namespace vllm {
@@ -1009,67 +995,75 @@ namespace vllm {
// Similar to cp_gather_cache but specifically for FP8->BF16 conversion
__global__ void cp_gather_and_upconvert_fp8_kv_cache(
const uint8_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656]
__nv_bfloat16* __restrict__ dst, // [total_tokens, 576]
const int32_t* __restrict__ block_table, // [num_reqs, BLOCK_INDICES]
const int32_t* __restrict__ workspace_starts, // [num_reqs]
const int32_t num_reqs, const int32_t block_size,
const int32_t total_tokens, const int64_t block_table_stride,
const int64_t cache_block_stride, const int64_t cache_entry_stride,
const int64_t dst_entry_stride) {
const int flat_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) >> 5;
if (flat_warp_id >= total_tokens) return;
const int lane_id = threadIdx.x & 31;
__nv_bfloat16* __restrict__ dst, // [TOT_TOKENS, 576]
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
const int32_t* __restrict__ seq_lens, // [BATCH]
const int32_t* __restrict__ workspace_starts, // [BATCH]
const int32_t block_size, const int32_t head_dim,
const int64_t block_table_stride, const int64_t cache_block_stride,
const int64_t cache_entry_stride, const int64_t dst_entry_stride) {
const int64_t bid = blockIdx.x; // Batch ID
const int32_t num_splits = gridDim.y;
const int32_t split = blockIdx.y;
const int32_t seq_start = workspace_starts[bid];
const int32_t seq_len = seq_lens[bid];
const int32_t tot_slots = seq_len;
const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
// Binary search to find which request owns this output token
int lo = 0, hi = num_reqs - 1;
while (lo < hi) {
int mid = (lo + hi + 1) >> 1;
if (workspace_starts[mid] <= flat_warp_id)
lo = mid;
else
hi = mid - 1;
const int32_t split_start = split * split_slots;
const int32_t split_end = min((split + 1) * split_slots, tot_slots);
const bool is_active_split = (split_start < tot_slots);
if (!is_active_split) return;
// Adjust the pointer for the block_table for this batch
const int32_t batch_offset = bid * block_table_stride;
int32_t offset = split_start;
int32_t offset_div = offset / block_size;
offset = offset % block_size;
const int32_t* batch_block_table = block_table + batch_offset;
// Adjust dst pointer based on the cumulative sequence lengths
dst += seq_start * dst_entry_stride;
const int tid = threadIdx.x;
// Process each token in this split
for (int pid = split_start; pid < split_end; ++pid) {
auto block_id = batch_block_table[offset_div];
const uint8_t* token_ptr =
src_cache + block_id * cache_block_stride + offset * cache_entry_stride;
__nv_bfloat16* dst_ptr = dst + pid * dst_entry_stride;
// FP8 format: 512 bytes fp8 + 16 bytes scales + 128 bytes rope (64 bf16)
const uint8_t* no_pe_ptr = token_ptr;
const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
const __nv_bfloat16* rope_ptr =
reinterpret_cast<const __nv_bfloat16*>(token_ptr + 512 + 16);
// Parallelize fp8 dequant (512 elements) and rope copy (64 elements)
if (tid < 512) {
// FP8 dequantization
const int tile = tid >> 7; // each tile is 128 elements
const float scale = scales_ptr[tile];
const uint8_t val = no_pe_ptr[tid];
dst_ptr[tid] =
fp8::scaled_convert<__nv_bfloat16, uint8_t,
vllm::Fp8KVCacheDataType::kFp8E4M3>(val, scale);
} else if (tid < 576) {
// Rope copy (64 bf16 elements)
const int rope_idx = tid - 512;
dst_ptr[512 + rope_idx] = rope_ptr[rope_idx];
}
// Move to next token
offset += 1;
if (offset == block_size) {
offset_div += 1;
offset = 0;
}
}
const int req_id = lo;
// Compute physical token address via block table
const int out_token_id = flat_warp_id;
const int token_offset = out_token_id - workspace_starts[req_id];
const int cache_block_idx = token_offset / block_size;
const int offset_in_block = token_offset % block_size;
const int physical_block =
block_table[req_id * block_table_stride + cache_block_idx];
const uint8_t* token_ptr = src_cache + physical_block * cache_block_stride +
offset_in_block * cache_entry_stride;
const int4* nope_src = reinterpret_cast<const int4*>(token_ptr);
const int4 fp8_data = nope_src[lane_id];
const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
const float scale = scales_ptr[lane_id >> 3];
const uint2 fp8_lo = make_uint2(fp8_data.x, fp8_data.y);
const uint2 fp8_hi = make_uint2(fp8_data.z, fp8_data.w);
#ifdef USE_ROCM
const bf16_8_t bf16_lo =
fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_lo, scale);
const bf16_8_t bf16_hi =
fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_hi, scale);
#else
const bf16_8_t bf16_lo =
fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_lo, scale, __NV_E4M3);
const bf16_8_t bf16_hi =
fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_hi, scale, __NV_E4M3);
#endif
__nv_bfloat16* dst_ptr = dst + out_token_id * dst_entry_stride;
int4* nope_dst = reinterpret_cast<int4*>(dst_ptr) + lane_id * 2;
nope_dst[0] = *reinterpret_cast<const int4*>(&bf16_lo);
nope_dst[1] = *reinterpret_cast<const int4*>(&bf16_hi);
const int* rope_src = reinterpret_cast<const int*>(token_ptr + 528);
int* rope_dst = reinterpret_cast<int*>(dst_ptr + 512);
rope_dst[lane_id] = rope_src[lane_id];
}
template <typename scalar_t>
@@ -1263,16 +1257,15 @@ void cp_gather_and_upconvert_fp8_kv_cache(
src_ptr = reinterpret_cast<const uint8_t*>(src_cache.data_ptr());
}
const int total_tokens = dst.size(0);
constexpr int warps_per_block = 8;
const int grid_size = (total_tokens + warps_per_block - 1) / warps_per_block;
const int block_size_threads = warps_per_block * 32; // 256 threads
// Decide on the number of splits based on the batch size
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
dim3 grid(batch_size, num_splits);
dim3 block(576);
vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid_size, block_size_threads, 0,
stream>>>(
vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid, block, 0, stream>>>(
src_ptr, reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
block_table.data_ptr<int32_t>(), workspace_starts.data_ptr<int32_t>(),
static_cast<int32_t>(batch_size), block_size, total_tokens,
block_table.data_ptr<int32_t>(), seq_lens.data_ptr<int32_t>(),
workspace_starts.data_ptr<int32_t>(), block_size, head_dim,
block_table_stride, cache_block_stride, cache_entry_stride,
dst_entry_stride);
}
@@ -1372,43 +1365,3 @@ void cp_gather_indexer_k_quant_cache(
CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32);
}
}
// Concatenate ql_nope and q_pe into a contiguous q_out tensor for MLA/DSA.
// Replaces torch.cat((ql_nope, q_pe), dim=-1).
void concat_mla_q(torch::Tensor& ql_nope, // [num_tokens, num_heads, nope_dim]
torch::Tensor& q_pe, // [num_tokens, num_heads, rope_dim]
torch::Tensor& q_out // [num_tokens, num_heads, nope_dim +
// rope_dim]
) {
const int num_tokens = ql_nope.size(0);
const int num_heads = ql_nope.size(1);
const int nope_dim = ql_nope.size(2);
const int rope_dim = q_pe.size(2);
TORCH_CHECK(nope_dim % 512 == 0, "nope_dim must be a multiple of 512, got ",
nope_dim);
TORCH_CHECK(rope_dim == 64, "rope_dim must be 64, got ", rope_dim);
TORCH_CHECK(q_out.size(2) == nope_dim + rope_dim);
TORCH_CHECK(ql_nope.stride(2) == 1, "ql_nope must have stride 1 in dim 2");
TORCH_CHECK(q_pe.stride(2) == 1, "q_pe must have stride 1 in dim 2");
TORCH_CHECK(q_out.stride(2) == 1, "q_out must have stride 1 in dim 2");
if (num_tokens == 0) return;
constexpr int warps_per_block = 8;
const int total_warps = num_tokens * num_heads;
const int grid_size = (total_warps + warps_per_block - 1) / warps_per_block;
const int block_size = warps_per_block * 32;
const at::cuda::OptionalCUDAGuard device_guard(device_of(ql_nope));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES(ql_nope.scalar_type(), "concat_mla_q", [&] {
vllm::ConcatMLAQKernel<scalar_t, 512><<<grid_size, block_size, 0, stream>>>(
q_out.data_ptr<scalar_t>(), ql_nope.data_ptr<scalar_t>(),
q_pe.data_ptr<scalar_t>(), num_tokens, num_heads, q_out.stride(0),
q_out.stride(1), ql_nope.stride(0), ql_nope.stride(1), q_pe.stride(0),
q_pe.stride(1));
});
}

View File

@@ -1,60 +0,0 @@
#ifndef CONCAT_MLA_Q_CUH_
#define CONCAT_MLA_Q_CUH_
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include "cuda_vec_utils.cuh"
namespace vllm {
// Concatenates ql_nope [num_tokens, num_heads, NOPE_DIM] and
// q_pe [num_tokens, num_heads, 64]
// into q_out [num_tokens, num_heads, NOPE_DIM+64].
// Currently instantiated only for NOPE_DIM=512.
// Rope dim is hardcoded to 64 (DeepSeek V3.2 MLA)
template <typename DType, int NOPE_DIM>
__global__ void ConcatMLAQKernel(
DType* __restrict__ q_out, const DType* __restrict__ ql_nope,
const DType* __restrict__ q_pe, const int num_tokens, const int num_heads,
const int64_t out_stride_0, const int64_t out_stride_1,
const int64_t nope_stride_0, const int64_t nope_stride_1,
const int64_t pe_stride_0, const int64_t pe_stride_1) {
const int flat_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) >> 5;
if (flat_warp_id >= num_tokens * num_heads) return;
const int token_id = flat_warp_id / num_heads;
const int head_id = flat_warp_id % num_heads;
const int lane_id = threadIdx.x & 31;
constexpr bool use_256b = VLLM_256B_PTX_ENABLED;
constexpr int nope_vec_loads =
NOPE_DIM * sizeof(DType) / (VecTraits<use_256b>::ARCH_MAX_VEC_SIZE * 32);
const DType* nope_src =
ql_nope + token_id * nope_stride_0 + head_id * nope_stride_1;
DType* nope_dst = q_out + token_id * out_stride_0 + head_id * out_stride_1;
#pragma unroll
for (int i = 0; i < nope_vec_loads; i++) {
const int offset = i * 32 + lane_id;
if constexpr (use_256b) {
st256_cs(reinterpret_cast<u32x8_t*>(nope_dst) + offset,
ld256_cs(reinterpret_cast<const u32x8_t*>(nope_src) + offset));
} else {
st128_cs(reinterpret_cast<int4*>(nope_dst) + offset,
ld128_cs(reinterpret_cast<const int4*>(nope_src) + offset));
}
}
const int* rope_src = reinterpret_cast<const int*>(
q_pe + token_id * pe_stride_0 + head_id * pe_stride_1);
int* rope_dst = reinterpret_cast<int*>(q_out + token_id * out_stride_0 +
head_id * out_stride_1 + NOPE_DIM);
st32_cs(rope_dst + lane_id, ld32_cs(rope_src + lane_id));
}
} // namespace vllm
#endif // CONCAT_MLA_Q_CUH_

View File

@@ -420,7 +420,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
const int64_t block_size, const int64_t block_size_stride) {
// For AMX 2D tiles, size of each line is 64 bytes
constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES;
// For AMX B matrix, N always is 16
// For AMX B martix, N always is 16
constexpr int64_t amx_b_tile_n_size = AMX_TILE_ROW_BYTES / 4;
constexpr int64_t amx_b_tile_k_size = amx_tile_row_size / sizeof(scalar_t);
// For now suppose block_size is divisible by amx_tile_column_num

View File

@@ -13,9 +13,6 @@
#elif defined(__aarch64__)
// arm implementation
#include "cpu_types_arm.hpp"
#elif defined(__riscv_v)
// riscv implementation
#include "cpu_types_riscv.hpp"
#else
#warning "unsupported vLLM cpu implementation, vLLM will compile with scalar"
#include "cpu_types_scalar.hpp"

View File

@@ -1,832 +0,0 @@
#ifndef CPU_TYPES_RISCV_HPP
#define CPU_TYPES_RISCV_HPP
#include <algorithm>
#include <cmath>
#include <cstring>
#include <iostream>
#include <limits>
#include <riscv_vector.h>
#include <torch/all.h>
// ============================================================================
// Vector Register Type Definitions (VLEN=128 bits)
// ============================================================================
typedef vfloat16m1_t fixed_vfloat16m1_t
__attribute__((riscv_rvv_vector_bits(128)));
typedef vfloat16m2_t fixed_vfloat16m2_t
__attribute__((riscv_rvv_vector_bits(256)));
typedef vfloat32m1_t fixed_vfloat32m1_t
__attribute__((riscv_rvv_vector_bits(128)));
typedef vfloat32m2_t fixed_vfloat32m2_t
__attribute__((riscv_rvv_vector_bits(256)));
typedef vfloat32m4_t fixed_vfloat32m4_t
__attribute__((riscv_rvv_vector_bits(512)));
typedef vfloat32m8_t fixed_vfloat32m8_t
__attribute__((riscv_rvv_vector_bits(1024)));
typedef vint32m2_t fixed_vint32m2_t __attribute__((riscv_rvv_vector_bits(256)));
typedef vint32m4_t fixed_vint32m4_t __attribute__((riscv_rvv_vector_bits(512)));
typedef vuint16m1_t fixed_vuint16m1_t
__attribute__((riscv_rvv_vector_bits(128)));
typedef vuint16m2_t fixed_vuint16m2_t
__attribute__((riscv_rvv_vector_bits(256)));
typedef vuint16m4_t fixed_vuint16m4_t
__attribute__((riscv_rvv_vector_bits(512)));
#ifdef RISCV_BF16_SUPPORT
typedef vbfloat16m1_t fixed_vbfloat16m1_t
__attribute__((riscv_rvv_vector_bits(128)));
typedef vbfloat16m2_t fixed_vbfloat16m2_t
__attribute__((riscv_rvv_vector_bits(256)));
typedef vbfloat16m4_t fixed_vbfloat16m4_t
__attribute__((riscv_rvv_vector_bits(512)));
#endif
namespace vec_op {
#ifdef RISCV_BF16_SUPPORT
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
#else
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
#endif
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
#define FORCE_INLINE __attribute__((always_inline)) inline
namespace {
template <typename T, T... indexes, typename F>
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
(f(std::integral_constant<T, indexes>{}), ...);
};
} // namespace
template <typename T, T count, typename F,
typename = std::enable_if_t<std::is_invocable_v<F, T>>>
constexpr void unroll_loop(F&& f) {
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
}
template <typename T>
struct Vec {
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; };
};
struct FP32Vec8;
struct FP32Vec16;
// ============================================================================
// FP16 Implementation
// ============================================================================
struct FP16Vec8 : public Vec<FP16Vec8> {
constexpr static int VEC_ELEM_NUM = 8;
fixed_vfloat16m1_t reg;
explicit FP16Vec8(const void* ptr)
: reg(__riscv_vle16_v_f16m1(static_cast<const _Float16*>(ptr),
VEC_ELEM_NUM)) {};
explicit FP16Vec8(const FP32Vec8&);
void save(void* ptr) const {
__riscv_vse16_v_f16m1(static_cast<_Float16*>(ptr), reg, VEC_ELEM_NUM);
}
void save(void* ptr, int elem_num) const {
__riscv_vse16_v_f16m1(static_cast<_Float16*>(ptr), reg, elem_num);
}
void save_strided(void* ptr, ptrdiff_t stride) const {
ptrdiff_t byte_stride = stride * sizeof(_Float16);
__riscv_vsse16_v_f16m1(static_cast<_Float16*>(ptr), byte_stride, reg,
VEC_ELEM_NUM);
}
};
struct FP16Vec16 : public Vec<FP16Vec16> {
constexpr static int VEC_ELEM_NUM = 16;
fixed_vfloat16m2_t reg;
explicit FP16Vec16(const void* ptr)
: reg(__riscv_vle16_v_f16m2(static_cast<const _Float16*>(ptr),
VEC_ELEM_NUM)) {};
explicit FP16Vec16(const FP32Vec16& vec);
void save(void* ptr) const {
__riscv_vse16_v_f16m2(static_cast<_Float16*>(ptr), reg, VEC_ELEM_NUM);
}
void save(void* ptr, int elem_num) const {
__riscv_vse16_v_f16m2(static_cast<_Float16*>(ptr), reg, elem_num);
}
void save_strided(void* ptr, ptrdiff_t stride) const {
ptrdiff_t byte_stride = stride * sizeof(_Float16);
__riscv_vsse16_v_f16m2(static_cast<_Float16*>(ptr), byte_stride, reg,
VEC_ELEM_NUM);
}
};
// ============================================================================
// BF16 Implementation
// ============================================================================
#ifdef RISCV_BF16_SUPPORT
FORCE_INLINE fixed_vuint16m1_t bf16_to_u16(fixed_vbfloat16m1_t v) {
return __riscv_vreinterpret_v_bf16m1_u16m1(v);
}
FORCE_INLINE fixed_vuint16m2_t bf16_to_u16(fixed_vbfloat16m2_t v) {
return __riscv_vreinterpret_v_bf16m2_u16m2(v);
}
FORCE_INLINE fixed_vuint16m4_t bf16_to_u16(fixed_vbfloat16m4_t v) {
return __riscv_vreinterpret_v_bf16m4_u16m4(v);
}
struct BF16Vec8 : public Vec<BF16Vec8> {
constexpr static int VEC_ELEM_NUM = 8;
fixed_vbfloat16m1_t reg;
explicit BF16Vec8(const void* ptr)
: reg(__riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vle16_v_u16m1(
reinterpret_cast<const uint16_t*>(ptr), VEC_ELEM_NUM))) {};
explicit BF16Vec8(fixed_vbfloat16m1_t data) : reg(data) {};
explicit BF16Vec8(const FP32Vec8&);
void save(void* ptr) const {
__riscv_vse16_v_u16m1(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
VEC_ELEM_NUM);
}
void save(void* ptr, int elem_num) const {
__riscv_vse16_v_u16m1(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
elem_num);
}
void save_strided(void* ptr, ptrdiff_t stride) const {
ptrdiff_t byte_stride = stride * sizeof(uint16_t);
__riscv_vsse16_v_u16m1(reinterpret_cast<uint16_t*>(ptr), byte_stride,
bf16_to_u16(reg), VEC_ELEM_NUM);
}
};
struct BF16Vec16 : public Vec<BF16Vec16> {
constexpr static int VEC_ELEM_NUM = 16;
fixed_vbfloat16m2_t reg;
explicit BF16Vec16(const void* ptr)
: reg(__riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vle16_v_u16m2(
reinterpret_cast<const uint16_t*>(ptr), VEC_ELEM_NUM))) {};
explicit BF16Vec16(fixed_vbfloat16m2_t data) : reg(data) {};
explicit BF16Vec16(const FP32Vec16&);
void save(void* ptr) const {
__riscv_vse16_v_u16m2(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
VEC_ELEM_NUM);
}
void save(void* ptr, int elem_num) const {
__riscv_vse16_v_u16m2(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
elem_num);
}
void save_strided(void* ptr, ptrdiff_t stride) const {
ptrdiff_t byte_stride = stride * sizeof(uint16_t);
__riscv_vsse16_v_u16m2(reinterpret_cast<uint16_t*>(ptr), byte_stride,
bf16_to_u16(reg), VEC_ELEM_NUM);
}
};
struct BF16Vec32 : public Vec<BF16Vec32> {
constexpr static int VEC_ELEM_NUM = 32;
fixed_vbfloat16m4_t reg;
explicit BF16Vec32(const void* ptr)
: reg(__riscv_vreinterpret_v_u16m4_bf16m4(__riscv_vle16_v_u16m4(
reinterpret_cast<const uint16_t*>(ptr), VEC_ELEM_NUM))) {};
explicit BF16Vec32(fixed_vbfloat16m4_t data) : reg(data) {};
explicit BF16Vec32(const BF16Vec8& v) {
fixed_vuint16m1_t u16_val = bf16_to_u16(v.reg);
fixed_vuint16m4_t u16_combined =
__riscv_vcreate_v_u16m1_u16m4(u16_val, u16_val, u16_val, u16_val);
reg = __riscv_vreinterpret_v_u16m4_bf16m4(u16_combined);
};
void save(void* ptr) const {
__riscv_vse16_v_u16m4(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
VEC_ELEM_NUM);
}
void save(void* ptr, int elem_num) const {
__riscv_vse16_v_u16m4(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
elem_num);
}
void save_strided(void* ptr, ptrdiff_t stride) const {
ptrdiff_t byte_stride = stride * sizeof(uint16_t);
__riscv_vsse16_v_u16m4(reinterpret_cast<uint16_t*>(ptr), byte_stride,
bf16_to_u16(reg), VEC_ELEM_NUM);
}
};
#else
// ============================================================================
// BF16 Fallback Implementation (FP32 Simulation)
// ============================================================================
struct BF16Vec8 : public Vec<BF16Vec8> {
constexpr static int VEC_ELEM_NUM = 8;
fixed_vfloat32m2_t reg_fp32;
explicit BF16Vec8(const void* ptr) {
const uint16_t* u16 = static_cast<const uint16_t*>(ptr);
float tmp[8];
for (int i = 0; i < 8; ++i) {
uint32_t v = static_cast<uint32_t>(u16[i]) << 16;
std::memcpy(&tmp[i], &v, 4);
}
reg_fp32 = __riscv_vle32_v_f32m2(tmp, 8);
}
explicit BF16Vec8(const FP32Vec8&);
void save(void* ptr) const {
float tmp[8];
__riscv_vse32_v_f32m2(tmp, reg_fp32, 8);
uint16_t* u16 = static_cast<uint16_t*>(ptr);
for (int i = 0; i < 8; ++i) {
uint32_t v;
std::memcpy(&v, &tmp[i], 4);
u16[i] = static_cast<uint16_t>(v >> 16);
}
}
void save(void* ptr, int elem_num) const {
float tmp[8];
__riscv_vse32_v_f32m2(tmp, reg_fp32, 8);
uint16_t* u16 = static_cast<uint16_t*>(ptr);
for (int i = 0; i < elem_num; ++i) {
uint32_t v;
std::memcpy(&v, &tmp[i], 4);
u16[i] = static_cast<uint16_t>(v >> 16);
}
}
void save_strided(void* ptr, ptrdiff_t stride) const {
float tmp[8];
__riscv_vse32_v_f32m2(tmp, reg_fp32, 8);
uint8_t* u8 = static_cast<uint8_t*>(ptr);
ptrdiff_t byte_stride = stride * sizeof(uint16_t);
for (int i = 0; i < 8; ++i) {
uint32_t v;
std::memcpy(&v, &tmp[i], 4);
uint16_t val = static_cast<uint16_t>(v >> 16);
*reinterpret_cast<uint16_t*>(u8 + i * byte_stride) = val;
}
}
};
struct BF16Vec16 : public Vec<BF16Vec16> {
constexpr static int VEC_ELEM_NUM = 16;
fixed_vfloat32m4_t reg_fp32;
explicit BF16Vec16(const void* ptr) {
const uint16_t* u16 = static_cast<const uint16_t*>(ptr);
float tmp[16];
for (int i = 0; i < 16; ++i) {
uint32_t v = static_cast<uint32_t>(u16[i]) << 16;
std::memcpy(&tmp[i], &v, 4);
}
reg_fp32 = __riscv_vle32_v_f32m4(tmp, 16);
}
explicit BF16Vec16(const FP32Vec16&);
void save(void* ptr) const {
float tmp[16];
__riscv_vse32_v_f32m4(tmp, reg_fp32, 16);
uint16_t* u16 = static_cast<uint16_t*>(ptr);
for (int i = 0; i < 16; ++i) {
uint32_t v;
std::memcpy(&v, &tmp[i], 4);
u16[i] = static_cast<uint16_t>(v >> 16);
}
}
void save(void* ptr, int elem_num) const {
float tmp[16];
__riscv_vse32_v_f32m4(tmp, reg_fp32, 16);
uint16_t* u16 = static_cast<uint16_t*>(ptr);
for (int i = 0; i < elem_num; ++i) {
uint32_t v;
std::memcpy(&v, &tmp[i], 4);
u16[i] = static_cast<uint16_t>(v >> 16);
}
}
void save_strided(void* ptr, ptrdiff_t stride) const {
float tmp[16];
__riscv_vse32_v_f32m4(tmp, reg_fp32, 16);
uint8_t* u8 = static_cast<uint8_t*>(ptr);
ptrdiff_t byte_stride = stride * sizeof(uint16_t);
for (int i = 0; i < 16; ++i) {
uint32_t v;
std::memcpy(&v, &tmp[i], 4);
uint16_t val = static_cast<uint16_t>(v >> 16);
*reinterpret_cast<uint16_t*>(u8 + i * byte_stride) = val;
}
}
};
struct BF16Vec32 : public Vec<BF16Vec32> {
constexpr static int VEC_ELEM_NUM = 32;
fixed_vfloat32m8_t reg_fp32;
explicit BF16Vec32(const void* ptr) {
const uint16_t* u16 = static_cast<const uint16_t*>(ptr);
float tmp[32];
for (int i = 0; i < 32; ++i) {
uint32_t v = static_cast<uint32_t>(u16[i]) << 16;
std::memcpy(&tmp[i], &v, 4);
}
reg_fp32 = __riscv_vle32_v_f32m8(tmp, 32);
}
explicit BF16Vec32(const BF16Vec8& v) {
float tmp_small[8];
__riscv_vse32_v_f32m2(tmp_small, v.reg_fp32, 8);
float tmp_large[32];
for (int i = 0; i < 4; ++i) {
std::memcpy(tmp_large + (i * 8), tmp_small, 8 * sizeof(float));
}
reg_fp32 = __riscv_vle32_v_f32m8(tmp_large, 32);
}
void save(void* ptr) const {
float tmp[32];
__riscv_vse32_v_f32m8(tmp, reg_fp32, 32);
uint16_t* u16 = static_cast<uint16_t*>(ptr);
for (int i = 0; i < 32; ++i) {
uint32_t v;
std::memcpy(&v, &tmp[i], 4);
u16[i] = static_cast<uint16_t>(v >> 16);
}
}
void save(void* ptr, int elem_num) const {
float tmp[32];
__riscv_vse32_v_f32m8(tmp, reg_fp32, 32);
uint16_t* u16 = static_cast<uint16_t*>(ptr);
for (int i = 0; i < elem_num; ++i) {
uint32_t v;
std::memcpy(&v, &tmp[i], 4);
u16[i] = static_cast<uint16_t>(v >> 16);
}
}
void save_strided(void* ptr, ptrdiff_t stride) const {
float tmp[32];
__riscv_vse32_v_f32m8(tmp, reg_fp32, 32);
uint8_t* u8 = static_cast<uint8_t*>(ptr);
ptrdiff_t byte_stride = stride * sizeof(uint16_t);
for (int i = 0; i < 32; ++i) {
uint32_t v;
std::memcpy(&v, &tmp[i], 4);
uint16_t val = static_cast<uint16_t>(v >> 16);
*reinterpret_cast<uint16_t*>(u8 + i * byte_stride) = val;
}
}
};
#endif
// ============================================================================
// FP32 Implementation
// ============================================================================
struct FP32Vec4 : public Vec<FP32Vec4> {
constexpr static int VEC_ELEM_NUM = 4;
fixed_vfloat32m1_t reg;
explicit FP32Vec4(float v) : reg(__riscv_vfmv_v_f_f32m1(v, VEC_ELEM_NUM)) {};
explicit FP32Vec4() : reg(__riscv_vfmv_v_f_f32m1(0.0f, VEC_ELEM_NUM)) {};
explicit FP32Vec4(const float* ptr)
: reg(__riscv_vle32_v_f32m1(ptr, VEC_ELEM_NUM)) {};
explicit FP32Vec4(fixed_vfloat32m1_t data) : reg(data) {};
explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {};
void save(float* ptr) const { __riscv_vse32_v_f32m1(ptr, reg, VEC_ELEM_NUM); }
void save(float* ptr, int elem_num) const {
__riscv_vse32_v_f32m1(ptr, reg, elem_num);
}
};
struct FP32Vec8 : public Vec<FP32Vec8> {
constexpr static int VEC_ELEM_NUM = 8;
fixed_vfloat32m2_t reg;
explicit FP32Vec8(float v) : reg(__riscv_vfmv_v_f_f32m2(v, VEC_ELEM_NUM)) {};
explicit FP32Vec8() : reg(__riscv_vfmv_v_f_f32m2(0.0f, VEC_ELEM_NUM)) {};
explicit FP32Vec8(const float* ptr)
: reg(__riscv_vle32_v_f32m2(ptr, VEC_ELEM_NUM)) {};
explicit FP32Vec8(fixed_vfloat32m2_t data) : reg(data) {};
explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
explicit FP32Vec8(const FP16Vec8& v)
: reg(__riscv_vfwcvt_f_f_v_f32m2(v.reg, VEC_ELEM_NUM)) {};
explicit FP32Vec8(fixed_vfloat16m1_t v)
: reg(__riscv_vfwcvt_f_f_v_f32m2(v, VEC_ELEM_NUM)) {};
#ifdef RISCV_BF16_SUPPORT
explicit FP32Vec8(fixed_vbfloat16m1_t v)
: reg(__riscv_vfwcvtbf16_f_f_v_f32m2(v, VEC_ELEM_NUM)) {};
explicit FP32Vec8(const BF16Vec8& v)
: reg(__riscv_vfwcvtbf16_f_f_v_f32m2(v.reg, VEC_ELEM_NUM)) {};
#else
explicit FP32Vec8(const BF16Vec8& v) : reg(v.reg_fp32) {};
#endif
float reduce_sum() const {
fixed_vfloat32m1_t scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1);
scalar = __riscv_vfredusum_vs_f32m2_f32m1(reg, scalar, VEC_ELEM_NUM);
return __riscv_vfmv_f_s_f32m1_f32(scalar);
}
FP32Vec8 operator*(const FP32Vec8& b) const {
return FP32Vec8(__riscv_vfmul_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
}
FP32Vec8 operator+(const FP32Vec8& b) const {
return FP32Vec8(__riscv_vfadd_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
}
FP32Vec8 operator-(const FP32Vec8& b) const {
return FP32Vec8(__riscv_vfsub_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
}
FP32Vec8 operator/(const FP32Vec8& b) const {
return FP32Vec8(__riscv_vfdiv_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
}
FP32Vec8 min(const FP32Vec8& b) const {
return FP32Vec8(__riscv_vfmin_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
}
FP32Vec8 max(const FP32Vec8& b) const {
return FP32Vec8(__riscv_vfmax_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
}
FP32Vec8 abs() const {
return FP32Vec8(__riscv_vfabs_v_f32m2(reg, VEC_ELEM_NUM));
}
FP32Vec8 min(const FP32Vec8& b, int elem_num) const {
return FP32Vec8(__riscv_vfmin_vv_f32m2(reg, b.reg, elem_num));
}
FP32Vec8 max(const FP32Vec8& b, int elem_num) const {
return FP32Vec8(__riscv_vfmax_vv_f32m2(reg, b.reg, elem_num));
}
FP32Vec8 clamp(const FP32Vec8& min_v, const FP32Vec8& max_v) const {
fixed_vfloat32m2_t temp =
__riscv_vfmax_vv_f32m2(min_v.reg, reg, VEC_ELEM_NUM);
return FP32Vec8(__riscv_vfmin_vv_f32m2(max_v.reg, temp, VEC_ELEM_NUM));
}
void save(float* ptr) const { __riscv_vse32_v_f32m2(ptr, reg, VEC_ELEM_NUM); }
void save(float* ptr, int elem_num) const {
__riscv_vse32_v_f32m2(ptr, reg, elem_num);
}
void save_strided(float* ptr, ptrdiff_t stride) const {
ptrdiff_t byte_stride = stride * sizeof(float);
__riscv_vsse32_v_f32m2(ptr, byte_stride, reg, VEC_ELEM_NUM);
}
FP32Vec8 exp() const {
const float inv_ln2 = 1.44269504088896341f;
fixed_vfloat32m2_t x_scaled =
__riscv_vfmul_vf_f32m2(reg, inv_ln2, VEC_ELEM_NUM);
fixed_vint32m2_t n_int = __riscv_vfcvt_x_f_v_i32m2(x_scaled, VEC_ELEM_NUM);
fixed_vfloat32m2_t n_float = __riscv_vfcvt_f_x_v_f32m2(n_int, VEC_ELEM_NUM);
fixed_vfloat32m2_t r =
__riscv_vfsub_vv_f32m2(x_scaled, n_float, VEC_ELEM_NUM);
fixed_vfloat32m2_t poly =
__riscv_vfmv_v_f_f32m2(0.001333355810164f, VEC_ELEM_NUM);
poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
poly = __riscv_vfadd_vf_f32m2(poly, 0.009618129107628f, VEC_ELEM_NUM);
poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
poly = __riscv_vfadd_vf_f32m2(poly, 0.055504108664821f, VEC_ELEM_NUM);
poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
poly = __riscv_vfadd_vf_f32m2(poly, 0.240226506959101f, VEC_ELEM_NUM);
poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
poly = __riscv_vfadd_vf_f32m2(poly, 0.693147180559945f, VEC_ELEM_NUM);
poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
poly = __riscv_vfadd_vf_f32m2(poly, 1.0f, VEC_ELEM_NUM);
fixed_vint32m2_t biased_exp =
__riscv_vadd_vx_i32m2(n_int, 127, VEC_ELEM_NUM);
biased_exp = __riscv_vmax_vx_i32m2(biased_exp, 0, VEC_ELEM_NUM);
fixed_vint32m2_t exponent_bits =
__riscv_vsll_vx_i32m2(biased_exp, 23, VEC_ELEM_NUM);
fixed_vfloat32m2_t scale =
__riscv_vreinterpret_v_i32m2_f32m2(exponent_bits);
return FP32Vec8(__riscv_vfmul_vv_f32m2(poly, scale, VEC_ELEM_NUM));
}
FP32Vec8 tanh() const {
fixed_vfloat32m2_t x_clamped = __riscv_vfmin_vf_f32m2(
__riscv_vfmax_vf_f32m2(reg, -9.0f, VEC_ELEM_NUM), 9.0f, VEC_ELEM_NUM);
fixed_vfloat32m2_t x2 =
__riscv_vfmul_vf_f32m2(x_clamped, 2.0f, VEC_ELEM_NUM);
FP32Vec8 exp_val = FP32Vec8(x2).exp();
fixed_vfloat32m2_t num =
__riscv_vfsub_vf_f32m2(exp_val.reg, 1.0f, VEC_ELEM_NUM);
fixed_vfloat32m2_t den =
__riscv_vfadd_vf_f32m2(exp_val.reg, 1.0f, VEC_ELEM_NUM);
return FP32Vec8(__riscv_vfdiv_vv_f32m2(num, den, VEC_ELEM_NUM));
}
FP32Vec8 er() const {
const float p = 0.3275911f, a1 = 0.254829592f, a2 = -0.284496736f,
a3 = 1.421413741f, a4 = -1.453152027f, a5 = 1.061405429f;
fixed_vfloat32m2_t abs_x = __riscv_vfabs_v_f32m2(reg, VEC_ELEM_NUM);
fixed_vfloat32m2_t t = __riscv_vfadd_vf_f32m2(
__riscv_vfmul_vf_f32m2(abs_x, p, VEC_ELEM_NUM), 1.0f, VEC_ELEM_NUM);
t = __riscv_vfrdiv_vf_f32m2(t, 1.0f, VEC_ELEM_NUM);
fixed_vfloat32m2_t poly = __riscv_vfmv_v_f_f32m2(a5, VEC_ELEM_NUM);
poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
a4, VEC_ELEM_NUM);
poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
a3, VEC_ELEM_NUM);
poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
a2, VEC_ELEM_NUM);
poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
a1, VEC_ELEM_NUM);
poly = __riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM);
fixed_vfloat32m2_t exp_val =
FP32Vec8(__riscv_vfneg_v_f32m2(
__riscv_vfmul_vv_f32m2(abs_x, abs_x, VEC_ELEM_NUM),
VEC_ELEM_NUM))
.exp()
.reg;
fixed_vfloat32m2_t res = __riscv_vfrsub_vf_f32m2(
__riscv_vfmul_vv_f32m2(poly, exp_val, VEC_ELEM_NUM), 1.0f,
VEC_ELEM_NUM);
vbool16_t mask = __riscv_vmflt_vf_f32m2_b16(reg, 0.0f, VEC_ELEM_NUM);
return FP32Vec8(__riscv_vfneg_v_f32m2_m(mask, res, VEC_ELEM_NUM));
}
};
struct FP32Vec16 : public Vec<FP32Vec16> {
constexpr static int VEC_ELEM_NUM = 16;
fixed_vfloat32m4_t reg;
explicit FP32Vec16(float v) : reg(__riscv_vfmv_v_f_f32m4(v, VEC_ELEM_NUM)) {};
explicit FP32Vec16() : reg(__riscv_vfmv_v_f_f32m4(0.0f, VEC_ELEM_NUM)) {};
explicit FP32Vec16(const float* ptr)
: reg(__riscv_vle32_v_f32m4(ptr, VEC_ELEM_NUM)) {};
explicit FP32Vec16(fixed_vfloat32m4_t data) : reg(data) {};
explicit FP32Vec16(const FP32Vec8& data)
: reg(__riscv_vcreate_v_f32m2_f32m4(data.reg, data.reg)) {};
explicit FP32Vec16(const FP32Vec16& data) : reg(data.reg) {};
explicit FP32Vec16(const FP16Vec16& v);
#ifdef RISCV_BF16_SUPPORT
explicit FP32Vec16(fixed_vbfloat16m2_t v)
: reg(__riscv_vfwcvtbf16_f_f_v_f32m4(v, VEC_ELEM_NUM)) {};
explicit FP32Vec16(const BF16Vec16& v)
: reg(__riscv_vfwcvtbf16_f_f_v_f32m4(v.reg, VEC_ELEM_NUM)) {};
#else
explicit FP32Vec16(const BF16Vec16& v) : reg(v.reg_fp32) {};
#endif
FP32Vec16 operator+(const FP32Vec16& b) const {
return FP32Vec16(__riscv_vfadd_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
}
FP32Vec16 operator-(const FP32Vec16& b) const {
return FP32Vec16(__riscv_vfsub_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
}
FP32Vec16 operator*(const FP32Vec16& b) const {
return FP32Vec16(__riscv_vfmul_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
}
FP32Vec16 operator/(const FP32Vec16& b) const {
return FP32Vec16(__riscv_vfdiv_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
}
FP32Vec16 fma(const FP32Vec16& a, const FP32Vec16& b) const {
return FP32Vec16(__riscv_vfmacc_vv_f32m4(reg, a.reg, b.reg, VEC_ELEM_NUM));
}
float reduce_sum() const {
fixed_vfloat32m1_t scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1);
scalar = __riscv_vfredusum_vs_f32m4_f32m1(reg, scalar, VEC_ELEM_NUM);
return __riscv_vfmv_f_s_f32m1_f32(scalar);
}
float reduce_max() const {
fixed_vfloat32m1_t scalar =
__riscv_vfmv_s_f_f32m1(std::numeric_limits<float>::lowest(), 1);
scalar = __riscv_vfredmax_vs_f32m4_f32m1(reg, scalar, VEC_ELEM_NUM);
return __riscv_vfmv_f_s_f32m1_f32(scalar);
}
float reduce_min() const {
fixed_vfloat32m1_t scalar =
__riscv_vfmv_s_f_f32m1(std::numeric_limits<float>::max(), 1);
scalar = __riscv_vfredmin_vs_f32m4_f32m1(reg, scalar, VEC_ELEM_NUM);
return __riscv_vfmv_f_s_f32m1_f32(scalar);
}
template <int group_size>
float reduce_sub_sum(int idx) {
static_assert(VEC_ELEM_NUM % group_size == 0);
const int start = idx * group_size;
vuint32m4_t indices = __riscv_vid_v_u32m4(VEC_ELEM_NUM);
vbool8_t mask = __riscv_vmand_mm_b8(
__riscv_vmsgeu_vx_u32m4_b8(indices, start, VEC_ELEM_NUM),
__riscv_vmsltu_vx_u32m4_b8(indices, start + group_size, VEC_ELEM_NUM),
VEC_ELEM_NUM);
fixed_vfloat32m1_t scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1);
scalar =
__riscv_vfredusum_vs_f32m4_f32m1_m(mask, reg, scalar, VEC_ELEM_NUM);
return __riscv_vfmv_f_s_f32m1_f32(scalar);
};
FP32Vec16 max(const FP32Vec16& b) const {
return FP32Vec16(__riscv_vfmax_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
}
FP32Vec16 min(const FP32Vec16& b) const {
return FP32Vec16(__riscv_vfmin_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
}
FP32Vec16 abs() const {
return FP32Vec16(__riscv_vfabs_v_f32m4(reg, VEC_ELEM_NUM));
}
FP32Vec16 clamp(const FP32Vec16& min_v, const FP32Vec16& max_v) const {
return FP32Vec16(__riscv_vfmin_vv_f32m4(
max_v.reg, __riscv_vfmax_vv_f32m4(min_v.reg, reg, VEC_ELEM_NUM),
VEC_ELEM_NUM));
}
void save(float* ptr) const { __riscv_vse32_v_f32m4(ptr, reg, VEC_ELEM_NUM); }
void save(float* ptr, int elem_num) const {
__riscv_vse32_v_f32m4(ptr, reg, elem_num);
}
void save_strided(float* ptr, ptrdiff_t stride) const {
ptrdiff_t byte_stride = stride * sizeof(float);
__riscv_vsse32_v_f32m4(ptr, byte_stride, reg, VEC_ELEM_NUM);
}
FP32Vec16 exp() const {
const float inv_ln2 = 1.44269504088896341f;
fixed_vfloat32m4_t x_scaled =
__riscv_vfmul_vf_f32m4(reg, inv_ln2, VEC_ELEM_NUM);
fixed_vint32m4_t n_int = __riscv_vfcvt_x_f_v_i32m4(x_scaled, VEC_ELEM_NUM);
fixed_vfloat32m4_t n_float = __riscv_vfcvt_f_x_v_f32m4(n_int, VEC_ELEM_NUM);
fixed_vfloat32m4_t r =
__riscv_vfsub_vv_f32m4(x_scaled, n_float, VEC_ELEM_NUM);
fixed_vfloat32m4_t poly =
__riscv_vfmv_v_f_f32m4(0.001333355810164f, VEC_ELEM_NUM);
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
0.009618129107628f, VEC_ELEM_NUM);
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
0.055504108664821f, VEC_ELEM_NUM);
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
0.240226506959101f, VEC_ELEM_NUM);
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
0.693147180559945f, VEC_ELEM_NUM);
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
1.0f, VEC_ELEM_NUM);
fixed_vint32m4_t biased_exp = __riscv_vmax_vx_i32m4(
__riscv_vadd_vx_i32m4(n_int, 127, VEC_ELEM_NUM), 0, VEC_ELEM_NUM);
fixed_vfloat32m4_t scale = __riscv_vreinterpret_v_i32m4_f32m4(
__riscv_vsll_vx_i32m4(biased_exp, 23, VEC_ELEM_NUM));
return FP32Vec16(__riscv_vfmul_vv_f32m4(poly, scale, VEC_ELEM_NUM));
}
FP32Vec16 tanh() const {
fixed_vfloat32m4_t x_clamped = __riscv_vfmin_vf_f32m4(
__riscv_vfmax_vf_f32m4(reg, -9.0f, VEC_ELEM_NUM), 9.0f, VEC_ELEM_NUM);
FP32Vec16 exp_val =
FP32Vec16(__riscv_vfmul_vf_f32m4(x_clamped, 2.0f, VEC_ELEM_NUM)).exp();
return FP32Vec16(__riscv_vfdiv_vv_f32m4(
__riscv_vfsub_vf_f32m4(exp_val.reg, 1.0f, VEC_ELEM_NUM),
__riscv_vfadd_vf_f32m4(exp_val.reg, 1.0f, VEC_ELEM_NUM), VEC_ELEM_NUM));
}
FP32Vec16 er() const {
const float p = 0.3275911f, a1 = 0.254829592f, a2 = -0.284496736f,
a3 = 1.421413741f, a4 = -1.453152027f, a5 = 1.061405429f;
fixed_vfloat32m4_t abs_x = __riscv_vfabs_v_f32m4(reg, VEC_ELEM_NUM);
fixed_vfloat32m4_t t = __riscv_vfrdiv_vf_f32m4(
__riscv_vfadd_vf_f32m4(__riscv_vfmul_vf_f32m4(abs_x, p, VEC_ELEM_NUM),
1.0f, VEC_ELEM_NUM),
1.0f, VEC_ELEM_NUM);
fixed_vfloat32m4_t poly = __riscv_vfmv_v_f_f32m4(a5, VEC_ELEM_NUM);
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
a4, VEC_ELEM_NUM);
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
a3, VEC_ELEM_NUM);
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
a2, VEC_ELEM_NUM);
poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
a1, VEC_ELEM_NUM);
poly = __riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM);
fixed_vfloat32m4_t exp_val =
FP32Vec16(__riscv_vfneg_v_f32m4(
__riscv_vfmul_vv_f32m4(abs_x, abs_x, VEC_ELEM_NUM),
VEC_ELEM_NUM))
.exp()
.reg;
fixed_vfloat32m4_t res = __riscv_vfrsub_vf_f32m4(
__riscv_vfmul_vv_f32m4(poly, exp_val, VEC_ELEM_NUM), 1.0f,
VEC_ELEM_NUM);
vbool8_t mask = __riscv_vmflt_vf_f32m4_b8(reg, 0.0f, VEC_ELEM_NUM);
return FP32Vec16(__riscv_vfneg_v_f32m4_m(mask, res, VEC_ELEM_NUM));
}
};
// ============================================================================
// Type Traits & Global Helpers
// ============================================================================
template <typename T>
struct VecType {
using vec_type = void;
using vec_t = void;
};
template <typename T>
using vec_t = typename VecType<T>::vec_type;
template <>
struct VecType<float> {
using vec_type = FP32Vec8;
using vec_t = FP32Vec8;
};
template <>
struct VecType<c10::Half> {
using vec_type = FP16Vec8;
using vec_t = FP16Vec8;
};
template <>
struct VecType<c10::BFloat16> {
using vec_type = BF16Vec8;
using vec_t = BF16Vec8;
};
template <typename T>
void storeFP32(float v, T* ptr) {
*ptr = v;
}
template <>
inline void storeFP32<c10::Half>(float v, c10::Half* ptr) {
*reinterpret_cast<_Float16*>(ptr) = static_cast<_Float16>(v);
}
inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
reg = __riscv_vfncvt_f_f_w_f16m2(v.reg, VEC_ELEM_NUM);
}
inline FP16Vec8::FP16Vec8(const FP32Vec8& v) {
reg = __riscv_vfncvt_f_f_w_f16m1(v.reg, VEC_ELEM_NUM);
}
inline FP32Vec16::FP32Vec16(const FP16Vec16& v) {
reg = __riscv_vfwcvt_f_f_v_f32m4(v.reg, VEC_ELEM_NUM);
}
inline void fma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
acc = acc.fma(a, b);
}
#ifdef RISCV_BF16_SUPPORT
template <>
inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
*ptr = static_cast<__bf16>(v);
};
inline BF16Vec8::BF16Vec8(const FP32Vec8& v)
: reg(__riscv_vfncvtbf16_f_f_w_bf16m1(v.reg, VEC_ELEM_NUM)) {};
inline BF16Vec16::BF16Vec16(const FP32Vec16& v)
: reg(__riscv_vfncvtbf16_f_f_w_bf16m2(v.reg, VEC_ELEM_NUM)) {};
#else
template <>
inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
uint32_t val;
std::memcpy(&val, &v, 4);
*reinterpret_cast<uint16_t*>(ptr) = static_cast<uint16_t>(val >> 16);
}
inline BF16Vec8::BF16Vec8(const FP32Vec8& v) : reg_fp32(v.reg) {}
inline BF16Vec16::BF16Vec16(const FP32Vec16& v) : reg_fp32(v.reg) {}
#endif
inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 1); }
} // namespace vec_op
#ifndef CPU_KERNEL_GUARD_IN
#define CPU_KERNEL_GUARD_IN(NAME)
#endif
#ifndef CPU_KERNEL_GUARD_OUT
#define CPU_KERNEL_GUARD_OUT(NAME)
#endif
#endif // CPU_TYPES_RISCV_HPP

View File

@@ -237,10 +237,13 @@ W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
};
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
{b_k_stride_, b_n_stride_});
#ifdef __aarch64__
// dummy M size for prepacking weights
// Prepacking weights improves performance and avoid runtime reorders
constexpr dnnl_dim_t kProbeM = 128;
#else
constexpr dnnl_dim_t kProbeM = DNNL_RUNTIME_DIM_VAL;
#endif
prepack_weight(args.b_ptr, original_b_md,
create_primitive_desc(
@@ -408,19 +411,21 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
{b_k_stride_, b_n_stride_});
// dummy M size for prepacking weights
// Prepacking weights improves performance and avoid runtime reorders
constexpr dnnl_dim_t kProbeM = 128;
prepack_weight(args.b_ptr, original_b_md,
create_primitive_desc(
MSizeCacheKey{// Use a concrete M so oneDNN's kernel
// selector can choose an optimally blocked
// weight layout.
.a_m_size = kProbeM,
.a_m_stride = b_k_size_,
.use_bias = false,
.bias_type = dnnl::memory::data_type::undef},
MSizeCacheKey{
#ifdef VLLM_USE_ACL
// Arm Compute Library (ACL) backend for oneDNN does
// not support runtime
// dimensions, so we set M to a default value
.a_m_size = 128,
.a_m_stride = b_k_size_,
#else
.a_m_size = DNNL_RUNTIME_DIM_VAL,
.a_m_stride = DNNL_RUNTIME_DIM_VAL,
#endif
.use_bias = false,
.bias_type = dnnl::memory::data_type::undef},
true)
.weights_desc());
init_runtime_memory_cache(args);

View File

@@ -4,7 +4,7 @@
#include <torch/library.h>
// Note: overwrite the external definition for sharing same name between
// Note: overwrite the external defination for sharing same name between
// libraries use different ISAs.
#define TORCH_EXTENSION_NAME _C

View File

@@ -196,7 +196,7 @@ __forceinline__ __device__ u32x8_t ld256_cs(const u32x8_t* addr) {
return val;
#else
assert(false && "ld256_cs requires SM100+ with CUDA 12.9+");
return u32x8_t{};
return {};
#endif
}
@@ -211,51 +211,23 @@ __forceinline__ __device__ void st256_cs(u32x8_t* addr, u32x8_t val) {
#endif
}
// 32-bit load / store.
__device__ __forceinline__ int ld32(const int* addr) { return __ldg(addr); }
__device__ __forceinline__ void st32(int* addr, int val) { *addr = val; }
// 32-bit cache-streaming (.cs) load / store.
// Falls back to ld32/st32 on ROCm (no .cs hint).
// 32-bit cache-streaming (.cs) load / store — SM100+ only.
__forceinline__ __device__ int ld32_cs(const int* addr) {
#if VLLM_256B_PTX_ENABLED
int val;
#ifndef USE_ROCM
asm volatile("ld.global.cs.b32 %0, [%1];" : "=r"(val) : "l"(addr));
#else
val = ld32(addr);
#endif
return val;
#else
assert(false && "ld32_cs requires SM100+ with CUDA 12.9+");
return 0;
#endif
}
__forceinline__ __device__ void st32_cs(int* addr, int val) {
#ifndef USE_ROCM
#if VLLM_256B_PTX_ENABLED
asm volatile("st.global.cs.b32 [%0], %1;" ::"l"(addr), "r"(val));
#else
st32(addr, val);
#endif
}
// 128-bit cache-streaming (.cs) load / store.
// Falls back to ld128/st128 on ROCm (no .cs hint).
__forceinline__ __device__ int4 ld128_cs(const int4* addr) {
int4 val;
#ifndef USE_ROCM
asm volatile("ld.global.cs.v4.u32 {%0,%1,%2,%3}, [%4];"
: "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
: "l"(addr));
#else
ld128(val, addr);
#endif
return val;
}
__forceinline__ __device__ void st128_cs(int4* addr, int4 val) {
#ifndef USE_ROCM
asm volatile("st.global.cs.v4.u32 [%0], {%1,%2,%3,%4};" ::"l"(addr),
"r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w));
#else
st128(val, addr);
assert(false && "st32_cs requires SM100+ with CUDA 12.9+");
#endif
}
@@ -288,7 +260,7 @@ __device__ __forceinline__ void ld256_cg_or_zero(u32x8_t& val, const void* ptr,
__device__ __forceinline__ void ld128_cg_or_zero(uint4& val, const void* ptr,
bool pred) {
#ifndef USE_ROCM
#if VLLM_256B_PTX_ENABLED
uint32_t r0, r1, r2, r3;
asm volatile(
@@ -306,7 +278,7 @@ __device__ __forceinline__ void ld128_cg_or_zero(uint4& val, const void* ptr,
val = uint4{r0, r1, r2, r3};
#else
assert(false && "ld128_cg_or_zero is not supported on ROCm");
assert(false && "ld128_cg_or_zero requires SM100+ with CUDA 12.9+");
#endif
}

View File

@@ -109,18 +109,16 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
#ifndef USE_ROCM
int flag = 0;
CUresult rdma_result = cuDeviceGetAttribute(
CUDA_CHECK(cuDeviceGetAttribute(
&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
device);
if (rdma_result == CUDA_SUCCESS &&
flag) { // support GPUDirect RDMA if possible
device));
if (flag) { // support GPUDirect RDMA if possible
prop.allocFlags.gpuDirectRDMACapable = 1;
}
int fab_flag = 0;
CUresult fab_result = cuDeviceGetAttribute(
&fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device);
if (fab_result == CUDA_SUCCESS &&
fab_flag) { // support fabric handle if possible
CUDA_CHECK(cuDeviceGetAttribute(
&fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device));
if (fab_flag) { // support fabric handle if possible
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
}
#endif

View File

@@ -35,11 +35,11 @@ __global__ void batched_moe_align_block_size_kernel(
int32_t const block_ids_size = sorted_ids_size / block_size;
int32_t const SENTINEL =
num_batches * max_tokens_per_batch; // To denote invalid entries.
// Initialize sorted_ids
// Intialize sorted_ids
for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
sorted_ids[i] = SENTINEL;
}
// Initialize expert_ids with -1
// Intialize expert_ids with -1
for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
block_ids[i] = -1;
}

View File

@@ -73,9 +73,10 @@ void moe_permute(
MOE_DISPATCH(input.scalar_type(), [&] {
expandInputRowsKernelLauncher<scalar_t>(
get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
get_ptr<int>(sorted_row_idx), get_ptr<int>(inv_permuted_idx),
get_ptr<int>(permuted_idx), get_ptr<int64_t>(expert_first_token_offset),
n_token, valid_num_ptr, n_hidden, topk, n_local_expert, stream);
get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
n_hidden, topk, n_local_expert, stream);
});
}

View File

@@ -57,7 +57,7 @@ void sortAndScanExpert(const int* expert_for_source_row, const int* source_rows,
template <typename T>
void expandInputRowsKernelLauncher(
T const* unpermuted_input, T* permuted_output,
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
int const* expanded_dest_row_to_expanded_source_row,
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
int64_t const* expert_first_token_offset, int64_t const num_rows,

View File

@@ -2,7 +2,7 @@
template <typename T, bool CHECK_SKIPPED>
__global__ void expandInputRowsKernel(
T const* unpermuted_input, T* permuted_output,
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
int const* expanded_dest_row_to_expanded_source_row,
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
int64_t const* expert_first_token_offset, int64_t const num_rows,
@@ -16,6 +16,7 @@ __global__ void expandInputRowsKernel(
int64_t expanded_dest_row = blockIdx.x;
int64_t const expanded_source_row =
expanded_dest_row_to_expanded_source_row[expanded_dest_row];
int expert_id = sorted_experts[expanded_dest_row];
if (threadIdx.x == 0) {
assert(expanded_dest_row <= INT32_MAX);
@@ -53,7 +54,7 @@ __global__ void expandInputRowsKernel(
template <typename T>
void expandInputRowsKernelLauncher(
T const* unpermuted_input, T* permuted_output,
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
int const* expanded_dest_row_to_expanded_source_row,
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
int64_t const* expert_first_token_offset, int64_t const num_rows,
@@ -69,12 +70,12 @@ void expandInputRowsKernelLauncher(
bool is_check_skip = num_valid_tokens_ptr != nullptr;
auto func = func_map[is_check_skip];
func<<<blocks, threads, 0, stream>>>(unpermuted_input, permuted_output,
expanded_dest_row_to_expanded_source_row,
expanded_source_row_to_expanded_dest_row,
permuted_idx, expert_first_token_offset,
num_rows, num_valid_tokens_ptr, cols, k,
num_local_experts);
func<<<blocks, threads, 0, stream>>>(
unpermuted_input, permuted_output, sorted_experts,
expanded_dest_row_to_expanded_source_row,
expanded_source_row_to_expanded_dest_row, permuted_idx,
expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
num_local_experts);
}
template <class T, class U>

View File

@@ -295,14 +295,10 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a);
std::tuple<torch::Tensor, torch::Tensor> scaled_fp4_quant_func(
torch::Tensor const& input, torch::Tensor const& input_scale,
bool is_sf_swizzled_layout);
void scaled_fp4_quant_out(torch::Tensor const& input,
torch::Tensor const& input_scale,
bool is_sf_swizzled_layout, torch::Tensor& output,
torch::Tensor& output_scale);
void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
torch::Tensor& output_scale,
torch::Tensor const& input_scale,
bool is_sf_swizzled_layout);
void scaled_fp4_experts_quant(
torch::Tensor& output, torch::Tensor& output_scale,

View File

@@ -542,7 +542,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
if (!lane_id) {
// Store scales.
if constexpr (std::is_same<scale_t, uint8_t>::value) {
// Packed UE8M0 format. Remove Mantissa.
// Packed UE8MO format. Remove Mantissa.
*y_s_ptr = reinterpret_cast<int16_t&>(y_s) >> 7;
bool const jump_pack = (current_group_id + 1) % 4 == 0;

View File

@@ -16,8 +16,6 @@
#include <torch/all.h>
#include "nvfp4_utils.cuh"
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
@@ -53,10 +51,9 @@ void silu_and_mul_scaled_fp4_experts_quant_sm1xxa(
torch::Tensor const& output_scale_offset_by_experts);
#endif
void scaled_fp4_quant_out(torch::Tensor const& input,
torch::Tensor const& input_sf,
bool is_sf_swizzled_layout, torch::Tensor& output,
torch::Tensor& output_sf) {
void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
torch::Tensor& output_sf, torch::Tensor const& input_sf,
bool is_sf_swizzled_layout) {
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf,
@@ -65,34 +62,6 @@ void scaled_fp4_quant_out(torch::Tensor const& input,
TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization kernel");
}
std::tuple<torch::Tensor, torch::Tensor> scaled_fp4_quant_func(
torch::Tensor const& input, torch::Tensor const& input_sf,
bool is_sf_swizzled_layout) {
int64_t n = input.size(-1);
int64_t m = input.numel() / n;
auto device = input.device();
// Two fp4 values packed into a uint8
auto output = torch::empty(
{m, n / 2}, torch::TensorOptions().device(device).dtype(torch::kUInt8));
torch::Tensor output_sf;
if (is_sf_swizzled_layout) {
auto [sf_m, sf_n] = vllm::computeSwizzledSFShape(m, n);
output_sf = torch::empty(
{sf_m, sf_n},
torch::TensorOptions().device(device).dtype(torch::kInt32));
} else {
output_sf = torch::empty(
{m, n / CVT_FP4_SF_VEC_SIZE},
torch::TensorOptions().device(device).dtype(torch::kUInt8));
}
scaled_fp4_quant_out(input, input_sf, is_sf_swizzled_layout, output,
output_sf);
return {output, output_sf};
}
void scaled_fp4_experts_quant(
torch::Tensor& output, torch::Tensor& output_scale,
torch::Tensor const& input, torch::Tensor const& input_global_scale,

View File

@@ -18,7 +18,6 @@
#include <cuda_runtime.h>
#include <cuda_fp8.h>
#include <utility>
#include "../../cuda_vec_utils.cuh"
@@ -55,18 +54,6 @@ inline int computeEffectiveRows(int m) {
return round_up(m, ROW_TILE);
}
// Compute the shape of the swizzled SF output tensor.
// Returns (rounded_m, rounded_n / 4) where:
// rounded_m = round_up(m, 128)
// rounded_n = round_up(n / CVT_FP4_SF_VEC_SIZE, 4)
inline std::pair<int64_t, int64_t> computeSwizzledSFShape(int64_t m,
int64_t n) {
int64_t rounded_m = round_up(m, static_cast<int64_t>(128));
int64_t scale_n = n / CVT_FP4_SF_VEC_SIZE;
int64_t rounded_n = round_up(scale_n, static_cast<int64_t>(4));
return {rounded_m, rounded_n / 4};
}
// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
inline __device__ uint32_t fp32_vec8_to_e2m1(float (&array)[8]) {
uint32_t val;

View File

@@ -15,33 +15,31 @@ __device__ void rms_norm_dynamic_per_token_quant_vec(
scalar_t const* __restrict__ input, // [..., hidden_size]
scalar_t const* __restrict__ weight, // [hidden_size]
float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
int32_t const input_stride, scalar_t* __restrict__ residual = nullptr) {
scalar_t* __restrict__ residual = nullptr) {
float rms = 0.0f;
float token_scale = 0.0f;
// Compute rms
vllm::vectorized::compute_rms<scalar_t, has_residual>(
&rms, input, hidden_size, input_stride, var_epsilon, residual);
&rms, input, hidden_size, var_epsilon, residual);
// Compute scale
vllm::vectorized::compute_dynamic_per_token_scales<scalar_t, scalar_out_t,
has_residual>(
&token_scale, scales, input, weight, rms, scale_ub, hidden_size,
input_stride, residual);
residual);
// RMS Norm + Quant
if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
token_scale = 1.0f / token_scale;
vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, true,
has_residual>(out, input, weight, rms,
&token_scale, hidden_size,
input_stride, residual);
has_residual>(
out, input, weight, rms, &token_scale, hidden_size, residual);
} else {
// FP8 - Do not invert token_scale for exact match with FBGemm
vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, false,
has_residual>(out, input, weight, rms,
&token_scale, hidden_size,
input_stride, residual);
has_residual>(
out, input, weight, rms, &token_scale, hidden_size, residual);
}
}
@@ -53,40 +51,38 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
scalar_t const* __restrict__ input, // [..., hidden_size]
scalar_t const* __restrict__ weight, // [hidden_size]
float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
int32_t const input_stride, scalar_t* __restrict__ residual = nullptr) {
scalar_t* __restrict__ residual = nullptr) {
// For vectorization, token_input and token_output pointers need to be
// aligned at 8-byte and 4-byte addresses respectively.
bool const can_vectorize = hidden_size % 4 == 0 and input_stride % 4 == 0;
bool const can_vectorize = hidden_size % 4 == 0;
if (can_vectorize) {
return rms_norm_dynamic_per_token_quant_vec<scalar_t, scalar_out_t,
has_residual>(
out, scales, input, weight, scale_ub, var_epsilon, hidden_size,
input_stride, residual);
residual);
}
float rms = 0.0f;
float token_scale = 0.0f;
// Compute RMS
vllm::compute_rms<scalar_t, has_residual>(
&rms, input, hidden_size, input_stride, var_epsilon, residual);
vllm::compute_rms<scalar_t, has_residual>(&rms, input, hidden_size,
var_epsilon, residual);
// Compute Scale
vllm::compute_dynamic_per_token_scales<scalar_t, scalar_out_t, has_residual>(
&token_scale, scales, input, weight, rms, scale_ub, hidden_size,
input_stride, residual);
residual);
// RMS Norm + Quant
if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
token_scale = 1.0f / token_scale;
vllm::norm_and_quant<scalar_t, scalar_out_t, true, has_residual>(
out, input, weight, rms, &token_scale, hidden_size, input_stride,
residual);
out, input, weight, rms, &token_scale, hidden_size, residual);
} else {
// FP8 - Do not invert s_token_scale for exact match with FBGemm
vllm::norm_and_quant<scalar_t, scalar_out_t, false, has_residual>(
out, input, weight, rms, &token_scale, hidden_size, input_stride,
residual);
out, input, weight, rms, &token_scale, hidden_size, residual);
}
}
@@ -101,20 +97,19 @@ __global__ void rms_norm_per_block_quant_kernel(
scalar_t const* __restrict__ input, // [..., hidden_size]
scalar_t const* __restrict__ weight, // [hidden_size]
float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
int32_t const input_stride, scalar_t* __restrict__ residual = nullptr,
int64_t outer_scale_stride = 1) {
scalar_t* __restrict__ residual = nullptr, int64_t outer_scale_stride = 1) {
float rms;
// Compute RMS
// Always able to vectorize due to constraints on hidden_size
vllm::vectorized::compute_rms<scalar_t, has_residual>(
&rms, input, hidden_size, input_stride, var_epsilon, residual);
&rms, input, hidden_size, var_epsilon, residual);
// Compute Scale
// Always able to vectorize due to constraints on hidden_size and group_size
vllm::vectorized::compute_dynamic_per_token_scales<
scalar_t, scalar_out_t, has_residual, is_scale_transposed, group_size>(
nullptr, scales, input, weight, rms, scale_ub, hidden_size, input_stride,
residual, outer_scale_stride);
nullptr, scales, input, weight, rms, scale_ub, hidden_size, residual,
outer_scale_stride);
// RMS Norm + Quant
// Always able to vectorize due to constraints on hidden_size
@@ -125,7 +120,7 @@ __global__ void rms_norm_per_block_quant_kernel(
vllm::vectorized::norm_and_quant<
scalar_t, scalar_out_t, std::is_same_v<scalar_out_t, int8_t>,
has_residual, is_scale_transposed, group_size>(
out, input, weight, rms, scales, hidden_size, input_stride, residual,
out, input, weight, rms, scales, hidden_size, residual,
outer_scale_stride);
}
@@ -142,7 +137,6 @@ void rms_norm_dynamic_per_token_quant_dispatch(
std::optional<at::Tensor> const& scale_ub,
std::optional<at::Tensor>& residual) {
int32_t hidden_size = input.size(-1);
int32_t input_stride = input.view({-1, hidden_size}).stride(0);
auto num_tokens = input.numel() / hidden_size;
dim3 grid(num_tokens);
@@ -159,7 +153,7 @@ void rms_norm_dynamic_per_token_quant_dispatch(
out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
var_epsilon, hidden_size, input_stride,
var_epsilon, hidden_size,
has_residual ? residual->data_ptr<scalar_in_t>() : nullptr);
});
});
@@ -176,9 +170,7 @@ void rms_norm_dynamic_per_token_quant(
? c10::ScalarType::Float8_e4m3fn
: c10::ScalarType::Float8_e4m3fnuz;
TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
TORCH_CHECK(out.is_contiguous());
TORCH_CHECK(input.stride(-1) == 1,
"Input must be contiguous in the last dimension");
TORCH_CHECK(out.is_contiguous() && input.is_contiguous());
if (scale_ub.has_value()) {
TORCH_CHECK(out.dtype() == kFp8Type);
@@ -187,7 +179,6 @@ void rms_norm_dynamic_per_token_quant(
TORCH_CHECK(scales.dtype() == torch::kFloat32);
if (residual) {
TORCH_CHECK(residual->scalar_type() == input.scalar_type());
TORCH_CHECK(residual->is_contiguous());
}
VLLM_DISPATCH_FLOATING_TYPES(
@@ -209,15 +200,6 @@ void rms_norm_per_block_quant_dispatch(
std::optional<at::Tensor> const& scale_ub,
std::optional<at::Tensor>& residual, bool is_scale_transposed) {
int32_t hidden_size = input.size(-1);
int32_t input_stride = input.view({-1, hidden_size}).stride(0);
TORCH_CHECK(hidden_size % 4 == 0,
"Hidden size must be divisible by 4 for vectorized access");
TORCH_CHECK(input_stride % 4 == 0,
"Input stride must be divisible by 4 for vectorized access");
TORCH_CHECK(group_size % 4 == 0,
"Group size must be divisible by 4 for vectorized access");
auto num_tokens = input.numel() / hidden_size;
dim3 grid(num_tokens);
@@ -243,7 +225,7 @@ void rms_norm_per_block_quant_dispatch(
weight.data_ptr<scalar_in_t>(),
scale_ub.has_value() ? scale_ub->data_ptr<float>()
: nullptr,
var_epsilon, hidden_size, input_stride,
var_epsilon, hidden_size,
has_residual ? residual->data_ptr<scalar_in_t>()
: nullptr,
scales.stride(1));
@@ -264,9 +246,7 @@ void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
? c10::ScalarType::Float8_e4m3fn
: c10::ScalarType::Float8_e4m3fnuz;
TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
TORCH_CHECK(out.is_contiguous());
TORCH_CHECK(input.stride(-1) == 1,
"Input must be contiguous in the last dimension");
TORCH_CHECK(out.is_contiguous() && input.is_contiguous());
if (scale_ub.has_value()) {
TORCH_CHECK(out.dtype() == kFp8Type);
@@ -275,7 +255,6 @@ void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
TORCH_CHECK(scales.dtype() == torch::kFloat32);
if (residual) {
TORCH_CHECK(residual->scalar_type() == input.scalar_type());
TORCH_CHECK(residual->is_contiguous());
}
TORCH_CHECK(group_size == 128 || group_size == 64,

Some files were not shown because too many files have changed in this diff Show More