Compare commits

..

18 Commits

Author SHA1 Message Date
Andreas Karatzas
89a77b1084 [ROCm][CI] Pin TorchCodec to v0.10.0 for ROCm compatibility (#34447)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
(cherry picked from commit 4c078fa546)
(cherry picked from commit a976961fb77d38129abf69edd4952101731f2421)
2026-02-24 20:30:22 -08:00
Kevin H. Luu
d3c1513f5f [ci] Use the right tag for CPU arm64 image (#34915)
Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
(cherry picked from commit a1a2d79442)
(cherry picked from commit 772f70839192262ff01c533d821a11a225d1c00f)
2026-02-24 20:30:13 -08:00
Cyrus Leung
5dbfbc967b [CI/Build] Fix gRPC version mismatch (#35013)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
(cherry picked from commit 965fe45935)
(cherry picked from commit 90308959295b66049024649fe1273070477f343d)
2026-02-24 20:30:02 -08:00
khluu
c86cdcbcd2 Revert "[Release 2.10] Update to Torch 2.10 - final release (#30525)"
This reverts commit f97ca67176.
2026-02-24 20:28:53 -08:00
khluu
3c9496f146 Revert "[Bugfix][ROCm][GPT-OSS] Use old triton_kernels implementation on ROCm if the new API is not available (#34153)"
This reverts commit 55a1baebc5.
2026-02-24 20:28:45 -08:00
khluu
2d5be1dd5c release script
Signed-off-by: khluu <khluu000@gmail.com>
2026-02-12 02:37:52 -08:00
Michael Goin
7a06e5b05b [Bugfix] Fix MTP accuracy for GLM-5 (#34385)
Signed-off-by: mgoin <mgoin64@gmail.com>
(cherry picked from commit ec12d39d44)
2026-02-11 20:54:27 -08:00
Junseo Park
946b2f106c [Bugfix] send None sentinel on final commit so server properly sends transcription.done (#33963)
Signed-off-by: pjs102793 <pjs102793@naver.com>
Co-authored-by: Nick Hill <nickhill123@gmail.com>
(cherry picked from commit 5458eb835d)
2026-02-11 20:54:14 -08:00
Nick Hill
5e8adb0c49 [Misc] Bump fastsafetensors version for latest fixes (#34273)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
(cherry picked from commit 79504027ef)
2026-02-11 20:54:00 -08:00
Xinyu Dong
9be1ff2d3a [Bugfix] fix default is_neox_style is True for deepseek (#34353)
Signed-off-by: dongxinyu03 <dongxinyu03@baidu.com>
(cherry picked from commit be7f3d5d20)
2026-02-11 20:53:40 -08:00
Jee Jee Li
b3ee90f961 [Model] GLM adaptation (#34124)
(cherry picked from commit 978a37c823)
2026-02-11 20:53:11 -08:00
Seiji Eicher
c44d0c6d66 Patch protobuf for CVE-2026-0994 (#34253)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
Co-authored-by: Kevin H. Luu <khluu000@gmail.com>
(cherry picked from commit 5045d5c983)
2026-02-11 02:33:40 -08:00
Kunshang Ji
83db96d8cd [XPU][9/N] clean up existing ipex code/doc (#34111)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
(cherry picked from commit cb9574eb85)
2026-02-11 02:33:27 -08:00
zofia
dbfb79fe45 [XPU][7/N] enable xpu fp8 moe (#34202)
Signed-off-by: Zhu, Zufang <zufang.zhu@intel.com>
(cherry picked from commit b482f71e9f)
2026-02-11 02:33:15 -08:00
Roger Wang
b2e1fc3589 [Bugfix][Core] Fix CPU memory leak from Request reference cycle in prefix caching (#34183)
Signed-off-by: Roger Wang <hey@rogerw.io>
(cherry picked from commit 8a5e0e2b2b)
2026-02-11 02:33:04 -08:00
Gregory Shtrasberg
55a1baebc5 [Bugfix][ROCm][GPT-OSS] Use old triton_kernels implementation on ROCm if the new API is not available (#34153)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
(cherry picked from commit c60f8e3b49)
2026-02-11 02:32:52 -08:00
Charlie Fu
e1e9841631 [torch.compile][Fusion] Fix attention fusion pass removing kv_udpate op. (#33945)
Signed-off-by: charlifu <charlifu@amd.com>
(cherry picked from commit bb9f97308d)
2026-02-11 02:32:41 -08:00
zofia
5bd63387c3 [XPU][6/N] add xpu scaled_mm kernel (#34117)
Signed-off-by: Zhu, Zufang <zufang.zhu@intel.com>
(cherry picked from commit 9bdb06b436)
2026-02-11 02:32:27 -08:00
1046 changed files with 19713 additions and 83757 deletions

View File

@@ -1,7 +1,6 @@
group: Hardware - AMD Build
group: Hardware
steps:
- label: "AMD: :docker: build image"
key: image-build-amd
depends_on: []
device: amd_cpu
no_plugin: true
@@ -10,7 +9,7 @@ steps:
docker build
--build-arg max_jobs=16
--build-arg REMOTE_VLLM=1
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx942;gfx950'
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-f docker/Dockerfile.rocm

View File

@@ -8,7 +8,7 @@ clean_docker_tag() {
}
print_usage_and_exit() {
echo "Usage: $0 <registry> <repo> <commit> <branch> <image_tag> [<image_tag_latest>]"
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
exit 1
}
@@ -142,16 +142,11 @@ resolve_parent_commit() {
print_bake_config() {
echo "--- :page_facing_up: Resolved bake configuration"
# Write to a temp directory to avoid polluting the repo root (which is the
# Docker build context). Files left in the repo root get COPY'd into the
# image and can cause duplicate artifact uploads from downstream steps.
local bake_tmp
bake_tmp="$(mktemp -d)"
BAKE_CONFIG_FILE="${bake_tmp}/bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
echo "Saved bake config to ${BAKE_CONFIG_FILE}"
echo "--- :arrow_down: Uploading bake config to Buildkite"
(cd "$(dirname "${BAKE_CONFIG_FILE}")" && buildkite-agent artifact upload "$(basename "${BAKE_CONFIG_FILE}")")
buildkite-agent artifact upload "${BAKE_CONFIG_FILE}"
}
#################################
@@ -159,7 +154,7 @@ print_bake_config() {
#################################
print_instance_info
if [[ $# -lt 5 ]]; then
if [[ $# -lt 7 ]]; then
print_usage_and_exit
fi
@@ -168,8 +163,10 @@ REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
BRANCH=$4
IMAGE_TAG=$5
IMAGE_TAG_LATEST=${6:-} # only used for main branch, optional
VLLM_USE_PRECOMPILED=$5
VLLM_MERGE_BASE_COMMIT=$6
IMAGE_TAG=$7
IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional
# build config
TARGET="test-ci"
@@ -196,6 +193,8 @@ export CACHE_FROM
export CACHE_FROM_BASE_BRANCH
export CACHE_FROM_MAIN
export CACHE_TO
export VLLM_USE_PRECOMPILED
export VLLM_MERGE_BASE_COMMIT
# print args
echo "--- :mag: Arguments"
@@ -203,6 +202,8 @@ echo "REGISTRY: ${REGISTRY}"
echo "REPO: ${REPO}"
echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
echo "BRANCH: ${BRANCH}"
echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
echo "IMAGE_TAG: ${IMAGE_TAG}"
echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"

View File

@@ -3,9 +3,9 @@ steps:
- label: ":docker: Build image"
key: image-build
depends_on: []
timeout_in_minutes: 600
commands:
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
- if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
retry:
automatic:
- exit_status: -1 # Agent was lost
@@ -41,7 +41,7 @@ steps:
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
- label: ":docker: Build CPU arm64 image"
key: cpu-arm64-image-build
depends_on: []

View File

@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
@@ -24,13 +24,13 @@ fi
# build
docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--build-arg VLLM_CPU_AVX512BF16=true \
--build-arg VLLM_CPU_AVX512VNNI=true \
--build-arg VLLM_CPU_AMXBF16=true \
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
--target vllm-test \
--progress plain .
# push
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu

View File

@@ -11,7 +11,7 @@ REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then

View File

@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
@@ -25,10 +25,10 @@ fi
docker build \
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
--progress plain \
https://github.com/vllm-project/vllm-gaudi.git
# push
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu

View File

@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on chartqa for vllm.
#
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.11"
# pip install "lm-eval[api]>=0.4.9.2"
usage() {
echo``
@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
--tasks chartqa \
--batch_size auto \
--apply_chat_template \
--limit "$LIMIT"
--limit $LIMIT

View File

@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers.
#
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.11"
# pip install "lm-eval[api]>=0.4.9.2"
usage() {
echo``

View File

@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.11"
# pip install "lm-eval[api]>=0.4.9.2"
usage() {
echo``

View File

@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.11"
# pip install "lm-eval[api]>=0.4.9.2"
usage() {
echo``
@@ -20,11 +20,14 @@ usage() {
echo
}
while getopts "m:l:f:t:" OPT; do
while getopts "m:b:l:f:t:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
b )
BATCH_SIZE="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;

View File

@@ -9,10 +9,8 @@ import json
import os
from dataclasses import dataclass
from importlib import util
from pathlib import Path
import pandas as pd
import regex as re
pd.options.display.float_format = "{:.2f}".format
plotly_found = util.find_spec("plotly.express") is not None
@@ -277,131 +275,6 @@ def _apply_two_decimals(
return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
# -----------------------------
# Export helpers (Excel + CSV)
# -----------------------------
def _sanitize_sheet_name(name: str) -> str:
"""
Excel sheet constraints:
- max 31 chars
- cannot contain: : \ / ? * [ ]
- cannot be empty
"""
name = "sheet" if name is None else str(name)
name = re.sub(r"[:\\/?*\[\]]", "_", name)
name = name.strip().strip("'")
name = re.sub(r"\s+", " ", name)
if not name:
name = "sheet"
return name[:31]
def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
d = dict(zip(group_cols, gkey_tuple))
model = d.get("Model", "model")
model_short = str(model).split("/")[-1]
ilen = d.get("Input Len", "")
olen = d.get("Output Len", "")
lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
return _sanitize_sheet_name(f"{model_short}{lens}")
def _write_tables_to_excel_sheet(
writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
):
startrow = 0
for title, df in blocks:
pd.DataFrame([[title]]).to_excel(
writer, sheet_name=sheet, index=False, header=False, startrow=startrow
)
startrow += 1
df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow)
startrow += len(df) + 3
def _safe_filename(s: str) -> str:
s = re.sub(r"[^\w\-.]+", "_", str(s).strip())
return s[:180] if len(s) > 180 else s
# -----------------------------
# vLLM environment export helper
# -----------------------------
def _parse_vllm_env_txt(env_path: Path) -> pd.DataFrame:
"""Parse vllm_env.txt into a flat table (Section, Key, Value).
Supports:
- section headers as standalone lines (no ':' or '=')
- key-value lines like 'OS: Ubuntu ...'
- env var lines like 'HF_HOME=/data/hf'
"""
lines = env_path.read_text(encoding="utf-8", errors="replace").splitlines()
section = "General"
rows: list[dict] = []
def set_section(s: str):
nonlocal section
s = (s or "").strip()
if s:
section = s
for raw in lines:
stripped = raw.strip()
if not stripped:
continue
# divider lines like =====
if set(stripped) <= {"="}:
continue
# section header heuristic: short standalone line
if ":" not in stripped and "=" not in stripped and len(stripped) <= 64:
if stripped.lower().startswith("collecting environment information"):
continue
set_section(stripped)
continue
# env var style: KEY=VALUE (and not a URL with :)
if "=" in stripped and ":" not in stripped:
k, v = stripped.split("=", 1)
k = k.strip()
v = v.strip()
if k:
rows.append({"Section": section, "Key": k, "Value": v})
continue
# key: value
if ":" in stripped:
k, v = stripped.split(":", 1)
k = k.strip()
v = v.strip()
if k:
rows.append({"Section": section, "Key": k, "Value": v})
continue
return pd.DataFrame(rows, columns=["Section", "Key", "Value"])
def _load_env_df_for_inputs(args, files: list[str]) -> pd.DataFrame | None:
"""Load vllm_env.txt next to the *original* input JSON file.
Note: when only one -f is provided, the script may split JSON into ./splits/...,
but vllm_env.txt typically lives next to the original benchmark_results.json.
"""
base_dir: Path | None = None
if getattr(args, "file", None):
base_dir = Path(args.file[0]).resolve().parent
elif files:
base_dir = Path(files[0]).resolve().parent
if base_dir is None:
return None
env_path = base_dir / "vllm_env.txt"
if not env_path.exists():
return None
df = _parse_vllm_env_txt(env_path)
return df
# -----------------------------
# Valid max concurrency summary helpers
# -----------------------------
@@ -555,6 +428,7 @@ def build_valid_max_concurrency_summary_html(
summary_df = pd.DataFrame(rows)
# --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
for c in summary_df.columns:
if c == "Configuration":
continue
@@ -562,10 +436,12 @@ def build_valid_max_concurrency_summary_html(
both_col = f"Max {conc_col} (Both)"
# --- Strict 2-decimal formatting for ALL non-Configuration columns ---
formatters = {}
for c in summary_df.columns:
if c == "Configuration":
continue
# default argument binds per-column formatter correctly
formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
styler = summary_df.style.format(formatters)
@@ -584,95 +460,6 @@ def build_valid_max_concurrency_summary_html(
return title + styler.to_html(table_attributes='border="1" class="dataframe"')
def build_valid_max_concurrency_summary_df(
tput_group_df: pd.DataFrame | None,
ttft_group_df: pd.DataFrame | None,
tpot_group_df: pd.DataFrame | None,
conc_col: str,
args,
) -> pd.DataFrame | None:
if ttft_group_df is None and tpot_group_df is None:
return None
ttft_cols = (
_config_value_columns(ttft_group_df, conc_col)
if ttft_group_df is not None
else []
)
tpot_cols = (
_config_value_columns(tpot_group_df, conc_col)
if tpot_group_df is not None
else []
)
tput_cols = (
_config_value_columns(tput_group_df, conc_col)
if tput_group_df is not None
else []
)
if ttft_group_df is not None and tpot_group_df is not None:
cfg_cols = [c for c in ttft_cols if c in tpot_cols]
if tput_group_df is not None:
cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
else:
cfg_cols = ttft_cols or tpot_cols
if not cfg_cols:
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
rows = []
for cfg in cfg_cols:
ttft_max = (
_max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
if ttft_group_df is not None
else pd.NA
)
tpot_max = (
_max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
if tpot_group_df is not None
else pd.NA
)
both = (
pd.NA
if (pd.isna(ttft_max) or pd.isna(tpot_max))
else min(ttft_max, tpot_max)
)
tput_at_both = (
_value_at_concurrency(tput_group_df, conc_col, cfg, both)
if tput_group_df is not None
else pd.NA
)
ttft_at_both = (
_value_at_concurrency(ttft_group_df, conc_col, cfg, both)
if ttft_group_df is not None
else pd.NA
)
tpot_at_both = (
_value_at_concurrency(tpot_group_df, conc_col, cfg, both)
if tpot_group_df is not None
else pd.NA
)
rows.append(
{
"Configuration": cfg,
f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
f"Max {conc_col} (Both)": both,
"Output Tput @ Both (tok/s)": tput_at_both,
"TTFT @ Both (ms)": ttft_at_both,
"TPOT @ Both (ms)": tpot_at_both,
}
)
df = pd.DataFrame(rows)
for c in df.columns:
if c != "Configuration":
df[c] = pd.to_numeric(df[c], errors="coerce")
return df
# -----------------------------
# Plot helper
# -----------------------------
@@ -750,21 +537,6 @@ def build_parser() -> argparse.ArgumentParser:
default=100.0,
help="Reference limit for TPOT plots (ms)",
)
# ---- NEW: export options ----
parser.add_argument(
"--excel-out",
type=str,
default="perf_comparison.xlsx",
help="Write one sheet per (Model, Dataset, Input Len, Output Len).",
)
parser.add_argument(
"--csv-out-dir",
type=str,
default="",
help="If set, write per-group per-metric CSVs into this directory.",
)
return parser
@@ -885,6 +657,7 @@ def maybe_write_plot(
markers=True,
)
# Ensure plot hover + y tick labels are also 2 decimals.
fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
fig.update_yaxes(tickformat=".2f")
@@ -957,151 +730,87 @@ def write_report_group_first(
for metric_label, (df, _) in metric_cache.items()
}
csv_dir = Path(args.csv_out_dir) if args.csv_out_dir else None
if csv_dir:
csv_dir.mkdir(parents=True, exist_ok=True)
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
main_fh.write('<meta charset="utf-8">\n')
for gkey in group_keys:
gkey_tuple = normalize_group_key(gkey)
suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
sub_path = group_filename(gkey_tuple)
group_header = (
'<div style="font-size: 1.4em; font-weight: 700; '
'margin: 18px 0 10px 0;">'
f"{_html.escape(suffix)}"
"</div>\n"
)
excel_path = args.excel_out or "perf_comparison.xlsx"
with pd.ExcelWriter(excel_path, engine="openpyxl") as xw:
# ---- Environment sheet (first) ----
env_sheet = _sanitize_sheet_name("Environment")
env_df = _load_env_df_for_inputs(args, files)
if env_df is None or env_df.empty:
pd.DataFrame(
[
{
"Section": "Environment",
"Key": "vllm_env.txt",
"Value": "NOT FOUND (or empty)",
}
]
).to_excel(xw, sheet_name=env_sheet, index=False)
else:
env_df.to_excel(xw, sheet_name=env_sheet, index=False)
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
main_fh.write('<meta charset="utf-8">\n')
for gkey in group_keys:
gkey_tuple = normalize_group_key(gkey)
suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
sub_path = group_filename(gkey_tuple)
group_header = (
'<div style="font-size: 1.4em; font-weight: 700; '
'margin: 18px 0 10px 0;">'
f"{_html.escape(suffix)}"
"</div>\n"
main_fh.write(group_header)
with open(sub_path, "w", encoding="utf-8") as sub_fh:
sub_fh.write('<meta charset="utf-8">\n')
sub_fh.write(group_header)
tput_group_df = None
ttft_group_df = None
tpot_group_df = None
conc_col = args.xaxis
for metric_label in plan.data_cols:
gb = metric_groupbys[metric_label]
df_sorted, raw_data_cols = metric_cache[metric_label]
try:
group_df = gb.get_group(gkey)
except KeyError:
missing = (
'<div style="font-size: 1.1em; font-weight: 600; '
'margin: 10px 0;">'
f"{_html.escape(metric_label)} — missing for this group"
"</div>\n"
)
main_fh.write(missing)
sub_fh.write(missing)
continue
if conc_col not in group_df.columns:
conc_col = _find_concurrency_col(group_df)
mn = metric_label.lower().strip()
if "tok/s" in mn:
tput_group_df = group_df
elif "ttft" in mn:
ttft_group_df = group_df
elif mn in ("p99", "median") or "tpot" in mn:
tpot_group_df = group_df
display_group = group_df.drop(
columns=group_cols_canonical, errors="ignore"
)
html = render_metric_table_html(
display_group, metric_label, suffix, args
)
main_fh.write(html)
sub_fh.write(html)
maybe_write_plot(
main_fh,
sub_fh,
group_df=group_df,
raw_data_cols=raw_data_cols,
metric_label=metric_label,
y_axis_col=y_axis_col,
args=args,
)
summary_html = build_valid_max_concurrency_summary_html(
tput_group_df=tput_group_df,
ttft_group_df=ttft_group_df,
tpot_group_df=tpot_group_df,
conc_col=conc_col,
args=args,
)
main_fh.write(group_header)
sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
sheet_base = sheet
dedup_i = 1
while sheet in xw.sheets:
dedup_i += 1
sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}")
excel_blocks: list[tuple[str, pd.DataFrame]] = []
with open(sub_path, "w", encoding="utf-8") as sub_fh:
sub_fh.write('<meta charset="utf-8">\n')
sub_fh.write(group_header)
tput_group_df = None
ttft_group_df = None
tpot_group_df = None
conc_col = args.xaxis
for metric_label in plan.data_cols:
gb = metric_groupbys[metric_label]
df_sorted, raw_data_cols = metric_cache[metric_label]
try:
group_df = gb.get_group(gkey)
except KeyError:
missing = (
'<div style="font-size: 1.1em; font-weight: 600; '
'margin: 10px 0;">'
f"{_html.escape(metric_label)} — missing for this group"
"</div>\n"
)
main_fh.write(missing)
sub_fh.write(missing)
continue
if conc_col not in group_df.columns:
conc_col = _find_concurrency_col(group_df)
mn = metric_label.lower().strip()
if "tok/s" in mn:
tput_group_df = group_df
elif "ttft" in mn:
ttft_group_df = group_df
elif mn in ("p99", "median") or "tpot" in mn:
tpot_group_df = group_df
display_group = group_df.drop(
columns=group_cols_canonical, errors="ignore"
)
html = render_metric_table_html(
display_group, metric_label, suffix, args
)
main_fh.write(html)
sub_fh.write(html)
maybe_write_plot(
main_fh,
sub_fh,
group_df=group_df,
raw_data_cols=raw_data_cols,
metric_label=metric_label,
y_axis_col=y_axis_col,
args=args,
)
excel_blocks.append(
(metric_label, display_group.reset_index(drop=True))
)
if csv_dir:
fn = _safe_filename(
f"{sheet}__{metric_label}".replace(" ", "_").replace(
"/", "_"
)
)
display_group.to_csv(csv_dir / f"{fn}.csv", index=False)
summary_html = build_valid_max_concurrency_summary_html(
tput_group_df=tput_group_df,
ttft_group_df=ttft_group_df,
tpot_group_df=tpot_group_df,
conc_col=conc_col,
args=args,
)
if summary_html:
main_fh.write(summary_html)
sub_fh.write(summary_html)
summary_df = build_valid_max_concurrency_summary_df(
tput_group_df=tput_group_df,
ttft_group_df=ttft_group_df,
tpot_group_df=tpot_group_df,
conc_col=conc_col,
args=args,
)
if summary_df is not None:
excel_blocks.append(
("Valid Max Concurrency Summary", summary_df)
)
if csv_dir:
fn = _safe_filename(
f"{sheet}__Valid_Max_Concurrency_Summary"
)
summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
_write_tables_to_excel_sheet(xw, sheet, excel_blocks)
print(f"Wrote Excel: {excel_path}")
if csv_dir:
print(f"Wrote CSVs under: {csv_dir}")
if summary_html:
main_fh.write(summary_html)
sub_fh.write(summary_html)
def main():

View File

@@ -1,4 +1,6 @@
#!/bin/bash
# This script should be run inside the CI process
# This script assumes that we are already inside the vllm/ directory
# Benchmarking results will be available inside vllm/benchmarks/results/
@@ -7,19 +9,14 @@
set -x
set -o pipefail
# Environment-driven debug controls (like ON_CPU=1)
DRY_RUN="${DRY_RUN:-0}"
MODEL_FILTER="${MODEL_FILTER:-}"
DTYPE_FILTER="${DTYPE_FILTER:-}"
check_gpus() {
if command -v nvidia-smi; then
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | grep -c . || true)
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
elif command -v amd-smi; then
declare -g gpu_count=$(amd-smi list | grep -c 'GPU' || true)
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
elif command -v hl-smi; then
declare -g gpu_count=$(hl-smi --list | grep -ci "Module ID" || true)
declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
fi
if [[ $gpu_count -gt 0 ]]; then
@@ -47,7 +44,7 @@ check_cpus() {
declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
if [[ $numa_count -gt 0 ]]; then
echo "NUMA found."
echo "$numa_count"
echo $numa_count
else
echo "Need at least 1 NUMA to run benchmarking."
exit 1
@@ -115,12 +112,13 @@ json2envs() {
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
local timeout_val="1200"
timeout "$timeout_val" bash -c '
until curl -sf http://localhost:8000/v1/models >/dev/null; do
until curl -X POST localhost:8000/v1/completions; do
sleep 1
done
'
done' && return 0 || return 1
}
kill_processes_launched_by_current_bash() {
@@ -254,16 +252,37 @@ run_benchmark_tests() {
done
}
run_latency_tests() { run_benchmark_tests "latency" "$1"; }
run_startup_tests() { run_benchmark_tests "startup" "$1"; }
run_throughput_tests() { run_benchmark_tests "throughput" "$1"; }
run_latency_tests() {
run_benchmark_tests "latency" "$1"
}
merge_serving_tests_stream() {
# Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode.
# This helper does NOT modify JSON; it only filters the stream in dry-run mode.
local serving_test_file="$1"
# shellcheck disable=SC2016
local merged='
run_startup_tests() {
run_benchmark_tests "startup" "$1"
}
run_throughput_tests() {
run_benchmark_tests "throughput" "$1"
}
run_serving_tests() {
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
#
# Supported JSON formats:
# 1) Plain format: top-level array
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
#
# 2) Default parameters field + plain format tests
# {
# "defaults": { ... },
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
# }
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '
if type == "array" then
# Plain format: test cases array
.[]
@@ -285,50 +304,7 @@ merge_serving_tests_stream() {
else
error("Unsupported serving test file format: must be array or object with .tests")
end
'
jq -c "$merged" "$serving_test_file" | \
if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" '
select((($model|length)==0)
or ((.server_parameters.model // "") == $model)
or ((.client_parameters.model // "") == $model))
| select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
'
else
cat
fi
}
run_serving_tests() {
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
#
# Supported JSON formats:
# 1) Plain format: top-level array
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
#
# 2) Default parameters field + plain format tests
# {
# "defaults": { ... },
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
# }
local serving_test_file
serving_test_file=$1
# In dry-run mode, if filters are provided but no tests match, fail fast.
if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
local count
count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ')
if [[ "$count" -eq 0 ]]; then
echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2
return 0
fi
fi
# Iterate over serving tests (merged + optional filtered stream)
merge_serving_tests_stream "$serving_test_file" | while read -r params; do
' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -397,7 +373,7 @@ run_serving_tests() {
echo "Server command: $server_command"
# support remote vllm server
client_remote_args=""
if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then
if [[ -z "${REMOTE_HOST}" ]]; then
bash -c "$server_command" &
server_pid=$!
# wait until the server is alive
@@ -408,9 +384,6 @@ run_serving_tests() {
echo ""
echo "vLLM failed to start within the timeout period."
fi
elif [[ "${DRY_RUN:-0}" == "1" ]]; then
# dry-run: don't start server
echo "Dry Run."
else
server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
if [[ ${REMOTE_PORT} ]]; then
@@ -429,12 +402,14 @@ run_serving_tests() {
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
# iterate over different max_concurrency
for max_concurrency in $max_concurrency_list; do
new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
echo " new test name $new_test_name"
# pass the tensor parallel size, the compilation mode, and the optimization
# level to the client so that they can be used on the benchmark dashboard
@@ -450,9 +425,7 @@ run_serving_tests() {
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
if [[ "${DRY_RUN:-0}" != "1" ]]; then
bash -c "$client_command"
fi
bash -c "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
@@ -470,15 +443,12 @@ run_serving_tests() {
done
# clean up
if [[ "${DRY_RUN:-0}" != "1" ]]; then
kill -9 "$server_pid"
kill_gpu_processes
fi
kill -9 $server_pid
kill_gpu_processes
done
}
main() {
local ARCH
ARCH=''
if [[ "$ON_CPU" == "1" ]]; then
@@ -488,13 +458,7 @@ main() {
check_gpus
ARCH="$arch_suffix"
fi
# DRY_RUN does not execute vLLM; do not require HF_TOKEN.
if [[ "${DRY_RUN:-0}" != "1" ]]; then
check_hf_token
else
echo "DRY_RUN=1 -> skip HF_TOKEN validation"
fi
check_hf_token
# dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
@@ -515,16 +479,11 @@ main() {
# dump vllm info via vllm collect-env
env_output=$(vllm collect-env)
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
# benchmarking
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $?
if [[ "${DRY_RUN:-0}" == "1" ]]; then
echo "DRY_RUN=1 -> skip latency/startup/throughput suites"
exit 0
fi
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"

View File

@@ -1,41 +0,0 @@
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [
32,
64,
128
],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"dtype": "bfloat16",
"model": "jinaai/jina-embeddings-v3",
"trust_remote_code": ""
},
"client_parameters": {
"model": "jinaai/jina-embeddings-v3",
"backend": "openai-embeddings",
"endpoint": "/v1/embeddings",
"dataset_name": "sharegpt",
"dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_jina_embed_v3_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {}
}
]
}

View File

@@ -1,283 +0,0 @@
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"ignore-eos": "",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_llama8B_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp1_random_128_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_128_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_128_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp4_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp1_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 2
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_granite2B_tp1_random_128_128",
"server_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen1.7B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-1.7B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-1.7B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen4B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-4B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-4B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen8B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-8B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_glm9B_tp1_random_128_128",
"server_parameters": {
"model": "zai-org/glm-4-9b-hf",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "zai-org/glm-4-9b-hf",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_gemma7B_tp1_random_128_128",
"server_parameters": {
"model": "google/gemma-7b",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "google/gemma-7b",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
}
]
}

View File

@@ -148,6 +148,136 @@
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp1_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 2
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_granite2B_tp1_random_128_128",
"server_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen1.7B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-1.7B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-1.7B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen4B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-4B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-4B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen8B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-8B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_glm9B_tp1_random_128_128",
"server_parameters": {
"model": "zai-org/glm-4-9b-hf",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "zai-org/glm-4-9b-hf",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_gemma7B_tp1_random_128_128",
"server_parameters": {
"model": "google/gemma-7b",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "google/gemma-7b",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
}
]
}

View File

@@ -248,8 +248,8 @@ steps:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
- group: "Publish wheels"
key: "publish-wheels"
- group: "Publish release artifacts"
key: "publish-release-artifacts"
steps:
- block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
key: block-upload-release-wheels
@@ -265,6 +265,27 @@ steps:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
- block: "Confirm update release images to DockerHub"
key: block-update-release-images-dockerhub
depends_on:
- input-release-version
- annotate-release-workflow
- label: "Publish release images to DockerHub"
depends_on:
- block-update-release-images-dockerhub
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/push-release-images-dockerhub.sh"
plugins:
- docker-login#v3.0.0:
username: vllmbot
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
# =============================================================================
# ROCm Release Pipeline (x86_64 only)

View File

@@ -25,7 +25,7 @@ S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"
# Format ROCm version for path (e.g., "7.1" -> "rocm710")
ROCM_VERSION_PATH="rocm$(echo "${ROCM_VERSION}" | tr -d '.')"
ROCM_VERSION_PATH="rocm$(echo ${ROCM_VERSION} | tr -d '.')"
ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
## ROCm Wheel and Docker Image Releases

View File

@@ -83,7 +83,7 @@ case "${1:-}" in
exit 1
fi
WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
if [[ "$WHEEL_COUNT" -eq 0 ]]; then
echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
exit 1
@@ -110,9 +110,9 @@ case "${1:-}" in
echo ""
echo "Downloaded wheels:"
find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \;
ls -lh artifacts/rocm-base-wheels/
WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
echo ""
echo "Total: $WHEEL_COUNT wheels"
echo "========================================"

View File

@@ -134,7 +134,7 @@ log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
# Store PR data in a temp file
PR_DATA=$(mktemp)
trap 'rm -f "$PR_DATA"' EXIT
trap "rm -f $PR_DATA" EXIT
if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
--limit 1000 \

View File

@@ -1,37 +1,25 @@
#!/bin/bash
# This script runs tests inside the corresponding ROCm docker container.
# It handles both single-node and multi-node test configurations.
#
# Multi-node detection: Instead of matching on fragile group names, we detect
# multi-node jobs structurally by looking for the bracket command syntax
# "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
# This script runs test inside the corresponding ROCm docker container.
set -o pipefail
# Export Python path
export PYTHONPATH=".."
###############################################################################
# Helper Functions
###############################################################################
# Print ROCm version
echo "--- Confirming Clean Initial State"
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
wait_for_clean_gpus() {
local timeout=${1:-300}
local start=$SECONDS
echo "--- Waiting for clean GPU state (timeout: ${timeout}s)"
while true; do
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
return
fi
if (( SECONDS - start >= timeout )); then
echo "Error: GPUs did not reach clean state within ${timeout}s" >&2
exit 1
fi
sleep 3
done
}
echo "--- ROCm info"
rocminfo
# cleanup older docker images
cleanup_docker() {
# Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}')
@@ -40,12 +28,15 @@ cleanup_docker() {
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
@@ -54,258 +45,193 @@ cleanup_docker() {
}
cleanup_network() {
local max_nodes=${NUM_NODES:-2}
for node in $(seq 0 $((max_nodes - 1))); do
if docker ps -a -q -f name="node${node}" | grep -q .; then
docker stop "node${node}" || true
for node in $(seq 0 $((NUM_NODES-1))); do
if docker pr -a -q -f name="node${node}" | grep -q .; then
docker stop "node${node}"
fi
done
if docker network ls | grep -q docker-net; then
docker network rm docker-net || true
if docker network ls | grep docker-net; then
docker network rm docker-net
fi
}
is_multi_node() {
local cmds="$1"
# Primary signal: NUM_NODES environment variable set by the pipeline
if [[ "${NUM_NODES:-1}" -gt 1 ]]; then
return 0
fi
# Fallback: detect the bracket syntax structurally
# Pattern: [...] && [...] (per-node command arrays)
if [[ "$cmds" =~ \[.*\].*\&\&.*\[.*\] ]]; then
return 0
fi
return 1
}
###############################################################################
# Pytest marker re-quoting
#
# When commands are passed through Buildkite -> shell -> $* -> bash -c,
# quotes around pytest -m marker expressions get stripped:
# pytest -v -s -m 'not cpu_test' v1/core
# becomes:
# pytest -v -s -m not cpu_test v1/core
#
# pytest then interprets "cpu_test" as a file path, not part of the marker.
# This function detects unquoted multi-word marker expressions and re-quotes
# them so they survive the final bash -c expansion.
###############################################################################
re_quote_pytest_markers() {
local cmds="$1"
# Pattern: -m not <identifier> -> -m 'not <identifier>'
# Handles the common cases: 'not cpu_test', 'not slow_test', etc.
cmds=$(echo "$cmds" | sed -E "s/-m not ([a-zA-Z_][a-zA-Z0-9_]*)/-m 'not \1'/g")
echo "$cmds"
}
###############################################################################
# ROCm-specific pytest command rewrites
#
# These apply ignore flags and environment overrides for tests that are not
# yet supported or behave differently on ROCm hardware. Kept as a single
# function so new exclusions are easy to add in one place.
###############################################################################
apply_rocm_test_overrides() {
local cmds="$1"
# --- Model registry filter ---
if [[ $cmds == *"pytest -v -s models/test_registry.py"* ]]; then
cmds=${cmds//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
# --- LoRA: disable custom paged attention ---
if [[ $cmds == *"pytest -v -s lora"* ]]; then
cmds=${cmds//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
fi
# --- Kernel ignores ---
if [[ $cmds == *" kernels/core"* ]]; then
cmds="${cmds} \
--ignore=kernels/core/test_fused_quant_layernorm.py \
--ignore=kernels/core/test_permute_cols.py"
fi
if [[ $cmds == *" kernels/attention"* ]]; then
cmds="${cmds} \
--ignore=kernels/attention/test_attention_selector.py \
--ignore=kernels/attention/test_encoder_decoder_attn.py \
--ignore=kernels/attention/test_flash_attn.py \
--ignore=kernels/attention/test_flashinfer.py \
--ignore=kernels/attention/test_prefix_prefill.py \
--ignore=kernels/attention/test_cascade_flash_attn.py \
--ignore=kernels/attention/test_mha_attn.py \
--ignore=kernels/attention/test_lightning_attn.py \
--ignore=kernels/attention/test_attention.py"
fi
if [[ $cmds == *" kernels/quantization"* ]]; then
cmds="${cmds} \
--ignore=kernels/quantization/test_int8_quant.py \
--ignore=kernels/quantization/test_machete_mm.py \
--ignore=kernels/quantization/test_block_fp8.py \
--ignore=kernels/quantization/test_block_int8.py \
--ignore=kernels/quantization/test_marlin_gemm.py \
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
--ignore=kernels/quantization/test_int8_kernel.py"
fi
if [[ $cmds == *" kernels/mamba"* ]]; then
cmds="${cmds} \
--ignore=kernels/mamba/test_mamba_mixer2.py \
--ignore=kernels/mamba/test_causal_conv1d.py \
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
fi
if [[ $cmds == *" kernels/moe"* ]]; then
cmds="${cmds} \
--ignore=kernels/moe/test_moe.py \
--ignore=kernels/moe/test_cutlass_moe.py \
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
fi
# --- Entrypoint ignores ---
if [[ $cmds == *" entrypoints/openai "* ]]; then
cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_shutdown.py \
--ignore=entrypoints/openai/test_completion.py \
--ignore=entrypoints/openai/test_models.py \
--ignore=entrypoints/openai/test_lora_adapters.py \
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
--ignore=entrypoints/openai/test_root_path.py \
--ignore=entrypoints/openai/test_tokenization.py \
--ignore=entrypoints/openai/test_prompt_validation.py "}
fi
if [[ $cmds == *" entrypoints/llm "* ]]; then
cmds=${cmds//" entrypoints/llm "/" entrypoints/llm \
--ignore=entrypoints/llm/test_chat.py \
--ignore=entrypoints/llm/test_accuracy.py \
--ignore=entrypoints/llm/test_init.py \
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
# Clean up escaped newlines from --ignore appends
cmds=$(echo "$cmds" | sed 's/ \\ / /g')
echo "$cmds"
}
###############################################################################
# Main
###############################################################################
# --- GPU initialization ---
echo "--- Confirming Clean Initial State"
wait_for_clean_gpus
echo "--- ROCm info"
rocminfo
# --- Docker housekeeping ---
# Call the cleanup docker function
cleanup_docker
echo "--- Resetting GPUs"
echo "reset" > /opt/amdgpu/etc/gpu_state
wait_for_clean_gpus
# --- Pull test image ---
echo "reset" > /opt/amdgpu/etc/gpu_state
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
echo "--- Pulling container"
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
docker pull "${image_name}"
remove_docker_container() {
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
}
trap remove_docker_container EXIT
# --- Prepare commands ---
echo "--- Running container"
HF_CACHE="$(realpath ~)/huggingface"
mkdir -p "${HF_CACHE}"
HF_MOUNT="/root/.cache/huggingface"
commands="$*"
commands=$@
echo "Raw commands: $commands"
# Fix quoting before ROCm overrides (so overrides see correct structure)
commands=$(re_quote_pytest_markers "$commands")
commands=$(apply_rocm_test_overrides "$commands")
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
if [[ $commands == *"pytest -v -s lora"* ]]; then
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
fi
#ignore certain kernels tests
if [[ $commands == *" kernels/core"* ]]; then
commands="${commands} \
--ignore=kernels/core/test_fused_quant_layernorm.py \
--ignore=kernels/core/test_permute_cols.py"
fi
if [[ $commands == *" kernels/attention"* ]]; then
commands="${commands} \
--ignore=kernels/attention/test_attention_selector.py \
--ignore=kernels/attention/test_encoder_decoder_attn.py \
--ignore=kernels/attention/test_flash_attn.py \
--ignore=kernels/attention/test_flashinfer.py \
--ignore=kernels/attention/test_prefix_prefill.py \
--ignore=kernels/attention/test_cascade_flash_attn.py \
--ignore=kernels/attention/test_mha_attn.py \
--ignore=kernels/attention/test_lightning_attn.py \
--ignore=kernels/attention/test_attention.py"
fi
if [[ $commands == *" kernels/quantization"* ]]; then
commands="${commands} \
--ignore=kernels/quantization/test_int8_quant.py \
--ignore=kernels/quantization/test_machete_mm.py \
--ignore=kernels/quantization/test_block_fp8.py \
--ignore=kernels/quantization/test_block_int8.py \
--ignore=kernels/quantization/test_marlin_gemm.py \
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
--ignore=kernels/quantization/test_int8_kernel.py"
fi
if [[ $commands == *" kernels/mamba"* ]]; then
commands="${commands} \
--ignore=kernels/mamba/test_mamba_mixer2.py \
--ignore=kernels/mamba/test_causal_conv1d.py \
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
fi
if [[ $commands == *" kernels/moe"* ]]; then
commands="${commands} \
--ignore=kernels/moe/test_moe.py \
--ignore=kernels/moe/test_cutlass_moe.py \
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
fi
#ignore certain Entrypoints/openai tests
if [[ $commands == *" entrypoints/openai "* ]]; then
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_shutdown.py \
--ignore=entrypoints/openai/test_completion.py \
--ignore=entrypoints/openai/test_models.py \
--ignore=entrypoints/openai/test_lora_adapters.py \
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
--ignore=entrypoints/openai/test_root_path.py \
--ignore=entrypoints/openai/test_tokenization.py \
--ignore=entrypoints/openai/test_prompt_validation.py "}
fi
#ignore certain Entrypoints/llm tests
if [[ $commands == *" entrypoints/llm "* ]]; then
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
--ignore=entrypoints/llm/test_chat.py \
--ignore=entrypoints/llm/test_accuracy.py \
--ignore=entrypoints/llm/test_init.py \
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
commands=$(echo "$commands" | sed 's/ \\ / /g')
echo "Final commands: $commands"
# --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py
# --ignore=entrypoints/openai/test_accuracy.py \
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
MYPYTHONPATH=".."
# Verify GPU access
# Test that we're launching on the machine that has
# proper access to GPUs
render_gid=$(getent group render | cut -d: -f3)
if [[ -z "$render_gid" ]]; then
echo "Error: 'render' group not found. This is required for GPU access." >&2
exit 1
fi
# --- Route: multi-node vs single-node ---
if is_multi_node "$commands"; then
echo "--- Multi-node job detected"
if [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
# Parse the bracket syntax: prefix ; [node0_cmds] && [node1_cmds]
# BASH_REMATCH[1] = prefix (everything before first bracket)
# BASH_REMATCH[2] = comma-separated node0 commands
# BASH_REMATCH[3] = comma-separated node1 commands
if [[ "$commands" =~ ^(.*)\[(.*)"] && ["(.*)\]$ ]]; then
prefix=$(echo "${BASH_REMATCH[1]}" | sed 's/;//g')
echo "PREFIX: ${prefix}"
export composite_command="(command rocm-smi || true)"
saved_IFS=$IFS
IFS=','
read -ra node0 <<< "${BASH_REMATCH[2]}"
read -ra node1 <<< "${BASH_REMATCH[3]}"
IFS=$saved_IFS
if [[ ${#node0[@]} -ne ${#node1[@]} ]]; then
echo "Warning: node0 has ${#node0[@]} commands, node1 has ${#node1[@]}. They will be paired by index."
fi
for i in "${!node0[@]}"; do
command_node_0=$(echo "${node0[i]}" | sed 's/\"//g')
command_node_1=$(echo "${node1[i]}" | sed 's/\"//g')
step_cmd="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
echo "COMMANDS: ${step_cmd}"
composite_command="${composite_command} && ${step_cmd}"
done
/bin/bash -c "${composite_command}"
cleanup_network
if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then
prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g')
echo "PREFIX: ${prefix}"
export composite_command="(command rocm-smi || true)"
myIFS=$IFS
IFS=','
read -ra node0 <<< ${BASH_REMATCH[2]}
read -ra node1 <<< ${BASH_REMATCH[3]}
IFS=$myIFS
for i in "${!node0[@]}";do
command_node_0=$(echo ${node0[i]} | sed 's/\"//g')
command_node_1=$(echo ${node1[i]} | sed 's/\"//g')
export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
echo "COMMANDS: ${commands}"
composite_command=$(echo "${composite_command} && ${commands}")
done
/bin/bash -c "${composite_command}"
cleanup_network
else
echo "Multi-node job detected but failed to parse bracket command syntax."
echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]"
echo "Got: $commands"
cleanup_network
exit 111
echo "Failed to parse node commands! Exiting."
cleanup_network
exit 111
fi
else
echo "--- Single-node job"
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--shm-size=16gb \
--group-add "$render_gid" \
--rm \
-e HF_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \
--name "${container_name}" \
"${image_name}" \
/bin/bash -c "${commands}"
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--shm-size=16gb \
--group-add "$render_gid" \
--rm \
-e HF_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \
--name "${container_name}" \
"${image_name}" \
/bin/bash -c "${commands}"
fi

View File

@@ -27,7 +27,7 @@ function cpu_tests() {
podman exec -it "$container_id" bash -c "
export TORCH_COMPILE_DISABLE=1
set -xve
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
# Run basic model test
podman exec -it "$container_id" bash -c "
@@ -43,7 +43,7 @@ function cpu_tests() {
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> "$HOME"/test_rest.log
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
}
# All of CPU tests are expected to be finished less than 40 mins.

View File

@@ -16,5 +16,5 @@ echo "--- :docker: Building Docker image"
docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel.
docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
docker run --rm --cpuset-cpus=$CORE_RANGE --cpuset-mems=$NUMA_NODE -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g $IMAGE_NAME \
timeout $TIMEOUT_VAL bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"

View File

@@ -7,7 +7,7 @@ set -exuo pipefail
# Try building the docker image
image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
cat <<EOF | docker build -t "${image_name}" -f - .
cat <<EOF | docker build -t ${image_name} -f - .
FROM gaudi-base-image:latest
COPY ./ /workspace/vllm
@@ -39,12 +39,12 @@ EOF
# functions, while other platforms only need one remove_docker_container
# function.
EXITCODE=1
remove_docker_containers() { docker rm -f "${container_name}" || true; }
remove_docker_containers() { docker rm -f ${container_name} || true; }
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
remove_docker_containers
echo "Running HPU plugin v1 test"
docker run --rm --runtime=habana --name="${container_name}" --network=host \
docker run --rm --runtime=habana --name=${container_name} --network=host \
-e HABANA_VISIBLE_DEVICES=all \
-e VLLM_SKIP_WARMUP=true \
-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \

View File

@@ -41,7 +41,6 @@ get_config() {
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
exit 1
fi
# shellcheck source=/dev/null
source "${TEST_RUN_CONFIG_FILE}"
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
return 0
@@ -49,8 +48,9 @@ get_config() {
# get test running configuration.
fetch_vllm_test_cfg
get_config
# Check if the function call was successful. If not, exit the script.
if ! get_config; then
if [ $? -ne 0 ]; then
exit 1
fi
@@ -62,14 +62,14 @@ agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
echo "agent_idx: ${agent_idx}"
builder_name="cachebuilder${agent_idx}"
builder_cache_dir="/mnt/docker-cache${agent_idx}"
mkdir -p "${builder_cache_dir}"
mkdir -p ${builder_cache_dir}
# Try building the docker image
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:"${PYPI_CACHE_HOST}" \
--builder "${builder_name}" --cache-from type=local,src="${builder_cache_dir}" \
--cache-to type=local,dest="${builder_cache_dir}",mode=max \
--progress=plain --load -t "${image_name}" -f - .
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
--cache-to type=local,dest=${builder_cache_dir},mode=max \
--progress=plain --load -t ${image_name} -f - .
FROM ${BASE_IMAGE_NAME}
# Define environments
@@ -116,7 +116,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/devlib && \
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -139,7 +139,7 @@ trap remove_docker_container EXIT
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
# returns one argument per line: --device, /dev/davinciX, ...
# returns --device /dev/davinci0 --device /dev/davinci1
parse_and_gen_devices() {
local input="$1"
local index cards_num
@@ -151,24 +151,29 @@ parse_and_gen_devices() {
return 1
fi
local devices=""
local i=0
while (( i < cards_num )); do
local dev_idx=$(((index - 1)*cards_num + i ))
printf '%s\n' "--device"
printf '%s\n' "/dev/davinci${dev_idx}"
devices="$devices --device /dev/davinci${dev_idx}"
((i++))
done
# trim leading space
devices="${devices#"${devices%%[![:space:]]*}"}"
# Output devices: assigned to the caller variable
printf '%s' "$devices"
}
mapfile -t device_args < <(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
# This test checks whether the OOT platform interface is functioning properly in conjunction with
# the hardware plugin vllm-ascend.
model_cache_dir=/mnt/modelscope${agent_idx}
mkdir -p "${model_cache_dir}"
mkdir -p ${model_cache_dir}
docker run \
"${device_args[@]}" \
${devices} \
--device /dev/davinci_manager \
--device /dev/devmm_svm \
--device /dev/hisi_hdc \
@@ -177,7 +182,7 @@ docker run \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v "${model_cache_dir}":/root/.cache/modelscope \
-v ${model_cache_dir}:/root/.cache/modelscope \
--entrypoint="" \
--name "${container_name}" \
"${image_name}" \

View File

@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"

View File

@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"

View File

@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# Try building the docker image
docker build -t "${image_name}" -f docker/Dockerfile.xpu .
docker build -t ${image_name} -f docker/Dockerfile.xpu .
# Setup cleanup
remove_docker_container() {

View File

@@ -21,16 +21,16 @@ echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag nam
# pull original arch-dependent images from AWS ECR Public
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX"
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX"
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
# tag arch-dependent images
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-aarch64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
# push arch-dependent images to DockerHub
docker push vllm/vllm-openai:"$TAG_NAME"-x86_64
docker push vllm/vllm-openai:"$TAG_NAME"-aarch64
docker push vllm/vllm-openai:$TAG_NAME-x86_64
docker push vllm/vllm-openai:$TAG_NAME-aarch64
# push arch-independent manifest to DockerHub
docker manifest create vllm/vllm-openai:"$TAG_NAME" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
docker manifest create vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
docker manifest push vllm/vllm-openai:"$TAG_NAME"
docker manifest push vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT"
docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
docker manifest push vllm/vllm-openai:$TAG_NAME
docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT

View File

@@ -0,0 +1,98 @@
#!/bin/bash
set -ex
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null | sed 's/^v//')
if [ -z "${RELEASE_VERSION}" ]; then
echo "RELEASE_VERSION is not set"
exit 1
fi
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
# Download images:
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
# Tag and push images:
## CUDA
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
docker push vllm/vllm-openai:latest-x86_64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
docker push vllm/vllm-openai:latest-x86_64-cu130
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker push vllm/vllm-openai:latest-aarch64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
docker push vllm/vllm-openai:latest-aarch64-cu130
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
## ROCm
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
docker push vllm/vllm-openai-rocm:latest
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker push vllm/vllm-openai-rocm:latest-base
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
## CPU
docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
docker push vllm/vllm-openai-cpu:latest-x86_64
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
docker push vllm/vllm-openai-cpu:latest-arm64
docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
# Create multi-arch manifest:
docker manifest rm vllm/vllm-openai:latest
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker manifest push vllm/vllm-openai:latest
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
docker manifest rm vllm/vllm-openai:latest-cu130
docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
docker manifest push vllm/vllm-openai:latest-cu130
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
docker manifest rm vllm/vllm-openai-cpu:latest || true
docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
docker manifest push vllm/vllm-openai-cpu:latest
docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}

View File

@@ -0,0 +1,64 @@
#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Setup script for Prime-RL integration tests
# This script prepares the environment for running Prime-RL tests with nightly vLLM
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
exit 0
fi
echo "Setting up Prime-RL integration test environment..."
# Clean up any existing Prime-RL directory
if [ -d "${PRIME_RL_DIR}" ]; then
echo "Removing existing Prime-RL directory..."
rm -rf "${PRIME_RL_DIR}"
fi
# Install UV if not available
if ! command -v uv &> /dev/null; then
echo "Installing UV package manager..."
curl -LsSf https://astral.sh/uv/install.sh | sh
source $HOME/.local/bin/env
fi
# Clone Prime-RL repository at specific branch for reproducible tests
PRIME_RL_BRANCH="integ-vllm-main"
echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
cd "${PRIME_RL_DIR}"
echo "Setting up UV project environment..."
export UV_PROJECT_ENVIRONMENT=/usr/local
ln -s /usr/bin/python3 /usr/local/bin/python
# Remove vllm pin from pyproject.toml
echo "Removing vllm pin from pyproject.toml..."
sed -i '/vllm==/d' pyproject.toml
# Sync Prime-RL dependencies
echo "Installing Prime-RL dependencies..."
uv sync --inexact && uv sync --inexact --all-extras
# Verify installation
echo "Verifying installations..."
uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
echo "Prime-RL integration test environment setup complete!"
echo "Running Prime-RL integration tests..."
export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
uv run pytest -vs tests/integration/test_rl.py -m gpu
echo "Prime-RL integration tests completed!"

View File

@@ -51,14 +51,14 @@ for BACK in "${BACKENDS[@]}"; do
--enable-eplb \
--trust-remote-code \
--max-model-len 2048 \
--all2all-backend "$BACK" \
--port "$PORT" &
--all2all-backend $BACK \
--port $PORT &
SERVER_PID=$!
wait_for_server "$PORT"
wait_for_server $PORT
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")

View File

@@ -1,57 +0,0 @@
#!/usr/bin/env bash
set -euxo pipefail
# Nightly e2e test for prefetch offloading with a MoE model.
# Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights
# and validates GSM8K accuracy matches baseline (no offloading).
#
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
THRESHOLD=${1:-0.25}
NUM_Q=${2:-1319}
PORT=${3:-8030}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}"
wait_for_server() {
local port=$1
timeout 600 bash -c '
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
sleep 1
done'
}
MODEL="deepseek-ai/DeepSeek-V2-Lite"
cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
kill "${SERVER_PID}" 2>/dev/null || true
for _ in {1..20}; do
kill -0 "${SERVER_PID}" 2>/dev/null || break
sleep 0.5
done
kill -9 "${SERVER_PID}" 2>/dev/null || true
fi
}
trap cleanup EXIT
vllm serve "$MODEL" \
--max-model-len 2048 \
--offload-group-size 8 \
--offload-num-in-group 2 \
--offload-prefetch-step 1 \
--offload-params w13_weight w2_weight \
--port "$PORT" &
SERVER_PID=$!
wait_for_server "$PORT"
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_prefetch_offload.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} prefetch_offload: accuracy {acc:.3f}")
assert acc >= ${THRESHOLD}, f"${MODEL} prefetch_offload accuracy {acc}"
PY
cleanup
SERVER_PID=

View File

@@ -47,20 +47,20 @@ for BACK in "${BACKENDS[@]}"; do
vllm serve "$MODEL" \
--enforce-eager \
--enable-eplb \
--all2all-backend "$BACK" \
--all2all-backend $BACK \
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
--tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \
--data-parallel-size "${DATA_PARALLEL_SIZE}" \
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
--data-parallel-size ${DATA_PARALLEL_SIZE} \
--enable-expert-parallel \
--trust-remote-code \
--max-model-len 2048 \
--port "$PORT" &
--port $PORT &
SERVER_PID=$!
wait_for_server "$PORT"
wait_for_server $PORT
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")

View File

@@ -51,20 +51,20 @@ for BACK in "${BACKENDS[@]}"; do
--tensor-parallel-size 4 \
--enable-expert-parallel \
--enable-eplb \
--all2all-backend "$BACK" \
--all2all-backend $BACK \
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
--trust-remote-code \
--max-model-len 2048 \
--gpu-memory-utilization 0.9 \
"${PLATFORM_ARGS[@]}" \
--port "$PORT" &
--port $PORT &
SERVER_PID=$!
wait_for_server "$PORT"
wait_for_server $PORT
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")

View File

@@ -9,11 +9,10 @@ ENV_FILE=$1
# For testing on local vm, use `set -a` to export all variables
source /etc/environment
# shellcheck source=/dev/null
source "$ENV_FILE"
source $ENV_FILE
remove_docker_container() {
docker rm -f "$CONTAINER_NAME" || true;
docker rm -f $CONTAINER_NAME || true;
}
trap remove_docker_container EXIT
@@ -42,13 +41,13 @@ echo
echo "starting docker...$CONTAINER_NAME"
echo
docker run \
-v "$DOWNLOAD_DIR":"$DOWNLOAD_DIR" \
--env-file "$ENV_FILE" \
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
--env-file $ENV_FILE \
-e HF_TOKEN="$HF_TOKEN" \
-e TARGET_COMMIT="$BUILDKITE_COMMIT" \
-e MODEL="$MODEL" \
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
-e MODEL=$MODEL \
-e WORKSPACE=/workspace \
--name "$CONTAINER_NAME" \
--name $CONTAINER_NAME \
-d \
--privileged \
--network host \

View File

@@ -42,21 +42,21 @@ echo "lanching vllm..."
echo "logging to $VLLM_LOG"
echo
vllm serve "$MODEL" \
vllm serve $MODEL \
--seed 42 \
--max-num-seqs "$MAX_NUM_SEQS" \
--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
--tensor-parallel-size "$TENSOR_PARALLEL_SIZE" \
--max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
--no-enable-prefix-caching \
--download_dir "$DOWNLOAD_DIR" \
--max-model-len "$MAX_MODEL_LEN" > "$VLLM_LOG" 2>&1 &
--download_dir $DOWNLOAD_DIR \
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
echo "wait for 20 minutes.."
echo
# sleep 1200
# wait for 10 minutes...
for _ in {1..120}; do
for i in {1..120}; do
# TODO: detect other type of errors.
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
echo "Detected RuntimeError, exiting."
@@ -78,11 +78,11 @@ echo "logging to $BM_LOG"
echo
vllm bench serve \
--backend vllm \
--model "$MODEL" \
--model $MODEL \
--dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len "$INPUT_LEN" \
--sonnet-output-len "$OUTPUT_LEN" \
--sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--ignore-eos > "$BM_LOG"
echo "completed..."

View File

@@ -76,15 +76,16 @@ mkdir -p "$INDICES_OUTPUT_DIR"
# this indices have relative paths that could work as long as it is next to the wheel directory in s3
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
alias_args=()
if [[ -n "$DEFAULT_VARIANT_ALIAS" ]]; then
alias_args=(--alias-to-default "$DEFAULT_VARIANT_ALIAS")
if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
else
alias_arg=""
fi
# HACK: we do not need regex module here, but it is required by pre-commit hook
# To avoid any external dependency, we simply replace it back to the stdlib re module
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" "${alias_args[@]}"
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
# copy indices to /<commit>/ unconditionally
echo "Uploading indices to $S3_COMMIT_PREFIX"
@@ -99,9 +100,9 @@ fi
# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
if [[ "$version" != *"dev"* ]]; then
echo "Re-generating indices for /$pure_version/"
rm -rf "${INDICES_OUTPUT_DIR:?}/*"
rm -rf "$INDICES_OUTPUT_DIR/*"
mkdir -p "$INDICES_OUTPUT_DIR"
# wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" "${alias_args[@]}"
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
fi

View File

@@ -7,7 +7,7 @@ SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
GIT_VERSION=$(git describe --exact-match --tags "$BUILDKITE_COMMIT" 2>/dev/null)
GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
echo "Release version from Buildkite: $RELEASE_VERSION"
@@ -55,7 +55,7 @@ mkdir -p $DIST_DIR
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
echo "Wheels copied to local directory"
# generate source tarball
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT"
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
ls -la $DIST_DIR
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
@@ -65,6 +65,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
exit 1
fi
python3 -m twine check "$PYPI_WHEEL_FILES"
python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES"
python3 -m twine check $PYPI_WHEEL_FILES
python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES
echo "Wheels uploaded to PyPI"

View File

@@ -55,7 +55,7 @@ mkdir -p all-rocm-wheels
cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
WHEEL_COUNT=$(find all-rocm-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
echo "Total wheels to upload: $WHEEL_COUNT"
if [ "$WHEEL_COUNT" -eq 0 ]; then
@@ -115,7 +115,7 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] |
fi
# Extract version from vLLM wheel and update version-specific index
VLLM_WHEEL=$(find all-rocm-wheels -maxdepth 1 -name 'vllm*.whl' 2>/dev/null | head -1)
VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
if [ -n "$VLLM_WHEEL" ]; then
VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "Version in wheel: $VERSION"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -14,8 +14,3 @@ steps:
- pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd

View File

@@ -17,15 +17,3 @@ steps:
- tests/benchmarks/
commands:
- pytest -v -s benchmarks/
- label: Attention Benchmarks Smoke Test (B200)
device: b200
num_gpus: 2
optional: true
working_dir: "/vllm-workspace/"
timeout_in_minutes: 10
source_file_dependencies:
- benchmarks/attention_benchmarks/
- vllm/v1/attention/
commands:
- python3 benchmarks/attention_benchmarks/benchmark.py --backends flash flashinfer --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1

View File

@@ -121,10 +121,13 @@ steps:
optional: true
commands:
- nvidia-smi
# Run all models but only FLASHINFER, Inductor partition and native custom ops
# Run all models and attn backends but only Inductor partition and native custom ops
# -k "inductor_partition and not +rms_norm and not +quant_fp8"
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
# Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"
# -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
# Run just llama3 (fp8 & fp4) for all config combinations
# -k "llama-3"
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
- label: Fusion E2E TP2 Quick (H100)
timeout_in_minutes: 20
@@ -159,7 +162,7 @@ steps:
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run just llama3 (fp8 & bf16) for all config combinations
# Run just llama3 (fp4 & fp8 & bf16) for all config combinations
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
@@ -194,8 +197,7 @@ steps:
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run all models but only FLASHINFER, Inductor partition and native custom ops
# include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
# Run all models and attn backends but only Inductor partition and native custom ops
# for ar-rms-quant-fp4, also sweep llama3
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"

View File

@@ -104,6 +104,7 @@ steps:
# NEW rlhf examples
- cd new_weight_syncing
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
- label: Distributed Tests (8 GPUs)(H100)
timeout_in_minutes: 10
@@ -145,7 +146,6 @@ steps:
num_devices: 2
commands:
- pytest -v -s tests/distributed/test_context_parallel.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py
@@ -165,7 +165,6 @@ steps:
num_devices: 2
num_nodes: 2
no_plugin: true
optional: true # TODO: revert once infra issue solved
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
@@ -198,18 +197,7 @@ steps:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: Pipeline + Context Parallelism (4 GPUs)
- label: Pipeline + Context Parallelism (4 GPUs))
timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests"
num_devices: 4

View File

@@ -29,11 +29,15 @@ steps:
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
- label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100)
timeout_in_minutes: 60
device: h100
- label: Prime-RL Integration (2 GPUs)
timeout_in_minutes: 30
optional: true
num_devices: 1
soft_fail: true
num_devices: 2
working_dir: "/vllm-workspace"
source_file_dependencies:
- vllm/
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh 0.25 200 8030
- nvidia-smi
- bash .buildkite/scripts/run-prime-rl-test.sh

View File

@@ -28,11 +28,3 @@ steps:
- pytest -v -s v1/engine/test_preprocess_error_handling.py
# Run the rest of v1/engine tests
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
commands:
- pytest -v -s v1/e2e
- pytest -v -s v1/engine

View File

@@ -24,11 +24,6 @@ steps:
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Entrypoints Integration (API Server 1)
timeout_in_minutes: 130
@@ -47,13 +42,15 @@ steps:
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/entrypoints/rpc
- tests/entrypoints/instrumentator
- tests/tool_use
- tests/entrypoints/sleep
- tests/entrypoints/instrumentator
- tests/entrypoints/rpc
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/instrumentator
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
- pytest -v -s entrypoints/instrumentator
- pytest -v -s entrypoints/sleep
- pytest -v -s tool_use
- label: Entrypoints Integration (Pooling)
@@ -65,11 +62,6 @@ steps:
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/pooling
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Entrypoints Integration (Responses API)
timeout_in_minutes: 50

View File

@@ -115,7 +115,6 @@ steps:
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/kernels/moe/test_flashinfer_moe.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
# e2e
- pytest -v -s tests/models/quantization/test_nvfp4.py
@@ -157,3 +156,14 @@ steps:
- pytest -v -s kernels/moe/test_deepep_moe.py
- pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
# - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
- label: Kernels Fp4 MoE Test (B200)
timeout_in_minutes: 60
device: b200
num_devices: 1
optional: true
commands:
- pytest -v -s kernels/moe/test_cutedsl_moe.py
- pytest -v -s kernels/moe/test_flashinfer_moe.py
- pytest -v -s kernels/moe/test_nvfp4_moe.py
- pytest -v -s kernels/moe/test_ocp_mx_moe.py

View File

@@ -73,29 +73,3 @@ steps:
num_devices: 2
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
- label: GPQA Eval (GPT-OSS) (H100)
timeout_in_minutes: 120
device: h100
optional: true
num_devices: 2
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
- tests/evals/gpt_oss/
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-h100.txt
- label: GPQA Eval (GPT-OSS) (B200)
timeout_in_minutes: 120
device: b200
optional: true
num_devices: 2
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
- tests/evals/gpt_oss/
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-b200.txt

View File

@@ -16,7 +16,6 @@ steps:
- pytest -v -s v1/sample
- pytest -v -s v1/logits_processors
- pytest -v -s v1/worker
# TODO: create another `optional` test group for slow tests
- pytest -v -s -m 'not slow_test' v1/spec_decode
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'not cpu_test' v1/metrics
@@ -26,11 +25,6 @@ steps:
# Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: V1 Others (CPU)
depends_on:
@@ -114,11 +108,9 @@ steps:
timeout_in_minutes: 50
source_file_dependencies:
- vllm/
- tests/detokenizer
- tests/multimodal
- tests/utils_
commands:
- pytest -v -s detokenizer
- pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_
@@ -131,7 +123,6 @@ steps:
- tests/test_inputs.py
- tests/test_outputs.py
- tests/test_pooling_params.py
- tests/test_ray_env.py
- tests/multimodal
- tests/renderers
- tests/standalone_tests/lazy_imports.py
@@ -145,7 +136,6 @@ steps:
- pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- pytest -v -s test_pooling_params.py
- pytest -v -s test_ray_env.py
- pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s renderers
- pytest -v -s tokenizers_
@@ -153,6 +143,20 @@ steps:
- pytest -v -s transformers_utils
- pytest -v -s config
- label: GPT-OSS Eval (B200)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
device: b200
optional: true
source_file_dependencies:
- tests/evals/gpt_oss
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Batch Invariance (H100)
timeout_in_minutes: 25
device: h100

View File

@@ -4,6 +4,7 @@ depends_on:
steps:
- label: Basic Models Tests (Initialization)
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
@@ -15,6 +16,7 @@ steps:
- label: Basic Models Tests (Extra Initialization) %N
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/model_executor/models/
@@ -36,12 +38,6 @@ steps:
- tests/models/test_registry.py
commands:
- pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Basic Models Test (Other CPU) # 5min
depends_on:

View File

@@ -4,6 +4,7 @@ depends_on:
steps:
- label: Language Models Tests (Standard)
timeout_in_minutes: 25
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
@@ -15,6 +16,7 @@ steps:
- label: Language Models Tests (Extra Standard) %N
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/model_executor/models/
@@ -30,6 +32,7 @@ steps:
- label: Language Models Tests (Hybrid) %N
timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
@@ -37,7 +40,7 @@ steps:
commands:
# Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
# Shard hybrid language model tests
- pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
@@ -45,6 +48,7 @@ steps:
- label: Language Models Test (Extended Generation) # 80min
timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
- vllm/
@@ -52,21 +56,13 @@ steps:
commands:
# Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
commands:
- uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
- label: Language Models Test (PPL)
timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
- vllm/
@@ -76,20 +72,17 @@ steps:
- label: Language Models Test (Extended Pooling) # 36min
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
- vllm/
- tests/models/language/pooling
commands:
- pytest -v -s models/language/pooling -m 'not core_model'
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Language Models Test (MTEB)
timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
- vllm/

View File

@@ -12,10 +12,3 @@ steps:
commands:
- pytest -v -s samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
commands:
- pytest -v -s samplers

48
.github/CODEOWNERS vendored
View File

@@ -2,60 +2,42 @@
# for more info about CODEOWNERS file
# This lists cover the "core" components of vLLM that require careful review
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
/vllm/lora @jeejeelee
/vllm/model_executor/layers/attention @LucasWilkinson @MatthewBonanni
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
/vllm/model_executor/layers/attention @LucasWilkinson
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
/vllm/model_executor/layers/mamba @tdoublep
/vllm/model_executor/model_loader @22quinn
/vllm/model_executor/layers/batch_invariant.py @yewentao256
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
/vllm/vllm_flash_attn @LucasWilkinson @MatthewBonanni
/vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
/vllm/reasoning @aarnphm @chaunceyjiang
/vllm/entrypoints @aarnphm @chaunceyjiang
/vllm/tool_parsers @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact,
# so spam a lot of people
/vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
/vllm/config/cache.py @heheda12345
# Entrypoints
/vllm/entrypoints/anthropic @mgoin @DarkLight1337
/vllm/entrypoints/cli @hmellor @mgoin @DarkLight1337 @russellb
/vllm/entrypoints/mcp @heheda12345
/vllm/entrypoints/openai @aarnphm @chaunceyjiang @DarkLight1337 @russellb
/vllm/entrypoints/openai/realtime @njhill
/vllm/entrypoints/openai/speech_to_text @NickLucche
/vllm/entrypoints/pooling @noooop
/vllm/entrypoints/sagemaker @DarkLight1337
/vllm/entrypoints/serve @njhill
/vllm/entrypoints/*.py @njhill
/vllm/entrypoints/chat_utils.py @DarkLight1337
/vllm/entrypoints/llm.py @DarkLight1337
# Input/Output Processing
/vllm/sampling_params.py @njhill @NickLucche
/vllm/pooling_params.py @noooop @DarkLight1337
/vllm/tokenizers @DarkLight1337 @njhill
/vllm/renderers @DarkLight1337 @njhill
/vllm/reasoning @aarnphm @chaunceyjiang
/vllm/tool_parsers @aarnphm @chaunceyjiang
/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
# vLLM V1
/vllm/v1/attention @LucasWilkinson @MatthewBonanni
/vllm/v1/attention @LucasWilkinson
/vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
/vllm/v1/attention/backends/mla @pavanimajety
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
/vllm/v1/attention/backends/triton_attn.py @tdoublep
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
/vllm/v1/sample @22quinn @houseroad @njhill
/vllm/v1/spec_decode @benchislett @luccafong @MatthewBonanni
/vllm/v1/spec_decode @benchislett @luccafong
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
/vllm/v1/kv_cache_interface.py @heheda12345
/vllm/v1/kv_offload @ApostaC @orozery
/vllm/v1/worker/gpu/kv_connector.py @orozery
/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery @NickLucche
/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery
# Model runner V2
/vllm/v1/worker/gpu @WoosukKwon
@@ -133,8 +115,8 @@ mkdocs.yaml @hmellor
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
/vllm/model_executor/models/voxtral*.py @patrickvonplaten
/vllm/model_executor/models/pixtral*.py @patrickvonplaten
/vllm/tokenizers/mistral.py @patrickvonplaten
/vllm/transformers_utils/configs/mistral.py @patrickvonplaten
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
# Kernels
/vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep
@@ -170,7 +152,9 @@ mkdocs.yaml @hmellor
/examples/pooling @noooop
/tests/models/*/pooling* @noooop
/tests/entrypoints/pooling @noooop
/vllm/entrypoints/pooling @noooop
/vllm/config/pooler.py @noooop
/vllm/pooling_params.py @noooop
/vllm/model_executor/layers/pooler @noooop
# Security guide and policies

View File

@@ -19,7 +19,6 @@ jobs:
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with:
python-version: '3.12'
cache: 'pip'
- name: Install Python dependencies
run: |

3
.gitignore vendored
View File

@@ -238,6 +238,3 @@ ep_kernels_workspace/
vllm/grpc/vllm_engine_pb2.py
vllm/grpc/vllm_engine_pb2_grpc.py
vllm/grpc/vllm_engine_pb2.pyi
# Ignore generated cpu headers
csrc/cpu/cpu_attn_dispatch_generated.h

View File

@@ -143,11 +143,6 @@ repos:
name: Check attention backend documentation is up to date
entry: python tools/pre_commit/generate_attention_backend_docs.py --check
language: python
- id: check-boolean-context-manager
name: Check for boolean ops in with-statements
entry: python tools/pre_commit/check_boolean_context_manager.py
language: python
types: [python]
# Keep `suggestion` last
- id: suggestion
name: Suggestion

View File

@@ -9,14 +9,13 @@ build:
python: "3.12"
jobs:
post_checkout:
- git fetch origin main --unshallow --no-tags --filter=blob:none || true
pre_create_environment:
- pip install uv
create_environment:
- uv venv $READTHEDOCS_VIRTUALENV_PATH
install:
- uv pip install --python $READTHEDOCS_VIRTUALENV_PATH/bin/python --no-cache-dir -r requirements/docs.txt
- git fetch --unshallow || true
mkdocs:
configuration: mkdocs.yaml
fail_on_warning: true
# Optionally declare the Python requirements required to build your docs
python:
install:
- requirements: requirements/docs.txt

View File

@@ -56,8 +56,8 @@ endif()
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm
#
set(TORCH_SUPPORTED_VERSION_CUDA "2.10.0")
set(TORCH_SUPPORTED_VERSION_ROCM "2.10.0")
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")
#
# Try to find python package with an executable that exactly matches
@@ -293,7 +293,6 @@ set(VLLM_EXT_SRC
"csrc/fused_qknorm_rope_kernel.cu"
"csrc/layernorm_quant_kernels.cu"
"csrc/sampler.cu"
"csrc/topk.cu"
"csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/w8a8/int8/scaled_quant.cu"
@@ -434,7 +433,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
endif()
if (MARLIN_SM75_ARCHS)
if (MARLIN_SM75_ARCHS)
file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/marlin/sm75_kernel_*.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
@@ -446,7 +445,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
endif()
if (MARLIN_FP8_ARCHS)
if (MARLIN_FP8_ARCHS)
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/marlin/sm89_kernel_*.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
@@ -771,24 +770,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
# DeepSeek V3 fused A GEMM kernel (requires SM 9.0+, Hopper and later)
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_FUSED_A_GEMM_ARCHS)
set(DSV3_FUSED_A_GEMM_SRC "csrc/dsv3_fused_a_gemm.cu")
set_gencode_flags_for_srcs(
SRCS "${DSV3_FUSED_A_GEMM_SRC}"
CUDA_ARCHS "${DSV3_FUSED_A_GEMM_ARCHS}")
list(APPEND VLLM_EXT_SRC ${DSV3_FUSED_A_GEMM_SRC})
message(STATUS "Building dsv3_fused_a_gemm for archs: ${DSV3_FUSED_A_GEMM_ARCHS}")
else()
message(STATUS "Not building dsv3_fused_a_gemm as no compatible archs found "
"in CUDA target architectures.")
endif()
# moe_data.cu is used by all CUTLASS MoE kernels.
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
@@ -1061,7 +1042,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
endif()
if (MARLIN_MOE_SM75_ARCHS)
if (MARLIN_MOE_SM75_ARCHS)
file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_MOE_SM75_SRC}"
@@ -1100,27 +1081,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
" in CUDA target architectures")
endif()
# DeepSeek V3 router GEMM kernel - requires SM90+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_ROUTER_GEMM_ARCHS)
set(DSV3_ROUTER_GEMM_SRC
"csrc/moe/dsv3_router_gemm_entry.cu"
"csrc/moe/dsv3_router_gemm_float_out.cu"
"csrc/moe/dsv3_router_gemm_bf16_out.cu")
set_gencode_flags_for_srcs(
SRCS "${DSV3_ROUTER_GEMM_SRC}"
CUDA_ARCHS "${DSV3_ROUTER_GEMM_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC "${DSV3_ROUTER_GEMM_SRC}")
message(STATUS "Building DSV3 router GEMM kernel for archs: ${DSV3_ROUTER_GEMM_ARCHS}")
else()
message(STATUS "Not building DSV3 router GEMM kernel as no compatible archs found"
" (requires SM90+ and CUDA >= 12.0)")
endif()
endif()
message(STATUS "Enabling moe extension.")

View File

@@ -229,40 +229,3 @@ def get_batch_stats(requests: list[BatchRequest]) -> dict:
sum(r.kv_len for r in requests) / len(requests) if requests else 0
),
}
def get_batch_type(batch_spec: str, spec_decode_threshold: int = 8) -> str:
"""
Classify a batch spec into a type string.
Args:
batch_spec: Batch specification string (e.g., "q2k", "8q1s1k", "2q2k_8q1s1k")
spec_decode_threshold: Max q_len to be considered spec-decode vs extend
Returns:
Type string: "prefill", "decode", "spec-decode", "extend", or "mixed (types...)"
"""
requests = parse_batch_spec(batch_spec)
# Classify each request
types_present = set()
for req in requests:
if req.is_decode:
types_present.add("decode")
elif req.is_prefill:
types_present.add("prefill")
elif req.is_extend:
# Distinguish spec-decode (small q_len) from extend (chunked prefill)
if req.q_len <= spec_decode_threshold:
types_present.add("spec-decode")
else:
types_present.add("extend")
if len(types_present) == 1:
return types_present.pop()
elif len(types_present) > 1:
# Sort for consistent output
sorted_types = sorted(types_present)
return f"mixed ({'+'.join(sorted_types)})"
else:
return "unknown"

View File

@@ -43,7 +43,6 @@ from common import (
ModelParameterSweep,
ParameterSweep,
ResultsFormatter,
batch_spec_sort_key,
is_mla_backend,
)
@@ -219,13 +218,10 @@ def run_model_parameter_sweep(
by_param_and_spec[key].append(r)
break
# Sort by param value then spec (batch_size, q_len, kv_len)
# Sort by param value then spec
sorted_keys = sorted(
by_param_and_spec.keys(),
key=lambda x: (
int(x[0]) if x[0].isdigit() else x[0],
batch_spec_sort_key(x[1]),
),
key=lambda x: (int(x[0]) if x[0].isdigit() else x[0], x[1]),
)
current_param_value = None
@@ -334,7 +330,7 @@ def run_parameter_sweep(
by_spec[spec] = []
by_spec[spec].append(r)
for spec in sorted(by_spec.keys(), key=batch_spec_sort_key):
for spec in sorted(by_spec.keys()):
results = by_spec[spec]
best = min(results, key=lambda r: r.mean_time)
console.print(
@@ -500,18 +496,15 @@ def main():
if "description" in yaml_config:
console.print(f"[dim]{yaml_config['description']}[/]")
# Override args with YAML values, but CLI args take precedence
# Check if CLI provided backends (they would be non-None and not default)
cli_backends_provided = args.backends is not None or args.backend is not None
# Backend(s) - only use YAML if CLI didn't specify
if not cli_backends_provided:
if "backend" in yaml_config:
args.backend = yaml_config["backend"]
args.backends = None
elif "backends" in yaml_config:
args.backends = yaml_config["backends"]
args.backend = None
# Override args with YAML values
# (YAML takes precedence unless CLI arg was explicitly set)
# Backend(s)
if "backend" in yaml_config:
args.backend = yaml_config["backend"]
args.backends = None
elif "backends" in yaml_config:
args.backends = yaml_config["backends"]
args.backend = None
# Check for special modes
if "mode" in yaml_config:
@@ -551,15 +544,13 @@ def main():
args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads)
args.block_size = model.get("block_size", args.block_size)
# Benchmark settings (top-level keys)
if "device" in yaml_config:
args.device = yaml_config["device"]
if "repeats" in yaml_config:
args.repeats = yaml_config["repeats"]
if "warmup_iters" in yaml_config:
args.warmup_iters = yaml_config["warmup_iters"]
if "profile_memory" in yaml_config:
args.profile_memory = yaml_config["profile_memory"]
# Benchmark settings
if "benchmark" in yaml_config:
bench = yaml_config["benchmark"]
args.device = bench.get("device", args.device)
args.repeats = bench.get("repeats", args.repeats)
args.warmup_iters = bench.get("warmup_iters", args.warmup_iters)
args.profile_memory = bench.get("profile_memory", args.profile_memory)
# Parameter sweep configuration
if "parameter_sweep" in yaml_config:

View File

@@ -12,36 +12,16 @@ from typing import Any
import numpy as np
import torch
from batch_spec import get_batch_type, parse_batch_spec
from rich.console import Console
from rich.table import Table
def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
"""
Extract sorting key from batch spec: (batch_size, max_q_len, max_kv_len).
This ensures results are sorted by batch size first, then query length,
then sequence length, rather than alphabetically.
"""
try:
requests = parse_batch_spec(spec)
batch_size = len(requests)
max_q_len = max(r.q_len for r in requests) if requests else 0
max_kv_len = max(r.kv_len for r in requests) if requests else 0
return (batch_size, max_q_len, max_kv_len)
except Exception:
# Fallback for unparseable specs
return (0, 0, 0)
# Mock classes for vLLM attention infrastructure
class MockHfConfig:
"""Mock HuggingFace config that satisfies vLLM's requirements."""
def __init__(self, mla_dims: dict, index_topk: int | None = None):
def __init__(self, mla_dims: dict):
self.num_attention_heads = mla_dims["num_q_heads"]
self.num_key_value_heads = mla_dims["num_kv_heads"]
self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"]
@@ -52,8 +32,6 @@ class MockHfConfig:
self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"]
self.v_head_dim = mla_dims["v_head_dim"]
self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
if index_topk is not None:
self.index_topk = index_topk
def get_text_config(self):
return self
@@ -104,38 +82,6 @@ class MockKVBProj:
return (result,) # Return as tuple to match ColumnParallelLinear API
class MockIndexer:
"""Mock Indexer for sparse MLA backends.
Provides topk_indices_buffer that sparse MLA backends use to determine
which KV cache slots to attend to for each token.
"""
def __init__(
self,
max_num_tokens: int,
topk_tokens: int,
device: torch.device,
):
self.topk_tokens = topk_tokens
self.topk_indices_buffer = torch.zeros(
(max_num_tokens, topk_tokens),
dtype=torch.int32,
device=device,
)
def fill_random_indices(self, num_tokens: int, max_kv_len: int):
"""Fill topk_indices_buffer with random valid indices for benchmarking."""
indices = torch.randint(
0,
max_kv_len,
(num_tokens, self.topk_tokens),
dtype=torch.int32,
device=self.topk_indices_buffer.device,
)
self.topk_indices_buffer[:num_tokens] = indices
class MockLayer(AttentionLayerBase):
"""Mock attention layer with scale parameters and impl.
@@ -370,19 +316,14 @@ class ResultsFormatter:
backends: List of backend names being compared
compare_to_fastest: Show percentage comparison to fastest
"""
# Group by batch spec, preserving first-occurrence order
# Group by batch spec
by_spec = {}
specs_order = []
for r in results:
spec = r.config.batch_spec
if spec not in by_spec:
by_spec[spec] = {}
specs_order.append(spec)
by_spec[spec][r.config.backend] = r
# Sort specs by (batch_size, q_len, kv_len) instead of alphabetically
specs_order = sorted(by_spec.keys(), key=batch_spec_sort_key)
# Create shortened backend names for display
def shorten_backend_name(name: str) -> str:
"""Shorten long backend names for table display."""
@@ -396,8 +337,6 @@ class ResultsFormatter:
table = Table(title="Attention Benchmark Results")
table.add_column("Batch\nSpec", no_wrap=True)
table.add_column("Type", no_wrap=True)
table.add_column("Batch\nSize", justify="right", no_wrap=True)
multi = len(backends) > 1
for backend in backends:
@@ -411,14 +350,12 @@ class ResultsFormatter:
table.add_column(col_rel, justify="right", no_wrap=False)
# Add rows
for spec in specs_order:
for spec in sorted(by_spec.keys()):
spec_results = by_spec[spec]
times = {b: r.mean_time for b, r in spec_results.items() if r.success}
best_time = min(times.values()) if times else 0.0
batch_type = get_batch_type(spec)
batch_size = len(parse_batch_spec(spec))
row = [spec, batch_type, str(batch_size)]
row = [spec]
for backend in backends:
if backend in spec_results:
r = spec_results[backend]
@@ -549,11 +486,10 @@ def get_attention_scale(head_dim: int) -> float:
def is_mla_backend(backend: str) -> bool:
"""
Check if backend is an MLA backend using the AttentionBackendEnum.
Check if backend is an MLA backend using the backend's is_mla() property.
Args:
backend: Backend name matching AttentionBackendEnum exactly
(e.g., "FLASHMLA_SPARSE")
backend: Backend name (e.g., "CUTLASS_MLA", "FLASHINFER_MLA")
Returns:
True if the backend is an MLA backend, False otherwise
@@ -561,8 +497,7 @@ def is_mla_backend(backend: str) -> bool:
from vllm.v1.attention.backends.registry import AttentionBackendEnum
try:
backend_enum = AttentionBackendEnum[backend]
backend_class = backend_enum.get_class()
backend_class = AttentionBackendEnum[backend.upper()].get_class()
return backend_class.is_mla()
except (KeyError, ValueError, ImportError, AttributeError):
except (KeyError, ValueError, ImportError):
return False

View File

@@ -3,7 +3,7 @@
model:
name: "deepseek-v3"
num_layers: 60
num_q_heads: 128 # Base value, can be swept for TP simulation
num_q_heads: 128
num_kv_heads: 1 # MLA uses single latent KV
head_dim: 576
kv_lora_rank: 512
@@ -12,13 +12,6 @@ model:
v_head_dim: 128
block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
model_parameter_sweep:
param_name: "num_q_heads"
values: [128, 64, 32, 16]
label_format: "{backend}_{value}h"
batch_specs:
# Small batches, varying sequence lengths
- "16q1s512" # 16 requests, 512 KV cache
@@ -41,30 +34,28 @@ batch_specs:
# Very large batches
- "128q1s1k" # 128 requests, 1k KV cache
- "128q1s2k" # 128 requests, 2k KV cache
- "128q1s4k" # 128 requests, 4k KV cache
- "128q1s8k" # 128 requests, 8k KV cache
# Long context
- "32q1s16k" # 32 requests, 16k KV cache
- "32q1s32k" # 32 requests, 32k KV cache
backends:
- CUTLASS_MLA
- FLASHINFER_MLA
- FLASH_ATTN_MLA # Hopper only
- FLASHMLA # Hopper only
- cutlass_mla
- flashinfer_mla
- flashattn_mla # Hopper only
- flashmla # Hopper only
device: "cuda:0"
repeats: 100
warmup_iters: 10
repeats: 5
warmup_iters: 3
profile_memory: true
# Backend-specific tuning
CUTLASS_MLA:
cutlass_mla:
num_kv_splits: auto # or specific value like 4, 8, 16
FLASH_ATTN_MLA:
flashattn_mla:
reorder_batch_threshold: 512
FLASHMLA:
flashmla:
reorder_batch_threshold: 1

View File

@@ -45,10 +45,10 @@ batch_specs:
- "4q4k_60q1s4k" # 4 prefill + 60 decode
backends:
- CUTLASS_MLA
- FLASHINFER_MLA
- FLASH_ATTN_MLA # Hopper only
- FLASHMLA # Hopper only
- cutlass_mla
- flashinfer_mla
- flashattn_mla # Hopper only
- flashmla # Hopper only
device: "cuda:0"
repeats: 5

View File

@@ -1,62 +0,0 @@
# MLA prefill-only benchmark configuration for sparse backends
model:
name: "deepseek-v3"
num_layers: 60
num_q_heads: 128
num_kv_heads: 1
head_dim: 576
kv_lora_rank: 512
qk_nope_head_dim: 128
qk_rope_head_dim: 64
v_head_dim: 128
block_size: 128
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
model_parameter_sweep:
param_name: "num_q_heads"
values: [128, 64, 32, 16]
label_format: "{backend}_{value}h"
batch_specs:
# Pure prefill
- "1q512"
- "1q1k"
- "1q2k"
- "1q4k"
- "1q8k"
# Batched pure prefill
- "2q512"
- "2q1k"
- "2q2k"
- "2q4k"
- "2q8k"
- "4q512"
- "4q1k"
- "4q2k"
- "4q4k"
- "4q8k"
- "8q512"
- "8q1k"
- "8q2k"
- "8q4k"
- "8q8k"
# Extend
- "1q512s4k"
- "1q512s8k"
- "1q1ks8k"
- "1q2ks8k"
- "1q2ks16k"
- "1q4ks16k"
backends:
- FLASHMLA_SPARSE
- FLASHINFER_MLA_SPARSE
device: "cuda:0"
repeats: 10
warmup_iters: 3
profile_memory: true

View File

@@ -6,7 +6,7 @@
description: "Decode vs Prefill pipeline crossover analysis"
# Test FlashAttn MLA
backend: FLASH_ATTN_MLA
backend: flashattn_mla
# Mode: decode_vs_prefill comparison (special sweep mode)
# For each batch spec, we'll test both decode and prefill pipelines
@@ -62,10 +62,11 @@ model:
block_size: 128
# Benchmark settings
device: "cuda:0"
repeats: 15 # More repeats for spec decode variance
warmup_iters: 5
profile_memory: false
benchmark:
device: "cuda:0"
repeats: 15 # More repeats for spec decode variance
warmup_iters: 5
profile_memory: false
# Output
output:

View File

@@ -41,17 +41,18 @@ batch_specs:
# Backends that support query length > 1
backends:
- FLASH_ATTN_MLA # reorder_batch_threshold = 512
- FLASHMLA # reorder_batch_threshold = 1 (tunable)
- flashattn_mla # reorder_batch_threshold = 512
- flashmla # reorder_batch_threshold = 1 (tunable)
# FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
# - FLASHINFER_MLA
# - flashinfer_mla
# Benchmark settings
device: "cuda:0"
repeats: 10 # More repeats for statistical significance
warmup_iters: 5
profile_memory: false
benchmark:
device: "cuda:0"
repeats: 10 # More repeats for statistical significance
warmup_iters: 5
profile_memory: false
# Test these threshold values for optimization
parameter_sweep:

View File

@@ -25,22 +25,14 @@ batch_specs:
- "4q1k_16q1s2k" # 4 prefill + 16 decode
- "2q4k_32q1s1k" # 2 large prefill + 32 decode
# Speculative decode (q <= 8)
- "16q2s1k" # 16 requests, 2 spec tokens, 1k KV cache
- "16q4s1k" # 16 requests, 4 spec tokens, 1k KV cache
- "16q8s1k" # 16 requests, 8 spec tokens, 1k KV cache
- "32q4s2k" # 32 requests, 4 spec tokens, 2k KV cache
- "8q8s4k" # 8 requests, 8 spec tokens, 4k KV cache
# Context extension (chunked prefill)
- "q1ks2k" # 1k query, 2k sequence
# Context extension
- "q1ks2k" # 1k query, 2k sequence (chunked prefill)
- "2q1ks4k" # 2 requests: 1k query, 4k sequence
# Available backends: FLASH_ATTN, TRITON_ATTN, FLASHINFER
backends:
- FLASH_ATTN
- TRITON_ATTN
- FLASHINFER
- flash
- triton
- flashinfer
device: "cuda:0"
repeats: 5

View File

@@ -8,13 +8,14 @@ This module provides helpers for running MLA backends without
needing full VllmConfig integration.
"""
import importlib
import numpy as np
import torch
from batch_spec import parse_batch_spec
from common import (
BenchmarkResult,
MockHfConfig,
MockIndexer,
MockKVBProj,
MockLayer,
setup_mla_dims,
@@ -61,7 +62,6 @@ def create_minimal_vllm_config(
block_size: int = 128,
max_num_seqs: int = 256,
mla_dims: dict | None = None,
index_topk: int | None = None,
) -> VllmConfig:
"""
Create minimal VllmConfig for MLA benchmarks.
@@ -73,8 +73,6 @@ def create_minimal_vllm_config(
max_num_seqs: Maximum number of sequences
mla_dims: Optional custom MLA dimensions dict. If not provided, uses
setup_mla_dims(model_name)
index_topk: Optional topk value for sparse MLA backends. If provided,
the config will include index_topk for sparse attention.
Returns:
VllmConfig for benchmarking
@@ -84,7 +82,7 @@ def create_minimal_vllm_config(
mla_dims = setup_mla_dims(model_name)
# Create mock HF config first (avoids downloading from HuggingFace)
mock_hf_config = MockHfConfig(mla_dims, index_topk=index_topk)
mock_hf_config = MockHfConfig(mla_dims)
# Create a temporary minimal config.json to avoid HF downloads
# This ensures consistent ModelConfig construction without network access
@@ -122,12 +120,16 @@ def create_minimal_vllm_config(
seed=0,
max_model_len=32768,
quantization=None,
quantization_param_path=None,
enforce_eager=False,
max_context_len_to_capture=None,
max_seq_len_to_capture=8192,
max_logprobs=20,
disable_sliding_window=False,
skip_tokenizer_init=True,
served_model_name=None,
limit_mm_per_prompt=None,
use_async_output_proc=True,
config_format="auto",
)
finally:
@@ -178,65 +180,56 @@ def create_minimal_vllm_config(
# ============================================================================
# Backend-specific properties that can't be inferred from the backend class
# Keys are AttentionBackendEnum names (uppercase)
# Backend name to class name prefix mapping
_BACKEND_NAME_MAP = {
"flashattn_mla": "FlashAttnMLA",
"flashmla": "FlashMLA",
"flashinfer_mla": "FlashInferMLA",
"cutlass_mla": "CutlassMLA",
}
# Special properties that differ from defaults
_BACKEND_PROPERTIES = {
"FLASHMLA": {
"flashmla": {
"query_format": "concat", # Single concatenated tensor (vs tuple)
"block_size": 64, # FlashMLA uses fixed block size
},
"FLASHMLA_SPARSE": {
"query_format": "concat", # Single concatenated tensor (vs tuple)
"flashinfer_mla": {
"block_size": 64, # FlashInfer MLA only supports 32 or 64
},
}
def _get_backend_config(backend: str) -> dict:
"""
Get backend configuration from AttentionBackendEnum.
Get backend configuration using naming conventions.
Uses the registry to get the backend class and extract configuration
from its methods (get_impl_cls, get_builder_cls, is_sparse, etc.).
Args:
backend: Backend name matching AttentionBackendEnum exactly
(e.g., "FLASHMLA_SPARSE")
Returns:
Dict with backend configuration
All MLA backends follow the pattern:
- Module: vllm.v1.attention.backends.mla.{backend}
- Impl: {Name}Impl
- Metadata: {Name}Metadata (or MLACommonMetadata)
- DecodeMetadata: {Name}DecodeMetadata (or MLACommonDecodeMetadata)
- MetadataBuilder: {Name}MetadataBuilder
"""
from vllm.v1.attention.backends.registry import AttentionBackendEnum
if backend not in _BACKEND_NAME_MAP:
raise ValueError(f"Unknown backend: {backend}")
try:
backend_enum = AttentionBackendEnum[backend]
backend_class = backend_enum.get_class()
except (KeyError, ValueError) as e:
valid_backends = [e.name for e in AttentionBackendEnum if e.name != "CUSTOM"]
raise ValueError(
f"Unknown backend: {backend}. "
f"Valid MLA backends: {[b for b in valid_backends if 'MLA' in b]}"
) from e
# Get block size from backend class
block_sizes = backend_class.get_supported_kernel_block_sizes()
# Use first supported block size (backends typically support one for MLA)
block_size = block_sizes[0] if block_sizes else None
if hasattr(block_size, "value"):
# Handle MultipleOf enum
block_size = None
# Check if sparse via class method if available
is_sparse = getattr(backend_class, "is_sparse", lambda: False)()
# Get properties that can't be inferred
name = _BACKEND_NAME_MAP[backend]
props = _BACKEND_PROPERTIES.get(backend, {})
# Check if backend uses common metadata (FlashInfer, CUTLASS)
uses_common = backend in ("flashinfer_mla", "cutlass_mla")
return {
"backend_class": backend_class,
"impl_class": backend_class.get_impl_cls(),
"builder_class": backend_class.get_builder_cls(),
"module": f"vllm.v1.attention.backends.mla.{backend}",
"impl_class": f"{name}Impl",
"metadata_class": "MLACommonMetadata" if uses_common else f"{name}Metadata",
"decode_metadata_class": "MLACommonDecodeMetadata"
if uses_common
else f"{name}DecodeMetadata",
"builder_class": f"{name}MetadataBuilder",
"query_format": props.get("query_format", "tuple"),
"block_size": block_size,
"is_sparse": is_sparse,
"block_size": props.get("block_size", None),
}
@@ -454,26 +447,22 @@ def _create_backend_impl(
mla_dims: dict,
vllm_config: VllmConfig,
device: torch.device,
max_num_tokens: int = 8192,
index_topk: int | None = None,
):
"""
Create backend implementation instance.
Args:
backend_cfg: Backend configuration dict from _get_backend_config()
backend_cfg: Backend configuration dict
mla_dims: MLA dimension configuration
vllm_config: VllmConfig instance
device: Target device
max_num_tokens: Maximum number of tokens for sparse indexer buffer
index_topk: Topk value for sparse MLA backends
Returns:
Tuple of (impl, layer, builder_instance, indexer)
Tuple of (impl, layer, builder_instance)
"""
# Get classes from backend config (already resolved by _get_backend_config)
impl_class = backend_cfg["impl_class"]
builder_class = backend_cfg["builder_class"]
# Import backend classes
backend_module = importlib.import_module(backend_cfg["module"])
impl_class = getattr(backend_module, backend_cfg["impl_class"])
# Calculate scale
scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"])
@@ -485,44 +474,26 @@ def _create_backend_impl(
v_head_dim=mla_dims["v_head_dim"],
)
# Create indexer for sparse backends
indexer = None
if backend_cfg.get("is_sparse", False):
if index_topk is None:
index_topk = 2048 # Default topk for sparse MLA
indexer = MockIndexer(
max_num_tokens=max_num_tokens,
topk_tokens=index_topk,
device=device,
)
# Build impl kwargs
impl_kwargs = {
"num_heads": mla_dims["num_q_heads"],
"head_size": mla_dims["head_dim"],
"scale": scale,
"num_kv_heads": mla_dims["num_kv_heads"],
"alibi_slopes": None,
"sliding_window": None,
"kv_cache_dtype": "auto",
"logits_soft_cap": None,
"attn_type": "decoder",
"kv_sharing_target_layer_name": None,
"q_lora_rank": None,
"kv_lora_rank": mla_dims["kv_lora_rank"],
"qk_nope_head_dim": mla_dims["qk_nope_head_dim"],
"qk_rope_head_dim": mla_dims["qk_rope_head_dim"],
"qk_head_dim": mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
"v_head_dim": mla_dims["v_head_dim"],
"kv_b_proj": mock_kv_b_proj,
}
# Add indexer for sparse backends
if indexer is not None:
impl_kwargs["indexer"] = indexer
# Create impl
impl = impl_class(**impl_kwargs)
impl = impl_class(
num_heads=mla_dims["num_q_heads"],
head_size=mla_dims["head_dim"],
scale=scale,
num_kv_heads=mla_dims["num_kv_heads"],
alibi_slopes=None,
sliding_window=None,
kv_cache_dtype="auto",
logits_soft_cap=None,
attn_type="decoder",
kv_sharing_target_layer_name=None,
q_lora_rank=None,
kv_lora_rank=mla_dims["kv_lora_rank"],
qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
v_head_dim=mla_dims["v_head_dim"],
kv_b_proj=mock_kv_b_proj,
)
# Initialize DCP attributes
if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1):
@@ -544,7 +515,9 @@ def _create_backend_impl(
# Create builder instance if needed
builder_instance = None
if builder_class:
if backend_cfg["builder_class"]:
builder_class = getattr(backend_module, backend_cfg["builder_class"])
# Populate static_forward_context so builder can find the layer
# MockLayer inherits from AttentionLayerBase, so isinstance checks pass
vllm_config.compilation_config.static_forward_context = {"placeholder": layer}
@@ -556,7 +529,7 @@ def _create_backend_impl(
device=device,
)
return impl, layer, builder_instance, indexer
return impl, layer, builder_instance
# ============================================================================
@@ -621,7 +594,6 @@ def _run_single_benchmark(
backend_cfg: dict,
mla_dims: dict,
device: torch.device,
indexer=None,
) -> BenchmarkResult:
"""
Run a single benchmark iteration.
@@ -634,7 +606,6 @@ def _run_single_benchmark(
backend_cfg: Backend configuration dict
mla_dims: MLA dimension configuration
device: Target device
indexer: Optional MockIndexer for sparse backends
Returns:
BenchmarkResult with timing statistics
@@ -642,9 +613,7 @@ def _run_single_benchmark(
# Parse batch spec
requests = parse_batch_spec(config.batch_spec)
q_lens = [r.q_len for r in requests]
kv_lens = [r.kv_len for r in requests]
total_q = sum(q_lens)
max_kv_len = max(kv_lens)
# Determine block size
block_size = backend_cfg["block_size"] or config.block_size
@@ -672,16 +641,8 @@ def _run_single_benchmark(
torch.bfloat16,
)
# Fill indexer with random indices for sparse backends
is_sparse = backend_cfg.get("is_sparse", False)
if is_sparse and indexer is not None:
indexer.fill_random_indices(total_q, max_kv_len)
# Determine which forward method to use
if is_sparse:
# Sparse backends use forward_mqa
forward_fn = lambda: impl.forward_mqa(decode_inputs, kv_cache, metadata, layer)
elif metadata.decode is not None:
# Determine which forward method to use based on metadata
if metadata.decode is not None:
forward_fn = lambda: impl._forward_decode(
decode_inputs, kv_cache, metadata, layer
)
@@ -732,13 +693,11 @@ def _run_single_benchmark(
def _run_mla_benchmark_batched(
backend: str,
configs_with_params: list[tuple], # [(config, threshold, num_splits), ...]
index_topk: int = 2048,
) -> list[BenchmarkResult]:
"""
Unified batched MLA benchmark runner for all backends.
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
flashinfer_mla_sparse, flashmla_sparse
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
This function reuses backend initialization across multiple benchmarks
to avoid setup/teardown overhead.
@@ -748,7 +707,6 @@ def _run_mla_benchmark_batched(
configs_with_params: List of (config, threshold, num_splits) tuples
- threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
- num_splits: num_kv_splits (CUTLASS only)
index_topk: Topk value for sparse MLA backends (default 2048)
Returns:
List of BenchmarkResult objects
@@ -772,27 +730,19 @@ def _run_mla_benchmark_batched(
if mla_dims is None:
mla_dims = setup_mla_dims("deepseek-v3")
# Determine if this is a sparse backend
is_sparse = backend_cfg.get("is_sparse", False)
# Create and set vLLM config for MLA (reused across all benchmarks)
vllm_config = create_minimal_vllm_config(
model_name="deepseek-v3", # Used only for model path
block_size=block_size,
mla_dims=mla_dims, # Use custom dims from config or default
index_topk=index_topk if is_sparse else None,
)
results = []
with set_current_vllm_config(vllm_config):
# Create backend impl, layer, builder, and indexer (reused across benchmarks)
impl, layer, builder_instance, indexer = _create_backend_impl(
backend_cfg,
mla_dims,
vllm_config,
device,
index_topk=index_topk if is_sparse else None,
# Create backend impl, layer, and builder (reused across benchmarks)
impl, layer, builder_instance = _create_backend_impl(
backend_cfg, mla_dims, vllm_config, device
)
# Run each benchmark with the shared impl
@@ -818,7 +768,6 @@ def _run_mla_benchmark_batched(
backend_cfg,
mla_dims,
device,
indexer=indexer,
)
results.append(result)
@@ -844,24 +793,20 @@ def run_mla_benchmark(
config,
reorder_batch_threshold: int | None = None,
num_kv_splits: int | None = None,
index_topk: int = 2048,
) -> BenchmarkResult | list[BenchmarkResult]:
"""
Unified MLA benchmark runner for all backends.
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
flashinfer_mla_sparse, flashmla_sparse
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
Always uses batched execution internally for optimal performance.
Args:
backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
flashinfer_mla_sparse, flashmla_sparse)
backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla)
config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples
reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA
(single config mode only)
num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
index_topk: Topk value for sparse MLA backends (default 2048)
Returns:
BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
@@ -871,9 +816,9 @@ def run_mla_benchmark(
# Already in batched format
if len(config) > 0 and isinstance(config[0], tuple):
# Format: [(cfg, param), ...] where param is threshold or num_splits
if backend in ("flashattn_mla", "flashmla", "flashmla_sparse"):
if backend in ("flashattn_mla", "flashmla"):
configs_with_params = [(cfg, param, None) for cfg, param in config]
else: # cutlass_mla, flashinfer_mla, or sparse backends
else: # cutlass_mla or flashinfer_mla
configs_with_params = [(cfg, None, param) for cfg, param in config]
else:
# Format: [cfg, ...] - just configs
@@ -885,7 +830,7 @@ def run_mla_benchmark(
return_single = True
# Use unified batched execution
results = _run_mla_benchmark_batched(backend, configs_with_params, index_topk)
results = _run_mla_benchmark_batched(backend, configs_with_params)
# Return single result or list based on input
return results[0] if return_single else results

View File

@@ -8,9 +8,7 @@ This module provides helpers for running standard attention backends
(FlashAttention, Triton, FlashInfer) with real vLLM integration.
"""
import logging
import types
from contextlib import contextmanager
import numpy as np
import torch
@@ -26,13 +24,8 @@ from vllm.config import (
ParallelConfig,
SchedulerConfig,
VllmConfig,
set_current_vllm_config,
)
from vllm.v1.attention.backends.utils import (
CommonAttentionMetadata,
get_kv_cache_layout,
set_kv_cache_layout,
)
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
from vllm.v1.kv_cache_interface import FullAttentionSpec
# ============================================================================
@@ -40,41 +33,37 @@ from vllm.v1.kv_cache_interface import FullAttentionSpec
# ============================================================================
_BACKEND_CONFIG = {
"flash": {
"module": "vllm.v1.attention.backends.flash_attn",
"backend_class": "FlashAttentionBackend",
"dtype": torch.float16,
"cache_layout": "standard",
# ^ [2, num_blocks, block_size, num_kv_heads, head_dim]
},
"triton": {
"module": "vllm.v1.attention.backends.triton_attn",
"backend_class": "TritonAttentionBackend",
"dtype": torch.float32,
"cache_layout": "standard",
},
"flashinfer": {
"module": "vllm.v1.attention.backends.flashinfer",
"backend_class": "FlashInferBackend",
"dtype": torch.float16,
"cache_layout": "flashinfer",
# ^ [num_blocks, 2, block_size, num_kv_heads, head_dim]
},
}
def _get_backend_config(backend: str) -> dict:
"""
Get backend configuration from AttentionBackendEnum.
Args:
backend: Backend name matching AttentionBackendEnum exactly
(e.g., "FLASH_ATTN", "TRITON_ATTN", "FLASHINFER")
Returns:
Dict with backend_class
"""
from vllm.v1.attention.backends.registry import AttentionBackendEnum
try:
backend_enum = AttentionBackendEnum[backend]
backend_class = backend_enum.get_class()
except (KeyError, ValueError) as e:
valid_backends = [b.name for b in AttentionBackendEnum if b.name != "CUSTOM"]
if backend not in _BACKEND_CONFIG:
raise ValueError(
f"Unknown backend: {backend}. Valid backends: {valid_backends}"
) from e
return {"backend_class": backend_class}
@contextmanager
def log_warnings_and_errors_only():
"""Temporarily set vLLM logger to WARNING level."""
logger = logging.getLogger("vllm")
old_level = logger.level
logger.setLevel(logging.WARNING)
try:
yield
finally:
logger.setLevel(old_level)
f"Unknown backend: {backend}. "
f"Available: {', '.join(_BACKEND_CONFIG.keys())}"
)
return _BACKEND_CONFIG[backend]
# ============================================================================
@@ -99,7 +88,11 @@ def _build_common_attn_metadata(
query_start_loc_cpu = query_start_loc.cpu()
seq_lens = torch.tensor(kv_lens, dtype=torch.int32, device=device)
max_seq_len = int(seq_lens.max().item())
seq_lens_cpu = seq_lens.cpu()
max_seq_len = int(seq_lens_cpu.max())
context_lens = [kv - q for kv, q in zip(kv_lens, q_lens)]
num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
max_blocks = (max(kv_lens) + block_size - 1) // block_size
num_blocks = batch_size * max_blocks
@@ -114,6 +107,8 @@ def _build_common_attn_metadata(
query_start_loc=query_start_loc,
query_start_loc_cpu=query_start_loc_cpu,
seq_lens=seq_lens,
seq_lens_cpu=seq_lens_cpu,
num_computed_tokens_cpu=num_computed_tokens_cpu,
num_reqs=batch_size,
num_actual_tokens=total_tokens,
max_query_len=max_query_len,
@@ -126,6 +121,7 @@ def _build_common_attn_metadata(
def _create_vllm_config(
config: BenchmarkConfig,
dtype: torch.dtype,
max_num_blocks: int,
) -> VllmConfig:
"""Create a VllmConfig for benchmarking with mock model methods."""
@@ -133,7 +129,7 @@ def _create_vllm_config(
model="meta-llama/Meta-Llama-3-8B",
tokenizer="meta-llama/Meta-Llama-3-8B",
trust_remote_code=False,
dtype="auto", # Use model's native dtype
dtype=dtype,
seed=0,
max_model_len=1024,
)
@@ -202,12 +198,15 @@ def _create_backend_impl(
backend_cfg: dict,
config: BenchmarkConfig,
device: torch.device,
dtype: torch.dtype,
):
"""Create backend implementation instance."""
backend_class = backend_cfg["backend_class"]
import importlib
backend_module = importlib.import_module(backend_cfg["module"])
backend_class = getattr(backend_module, backend_cfg["backend_class"])
scale = get_attention_scale(config.head_dim)
dtype = backend_cfg["dtype"]
impl = backend_class.get_impl_cls()(
num_heads=config.num_q_heads,
@@ -228,7 +227,7 @@ def _create_backend_impl(
layer = MockLayer(device, kv_cache_spec=kv_cache_spec)
return backend_class, impl, layer
return backend_class, impl, layer, dtype
def _create_metadata_builder(
@@ -236,44 +235,11 @@ def _create_metadata_builder(
kv_cache_spec: FullAttentionSpec,
vllm_config: VllmConfig,
device: torch.device,
backend_name: str = "",
):
"""Create metadata builder instance."""
layer_names = ["layer_0"]
builder_cls = backend_class.get_builder_cls()
# Flashinfer needs get_per_layer_parameters mocked since we don't have
# real model layers registered
if backend_name == "FLASHINFER":
import unittest.mock
from vllm.v1.attention.backends.utils import PerLayerParameters
def mock_get_per_layer_parameters(vllm_config, layer_names, impl_cls):
head_size = vllm_config.model_config.get_head_size()
return {
layer_name: PerLayerParameters(
window_left=-1, # No sliding window
logits_soft_cap=0.0, # No soft cap
sm_scale=1.0 / (head_size**0.5), # Standard scale
)
for layer_name in layer_names
}
with unittest.mock.patch(
"vllm.v1.attention.backends.flashinfer.get_per_layer_parameters",
mock_get_per_layer_parameters,
):
return builder_cls(
kv_cache_spec=kv_cache_spec,
layer_names=layer_names,
vllm_config=vllm_config,
device=device,
)
return builder_cls(
return backend_class.get_builder_cls()(
kv_cache_spec=kv_cache_spec,
layer_names=layer_names,
layer_names=["layer_0"],
vllm_config=vllm_config,
device=device,
)
@@ -315,44 +281,39 @@ def _create_input_tensors(
def _create_kv_cache(
config: BenchmarkConfig,
max_num_blocks: int,
backend_class,
cache_layout: str,
device: torch.device,
dtype: torch.dtype,
) -> list:
"""Create KV cache tensors for all layers using the backend's methods.
Uses the backend's get_kv_cache_shape() and get_kv_cache_stride_order()
to create the cache with the correct shape and memory layout.
"""
# Get the logical shape from the backend
cache_shape = backend_class.get_kv_cache_shape(
num_blocks=max_num_blocks,
block_size=config.block_size,
num_kv_heads=config.num_kv_heads,
head_size=config.head_dim,
)
# Get the stride order for custom memory layout
try:
stride_order = backend_class.get_kv_cache_stride_order()
assert len(stride_order) == len(cache_shape)
except (AttributeError, NotImplementedError):
stride_order = tuple(range(len(cache_shape)))
# Permute shape to physical layout order
physical_shape = tuple(cache_shape[i] for i in stride_order)
# Compute inverse permutation to get back to logical view
inv_order = [stride_order.index(i) for i in range(len(stride_order))]
cache_list = []
for _ in range(config.num_layers):
# Allocate in physical layout order (contiguous in memory)
cache = torch.zeros(*physical_shape, device=device, dtype=dtype)
# Permute to logical view
cache = cache.permute(*inv_order)
cache_list.append(cache)
"""Create KV cache tensors for all layers."""
if cache_layout == "flashinfer":
# FlashInfer layout: [num_blocks, 2, block_size, num_kv_heads, head_dim]
cache_list = [
torch.zeros(
max_num_blocks,
2,
config.block_size,
config.num_kv_heads,
config.head_dim,
device=device,
dtype=dtype,
)
for _ in range(config.num_layers)
]
else:
# Standard layout: [2, num_blocks, block_size, num_kv_heads, head_dim]
cache_list = [
torch.zeros(
2,
max_num_blocks,
config.block_size,
config.num_kv_heads,
config.head_dim,
device=device,
dtype=dtype,
)
for _ in range(config.num_layers)
]
return cache_list
@@ -435,7 +396,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
"""
Run standard attention benchmark with real kernels.
Supports: FLASH_ATTN, TRITON_ATTN, FLASHINFER
Supports: flash, triton, flashinfer
Args:
config: Benchmark configuration
@@ -450,79 +411,60 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
requests = parse_batch_spec(config.batch_spec)
if config.backend == "FLASHINFER":
if config.backend == "flashinfer":
requests = reorder_for_flashinfer(requests)
q_lens = [r.q_len for r in requests]
kv_lens = [r.kv_len for r in requests]
total_q = sum(q_lens)
max_kv = max(kv_lens)
batch_size = len(q_lens)
# Calculate total blocks needed: batch_size * max_blocks_per_request
max_blocks_per_request = (max_kv + config.block_size - 1) // config.block_size
max_num_blocks = batch_size * max_blocks_per_request
max_num_blocks = (max_kv + config.block_size - 1) // config.block_size
# Suppress vLLM logs during setup to reduce spam
with log_warnings_and_errors_only():
# Create vllm_config first - uses model's native dtype via "auto"
vllm_config = _create_vllm_config(config, max_num_blocks)
dtype = vllm_config.model_config.dtype
backend_class, impl, layer, dtype = _create_backend_impl(
backend_cfg, config, device
)
# Wrap everything in set_current_vllm_config context
# This is required for backends like flashinfer that need global config
with set_current_vllm_config(vllm_config):
backend_class, impl, layer = _create_backend_impl(
backend_cfg, config, device, dtype
)
common_metadata = _build_common_attn_metadata(
q_lens, kv_lens, config.block_size, device
)
# Set KV cache layout if the backend requires a specific one
# (e.g., FlashInfer requires HND on SM100/Blackwell for TRTLLM attention)
required_layout = backend_class.get_required_kv_cache_layout()
if required_layout is not None:
set_kv_cache_layout(required_layout)
get_kv_cache_layout.cache_clear()
kv_cache_spec = FullAttentionSpec(
block_size=config.block_size,
num_kv_heads=config.num_kv_heads,
head_size=config.head_dim,
dtype=dtype,
)
common_metadata = _build_common_attn_metadata(
q_lens, kv_lens, config.block_size, device
)
vllm_config = _create_vllm_config(config, dtype, max_num_blocks)
kv_cache_spec = FullAttentionSpec(
block_size=config.block_size,
num_kv_heads=config.num_kv_heads,
head_size=config.head_dim,
dtype=dtype,
)
builder = _create_metadata_builder(
backend_class, kv_cache_spec, vllm_config, device
)
builder = _create_metadata_builder(
backend_class, kv_cache_spec, vllm_config, device, config.backend
)
attn_metadata = builder.build(
common_prefix_len=0,
common_attn_metadata=common_metadata,
)
attn_metadata = builder.build(
common_prefix_len=0,
common_attn_metadata=common_metadata,
)
q_list, k_list, v_list = _create_input_tensors(config, total_q, device, dtype)
q_list, k_list, v_list = _create_input_tensors(
config, total_q, device, dtype
)
cache_list = _create_kv_cache(
config, max_num_blocks, backend_cfg["cache_layout"], device, dtype
)
cache_list = _create_kv_cache(
config, max_num_blocks, backend_class, device, dtype
)
times, mem_stats = _run_single_benchmark(
config,
impl,
layer,
q_list,
k_list,
v_list,
cache_list,
attn_metadata,
device,
dtype,
)
times, mem_stats = _run_single_benchmark(
config,
impl,
layer,
q_list,
k_list,
v_list,
cache_list,
attn_metadata,
device,
dtype,
)
mean_time = np.mean(times)
throughput = total_q / mean_time if mean_time > 0 else 0

View File

@@ -46,10 +46,10 @@ echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
echo "RESULT_FILE=$RESULT"
echo "====================== AUTO TUNEPARAMETERS ===================="
rm -rf "$LOG_FOLDER"
rm -rf "$PROFILE_PATH"
mkdir -p "$LOG_FOLDER"
mkdir -p "$PROFILE_PATH"
rm -rf $LOG_FOLDER
rm -rf $PROFILE_PATH
mkdir -p $LOG_FOLDER
mkdir -p $PROFILE_PATH
cd "$BASE/vllm"
@@ -114,7 +114,7 @@ start_server() {
# wait for 10 minutes...
server_started=0
for _ in {1..60}; do
for i in {1..60}; do
# This line checks whether the server is still alive or not,
# since that we should always have permission to send signal to the server process.
kill -0 $server_pid 2> /dev/null || break
@@ -145,12 +145,12 @@ run_benchmark() {
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
echo "vllm_log: $vllm_log"
echo
rm -f "$vllm_log"
rm -f $vllm_log
pkill -if "vllm serve" || true
echo "starting server..."
# Call start_server without a profile_dir to avoid profiling overhead
start_server "$gpu_memory_utilization" "$max_num_seqs" "$max_num_batched_tokens" "$vllm_log" ""
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log ""
result=$?
if [[ "$result" -eq 1 ]]; then
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
@@ -168,15 +168,15 @@ run_benchmark() {
# --profile flag is removed from this call
vllm bench serve \
--backend vllm \
--model "$MODEL" \
--model $MODEL \
--dataset-name random \
--random-input-len $adjusted_input_len \
--random-output-len "$OUTPUT_LEN" \
--random-output-len $OUTPUT_LEN \
--ignore-eos \
--disable-tqdm \
--request-rate inf \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 1000 \
--random-prefix-len $prefix_len \
--host "$HOSTNAME" \
@@ -195,20 +195,20 @@ run_benchmark() {
request_rate=$((${throughput%.*} + 1))
while ((request_rate > 0)); do
# clear prefix cache
curl -X POST http://"${HOSTNAME}":8004/reset_prefix_cache
curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
sleep 5
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
vllm bench serve \
--backend vllm \
--model "$MODEL" \
--model $MODEL \
--dataset-name random \
--random-input-len $adjusted_input_len \
--random-output-len "$OUTPUT_LEN" \
--random-output-len $OUTPUT_LEN \
--ignore-eos \
--disable-tqdm \
--request-rate $request_rate \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--random-prefix-len $prefix_len \
--host "$HOSTNAME" \
@@ -255,7 +255,7 @@ gpu_memory_utilization=0.98
find_gpu_memory_utilization=0
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
# Pass empty string for profile_dir argument
start_server "$gpu_memory_utilization" "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
result=$?
if [[ "$result" -eq 0 ]]; then
find_gpu_memory_utilization=1
@@ -274,7 +274,7 @@ fi
for num_seqs in "${num_seqs_list[@]}"; do
for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
run_benchmark "$num_seqs" "$num_batched_tokens" "$gpu_memory_utilization"
run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
done
done
echo "finish permutations"
@@ -285,7 +285,7 @@ echo "finish permutations"
if (( $(echo "$best_throughput > 0" | bc -l) )); then
echo
echo "Benchmark tuning finished. Now running profiling on the best configuration found..."
echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput, goodput: $best_goodput"
echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput"
echo
vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt"
@@ -293,7 +293,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
# Start server with the best params and profiling ENABLED
echo "Starting server for profiling..."
start_server "$gpu_memory_utilization" "$best_max_num_seqs" "$best_num_batched_tokens" "$vllm_log" "$PROFILE_PATH"
start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH"
# Run benchmark with the best params and the --profile flag
echo "Running benchmark with profiling..."
@@ -301,15 +301,15 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
vllm bench serve \
--backend vllm \
--model "$MODEL" \
--model $MODEL \
--dataset-name random \
--random-input-len $adjusted_input_len \
--random-output-len "$OUTPUT_LEN" \
--random-output-len $OUTPUT_LEN \
--ignore-eos \
--disable-tqdm \
--request-rate "$best_request_rate" \
--request-rate $best_request_rate \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--random-prefix-len $prefix_len \
--host "$HOSTNAME" \

View File

@@ -64,7 +64,7 @@ for i in $(seq 0 $(($num_runs - 1))); do
else
STATUS="FAILURE"
((FAILURE_COUNT++))
FAILED_RUNS+=("Run #$((i+1)): $(echo "$run_object" | jq -c .)")
FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)")
fi
RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")

View File

@@ -1,471 +0,0 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark comparing Triton vs PyTorch sort-based top-k/top-p implementations.
Compares:
- apply_top_k_top_p_triton (Triton binary search)
- apply_top_k_top_p (PyTorch sort-based)
Scenarios:
- top_k only (whole batch, partial batch)
- top_p only (whole batch, partial batch)
- mix of top_k and top_p
"""
import argparse
import gc
from dataclasses import dataclass
import torch
from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p_pytorch
from vllm.v1.sample.ops.topk_topp_triton import (
apply_top_k_top_p_triton,
reset_buffer_cache,
)
@dataclass
class BenchmarkConfig:
"""Configuration for a benchmark run."""
name: str
batch_size: int
vocab_size: int
# k and p can be tensors or None
k_values: torch.Tensor | None # [batch_size] or None
p_values: torch.Tensor | None # [batch_size] or None
description: str
ops_pct: float = 0.0 # Percentage of ops relative to batch size
def calculate_ops_pct(
k_values: torch.Tensor | None,
p_values: torch.Tensor | None,
vocab_size: int,
batch_size: int,
) -> float:
"""
Calculate the percentage of active top-k and top-p operations.
Returns percentage where 100% = batch_size ops.
E.g., if all rows have both top-k and top-p active, returns 200%.
"""
active_ops = 0
if k_values is not None:
# Count rows where k < vocab_size (active top-k filtering)
active_ops += (k_values < vocab_size).sum().item()
if p_values is not None:
# Count rows where p < 1.0 (active top-p filtering)
active_ops += (p_values < 1.0).sum().item()
return (active_ops / batch_size) * 100 if batch_size > 0 else 0.0
def create_logits(
batch_size: int, vocab_size: int, device: str = "cuda"
) -> torch.Tensor:
"""Create random logits mimicking a realistic LLM distribution.
Uses a Zipf-like probability distribution (rank^-1.1) converted to logits
via log, then randomly permuted per row. This produces a peaked distribution
where a small number of tokens capture most probability mass, similar to
real model outputs.
"""
# Create Zipf-like probabilities: p(rank) ~ rank^(-alpha)
ranks = torch.arange(1, vocab_size + 1, dtype=torch.float32, device=device)
probs = ranks.pow(-1.1)
probs = probs / probs.sum()
# Convert to logits (log-probabilities, unnormalized is fine)
base_logits = probs.log()
# Broadcast to batch and randomly permute each row
logits = base_logits.unsqueeze(0).expand(batch_size, -1).clone()
for i in range(batch_size):
logits[i] = logits[i, torch.randperm(vocab_size, device=device)]
return logits
def measure_memory() -> tuple[int, int]:
"""Return (allocated, reserved) memory in bytes."""
torch.cuda.synchronize()
return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()
def reset_memory_stats():
"""Reset peak memory statistics."""
reset_buffer_cache()
torch.cuda.reset_peak_memory_stats()
torch.cuda.empty_cache()
gc.collect()
def benchmark_function(
func,
logits: torch.Tensor,
k: torch.Tensor | None,
p: torch.Tensor | None,
warmup_iters: int = 5,
benchmark_iters: int = 20,
) -> tuple[float, int]:
"""
Benchmark a function and return (avg_time_ms, peak_memory_bytes).
Returns average time in milliseconds and peak memory usage.
"""
# Warmup
for _ in range(warmup_iters):
logits_copy = logits.clone()
func(logits_copy, k, p)
torch.cuda.synchronize()
# Reset memory stats before benchmark
reset_memory_stats()
# Benchmark
start_events = [
torch.cuda.Event(enable_timing=True) for _ in range(benchmark_iters)
]
end_events = [torch.cuda.Event(enable_timing=True) for _ in range(benchmark_iters)]
for i in range(benchmark_iters):
logits_copy = logits.clone()
start_events[i].record()
func(logits_copy, k, p)
end_events[i].record()
torch.cuda.synchronize()
# Calculate timing
times = [
start_events[i].elapsed_time(end_events[i]) for i in range(benchmark_iters)
]
avg_time = sum(times) / len(times)
# Get peak memory
_, peak_memory = measure_memory()
return avg_time, peak_memory
def create_benchmark_configs(
batch_sizes: list[int],
vocab_sizes: list[int],
device: str = "cuda",
) -> list[BenchmarkConfig]:
"""Create all benchmark configurations."""
configs = []
for vocab_size in vocab_sizes:
for batch_size in batch_sizes:
# 1. Top-k only - whole batch (all rows have k < vocab_size)
k_all = torch.full((batch_size,), 50, dtype=torch.int32, device=device)
configs.append(
BenchmarkConfig(
name=f"topk_whole_b{batch_size}_v{vocab_size // 1000}k",
batch_size=batch_size,
vocab_size=vocab_size,
k_values=k_all,
p_values=None,
description=f"Top-k only (whole batch, k=50), "
f"batch={batch_size}, vocab={vocab_size}",
ops_pct=calculate_ops_pct(k_all, None, vocab_size, batch_size),
)
)
# 2. Top-k only - partial batch (half have k=50, half have k=vocab_size)
k_partial = torch.full((batch_size,), 50, dtype=torch.int32, device=device)
k_partial[batch_size // 2 :] = vocab_size # No filtering for second half
configs.append(
BenchmarkConfig(
name=f"topk_partial_b{batch_size}_v{vocab_size // 1000}k",
batch_size=batch_size,
vocab_size=vocab_size,
k_values=k_partial,
p_values=None,
description=f"Top-k only (partial batch, 50% k=50, 50% k=vocab), "
f"batch={batch_size}, vocab={vocab_size}",
ops_pct=calculate_ops_pct(k_partial, None, vocab_size, batch_size),
)
)
# 3. Top-p only - whole batch (all rows have p < 1.0)
p_all = torch.full((batch_size,), 0.9, dtype=torch.float32, device=device)
configs.append(
BenchmarkConfig(
name=f"topp_whole_b{batch_size}_v{vocab_size // 1000}k",
batch_size=batch_size,
vocab_size=vocab_size,
k_values=None,
p_values=p_all,
description=f"Top-p only (whole batch, p=0.9), "
f"batch={batch_size}, vocab={vocab_size}",
ops_pct=calculate_ops_pct(None, p_all, vocab_size, batch_size),
)
)
# 4. Top-p only - partial batch (half have p=0.9, half have p=1.0)
p_partial = torch.full(
(batch_size,), 0.9, dtype=torch.float32, device=device
)
p_partial[batch_size // 2 :] = 1.0 # No filtering for second half
configs.append(
BenchmarkConfig(
name=f"topp_partial_b{batch_size}_v{vocab_size // 1000}k",
batch_size=batch_size,
vocab_size=vocab_size,
k_values=None,
p_values=p_partial,
description=f"Top-p only (partial batch, 50% p=0.9, 50% p=1.0), "
f"batch={batch_size}, vocab={vocab_size}",
ops_pct=calculate_ops_pct(None, p_partial, vocab_size, batch_size),
)
)
# 5. Mix of top-k and top-p (both applied to whole batch)
k_mix = torch.full((batch_size,), 100, dtype=torch.int32, device=device)
p_mix = torch.full((batch_size,), 0.9, dtype=torch.float32, device=device)
configs.append(
BenchmarkConfig(
name=f"topk_topp_whole_b{batch_size}_v{vocab_size // 1000}k",
batch_size=batch_size,
vocab_size=vocab_size,
k_values=k_mix,
p_values=p_mix,
description=f"Top-k + Top-p (whole batch, k=100, p=0.9), "
f"batch={batch_size}, vocab={vocab_size}",
ops_pct=calculate_ops_pct(k_mix, p_mix, vocab_size, batch_size),
)
)
# 6. Mix with partial application (some rows k only, some p only, some both)
k_mixed = torch.full(
(batch_size,), vocab_size, dtype=torch.int32, device=device
)
p_mixed = torch.full((batch_size,), 1.0, dtype=torch.float32, device=device)
# First third: k only
third = batch_size // 3
k_mixed[:third] = 50
# Second third: p only
p_mixed[third : 2 * third] = 0.5
# Last third: both k and p
k_mixed[2 * third :] = 100
p_mixed[2 * third :] = 0.9
configs.append(
BenchmarkConfig(
name=f"mixed_partial_b{batch_size}_v{vocab_size // 1000}k",
batch_size=batch_size,
vocab_size=vocab_size,
k_values=k_mixed,
p_values=p_mixed,
description=f"Mixed partial (1/3 k=50, 1/3 p=0.9, 1/3 both), "
f"batch={batch_size}, vocab={vocab_size}",
ops_pct=calculate_ops_pct(k_mixed, p_mixed, vocab_size, batch_size),
)
)
return configs
def format_memory(bytes_val: int) -> str:
"""Format memory in human-readable form."""
if bytes_val >= 1024**3:
return f"{bytes_val / (1024**3):.2f} GB"
elif bytes_val >= 1024**2:
return f"{bytes_val / (1024**2):.2f} MB"
elif bytes_val >= 1024:
return f"{bytes_val / 1024:.2f} KB"
return f"{bytes_val} B"
def run_benchmark(
configs: list[BenchmarkConfig],
warmup_iters: int = 5,
benchmark_iters: int = 20,
verbose: bool = True,
):
"""Run all benchmarks and print results."""
results = []
print("=" * 100)
print("Top-k/Top-p Benchmark: Triton vs PyTorch Sort-based")
print("=" * 100)
print()
for config in configs:
if verbose:
print(f"Running: {config.description}")
# Create fresh logits for this config
logits = create_logits(config.batch_size, config.vocab_size)
# Benchmark Triton
reset_memory_stats()
triton_time, triton_mem = benchmark_function(
apply_top_k_top_p_triton,
logits,
config.k_values,
config.p_values,
warmup_iters,
benchmark_iters,
)
# Benchmark PyTorch
reset_memory_stats()
pytorch_time, pytorch_mem = benchmark_function(
apply_top_k_top_p_pytorch,
logits,
config.k_values,
config.p_values,
warmup_iters,
benchmark_iters,
)
speedup = pytorch_time / triton_time if triton_time > 0 else float("inf")
mem_ratio = pytorch_mem / triton_mem if triton_mem > 0 else float("inf")
result = {
"config": config,
"triton_time_ms": triton_time,
"pytorch_time_ms": pytorch_time,
"triton_mem": triton_mem,
"pytorch_mem": pytorch_mem,
"speedup": speedup,
"mem_ratio": mem_ratio,
}
results.append(result)
if verbose:
print(f" Triton: {triton_time:.3f} ms, {format_memory(triton_mem)}")
print(f" PyTorch: {pytorch_time:.3f} ms, {format_memory(pytorch_mem)}")
print(f" Speedup: {speedup:.2f}x, Memory ratio: {mem_ratio:.2f}x")
print()
# Clean up
del logits
reset_memory_stats()
return results
def print_summary_table(results: list[dict]):
"""Print a summary table of results."""
print()
print("=" * 130)
print("SUMMARY TABLE")
print("=" * 130)
print()
# Header
header = (
f"{'Scenario':<40} {'Batch':>6} {'Vocab':>7} {'Ops%':>6} "
f"{'Triton (ms)':>12} {'PyTorch (ms)':>13} {'Speedup':>8} "
f"{'Tri Mem':>10} {'Pyt Mem':>10}"
)
print(header)
print("-" * 130)
# Group by scenario type
current_vocab = None
for result in results:
config = result["config"]
# Add separator between vocab sizes
if current_vocab != config.vocab_size:
if current_vocab is not None:
print("-" * 130)
current_vocab = config.vocab_size
scenario = config.name.split("_b")[0] # Extract scenario name
print(
f"{scenario:<40} {config.batch_size:>6} {config.vocab_size:>7} "
f"{config.ops_pct:>5.0f}% "
f"{result['triton_time_ms']:>12.3f} {result['pytorch_time_ms']:>13.3f} "
f"{result['speedup']:>7.2f}x "
f"{format_memory(result['triton_mem']):>10} "
f"{format_memory(result['pytorch_mem']):>10}"
)
print("=" * 130)
def main():
parser = argparse.ArgumentParser(
description="Benchmark Triton vs PyTorch sort-based top-k/top-p implementations"
)
parser.add_argument(
"--batch-sizes",
type=int,
nargs="+",
default=[1, 4, 16, 64, 128, 512, 1024, 2048],
help="Batch sizes to test (default: 1 4 16 64)",
)
parser.add_argument(
"--vocab-sizes",
type=int,
nargs="+",
default=[32768, 131072], # 32k, 128k
help="Vocabulary sizes to test (default: 32768 131072)",
)
parser.add_argument(
"--warmup-iters",
type=int,
default=5,
help="Number of warmup iterations (default: 5)",
)
parser.add_argument(
"--benchmark-iters",
type=int,
default=20,
help="Number of benchmark iterations (default: 20)",
)
parser.add_argument(
"--quiet",
action="store_true",
help="Only print summary table",
)
args = parser.parse_args()
# Print configuration
print(f"Batch sizes: {args.batch_sizes}")
print(f"Vocab sizes: {args.vocab_sizes}")
print(f"Warmup iterations: {args.warmup_iters}")
print(f"Benchmark iterations: {args.benchmark_iters}")
print()
# Check CUDA
if not torch.cuda.is_available():
print("ERROR: CUDA is not available. This benchmark requires a GPU.")
return
device_name = torch.cuda.get_device_name(0)
print(f"GPU: {device_name}")
print()
# Create configs
configs = create_benchmark_configs(
args.batch_sizes,
args.vocab_sizes,
)
# Run benchmarks
results = run_benchmark(
configs,
warmup_iters=args.warmup_iters,
benchmark_iters=args.benchmark_iters,
verbose=not args.quiet,
)
# Print summary
print_summary_table(results)
if __name__ == "__main__":
main()

View File

@@ -13,7 +13,6 @@ from torch.utils.benchmark import Measurement as TMeasurement
from tqdm import tqdm
import vllm._custom_ops as ops
from vllm.benchmarks.lib.utils import default_vllm_config
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8,
@@ -292,7 +291,6 @@ def print_timers(timers: Iterable[TMeasurement]):
compare.print()
@default_vllm_config()
def main():
torch.set_default_device("cuda")
bench_params = get_bench_params()

View File

@@ -8,7 +8,6 @@ os.environ["VLLM_USE_DEEP_GEMM"] = "0"
import torch
from vllm.benchmarks.lib.utils import default_vllm_config
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
W8A8BlockFp8LinearOp,
)
@@ -41,7 +40,6 @@ DEEPSEEK_V3_SHAPES = [
]
@default_vllm_config()
def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
"""Build runner function for w8a8 block fp8 matmul."""
factor_for_scale = 1e-2

View File

@@ -7,7 +7,6 @@ from unittest.mock import patch
import pandas as pd
import torch
from vllm.benchmarks.lib.utils import default_vllm_config
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.triton_utils import triton
@@ -85,7 +84,6 @@ def calculate_diff(
configs = []
@default_vllm_config()
def benchmark_quantization(
batch_size,
hidden_size,

View File

@@ -7,7 +7,6 @@ import itertools
import torch
import vllm.model_executor.layers.activation # noqa F401
from vllm.benchmarks.lib.utils import default_vllm_config
from vllm.model_executor.custom_op import op_registry
from vllm.triton_utils import triton
from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -19,7 +18,6 @@ intermediate_size = [3072, 9728, 12288]
configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
@default_vllm_config()
def benchmark_activation(
batch_size: int,
seq_len: int,

View File

@@ -11,7 +11,6 @@ import torch
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from tests.kernels.moe.utils import make_dummy_moe_config
from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
@@ -162,7 +161,7 @@ def bench_run(
w2_fp8q_cutlass,
topk_weights,
topk_ids,
activation=MoEActivation.SILU,
activation="silu",
global_num_experts=num_experts,
)
torch.cuda.synchronize()

View File

@@ -30,9 +30,6 @@ import torch.distributed as dist
from torch.distributed import ProcessGroup
from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
from vllm.distributed.device_communicators.flashinfer_all_reduce import (
FlashInferAllReduce,
)
from vllm.distributed.device_communicators.pynccl import (
PyNcclCommunicator,
register_nccl_symmetric_ops,
@@ -47,7 +44,7 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser
logger = init_logger(__name__)
# Default sequence lengths to benchmark
DEFAULT_SEQUENCE_LENGTHS = [16, 64, 128, 512, 1024, 2048, 4096, 8192]
DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192]
# Fixed hidden size and dtype for all benchmarks
HIDDEN_SIZE = 8192
@@ -84,7 +81,6 @@ class CommunicatorBenchmark:
self.symm_mem_comm = None
self.symm_mem_comm_multimem = None
self.symm_mem_comm_two_shot = None
self.fi_ar_comm = None
self._init_communicators()
@@ -165,22 +161,6 @@ class CommunicatorBenchmark:
)
self.symm_mem_comm_two_shot = None
try:
self.fi_ar_comm = FlashInferAllReduce(
group=self.cpu_group,
device=self.device,
)
if not self.fi_ar_comm.disabled:
logger.info("Rank %s: FlashInferAllReduce initialized", self.rank)
else:
logger.info("Rank %s: FlashInferAllReduce disabled", self.rank)
self.fi_ar_comm = None
except Exception as e:
logger.warning(
"Rank %s: Failed to initialize FlashInferAllReduce: %s", self.rank, e
)
self.fi_ar_comm = None
def benchmark_allreduce(
self, sequence_length: int, num_warmup: int, num_trials: int
) -> dict[str, float]:
@@ -200,8 +180,7 @@ class CommunicatorBenchmark:
lambda t, c=comm: c.custom_all_reduce(t),
lambda t, c=comm: c.should_custom_ar(t),
comm.capture(),
{"VLLM_CUSTOM_ALLREDUCE_ALGO": "1stage"},
None, # no destroy function
"1stage", # env variable value
)
)
# CustomAllreduce two-shot
@@ -211,8 +190,7 @@ class CommunicatorBenchmark:
lambda t, c=comm: c.custom_all_reduce(t),
lambda t, c=comm: c.should_custom_ar(t),
comm.capture(),
{"VLLM_CUSTOM_ALLREDUCE_ALGO": "2stage"},
None, # no destroy function
"2stage", # env variable value
)
)
@@ -224,8 +202,7 @@ class CommunicatorBenchmark:
lambda t, c=comm: c.all_reduce(t),
lambda t: True, # Always available if initialized
nullcontext(),
{}, # no env variable needed
None, # no destroy function
None, # no env variable needed
)
)
communicators.append(
@@ -234,8 +211,7 @@ class CommunicatorBenchmark:
lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
lambda t: True, # Always available if initialized
nullcontext(),
{}, # no env variable needed
None, # no destroy function
None, # no env variable needed
)
)
@@ -247,8 +223,7 @@ class CommunicatorBenchmark:
lambda t, c=comm: c.all_reduce(t),
lambda t, c=comm: c.should_use_symm_mem(t),
nullcontext(),
{}, # no env variable needed
None, # no destroy function
None, # no env variable needed
)
)
@@ -260,67 +235,29 @@ class CommunicatorBenchmark:
lambda t, c=comm: c.all_reduce(t),
lambda t, c=comm: c.should_use_symm_mem(t),
nullcontext(),
{}, # no env variable needed
None, # no destroy function needed
)
)
if self.fi_ar_comm is not None:
comm = self.fi_ar_comm
communicators.append(
(
"flashinfer_trtllm",
lambda t, c=comm: c.all_reduce(t),
lambda t, c=comm: c.should_use_fi_ar(t),
nullcontext(),
{"VLLM_FLASHINFER_ALLREDUCE_BACKEND": "trtllm"},
lambda c=comm: c.destroy(),
)
)
communicators.append(
(
"flashinfer_mnnvl",
lambda t, c=comm: c.all_reduce(t),
lambda t, c=comm: c.should_use_fi_ar(t),
nullcontext(),
{"VLLM_FLASHINFER_ALLREDUCE_BACKEND": "mnnvl"},
lambda c=comm: c.destroy(),
None, # no env variable needed
)
)
# Benchmark each communicator
for (
name,
allreduce_fn,
should_use_fn,
context,
env_dict,
destroy_fn,
) in communicators:
# Save original values and apply new environment variables
saved_env = {key: os.environ.get(key) for key in env_dict}
for key, value in env_dict.items():
os.environ[key] = value
try:
latency = self.benchmark_allreduce_single(
sequence_length,
allreduce_fn,
should_use_fn,
context,
num_warmup,
num_trials,
)
if latency is not None:
results[name] = latency
finally:
if destroy_fn is not None:
destroy_fn()
# Restore environment variables to their original state
for key, original_value in saved_env.items():
if original_value is None:
os.environ.pop(key, None)
else:
os.environ[key] = original_value
for name, allreduce_fn, should_use_fn, context, env_var in communicators:
# Set environment variable if needed
if env_var is not None:
os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var
else:
# Clear the environment variable to avoid interference
os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None)
latency = self.benchmark_allreduce_single(
sequence_length,
allreduce_fn,
should_use_fn,
context,
num_warmup,
num_trials,
)
if latency is not None:
results[name] = latency
return results

View File

@@ -5,11 +5,8 @@
Benchmark for FlashInfer fused collective operations vs standard operations.
This benchmark compares:
1. FlashInfer's allreduce_fusion with trtllm backend
(fused allreduce + rmsnorm + optional FP8/FP4 quant)
2. FlashInfer's allreduce_fusion with mnnvl backend
(fused allreduce + rmsnorm only, no quantization support)
3. Standard tensor_model_parallel_all_reduce + separate rmsnorm/quant operations
1. FlashInfer's trtllm_allreduce_fusion (fused allreduce + rmsnorm + optional quant)
2. Standard tensor_model_parallel_all_reduce + separate rmsnorm/quant operations
Usage with torchrun:
torchrun --nproc_per_node=2 benchmark_fused_collective.py
@@ -27,6 +24,7 @@ import torch.distributed as dist # type: ignore
from vllm.config.vllm import CompilationConfig, VllmConfig, set_current_vllm_config
from vllm.distributed import (
get_tp_group,
tensor_model_parallel_all_reduce,
)
from vllm.distributed.parallel_state import (
@@ -51,19 +49,14 @@ SCALED_FP4_QUANT_OP = torch.ops._C.scaled_fp4_quant
logger = init_logger(__name__)
# Try to import FlashInfer
TorchDistBackend = None
try:
import flashinfer.comm as flashinfer_comm # type: ignore
from flashinfer.comm.mnnvl import ( # type: ignore
TorchDistBackend,
)
if not (
hasattr(flashinfer_comm, "allreduce_fusion")
and hasattr(flashinfer_comm, "create_allreduce_fusion_workspace")
):
if not hasattr(flashinfer_comm, "trtllm_allreduce_fusion"):
flashinfer_comm = None
logger.warning("FlashInfer comm module found but missing allreduce_fusion API")
logger.warning(
"FlashInfer comm module found but missing trtllm_allreduce_fusion"
)
except ImportError:
flashinfer_comm = None
logger.warning("FlashInfer not found, only benchmarking standard operations")
@@ -81,70 +74,57 @@ _FI_MAX_SIZES = {
8: 64 * MiB, # 64MB
}
# Global workspace tensors for FlashInfer (keyed by backend name)
_FI_WORKSPACES: dict = {}
# Backends to benchmark
FLASHINFER_BACKENDS = ["trtllm", "mnnvl"]
# Global workspace tensor for FlashInfer
_FI_WORKSPACE_TENSOR = None
def setup_flashinfer_workspace(
backend: str,
world_size: int,
rank: int,
hidden_dim: int,
max_token_num: int,
dtype: torch.dtype,
use_fp32_lamport: bool = False,
):
"""Setup FlashInfer workspace for fused allreduce operations."""
global FI_WORKSPACES
global _FI_WORKSPACE_TENSOR
if flashinfer_comm is None:
return None
return None, None
if world_size not in _FI_MAX_SIZES:
logger.warning("FlashInfer not supported for world size %s", world_size)
return None
return None, None
try:
kwargs = {}
if TorchDistBackend is not None:
kwargs["comm_backend"] = TorchDistBackend(group=dist.group.WORLD)
workspace = flashinfer_comm.create_allreduce_fusion_workspace(
backend=backend,
world_size=world_size,
rank=rank,
max_token_num=max_token_num,
hidden_dim=hidden_dim,
dtype=dtype,
**kwargs,
# Create IPC workspace
ipc_handles, workspace_tensor = (
flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion(
tp_rank=rank,
tp_size=world_size,
max_token_num=max_token_num,
hidden_dim=hidden_dim,
group=get_tp_group().device_group,
use_fp32_lamport=use_fp32_lamport,
)
)
_FI_WORKSPACES[backend] = workspace
return workspace
_FI_WORKSPACE_TENSOR = workspace_tensor
return ipc_handles, workspace_tensor
except Exception as e:
logger.error(
"Failed to setup FlashInfer workspace (backend=%s): %s", backend, e
)
return None
logger.error("Failed to setup FlashInfer workspace: %s", e)
return None, None
def cleanup_flashinfer_workspaces():
"""Cleanup all FlashInfer workspaces."""
if flashinfer_comm is None:
def cleanup_flashinfer_workspace(ipc_handles):
"""Cleanup FlashInfer workspace."""
if flashinfer_comm is None or ipc_handles is None:
return
for backend, workspace in _FI_WORKSPACES.items():
try:
workspace.destroy()
except Exception as e:
logger.error(
"Failed to cleanup FlashInfer workspace (backend=%s): %s",
backend,
e,
)
_FI_WORKSPACES.clear()
try:
group = get_tp_group().device_group
flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce(ipc_handles, group)
except Exception as e:
logger.error("Failed to cleanup FlashInfer workspace: %s", e)
class FlashInferFusedAllReduceParams:
@@ -152,15 +132,25 @@ class FlashInferFusedAllReduceParams:
def __init__(
self,
rank: int,
world_size: int,
use_fp32_lamport: bool = False,
max_token_num: int = 1024,
):
self.rank = rank
self.world_size = world_size
self.use_fp32_lamport = use_fp32_lamport
self.trigger_completion_at_end = True
self.launch_with_pdl = True
self.fp32_acc = True
self.max_token_num = max_token_num
def get_flashinfer_fused_allreduce_kwargs(self):
def get_trtllm_fused_allreduce_kwargs(self):
return {
"world_rank": self.rank,
"world_size": self.world_size,
"launch_with_pdl": self.launch_with_pdl,
"trigger_completion_at_end": self.trigger_completion_at_end,
"fp32_acc": self.fp32_acc,
}
@@ -171,12 +161,11 @@ def flashinfer_fused_allreduce_rmsnorm(
rms_gamma: torch.Tensor,
rms_eps: float,
allreduce_params: "FlashInferFusedAllReduceParams",
workspace: object,
use_oneshot: bool,
norm_out: torch.Tensor | None = None,
):
"""FlashInfer fused allreduce + rmsnorm operation."""
if flashinfer_comm is None or workspace is None:
if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
raise RuntimeError("FlashInfer not available or workspace not initialized")
if norm_out is None:
@@ -185,25 +174,24 @@ def flashinfer_fused_allreduce_rmsnorm(
else:
residual_out = input_tensor
layout_code = None
if workspace.backend == "trtllm":
layout_code = flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4
flashinfer_comm.allreduce_fusion(
input=input_tensor,
workspace=workspace,
pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm,
flashinfer_comm.trtllm_allreduce_fusion(
allreduce_in=input_tensor,
token_num=input_tensor.shape[0],
residual_in=residual,
residual_out=residual_out,
norm_out=norm_out,
rms_gamma=rms_gamma,
rms_eps=rms_eps,
hidden_dim=input_tensor.shape[-1],
workspace_ptrs=_FI_WORKSPACE_TENSOR,
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm,
allreduce_out=None,
quant_out=None,
scale_out=None,
layout_code=layout_code,
layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
scale_factor=None,
use_oneshot=use_oneshot,
**allreduce_params.get_flashinfer_fused_allreduce_kwargs(),
**allreduce_params.get_trtllm_fused_allreduce_kwargs(),
)
@@ -214,16 +202,12 @@ def flashinfer_fused_allreduce_rmsnorm_fp8_quant(
rms_eps: float,
scale_factor: torch.Tensor,
allreduce_params: FlashInferFusedAllReduceParams,
workspace: object,
use_oneshot: bool = True,
norm_out: torch.Tensor | None = None,
quant_out: torch.Tensor | None = None,
):
"""FlashInfer fused allreduce + rmsnorm + FP8 quantization.
Note: Only supported by the trtllm backend.
"""
if flashinfer_comm is None or workspace is None:
"""FlashInfer fused allreduce + rmsnorm + FP8 quantization."""
if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
raise RuntimeError("FlashInfer not available or workspace not initialized")
if norm_out is None:
@@ -232,21 +216,24 @@ def flashinfer_fused_allreduce_rmsnorm_fp8_quant(
else:
residual_out = input_tensor
flashinfer_comm.allreduce_fusion(
input=input_tensor,
workspace=workspace,
pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant,
flashinfer_comm.trtllm_allreduce_fusion(
allreduce_in=input_tensor,
token_num=input_tensor.shape[0],
residual_in=residual,
residual_out=residual_out,
norm_out=norm_out,
rms_gamma=rms_gamma,
rms_eps=rms_eps,
hidden_dim=input_tensor.shape[-1],
workspace_ptrs=_FI_WORKSPACE_TENSOR,
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant,
allreduce_out=None,
quant_out=quant_out,
scale_out=None,
layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
scale_factor=scale_factor,
use_oneshot=use_oneshot,
**allreduce_params.get_flashinfer_fused_allreduce_kwargs(),
**allreduce_params.get_trtllm_fused_allreduce_kwargs(),
)
@@ -257,17 +244,13 @@ def flashinfer_fused_allreduce_rmsnorm_fp4_quant(
rms_eps: float,
input_global_scale: torch.Tensor,
allreduce_params: FlashInferFusedAllReduceParams,
workspace: object,
quant_out: torch.Tensor,
use_oneshot: bool,
output_scale: torch.Tensor,
norm_out: torch.Tensor | None = None,
):
"""FlashInfer fused allreduce + rmsnorm + FP4 quantization.
Note: Only supported by the trtllm backend.
"""
if flashinfer_comm is None or workspace is None:
"""FlashInfer fused allreduce + rmsnorm + FP4 quantization."""
if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
raise RuntimeError("FlashInfer not available or workspace not initialized")
if norm_out is None:
@@ -276,21 +259,24 @@ def flashinfer_fused_allreduce_rmsnorm_fp4_quant(
else:
residual_out = input_tensor
flashinfer_comm.allreduce_fusion(
input=input_tensor,
workspace=workspace,
pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant,
flashinfer_comm.trtllm_allreduce_fusion(
allreduce_in=input_tensor,
token_num=input_tensor.shape[0],
residual_in=residual,
residual_out=residual_out,
norm_out=norm_out,
rms_gamma=rms_gamma,
rms_eps=rms_eps,
hidden_dim=input_tensor.shape[-1],
workspace_ptrs=_FI_WORKSPACE_TENSOR,
pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant,
allreduce_out=None,
quant_out=quant_out,
scale_out=output_scale,
layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
scale_factor=input_global_scale,
use_oneshot=use_oneshot,
**allreduce_params.get_flashinfer_fused_allreduce_kwargs(),
**allreduce_params.get_trtllm_fused_allreduce_kwargs(),
)
@@ -423,16 +409,13 @@ def run_benchmarks(
dtype: torch.dtype,
use_residual: bool,
allreduce_params: FlashInferFusedAllReduceParams | None,
workspaces: dict,
quant_modes: set[str],
no_oneshot: bool,
):
"""Run all benchmarks for given configuration.
Args:
allreduce_params: Shared parameters for FlashInfer fused allreduce.
workspaces: Dict mapping backend name ("trtllm", "mnnvl") to workspace.
quant_modes: Set of quantization modes: "none", "fp8", "fp4".
quant_mode: "none", "fp8_only", "fp4_only", or "all"
"""
(
input_tensor,
@@ -448,18 +431,18 @@ def run_benchmarks(
rms_eps = 1e-6
results = {}
vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
use_oneshot_options = [False] if no_oneshot else [True, False]
# Create RMSNorm and QuantFP8 layers once for native benchmarks
if "none" in quant_modes:
# Standard AllReduce + RMSNorm
# Re-create VllmFusedAllreduce per config so CustomOp binds the
# correct forward method (native vs custom kernel).
for custom_op in ["-rms_norm", "+rms_norm"]:
with set_current_vllm_config(
VllmConfig(compilation_config=CompilationConfig(custom_ops=[custom_op]))
):
try:
vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
suffix = (
"_custom_rms_norm" if "+" in custom_op else "_native_rms_norm"
)
@@ -478,7 +461,6 @@ def run_benchmarks(
VllmConfig(compilation_config=CompilationConfig(custom_ops=["-rms_norm"]))
):
try:
vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
standard_allreduce_rmsnorm_native_compiled = torch.compile(
vllm_fused_allreduce.allreduce_rmsnorm,
fullgraph=True,
@@ -494,11 +476,10 @@ def run_benchmarks(
logger.error("Standard AllReduce+RMSNorm Native Compiled failed: %s", e)
results["standard_allreduce_rmsnorm_native_compiled"] = float("inf")
# FlashInfer Fused AllReduce + RMSNorm (all backends)
for backend, workspace in workspaces.items():
# FlashInfer Fused AllReduce + RMSNorm Oneshot/Twoshot
if flashinfer_comm is not None and allreduce_params is not None:
for use_oneshot in use_oneshot_options:
suffix = "_oneshot" if use_oneshot else "_twoshot"
key = f"flashinfer_{backend}_fused_allreduce_rmsnorm{suffix}"
try:
time_ms = benchmark_operation(
flashinfer_fused_allreduce_rmsnorm,
@@ -508,17 +489,14 @@ def run_benchmarks(
rms_gamma=rms_gamma,
rms_eps=rms_eps,
allreduce_params=allreduce_params,
workspace=workspace,
use_oneshot=use_oneshot,
)
results[key] = time_ms
results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = time_ms
except Exception as e:
logger.error(
"FlashInfer (%s) Fused AllReduce+RMSNorm failed: %s",
backend,
e,
logger.error("FlashInfer Fused AllReduce+RMSNorm failed: %s", e)
results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = float(
"inf"
)
results[key] = float("inf")
if "fp8" in quant_modes:
# Standard AllReduce + RMSNorm + FP8 Quant
@@ -527,7 +505,7 @@ def run_benchmarks(
"_custom_rms_norm" if "+" in rms_norm_custom_op else "_native_rms_norm"
)
for quant_fp8_custom_op in ["-quant_fp8", "+quant_fp8"]:
op_suffix = suffix + (
suffix += (
"_custom_quant_fp8"
if "+" in quant_fp8_custom_op
else "_native_quant_fp8"
@@ -540,17 +518,16 @@ def run_benchmarks(
)
):
try:
vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
time_ms = benchmark_operation(
vllm_fused_allreduce.allreduce_rmsnorm_fp8_quant,
input_tensor,
residual=residual,
scale_factor=scale_fp8,
)
results[f"standard_allreduce{op_suffix}"] = time_ms
results[f"standard_allreduce{suffix}"] = time_ms
except Exception as e:
logger.error("Standard AllReduce+RMSNorm+FP8 failed: %s", e)
results[f"standard_allreduce{op_suffix}"] = float("inf")
results[f"standard_allreduce{suffix}"] = float("inf")
# Standard AllReduce + RMSNorm + FP8 Quant Native Compiled
with set_current_vllm_config(
@@ -561,7 +538,6 @@ def run_benchmarks(
)
):
try:
vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
standard_allreduce_rmsnorm_fp8_quant_native_compiled = torch.compile(
vllm_fused_allreduce.allreduce_rmsnorm_fp8_quant,
fullgraph=True,
@@ -584,12 +560,10 @@ def run_benchmarks(
"inf"
)
# FlashInfer Fused AllReduce + RMSNorm + FP8 Quant (trtllm only)
if "trtllm" in workspaces:
trtllm_ws = workspaces["trtllm"]
# FlashInfer Fused AllReduce + RMSNorm + FP8 Quant Oneshot
if flashinfer_comm is not None and allreduce_params is not None:
for use_oneshot in use_oneshot_options:
suffix = "_oneshot" if use_oneshot else "_twoshot"
key = f"flashinfer_trtllm_fused_allreduce_rmsnorm_fp8_quant{suffix}"
try:
time_ms = benchmark_operation(
flashinfer_fused_allreduce_rmsnorm_fp8_quant,
@@ -601,16 +575,19 @@ def run_benchmarks(
scale_factor=scale_fp8,
quant_out=quant_out_fp8,
allreduce_params=allreduce_params,
workspace=trtllm_ws,
use_oneshot=use_oneshot,
)
results[key] = time_ms
results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = (
time_ms
)
except Exception as e:
logger.error(
"FlashInfer (trtllm) Fused AllReduce+RMSNorm+FP8 failed: %s",
"FlashInfer Fused AllReduce+RMSNorm+FP8 Oneshot failed: %s",
e,
)
results[key] = float("inf")
results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = (
float("inf")
)
if "fp4" in quant_modes and current_platform.has_device_capability(100):
# Standard AllReduce + RMSNorm + FP4 Quant
@@ -626,7 +603,6 @@ def run_benchmarks(
)
):
try:
vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
time_ms = benchmark_operation(
vllm_fused_allreduce.allreduce_rmsnorm_fp4_quant,
input_tensor,
@@ -645,7 +621,6 @@ def run_benchmarks(
VllmConfig(compilation_config=CompilationConfig(custom_ops=["-rms_norm"]))
):
try:
vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
standard_allreduce_rmsnorm_fp4_quant_native_compiled = torch.compile(
vllm_fused_allreduce.allreduce_rmsnorm_fp4_quant,
fullgraph=True,
@@ -670,12 +645,10 @@ def run_benchmarks(
"inf"
)
# FlashInfer Fused AllReduce + RMSNorm + FP4 Quant (trtllm only)
if "trtllm" in workspaces:
trtllm_ws = workspaces["trtllm"]
# FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Oneshot
if flashinfer_comm is not None and allreduce_params is not None:
for use_oneshot in use_oneshot_options:
suffix = "_oneshot" if use_oneshot else "_twoshot"
key = f"flashinfer_trtllm_fused_allreduce_rmsnorm_fp4_quant{suffix}"
try:
time_ms = benchmark_operation(
flashinfer_fused_allreduce_rmsnorm_fp4_quant,
@@ -686,18 +659,49 @@ def run_benchmarks(
rms_eps=rms_eps,
input_global_scale=scale_fp4,
allreduce_params=allreduce_params,
workspace=trtllm_ws,
quant_out=fp4_quant_out,
output_scale=fp4_output_scale,
use_oneshot=use_oneshot,
)
results[key] = time_ms
results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = (
time_ms
)
except Exception as e:
logger.error(
"FlashInfer (trtllm) Fused AllReduce+RMSNorm+FP4 failed: %s",
"FlashInfer Fused AllReduce+RMSNorm+FP4 Oneshot failed: %s",
e,
)
results[key] = float("inf")
results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = (
float("inf")
)
# FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Two-shot
if flashinfer_comm is not None and allreduce_params is not None:
try:
time_ms = benchmark_operation(
flashinfer_fused_allreduce_rmsnorm_fp4_quant,
input_tensor,
residual=residual,
norm_out=norm_out,
rms_gamma=rms_gamma,
rms_eps=rms_eps,
input_global_scale=scale_fp4,
allreduce_params=allreduce_params,
quant_out=fp4_quant_out,
output_scale=fp4_output_scale,
use_oneshot=False,
)
results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = (
time_ms
)
except Exception as e:
logger.error(
"FlashInfer Fused AllReduce+RMSNorm+FP4 Two-shot failed: %s",
e,
)
results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = float(
"inf"
)
return results
@@ -1035,33 +1039,24 @@ def main():
configs = list(itertools.product(args.num_tokens, dtypes, residual_options))
# Setup FlashInfer workspaces for all backends
# Setup FlashInfer workspace if available
ipc_handles = None
allreduce_params = None
if flashinfer_comm is not None:
# Use the largest hidden dimension for workspace setup
max_element_size = max(torch.finfo(dt).bits // 8 for dt in dtypes)
workspace_dtype = (
torch.float32
if max_element_size == 4
else (torch.bfloat16 if torch.bfloat16 in dtypes else torch.float16)
)
max_num_token = _FI_MAX_SIZES.get(world_size) // (
args.hidden_dim * max_element_size
args.hidden_dim * world_size * 2
)
for backend in FLASHINFER_BACKENDS:
setup_flashinfer_workspace(
backend=backend,
world_size=world_size,
rank=rank,
hidden_dim=args.hidden_dim,
max_token_num=max_num_token,
dtype=workspace_dtype,
)
ipc_handles, workspace_tensor = setup_flashinfer_workspace(
world_size, rank, args.hidden_dim, max_num_token
)
if _FI_WORKSPACES:
if workspace_tensor is not None:
allreduce_params = FlashInferFusedAllReduceParams(
rank=rank,
world_size=world_size,
max_token_num=max_num_token,
)
@@ -1086,7 +1081,6 @@ def main():
dtype,
use_residual,
allreduce_params,
workspaces=_FI_WORKSPACES,
quant_modes=quant_modes,
no_oneshot=args.no_oneshot,
)
@@ -1125,13 +1119,11 @@ def main():
finally:
# Cleanup
cleanup_flashinfer_workspaces()
if ipc_handles is not None:
cleanup_flashinfer_workspace(ipc_handles)
dist.barrier()
if __name__ == "__main__":
from vllm.config import VllmConfig, set_current_vllm_config
with set_current_vllm_config(VllmConfig()):
main()
main()

View File

@@ -5,14 +5,12 @@ import time
import torch
from vllm.benchmarks.lib.utils import default_vllm_config
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
@torch.inference_mode()
@default_vllm_config()
def main(
num_tokens: int,
hidden_size: int,

View File

@@ -16,7 +16,6 @@ import torch
from ray.experimental.tqdm_ray import tqdm
from vllm.model_executor.layers.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig,
FusedMoEParallelConfig,
@@ -100,38 +99,13 @@ def benchmark_config(
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
use_int4_w4a16: bool = False,
num_iters: int = 100,
block_quant_shape: list[int] = None,
use_deep_gemm: bool = False,
) -> float:
init_dtype = torch.float16 if use_fp8_w8a8 else dtype
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
if use_int4_w4a16:
# Int4 packed weights: 2 int4 values per uint8 byte
# K dimension is packed (halved)
intermediate_size = shard_intermediate_size // 2 # after silu_and_mul
w1 = torch.randint(
0,
255,
(
num_experts,
shard_intermediate_size,
hidden_size // 2, # int4 packing
),
dtype=torch.uint8,
)
w2 = torch.randint(
0,
255,
(
num_experts,
hidden_size,
intermediate_size // 2, # int4 packing
),
dtype=torch.uint8,
)
elif use_int8_w8a16:
if use_int8_w8a16:
w1 = torch.randint(
-127,
127,
@@ -165,20 +139,7 @@ def benchmark_config(
w2_scale = None
a1_scale = None
a2_scale = None
if use_int4_w4a16:
if block_quant_shape is None:
raise ValueError("block_quant_shape is required for int4_w4a16")
group_size = block_quant_shape[1]
# Scales shape: (E, N, K // group_size) in fp16
w1_scale = torch.rand(
(num_experts, shard_intermediate_size, hidden_size // group_size),
dtype=dtype,
)
w2_scale = torch.rand(
(num_experts, hidden_size, intermediate_size // group_size),
dtype=dtype,
)
elif use_int8_w8a16:
if use_int8_w8a16:
w1_scale = torch.randn(
(num_experts, 2 * shard_intermediate_size), dtype=torch.float32
)
@@ -237,7 +198,6 @@ def benchmark_config(
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_quant_shape,
weight_dtype="int4" if use_int4_w4a16 else None,
)
deep_gemm_experts = None
@@ -251,8 +211,7 @@ def benchmark_config(
hidden_dim=hidden_size,
intermediate_size_per_partition=shard_intermediate_size,
num_local_experts=num_experts,
num_logical_experts=num_experts,
activation=MoEActivation.SILU,
activation="silu",
moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
in_dtype=init_dtype,
routing_method=RoutingMethodType.TopK,
@@ -267,10 +226,9 @@ def benchmark_config(
x, input_gating, topk, renormalize=not use_deep_gemm
)
inplace = not disable_inplace()
if use_deep_gemm:
return deep_gemm_experts(
x, w1, w2, topk_weights, topk_ids, inplace=inplace
x, w1, w2, topk_weights, topk_ids, inplace=True
)
return fused_experts(
x,
@@ -278,7 +236,7 @@ def benchmark_config(
w2,
topk_weights,
topk_ids,
inplace=inplace,
inplace=True,
quant_config=quant_config,
)
@@ -520,7 +478,6 @@ class BenchmarkWorker:
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
use_int4_w4a16: bool = False,
block_quant_shape: list[int] = None,
use_deep_gemm: bool = False,
) -> tuple[dict[str, int], float]:
@@ -528,10 +485,7 @@ class BenchmarkWorker:
set_random_seed(self.seed)
dtype_str = _get_config_dtype_str(
dtype,
use_int8_w8a16=use_int8_w8a16,
use_fp8_w8a8=use_fp8_w8a8,
use_int4_w4a16=use_int4_w4a16,
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
)
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
# is the intermediate size after silu_and_mul.
@@ -562,7 +516,6 @@ class BenchmarkWorker:
dtype,
use_fp8_w8a8,
use_int8_w8a16,
use_int4_w4a16=use_int4_w4a16,
num_iters=100,
block_quant_shape=block_quant_shape,
use_deep_gemm=use_deep_gemm,
@@ -579,7 +532,6 @@ class BenchmarkWorker:
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
use_int4_w4a16: bool,
search_space: list[dict[str, int]],
block_quant_shape: list[int],
use_deep_gemm: bool,
@@ -590,7 +542,7 @@ class BenchmarkWorker:
best_config = None
best_time = float("inf")
if current_platform.is_rocm():
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16 or use_int4_w4a16)
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
search_space = prune_rocm_search_space(
num_tokens,
shard_intermediate_size,
@@ -619,7 +571,6 @@ class BenchmarkWorker:
dtype,
use_fp8_w8a8,
use_int8_w8a16,
use_int4_w4a16,
num_iters=20,
block_quant_shape=block_quant_shape,
use_deep_gemm=use_deep_gemm,
@@ -667,7 +618,6 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
else {}
),
**({"kpack": config["kpack"]} if "kpack" in config else {}),
**({"SPLIT_K": config["SPLIT_K"]} if "SPLIT_K" in config else {}),
}
@@ -680,15 +630,11 @@ def save_configs(
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
use_int4_w4a16: bool,
block_quant_shape: list[int],
save_dir: str,
) -> None:
dtype_str = _get_config_dtype_str(
dtype,
use_int8_w8a16=use_int8_w8a16,
use_fp8_w8a8=use_fp8_w8a8,
use_int4_w4a16=use_int4_w4a16,
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
)
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
@@ -790,38 +736,6 @@ def get_model_params(config):
return E, topk, intermediate_size, hidden_size
def get_quantization_group_size(config) -> int | None:
"""Extract the quantization group size from the HF model config.
This reads directly from the HuggingFace config object (as returned by
``get_config()``), not from vLLM's quantization config classes.
Supports AWQ/GPTQ-style configs (direct 'group_size' key) and
compressed-tensors configs (nested inside 'config_groups').
"""
quantization_config = getattr(config, "quantization_config", {})
if not isinstance(quantization_config, dict):
return None
# AWQ / GPTQ style: group_size is a top-level key
gs = quantization_config.get("group_size")
if gs is not None:
return gs
# compressed-tensors style: group_size is nested in config_groups
config_groups = quantization_config.get("config_groups", {})
if not isinstance(config_groups, dict):
return None
for group_cfg in config_groups.values():
if not isinstance(group_cfg, dict):
continue
weights = group_cfg.get("weights", {})
if not isinstance(weights, dict):
continue
gs = weights.get("group_size")
if gs is not None:
return gs
return None
def main(args: argparse.Namespace):
print(args)
@@ -840,20 +754,7 @@ def main(args: argparse.Namespace):
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16"
use_int4_w4a16 = args.dtype == "int4_w4a16"
block_quant_shape = get_weight_block_size_safety(config)
if use_int4_w4a16:
group_size = get_quantization_group_size(config)
if group_size is None:
raise ValueError(
"Could not determine group_size from model config. "
"The model's quantization_config must contain a 'group_size' "
"field (AWQ/GPTQ) or 'config_groups.*.weights.group_size' "
"(compressed-tensors)."
)
# For int4_w4a16, block_shape = [0, group_size]
# block_shape[0]=0 means no block quantization on N dimension
block_quant_shape = [0, group_size]
if args.batch_size is None:
batch_sizes = [
@@ -907,20 +808,8 @@ def main(args: argparse.Namespace):
return ray.get(outputs)
if args.tune:
# int4_w4a16 weights are uint8-packed, not fp16; treat like fp8 for
# search space generation (no matrix_instr_nonkdim/kpack exploration).
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16 or use_int4_w4a16)
# For int4_w4a16, the group_size constraint on BLOCK_SIZE_K does not
# apply: the gptq_awq kernel handles arbitrary BLOCK_SIZE_K regardless
# of group_size. Skip block_quant_shape filtering to keep the full
# search space (e.g. BLOCK_SIZE_K=64 with group_size=128).
tune_block_quant_shape = None if use_int4_w4a16 else block_quant_shape
search_space = get_configs_compute_bound(is_fp16, tune_block_quant_shape)
if use_int4_w4a16:
# SPLIT_K is a required kernel constexpr for gptq_awq kernel;
# only SPLIT_K=1 is used at runtime, so fix it during tuning.
for cfg in search_space:
cfg["SPLIT_K"] = 1
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
print(f"Start tuning over {len(search_space)} configurations...")
if use_deep_gemm:
raise ValueError(
@@ -940,7 +829,6 @@ def main(args: argparse.Namespace):
dtype,
use_fp8_w8a8,
use_int8_w8a16,
use_int4_w4a16,
search_space,
block_quant_shape,
use_deep_gemm,
@@ -960,7 +848,6 @@ def main(args: argparse.Namespace):
dtype,
use_fp8_w8a8,
use_int8_w8a16,
use_int4_w4a16,
block_quant_shape,
args.save_dir,
)
@@ -979,7 +866,6 @@ def main(args: argparse.Namespace):
dtype,
use_fp8_w8a8,
use_int8_w8a16,
use_int4_w4a16,
block_quant_shape,
use_deep_gemm,
)
@@ -1002,10 +888,7 @@ if __name__ == "__main__":
)
parser.add_argument("--enable-expert-parallel", "-enable-ep", action="store_true")
parser.add_argument(
"--dtype",
type=str,
choices=["auto", "fp8_w8a8", "int8_w8a16", "int4_w4a16"],
default="auto",
"--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
)
parser.add_argument("--use-deep-gemm", action="store_true")
parser.add_argument(

View File

@@ -1,278 +0,0 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark comparing old vs new default fused MoE configs.
Runs the triton fused_moe kernel with three configurations for each scenario:
1. Tuned config (from JSON file, if available) — the target to match
2. Old default (the hardcoded defaults before this change)
3. New default (the improved defaults)
Usage:
python benchmarks/kernels/benchmark_moe_defaults.py
Produces a table showing kernel time (us) and speedup of new vs old defaults.
"""
import torch
from vllm.model_executor.layers.fused_moe import fused_topk, override_config
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts,
get_default_config,
get_moe_configs,
)
from vllm.platforms import current_platform
from vllm.triton_utils import triton
from vllm.utils.torch_utils import set_random_seed
FP8_DTYPE = current_platform.fp8_dtype()
def old_default_config(M, E, N, K, topk, dtype=None, block_shape=None):
"""The original defaults before https://github.com/vllm-project/vllm/pull/34846,
for comparison."""
if dtype == "fp8_w8a8" and block_shape is not None:
return {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": block_shape[0],
"BLOCK_SIZE_K": block_shape[1],
"GROUP_SIZE_M": 32,
"SPLIT_K": 1,
"num_warps": 4,
"num_stages": 3 if not current_platform.is_rocm() else 2,
}
elif M <= E:
return {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"SPLIT_K": 1,
}
else:
return {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
}
def benchmark_config(
config,
M,
E,
N,
K,
topk,
dtype,
use_fp8=False,
block_shape=None,
num_iters=100,
):
"""Time a single kernel config. Returns kernel time in microseconds."""
init_dtype = torch.float16 if use_fp8 else dtype
a = torch.randn(M, K, device="cuda", dtype=init_dtype) / 10
w1 = torch.randn(E, 2 * N, K, device="cuda", dtype=init_dtype) / 10
w2 = torch.randn(E, K, N, device="cuda", dtype=init_dtype) / 10
w1_scale = None
w2_scale = None
a1_scale = None
a2_scale = None
if use_fp8:
if block_shape is not None:
bsn, bsk = block_shape
n_tiles_w1 = triton.cdiv(2 * N, bsn)
k_tiles_w1 = triton.cdiv(K, bsk)
n_tiles_w2 = triton.cdiv(K, bsn)
k_tiles_w2 = triton.cdiv(N, bsk)
w1_scale = torch.rand(
E, n_tiles_w1, k_tiles_w1, device="cuda", dtype=torch.float32
)
w2_scale = torch.rand(
E, n_tiles_w2, k_tiles_w2, device="cuda", dtype=torch.float32
)
else:
w1_scale = torch.rand(E, device="cuda", dtype=torch.float32)
w2_scale = torch.rand(E, device="cuda", dtype=torch.float32)
a1_scale = torch.rand(1, device="cuda", dtype=torch.float32)
a2_scale = torch.rand(1, device="cuda", dtype=torch.float32)
# Only weights are stored in fp8; activations stay in bf16/fp16
# and get dynamically quantized inside the kernel.
w1 = w1.to(FP8_DTYPE)
w2 = w2.to(FP8_DTYPE)
quant_config = FusedMoEQuantConfig.make(
quant_dtype=torch.float8_e4m3fn if use_fp8 else None,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_shape,
)
gating = torch.randn(M, E, device="cuda", dtype=torch.float32)
# Warmup
for _ in range(20):
with override_config(config):
topk_weights, topk_ids, _ = fused_topk(a, gating, topk, renormalize=True)
fused_experts(
a,
w1,
w2,
topk_weights,
topk_ids,
quant_config=quant_config,
)
torch.cuda.synchronize()
# Benchmark
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
for _ in range(num_iters):
with override_config(config):
topk_weights, topk_ids, _ = fused_topk(a, gating, topk, renormalize=True)
fused_experts(
a,
w1,
w2,
topk_weights,
topk_ids,
quant_config=quant_config,
)
end.record()
torch.cuda.synchronize()
return start.elapsed_time(end) / num_iters * 1000 # ms -> us
# Model configurations: (name, E, N, K, topk, dtype_str, use_fp8, block_shape)
# N = moe_intermediate_size // tp_size (the value used in config file lookup)
MODELS = [
# --- Few experts ---
("Mixtral bf16", 8, 7168, 4096, 2, None, False, None),
("Mixtral fp8", 8, 7168, 4096, 2, "fp8_w8a8", True, None),
# --- Many experts: real model shapes at tp=1 ---
# Qwen2-MoE-57B: E=60, topk=4, N=1408, K=2048
("Qwen2-MoE bf16", 60, 1408, 2048, 4, None, False, None),
# DeepSeek-V2: E=64, topk=6, N=1407, K=4096
# (use 1408 to avoid odd alignment; real model is 1407)
("DeepSeek-V2 bf16", 64, 1408, 4096, 6, None, False, None),
# OLMoE-7B: E=64, topk=8, N=2048, K=2048
("OLMoE bf16", 64, 2048, 2048, 8, None, False, None),
# GLM-4-100B-A10B: E=128, topk=8, N=1408, K=4096
("GLM-4-MoE bf16", 128, 1408, 4096, 8, None, False, None),
# Qwen3-30B-A3B: E=128, topk=8, N=768, K=2048
("Qwen3-MoE bf16", 128, 768, 2048, 8, None, False, None),
# DeepSeek-V3 / MiMo-V2-Flash: E=256, topk=8, N=2048, K=7168
("DeepSeek-V3 bf16", 256, 2048, 7168, 8, None, False, None),
# Qwen3.5-70B-A22B (Qwen3-Next): E=512, topk=10, N=512, K=2048
("Qwen3-Next bf16", 512, 512, 2048, 10, None, False, None),
# E=128 N=1856 bf16
("E128 N1856 bf16", 128, 1856, 4096, 8, None, False, None),
# E=256 N=512 bf16 (DS-V3 tp=4)
("DS-V3 tp4 bf16", 256, 512, 7168, 8, None, False, None),
# E=512 N=512 bf16 (Qwen3-Next tp=1)
("Qwen3-Next bf16", 512, 512, 2048, 10, None, False, None),
# E=512 N=256 bf16 (Qwen3-Next tp=2)
("Qwen3-Next tp2", 512, 256, 2048, 10, None, False, None),
# --- FP8 block quant (many experts) ---
# DS-V3 tp=4: E=256, N=512, fp8 block
("DS-V3 tp4 fp8blk", 256, 512, 7168, 8, "fp8_w8a8", True, [128, 128]),
# DS-V3 tp=8: E=256, N=256, fp8 block
("DS-V3 tp8 fp8blk", 256, 256, 7168, 8, "fp8_w8a8", True, [128, 128]),
# Qwen3-Next tp=2 fp8 block
("Qwen3-Next tp2 fp8blk", 512, 256, 2048, 10, "fp8_w8a8", True, [128, 128]),
]
BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
def main():
set_random_seed(0)
torch.set_default_device("cuda")
dtype = torch.bfloat16
for name, E, N, K, topk, dtype_str, use_fp8, block_shape in MODELS:
print(f"\n{'=' * 90}")
print(f" {name} (E={E}, N={N}, K={K}, topk={topk})")
print(f"{'=' * 90}")
# Try to load tuned config
block_n = block_shape[0] if block_shape else None
block_k = block_shape[1] if block_shape else None
tuned = get_moe_configs(E, N, dtype_str, block_n, block_k)
has_tuned = tuned is not None
print(f" Tuned config available: {has_tuned}")
hdr = (
f"{'Batch':>6} | {'Tuned (us)':>11} | {'Old (us)':>11} | "
f"{'New (us)':>11} | {'New/Old':>8} | {'New/Tuned':>10}"
)
print(f" {hdr}")
print(f" {'-' * len(hdr)}")
for M in BATCH_SIZES:
old_cfg = old_default_config(M, E, N, K, topk, dtype_str, block_shape)
new_cfg = get_default_config(M, E, N, K, topk, dtype_str, block_shape)
if has_tuned:
tuned_cfg = tuned[min(tuned.keys(), key=lambda x: abs(x - M))]
t_tuned = benchmark_config(
tuned_cfg,
M,
E,
N,
K,
topk,
dtype,
use_fp8=use_fp8,
block_shape=block_shape,
)
else:
t_tuned = None
t_old = benchmark_config(
old_cfg,
M,
E,
N,
K,
topk,
dtype,
use_fp8=use_fp8,
block_shape=block_shape,
)
t_new = benchmark_config(
new_cfg,
M,
E,
N,
K,
topk,
dtype,
use_fp8=use_fp8,
block_shape=block_shape,
)
ratio_new_old = t_new / t_old
tuned_str = f"{t_tuned:11.2f}" if t_tuned else f"{'N/A':>11}"
ratio_tuned = f"{t_new / t_tuned:10.2f}x" if t_tuned else f"{'N/A':>10}"
# flag regressions where new default is >5% slower than old
marker = " <--" if ratio_new_old > 1.05 else ""
print(
f" {M:>6} | {tuned_str} | {t_old:11.2f} | {t_new:11.2f} "
f"| {ratio_new_old:7.2f}x | {ratio_tuned}{marker}"
)
if __name__ == "__main__":
main()

View File

@@ -36,7 +36,6 @@ from typing import Any
import numpy as np
import torch
from vllm.benchmarks.lib.utils import default_vllm_config
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.transformers_utils.config import get_config
from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -79,7 +78,6 @@ def calculate_stats(times: list[float]) -> dict[str, float]:
}
@default_vllm_config()
def benchmark_mrope(
model_name: str,
num_tokens: int,

View File

@@ -5,7 +5,6 @@ import itertools
import torch
from vllm.benchmarks.lib.utils import default_vllm_config
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.triton_utils import triton
from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -30,7 +29,6 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device):
args={},
)
)
@default_vllm_config()
def benchmark(batch_size, seq_len, num_heads, provider):
dtype = torch.bfloat16
max_position = 8192

View File

@@ -71,7 +71,7 @@ while [[ $# -gt 0 ]]; do
usage
;;
*)
printf "Unknown argument: %s\n" "$1"
echo "Unknown argument: $1\n"
usage
;;
esac
@@ -84,17 +84,15 @@ mkdir -p "$OUTPUT_DIR"
QPS_VALUES=(25 20 15 10 5 1)
# Common parameters
COMMON_PARAMS=(
--backend "$BACKEND"
--model "$MODEL"
--dataset "$DATASET"
--structured-output-ratio "$STRUCTURED_OUTPUT_RATIO"
--save-results
--result-dir "$OUTPUT_DIR"
--output-len "$MAX_NEW_TOKENS"
--port "$PORT"
--tokenizer-mode "$TOKENIZER_MODE"
)
COMMON_PARAMS="--backend $BACKEND \
--model $MODEL \
--dataset $DATASET \
--structured-output-ratio $STRUCTURED_OUTPUT_RATIO \
--save-results \
--result-dir $OUTPUT_DIR \
--output-len $MAX_NEW_TOKENS \
--port $PORT \
--tokenizer-mode $TOKENIZER_MODE"
echo "Starting structured output benchmark with model: $MODEL"
echo "Backend: $BACKEND"
@@ -111,17 +109,17 @@ for qps in "${QPS_VALUES[@]}"; do
GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
# Construct filename for this run
FILENAME="${BACKEND}_${qps}qps_$(basename "$MODEL")_${DATASET}_${GIT_HASH}_${GIT_BRANCH}.json"
FILENAME="${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json"
NUM_PROMPTS=$(echo "$TOTAL_SECONDS * $qps" | bc)
NUM_PROMPTS=${NUM_PROMPTS%.*} # Remove fractional part
echo "Running benchmark with $NUM_PROMPTS prompts"
# Run the benchmark
python "$SCRIPT_DIR/benchmark_serving_structured_output.py" "${COMMON_PARAMS[@]}" \
--request-rate "$qps" \
python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
--request-rate $qps \
--result-filename "$FILENAME" \
--num-prompts "$NUM_PROMPTS"
--num-prompts $NUM_PROMPTS
echo "Completed benchmark with QPS: $qps"
echo "----------------------------------------"

View File

@@ -18,7 +18,6 @@ set(ENABLE_AVX512 $ENV{VLLM_CPU_AVX512})
set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16})
set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI})
set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16})
set(ENABLE_ARM_BF16 $ENV{VLLM_CPU_ARM_BF16})
include_directories("${CMAKE_SOURCE_DIR}/csrc")
@@ -116,10 +115,6 @@ else()
set(AVX512_FOUND ON)
message(STATUS "AVX512 support enabled via VLLM_CPU_AVX512 environment variable")
endif()
if (ENABLE_ARM_BF16)
set(ARM_BF16_FOUND ON)
message(STATUS "ARM BF16 support enabled via VLLM_CPU_ARM_BF16 environment variable")
endif()
endif()
if (AVX512_FOUND AND NOT AVX512_DISABLED)

View File

@@ -19,7 +19,7 @@ else()
FetchContent_Declare(
flashmla
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
GIT_TAG 692917b1cda61b93ac9ee2d846ec54e75afe87b1
GIT_TAG c2afa9cb93e674d5a9120a170a6da57b89267208
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ""

View File

@@ -1,9 +1,9 @@
# Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels
set(DEFAULT_TRITON_KERNELS_TAG "v3.6.0")
set(DEFAULT_TRITON_KERNELS_TAG "v3.5.0")
# Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
# be directly set to the triton_kernels python directory.
# be directly set to the triton_kernels python directory.
if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
message(STATUS "[triton_kernels] Fetch from $ENV{TRITON_KERNELS_SRC_DIR}")
FetchContent_Declare(
@@ -24,7 +24,7 @@ else()
)
endif()
# Fetch content
# Fetch content
FetchContent_MakeAvailable(triton_kernels)
if (NOT triton_kernels_SOURCE_DIR)
@@ -47,7 +47,7 @@ install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/third_party/tr
## Copy .py files to install directory.
install(DIRECTORY
${TRITON_KERNELS_PYTHON_DIR}
DESTINATION
DESTINATION
vllm/third_party/triton_kernels/
COMPONENT triton_kernels
FILES_MATCHING PATTERN "*.py")

View File

@@ -38,7 +38,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 5824e6e2008271063c3229ab3e7032bd74abbbc6
GIT_TAG 188be16520ceefdc625fdf71365585d2ee348fe2
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

View File

@@ -9,113 +9,6 @@
namespace vllm {
struct alignas(32) u32x8_t {
uint32_t u0, u1, u2, u3, u4, u5, u6, u7;
};
__device__ __forceinline__ void ld256(u32x8_t& val, const u32x8_t* ptr) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 && \
defined(CUDA_VERSION) && CUDA_VERSION >= 12090
asm volatile("ld.global.nc.v8.u32 {%0,%1,%2,%3,%4,%5,%6,%7}, [%8];\n"
: "=r"(val.u0), "=r"(val.u1), "=r"(val.u2), "=r"(val.u3),
"=r"(val.u4), "=r"(val.u5), "=r"(val.u6), "=r"(val.u7)
: "l"(ptr));
#else
const uint4* uint_ptr = reinterpret_cast<const uint4*>(ptr);
uint4 top_half = __ldg(&uint_ptr[0]);
uint4 bottom_half = __ldg(&uint_ptr[1]);
val.u0 = top_half.x;
val.u1 = top_half.y;
val.u2 = top_half.z;
val.u3 = top_half.w;
val.u4 = bottom_half.x;
val.u5 = bottom_half.y;
val.u6 = bottom_half.z;
val.u7 = bottom_half.w;
#endif
}
__device__ __forceinline__ void st256(u32x8_t& val, u32x8_t* ptr) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 && \
defined(CUDA_VERSION) && CUDA_VERSION >= 12090
asm volatile("st.global.v8.u32 [%0], {%1,%2,%3,%4,%5,%6,%7,%8};\n"
:
: "l"(ptr), "r"(val.u0), "r"(val.u1), "r"(val.u2), "r"(val.u3),
"r"(val.u4), "r"(val.u5), "r"(val.u6), "r"(val.u7)
: "memory");
#else
uint4* uint_ptr = reinterpret_cast<uint4*>(ptr);
uint_ptr[0] = make_uint4(val.u0, val.u1, val.u2, val.u3);
uint_ptr[1] = make_uint4(val.u4, val.u5, val.u6, val.u7);
#endif
}
template <bool support_256>
struct VecTraits;
template <>
struct VecTraits<true> {
static constexpr int ARCH_MAX_VEC_SIZE = 32;
using vec_t = u32x8_t;
};
template <>
struct VecTraits<false> {
static constexpr int ARCH_MAX_VEC_SIZE = 16;
using vec_t = int4;
};
template <typename T>
struct PackedTraits;
template <>
struct PackedTraits<c10::BFloat16> {
using packed_t = __nv_bfloat162;
};
template <>
struct PackedTraits<c10::Half> {
using packed_t = __half2;
};
template <>
struct PackedTraits<float> {
using packed_t = float2;
};
template <typename packed_t>
__device__ __forceinline__ float2 cast_to_float2(const packed_t& val) {
if constexpr (std::is_same_v<packed_t, __nv_bfloat162>) {
return __bfloat1622float2(val);
} else if constexpr (std::is_same_v<packed_t, __half2>) {
return __half22float2(val);
} else if constexpr (std::is_same_v<packed_t, float2>) {
return float2(val);
}
}
template <typename packed_t>
__device__ __forceinline__ packed_t cast_to_packed(const float2& val) {
if constexpr (std::is_same_v<packed_t, __nv_bfloat162>) {
return __float22bfloat162_rn(val);
} else if constexpr (std::is_same_v<packed_t, __half2>) {
return __float22half2_rn(val);
} else if constexpr (std::is_same_v<packed_t, float2>) {
return float2(val);
}
}
template <typename packed_t>
__device__ __forceinline__ packed_t packed_mul(const packed_t& x,
const packed_t& y) {
if constexpr (std::is_same_v<packed_t, __nv_bfloat162> ||
std::is_same_v<packed_t, __half2>) {
return __hmul2(x, y);
} else if constexpr (std::is_same_v<packed_t, float2>) {
return make_float2(x.x * y.x, x.y * y.y);
}
}
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
bool act_first>
__device__ __forceinline__ scalar_t compute(const scalar_t& x,
@@ -123,69 +16,52 @@ __device__ __forceinline__ scalar_t compute(const scalar_t& x,
return act_first ? ACT_FN(x) * y : x * ACT_FN(y);
}
template <typename packed_t, packed_t (*PACKED_ACT_FN)(const packed_t&),
bool act_first>
__device__ __forceinline__ packed_t packed_compute(const packed_t& x,
const packed_t& y) {
return act_first ? packed_mul(PACKED_ACT_FN(x), y)
: packed_mul(x, PACKED_ACT_FN(y));
}
// Check if all pointers are 16-byte aligned for int4 vectorized access
__host__ __device__ __forceinline__ bool is_16byte_aligned(const void* ptr) {
__device__ __forceinline__ bool is_16byte_aligned(const void* ptr) {
return (reinterpret_cast<uintptr_t>(ptr) & 15) == 0;
}
// Check if all pointers are 16-byte aligned for longlong4_32a vectorized access
__host__ __device__ __forceinline__ bool is_32byte_aligned(const void* ptr) {
return (reinterpret_cast<uintptr_t>(ptr) & 31) == 0;
}
// Activation and gating kernel template.
template <typename scalar_t, typename packed_t,
scalar_t (*ACT_FN)(const scalar_t&),
packed_t (*PACKED_ACT_FN)(const packed_t&), bool act_first,
bool use_vec, bool use_256b = false>
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
bool act_first>
__global__ void act_and_mul_kernel(
scalar_t* __restrict__ out, // [..., d]
const scalar_t* __restrict__ input, // [..., 2, d]
const int d) {
const scalar_t* x_ptr = input + blockIdx.x * 2 * d;
constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
const int64_t token_idx = blockIdx.x;
const scalar_t* x_ptr = input + token_idx * 2 * d;
const scalar_t* y_ptr = x_ptr + d;
scalar_t* out_ptr = out + blockIdx.x * d;
scalar_t* out_ptr = out + token_idx * d;
if constexpr (use_vec) {
// Fast path: 128-bit/256-bit vectorized loop
using vec_t = typename VecTraits<use_256b>::vec_t;
constexpr int ARCH_MAX_VEC_SIZE = VecTraits<use_256b>::ARCH_MAX_VEC_SIZE;
constexpr int VEC_SIZE = ARCH_MAX_VEC_SIZE / sizeof(packed_t);
// Check alignment for 128-bit vectorized access.
// All three pointers must be 16-byte aligned for safe int4 operations.
const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
is_16byte_aligned(out_ptr);
const vec_t* x_vec = reinterpret_cast<const vec_t*>(x_ptr);
const vec_t* y_vec = reinterpret_cast<const vec_t*>(y_ptr);
vec_t* out_vec = reinterpret_cast<vec_t*>(out_ptr);
const int num_vecs = d / 2 / VEC_SIZE;
if (aligned && d >= VEC_SIZE) {
// Fast path: 128-bit vectorized loop
const int4* x_vec = reinterpret_cast<const int4*>(x_ptr);
const int4* y_vec = reinterpret_cast<const int4*>(y_ptr);
int4* out_vec = reinterpret_cast<int4*>(out_ptr);
const int num_vecs = d / VEC_SIZE;
const int vec_end = num_vecs * VEC_SIZE;
for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
vec_t x, y;
if constexpr (use_256b) {
ld256(x, &x_vec[i]);
ld256(y, &y_vec[i]);
} else {
x = VLLM_LDG(&x_vec[i]);
y = VLLM_LDG(&y_vec[i]);
}
auto* xp = reinterpret_cast<packed_t*>(&x);
auto* yp = reinterpret_cast<packed_t*>(&y);
int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
auto* xp = reinterpret_cast<scalar_t*>(&x);
auto* yp = reinterpret_cast<scalar_t*>(&y);
auto* rp = reinterpret_cast<scalar_t*>(&r);
#pragma unroll
for (int j = 0; j < VEC_SIZE; j++) {
xp[j] =
packed_compute<packed_t, PACKED_ACT_FN, act_first>(xp[j], yp[j]);
}
if constexpr (use_256b) {
st256(x, &out_vec[i]);
} else {
out_vec[i] = x;
rp[j] = compute<scalar_t, ACT_FN, act_first>(xp[j], yp[j]);
}
out_vec[i] = r;
}
// Scalar cleanup for remaining elements
for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
out_ptr[i] = compute<scalar_t, ACT_FN, act_first>(VLLM_LDG(&x_ptr[i]),
VLLM_LDG(&y_ptr[i]));
}
} else {
// Scalar fallback for unaligned data or small d
@@ -203,15 +79,6 @@ __device__ __forceinline__ T silu_kernel(const T& x) {
return (T)(((float)x) / (1.0f + expf((float)-x)));
}
template <typename packed_t>
__device__ __forceinline__ packed_t packed_silu_kernel(const packed_t& val) {
// x * sigmoid(x)
float2 fval = cast_to_float2(val);
fval.x = fval.x / (1.0f + expf(-fval.x));
fval.y = fval.y / (1.0f + expf(-fval.y));
return cast_to_packed<packed_t>(fval);
}
template <typename T>
__device__ __forceinline__ T gelu_kernel(const T& x) {
// Equivalent to PyTorch GELU with 'none' approximation.
@@ -222,18 +89,6 @@ __device__ __forceinline__ T gelu_kernel(const T& x) {
return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA)));
}
template <typename packed_t>
__device__ __forceinline__ packed_t packed_gelu_kernel(const packed_t& val) {
// Equivalent to PyTorch GELU with 'none' approximation.
// Refer to:
// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
constexpr float ALPHA = M_SQRT1_2;
float2 fval = cast_to_float2(val);
fval.x = fval.x * 0.5f * (1.0f + ::erf(fval.x * ALPHA));
fval.y = fval.y * 0.5f * (1.0f + ::erf(fval.y * ALPHA));
return cast_to_packed<packed_t>(fval);
}
template <typename T>
__device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
// Equivalent to PyTorch GELU with 'tanh' approximation.
@@ -247,83 +102,32 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
return (T)(0.5f * f * (1.0f + ::tanhf(inner)));
}
template <typename packed_t>
__device__ __forceinline__ packed_t
packed_gelu_tanh_kernel(const packed_t& val) {
// Equivalent to PyTorch GELU with 'tanh' approximation.
// Refer to:
// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
float2 fval = cast_to_float2(val);
constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
constexpr float KAPPA = 0.044715;
float x_cube = fval.x * fval.x * fval.x;
float inner = BETA * (fval.x + KAPPA * x_cube);
fval.x = 0.5f * fval.x * (1.0f + ::tanhf(inner));
x_cube = fval.y * fval.y * fval.y;
inner = BETA * (fval.y + KAPPA * x_cube);
fval.y = 0.5f * fval.y * (1.0f + ::tanhf(inner));
return cast_to_packed<packed_t>(fval);
}
} // namespace vllm
// Launch activation and gating kernel.
// Use ACT_FIRST (bool) indicating whether to apply the activation function
// first.
#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, PACKED_KERNEL, ACT_FIRST) \
auto dtype = input.scalar_type(); \
int d = input.size(-1) / 2; \
int64_t num_tokens = input.numel() / input.size(-1); \
if (num_tokens == 0) { \
return; \
} \
dim3 grid(num_tokens); \
int cc_major = at::cuda::getCurrentDeviceProperties()->major; \
int support_vec = (cc_major >= 10 && num_tokens > 128) ? 32 : 16; \
int vec_size = support_vec / at::elementSize(dtype); \
const bool use_vec = (d % vec_size == 0); \
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
if (use_vec) { \
dim3 block(std::min(d / vec_size, 1024)); \
if (cc_major >= 10 && num_tokens > 128) { \
VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] { \
vllm::act_and_mul_kernel< \
scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t, \
KERNEL<scalar_t>, \
PACKED_KERNEL<typename vllm::PackedTraits<scalar_t>::packed_t>, \
ACT_FIRST, true, true><<<grid, block, 0, stream>>>( \
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d); \
}); \
} else { \
VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] { \
vllm::act_and_mul_kernel< \
scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t, \
KERNEL<scalar_t>, \
PACKED_KERNEL<typename vllm::PackedTraits<scalar_t>::packed_t>, \
ACT_FIRST, true, false><<<grid, block, 0, stream>>>( \
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d); \
}); \
} \
} else { \
dim3 block(std::min(d, 1024)); \
VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] { \
vllm::act_and_mul_kernel< \
scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t, \
KERNEL<scalar_t>, \
PACKED_KERNEL<typename vllm::PackedTraits<scalar_t>::packed_t>, \
ACT_FIRST, false><<<grid, block, 0, stream>>>( \
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d); \
}); \
}
#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, ACT_FIRST) \
int d = input.size(-1) / 2; \
int64_t num_tokens = input.numel() / input.size(-1); \
dim3 grid(num_tokens); \
dim3 block(std::min(d, 1024)); \
if (num_tokens == 0) { \
return; \
} \
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
VLLM_DISPATCH_FLOATING_TYPES( \
input.scalar_type(), "act_and_mul_kernel", [&] { \
vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>, ACT_FIRST> \
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
input.data_ptr<scalar_t>(), d); \
});
void silu_and_mul(torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, vllm::packed_silu_kernel,
true);
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, true);
}
void mul_and_silu(torch::Tensor& out, // [..., d]
@@ -331,22 +135,19 @@ void mul_and_silu(torch::Tensor& out, // [..., d]
{
// The difference between mul_and_silu and silu_and_mul is that mul_and_silu
// applies the silu to the latter half of the input.
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, vllm::packed_silu_kernel,
false);
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, false);
}
void gelu_and_mul(torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel, vllm::packed_gelu_kernel,
true);
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel, true);
}
void gelu_tanh_and_mul(torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel,
vllm::packed_gelu_tanh_kernel, true);
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel, true);
}
namespace vllm {
@@ -357,57 +158,42 @@ __device__ __forceinline__ T fatrelu_kernel(const T& x, const float threshold) {
return (T)(f > threshold ? f : 0.0f);
}
template <typename packed_t>
__device__ __forceinline__ packed_t
packed_fatrelu_kernel(const packed_t& val, const float threshold) {
float2 fval = cast_to_float2(val);
fval.x = fval.x > threshold ? fval.x : 0.0f;
fval.y = fval.y > threshold ? fval.y : 0.0f;
return cast_to_packed<packed_t>(fval);
}
template <typename scalar_t, typename packed_t,
scalar_t (*ACT_FN)(const scalar_t&, const float),
packed_t (*PACKED_ACT_FN)(const packed_t&, const float), bool use_vec,
bool use_256b = false>
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&, const float)>
__global__ void act_and_mul_kernel_with_param(
scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d,
const float param) {
const scalar_t* x_ptr = input + blockIdx.x * 2 * d;
constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
const int64_t token_idx = blockIdx.x;
const scalar_t* x_ptr = input + token_idx * 2 * d;
const scalar_t* y_ptr = x_ptr + d;
scalar_t* out_ptr = out + blockIdx.x * d;
scalar_t* out_ptr = out + token_idx * d;
if constexpr (use_vec) {
// Fast path: 128-bit/256-bit vectorized loop
using vec_t = typename VecTraits<use_256b>::vec_t;
constexpr int ARCH_MAX_VEC_SIZE = VecTraits<use_256b>::ARCH_MAX_VEC_SIZE;
constexpr int VEC_SIZE = ARCH_MAX_VEC_SIZE / sizeof(packed_t);
// Check alignment for 128-bit vectorized access
const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
is_16byte_aligned(out_ptr);
const vec_t* x_vec = reinterpret_cast<const vec_t*>(x_ptr);
const vec_t* y_vec = reinterpret_cast<const vec_t*>(y_ptr);
vec_t* out_vec = reinterpret_cast<vec_t*>(out_ptr);
const int num_vecs = d / 2 / VEC_SIZE;
if (aligned && d >= VEC_SIZE) {
// Fast path: 128-bit vectorized loop
const int4* x_vec = reinterpret_cast<const int4*>(x_ptr);
const int4* y_vec = reinterpret_cast<const int4*>(y_ptr);
int4* out_vec = reinterpret_cast<int4*>(out_ptr);
const int num_vecs = d / VEC_SIZE;
const int vec_end = num_vecs * VEC_SIZE;
for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
vec_t x, y;
if constexpr (use_256b) {
ld256(x, &x_vec[i]);
ld256(y, &y_vec[i]);
} else {
x = VLLM_LDG(&x_vec[i]);
y = VLLM_LDG(&y_vec[i]);
}
auto* xp = reinterpret_cast<packed_t*>(&x);
auto* yp = reinterpret_cast<packed_t*>(&y);
int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
auto* xp = reinterpret_cast<scalar_t*>(&x);
auto* yp = reinterpret_cast<scalar_t*>(&y);
auto* rp = reinterpret_cast<scalar_t*>(&r);
#pragma unroll
for (int j = 0; j < VEC_SIZE; j++) {
xp[j] = packed_mul(PACKED_ACT_FN(xp[j], param), yp[j]);
}
if constexpr (use_256b) {
st256(x, &out_vec[i]);
} else {
out_vec[i] = x;
rp[j] = ACT_FN(xp[j], param) * yp[j];
}
out_vec[i] = r;
}
// Scalar cleanup for remaining elements
for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
out_ptr[i] = ACT_FN(VLLM_LDG(&x_ptr[i]), param) * VLLM_LDG(&y_ptr[i]);
}
} else {
// Scalar fallback for unaligned data or small d
@@ -490,58 +276,20 @@ __global__ void swigluoai_and_mul_kernel(
} // namespace vllm
#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PACKED_KERNEL, PARAM) \
auto dtype = input.scalar_type(); \
int d = input.size(-1) / 2; \
int64_t num_tokens = input.numel() / input.size(-1); \
if (num_tokens == 0) { \
return; \
} \
dim3 grid(num_tokens); \
int cc_major = at::cuda::getCurrentDeviceProperties()->major; \
int support_vec = (cc_major >= 10 && num_tokens > 128) ? 32 : 16; \
int vec_size = support_vec / at::elementSize(dtype); \
const bool use_vec = (d % vec_size == 0); \
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
if (use_vec) { \
dim3 block(std::min(d / vec_size, 1024)); \
if (cc_major >= 10 && num_tokens > 128) { \
VLLM_DISPATCH_FLOATING_TYPES( \
dtype, "act_and_mul_kernel_with_param", [&] { \
vllm::act_and_mul_kernel_with_param< \
scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t, \
KERNEL<scalar_t>, \
PACKED_KERNEL< \
typename vllm::PackedTraits<scalar_t>::packed_t>, \
true, true><<<grid, block, 0, stream>>>( \
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d, \
PARAM); \
}); \
} else { \
VLLM_DISPATCH_FLOATING_TYPES( \
dtype, "act_and_mul_kernel_with_param", [&] { \
vllm::act_and_mul_kernel_with_param< \
scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t, \
KERNEL<scalar_t>, \
PACKED_KERNEL< \
typename vllm::PackedTraits<scalar_t>::packed_t>, \
true, false><<<grid, block, 0, stream>>>( \
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d, \
PARAM); \
}); \
} \
} else { \
dim3 block(std::min(d, 1024)); \
VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel_with_param", [&] { \
vllm::act_and_mul_kernel_with_param< \
scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t, \
KERNEL<scalar_t>, \
PACKED_KERNEL<typename vllm::PackedTraits<scalar_t>::packed_t>, \
false><<<grid, block, 0, stream>>>( \
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d, PARAM); \
}); \
}
#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM) \
int d = input.size(-1) / 2; \
int64_t num_tokens = input.numel() / input.size(-1); \
dim3 grid(num_tokens); \
dim3 block(std::min(d, 1024)); \
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
VLLM_DISPATCH_FLOATING_TYPES( \
input.scalar_type(), "act_and_mul_kernel_with_param", [&] { \
vllm::act_and_mul_kernel_with_param<scalar_t, KERNEL<scalar_t>> \
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
input.data_ptr<scalar_t>(), d, \
PARAM); \
});
#define LAUNCH_SIGLUOAI_AND_MUL(KERNEL, ALPHA, LIMIT) \
int d = input.size(-1) / 2; \
@@ -561,8 +309,7 @@ __global__ void swigluoai_and_mul_kernel(
void fatrelu_and_mul(torch::Tensor& out, // [..., d],
torch::Tensor& input, // [..., 2 * d]
double threshold) {
LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(
vllm::fatrelu_kernel, vllm::packed_fatrelu_kernel, threshold);
LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold);
}
void swigluoai_and_mul(torch::Tensor& out, // [..., d]
torch::Tensor& input, // [..., 2 * d]
@@ -572,41 +319,39 @@ void swigluoai_and_mul(torch::Tensor& out, // [..., d]
namespace vllm {
// Element-wise activation kernel template.
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&), bool use_vec,
bool use_256b = false>
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
__global__ void activation_kernel(
scalar_t* __restrict__ out, // [..., d]
const scalar_t* __restrict__ input, // [..., d]
const int d) {
const scalar_t* in_ptr = input + blockIdx.x * d;
scalar_t* out_ptr = out + blockIdx.x * d;
constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
const int64_t token_idx = blockIdx.x;
const scalar_t* in_ptr = input + token_idx * d;
scalar_t* out_ptr = out + token_idx * d;
if constexpr (use_vec) {
// Fast path: 128-bit/256-bit vectorized loop
using vec_t = typename VecTraits<use_256b>::vec_t;
constexpr int ARCH_MAX_VEC_SIZE = VecTraits<use_256b>::ARCH_MAX_VEC_SIZE;
constexpr int VEC_SIZE = ARCH_MAX_VEC_SIZE / sizeof(scalar_t);
const vec_t* in_vec = reinterpret_cast<const vec_t*>(in_ptr);
vec_t* out_vec = reinterpret_cast<vec_t*>(out_ptr);
// Check alignment for 128-bit vectorized access
const bool aligned = is_16byte_aligned(in_ptr) && is_16byte_aligned(out_ptr);
if (aligned && d >= VEC_SIZE) {
// Fast path: 128-bit vectorized loop
const int4* in_vec = reinterpret_cast<const int4*>(in_ptr);
int4* out_vec = reinterpret_cast<int4*>(out_ptr);
const int num_vecs = d / VEC_SIZE;
const int vec_end = num_vecs * VEC_SIZE;
for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
vec_t v;
if constexpr (use_256b) {
ld256(v, &in_vec[i]);
} else {
v = VLLM_LDG(&in_vec[i]);
}
int4 v = VLLM_LDG(&in_vec[i]), r;
auto* vp = reinterpret_cast<scalar_t*>(&v);
auto* rp = reinterpret_cast<scalar_t*>(&r);
#pragma unroll
for (int j = 0; j < VEC_SIZE; j++) {
vp[j] = ACT_FN(vp[j]);
}
if constexpr (use_256b) {
st256(v, &out_vec[i]);
} else {
out_vec[i] = v;
rp[j] = ACT_FN(vp[j]);
}
out_vec[i] = r;
}
// Scalar cleanup for remaining elements
for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[i]));
}
} else {
// Scalar fallback for unaligned data or small d
@@ -620,43 +365,18 @@ __global__ void activation_kernel(
} // namespace vllm
// Launch element-wise activation kernel.
#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
auto dtype = input.scalar_type(); \
int d = input.size(-1); \
int64_t num_tokens = input.numel() / input.size(-1); \
if (num_tokens == 0) { \
return; \
} \
dim3 grid(num_tokens); \
int cc_major = at::cuda::getCurrentDeviceProperties()->major; \
int support_vec = (cc_major >= 10 && num_tokens > 128) ? 32 : 16; \
int vec_size = support_vec / at::elementSize(dtype); \
const bool use_vec = (d % vec_size == 0); \
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
if (use_vec) { \
dim3 block(std::min(d / vec_size, 1024)); \
if (cc_major >= 10 && num_tokens > 128) { \
VLLM_DISPATCH_FLOATING_TYPES(dtype, "activation_kernel", [&] { \
vllm::activation_kernel<scalar_t, KERNEL<scalar_t>, true, true> \
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
input.data_ptr<scalar_t>(), d); \
}); \
} else { \
VLLM_DISPATCH_FLOATING_TYPES(dtype, "activation_kernel", [&] { \
vllm::activation_kernel<scalar_t, KERNEL<scalar_t>, true, false> \
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
input.data_ptr<scalar_t>(), d); \
}); \
} \
} else { \
dim3 block(std::min(d, 1024)); \
VLLM_DISPATCH_FLOATING_TYPES(dtype, "activation_kernel", [&] { \
vllm::activation_kernel<scalar_t, KERNEL<scalar_t>, false> \
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
input.data_ptr<scalar_t>(), d); \
}); \
}
#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
int d = input.size(-1); \
int64_t num_tokens = input.numel() / d; \
dim3 grid(num_tokens); \
dim3 block(std::min(d, 1024)); \
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] { \
vllm::activation_kernel<scalar_t, KERNEL<scalar_t>> \
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
input.data_ptr<scalar_t>(), d); \
});
namespace vllm {

View File

@@ -1234,13 +1234,8 @@ void cp_gather_and_upconvert_fp8_kv_cache(
"src_cache and seq_lens must be on the same device");
TORCH_CHECK(src_cache.device() == workspace_starts.device(),
"src_cache and workspace_starts must be on the same device");
auto dtype = src_cache.scalar_type();
TORCH_CHECK(
dtype == at::ScalarType::Byte || // uint8
dtype == at::ScalarType::Float8_e4m3fn || // fp8 e4m3
dtype == at::ScalarType::Float8_e5m2, // fp8 e5m2
"src_cache must be uint8, float8_e4m3fn, or float8_e5m2, but got ",
src_cache.dtype());
TORCH_CHECK(src_cache.dtype() == torch::kUInt8, "src_cache must be uint8");
TORCH_CHECK(dst.dtype() == torch::kBFloat16, "dst must be bfloat16");
TORCH_CHECK(head_dim == 576, "head_dim must be 576 for MLA");
@@ -1249,21 +1244,14 @@ void cp_gather_and_upconvert_fp8_kv_cache(
int64_t cache_entry_stride = src_cache.stride(1);
int64_t dst_entry_stride = dst.stride(0);
const uint8_t* src_ptr = nullptr;
if (dtype == at::ScalarType::Byte) {
src_ptr = src_cache.data_ptr<uint8_t>();
} else {
// float8_e4m3fn or float8_e5m2
src_ptr = reinterpret_cast<const uint8_t*>(src_cache.data_ptr());
}
// Decide on the number of splits based on the batch size
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
dim3 grid(batch_size, num_splits);
dim3 block(576);
vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid, block, 0, stream>>>(
src_ptr, reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
src_cache.data_ptr<uint8_t>(),
reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
block_table.data_ptr<int32_t>(), seq_lens.data_ptr<int32_t>(),
workspace_starts.data_ptr<int32_t>(), block_size, head_dim,
block_table_stride, cache_block_stride, cache_entry_stride,
@@ -1305,8 +1293,7 @@ void indexer_k_quant_and_cache(
const at::cuda::OptionalCUDAGuard device_guard(device_of(k));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
static const std::string kv_cache_dtype = "fp8_e4m3";
DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), kv_cache_dtype,
DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3",
CALL_INDEXER_K_QUANT_AND_CACHE);
}

View File

@@ -16,8 +16,6 @@ torch::Tensor get_scheduler_metadata(
isa = cpu_attention::ISA::VEC16;
} else if (isa_hint == "neon") {
isa = cpu_attention::ISA::NEON;
} else if (isa_hint == "vxe") {
isa = cpu_attention::ISA::VXE;
} else {
TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint);
}
@@ -102,8 +100,6 @@ void cpu_attn_reshape_and_cache(
return cpu_attention::ISA::VEC16;
} else if (isa == "neon") {
return cpu_attention::ISA::NEON;
} else if (isa == "vxe") {
return cpu_attention::ISA::VXE;
} else {
TORCH_CHECK(false, "Invalid ISA type: " + isa);
}

Some files were not shown because too many files have changed in this diff Show More