Compare commits

..

7 Commits

Author SHA1 Message Date
Seiji Eicher
c44d0c6d66 Patch protobuf for CVE-2026-0994 (#34253)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
Co-authored-by: Kevin H. Luu <khluu000@gmail.com>
(cherry picked from commit 5045d5c983)
2026-02-11 02:33:40 -08:00
Kunshang Ji
83db96d8cd [XPU][9/N] clean up existing ipex code/doc (#34111)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
(cherry picked from commit cb9574eb85)
2026-02-11 02:33:27 -08:00
zofia
dbfb79fe45 [XPU][7/N] enable xpu fp8 moe (#34202)
Signed-off-by: Zhu, Zufang <zufang.zhu@intel.com>
(cherry picked from commit b482f71e9f)
2026-02-11 02:33:15 -08:00
Roger Wang
b2e1fc3589 [Bugfix][Core] Fix CPU memory leak from Request reference cycle in prefix caching (#34183)
Signed-off-by: Roger Wang <hey@rogerw.io>
(cherry picked from commit 8a5e0e2b2b)
2026-02-11 02:33:04 -08:00
Gregory Shtrasberg
55a1baebc5 [Bugfix][ROCm][GPT-OSS] Use old triton_kernels implementation on ROCm if the new API is not available (#34153)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
(cherry picked from commit c60f8e3b49)
2026-02-11 02:32:52 -08:00
Charlie Fu
e1e9841631 [torch.compile][Fusion] Fix attention fusion pass removing kv_udpate op. (#33945)
Signed-off-by: charlifu <charlifu@amd.com>
(cherry picked from commit bb9f97308d)
2026-02-11 02:32:41 -08:00
zofia
5bd63387c3 [XPU][6/N] add xpu scaled_mm kernel (#34117)
Signed-off-by: Zhu, Zufang <zufang.zhu@intel.com>
(cherry picked from commit 9bdb06b436)
2026-02-11 02:32:27 -08:00
1871 changed files with 44766 additions and 169887 deletions

View File

@@ -1,7 +1,6 @@
group: Hardware - AMD Build group: Hardware
steps: steps:
- label: "AMD: :docker: build image" - label: "AMD: :docker: build image"
key: image-build-amd
depends_on: [] depends_on: []
device: amd_cpu device: amd_cpu
no_plugin: true no_plugin: true
@@ -10,7 +9,7 @@ steps:
docker build docker build
--build-arg max_jobs=16 --build-arg max_jobs=16
--build-arg REMOTE_VLLM=1 --build-arg REMOTE_VLLM=1
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950' --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}" --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-f docker/Dockerfile.rocm -f docker/Dockerfile.rocm

View File

@@ -21,20 +21,6 @@ steps:
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
pytest -x -v -s tests/kernels/test_onednn.py" pytest -x -v -s tests/kernels/test_onednn.py"
- label: CPU-Compatibility Tests
depends_on: []
soft_fail: true
device: intel_cpu
no_plugin: true
source_file_dependencies:
- cmake/cpu_extension.cmake
- setup.py
- vllm/platforms/cpu.py
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
- label: CPU-Language Generation and Pooling Model Tests - label: CPU-Language Generation and Pooling Model Tests
depends_on: [] depends_on: []
soft_fail: true soft_fail: true

View File

@@ -8,7 +8,7 @@ clean_docker_tag() {
} }
print_usage_and_exit() { print_usage_and_exit() {
echo "Usage: $0 <registry> <repo> <commit> <branch> <image_tag> [<image_tag_latest>]" echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
exit 1 exit 1
} }
@@ -142,16 +142,11 @@ resolve_parent_commit() {
print_bake_config() { print_bake_config() {
echo "--- :page_facing_up: Resolved bake configuration" echo "--- :page_facing_up: Resolved bake configuration"
# Write to a temp directory to avoid polluting the repo root (which is the BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
# Docker build context). Files left in the repo root get COPY'd into the
# image and can cause duplicate artifact uploads from downstream steps.
local bake_tmp
bake_tmp="$(mktemp -d)"
BAKE_CONFIG_FILE="${bake_tmp}/bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
echo "Saved bake config to ${BAKE_CONFIG_FILE}" echo "Saved bake config to ${BAKE_CONFIG_FILE}"
echo "--- :arrow_down: Uploading bake config to Buildkite" echo "--- :arrow_down: Uploading bake config to Buildkite"
(cd "$(dirname "${BAKE_CONFIG_FILE}")" && buildkite-agent artifact upload "$(basename "${BAKE_CONFIG_FILE}")") buildkite-agent artifact upload "${BAKE_CONFIG_FILE}"
} }
################################# #################################
@@ -159,7 +154,7 @@ print_bake_config() {
################################# #################################
print_instance_info print_instance_info
if [[ $# -lt 5 ]]; then if [[ $# -lt 7 ]]; then
print_usage_and_exit print_usage_and_exit
fi fi
@@ -168,8 +163,10 @@ REGISTRY=$1
REPO=$2 REPO=$2
BUILDKITE_COMMIT=$3 BUILDKITE_COMMIT=$3
BRANCH=$4 BRANCH=$4
IMAGE_TAG=$5 VLLM_USE_PRECOMPILED=$5
IMAGE_TAG_LATEST=${6:-} # only used for main branch, optional VLLM_MERGE_BASE_COMMIT=$6
IMAGE_TAG=$7
IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional
# build config # build config
TARGET="test-ci" TARGET="test-ci"
@@ -196,6 +193,8 @@ export CACHE_FROM
export CACHE_FROM_BASE_BRANCH export CACHE_FROM_BASE_BRANCH
export CACHE_FROM_MAIN export CACHE_FROM_MAIN
export CACHE_TO export CACHE_TO
export VLLM_USE_PRECOMPILED
export VLLM_MERGE_BASE_COMMIT
# print args # print args
echo "--- :mag: Arguments" echo "--- :mag: Arguments"
@@ -203,6 +202,8 @@ echo "REGISTRY: ${REGISTRY}"
echo "REPO: ${REPO}" echo "REPO: ${REPO}"
echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}" echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
echo "BRANCH: ${BRANCH}" echo "BRANCH: ${BRANCH}"
echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
echo "IMAGE_TAG: ${IMAGE_TAG}" echo "IMAGE_TAG: ${IMAGE_TAG}"
echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}" echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"

View File

@@ -5,7 +5,8 @@ steps:
depends_on: [] depends_on: []
timeout_in_minutes: 600 timeout_in_minutes: 600
commands: commands:
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi - if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
- if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
retry: retry:
automatic: automatic:
- exit_status: -1 # Agent was lost - exit_status: -1 # Agent was lost

View File

@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT=$3 BUILDKITE_COMMIT=$3
# authenticate with AWS ECR # authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists # skip build if image already exists
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
echo "Image not found, proceeding with build..." echo "Image not found, proceeding with build..."
else else
echo "Image found" echo "Image found"
@@ -24,11 +24,13 @@ fi
# build # build
docker build --file docker/Dockerfile.cpu \ docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \ --build-arg max_jobs=16 \
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
--build-arg VLLM_CPU_X86=true \ --build-arg VLLM_CPU_AVX512BF16=true \
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \ --build-arg VLLM_CPU_AVX512VNNI=true \
--build-arg VLLM_CPU_AMXBF16=true \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
--target vllm-test \ --target vllm-test \
--progress plain . --progress plain .
# push # push
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu

View File

@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT=$3 BUILDKITE_COMMIT=$3
# authenticate with AWS ECR # authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists # skip build if image already exists
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
echo "Image not found, proceeding with build..." echo "Image not found, proceeding with build..."
else else
echo "Image found" echo "Image found"
@@ -24,10 +24,10 @@ fi
# build # build
docker build --file docker/Dockerfile.cpu \ docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \ --build-arg max_jobs=16 \
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \ --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
--target vllm-test \ --target vllm-test \
--progress plain . --progress plain .
# push # push
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu

View File

@@ -11,10 +11,10 @@ REPO=$2
BUILDKITE_COMMIT=$3 BUILDKITE_COMMIT=$3
# authenticate with AWS ECR # authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists # skip build if image already exists
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
echo "Image not found, proceeding with build..." echo "Image not found, proceeding with build..."
else else
echo "Image found" echo "Image found"
@@ -25,10 +25,10 @@ fi
docker build \ docker build \
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \ --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
--build-arg max_jobs=16 \ --build-arg max_jobs=16 \
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \ --build-arg buildkite_commit=$BUILDKITE_COMMIT \
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu \ --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
--progress plain \ --progress plain \
https://github.com/vllm-project/vllm-gaudi.git https://github.com/vllm-project/vllm-gaudi.git
# push # push
docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu

View File

@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on chartqa for vllm. # We can use this script to compute baseline accuracy on chartqa for vllm.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.11" # pip install "lm-eval[api]>=0.4.9.2"
usage() { usage() {
echo`` echo``
@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
--tasks chartqa \ --tasks chartqa \
--batch_size auto \ --batch_size auto \
--apply_chat_template \ --apply_chat_template \
--limit "$LIMIT" --limit $LIMIT

View File

@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers. # We can use this script to compute baseline accuracy on GSM for transformers.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.11" # pip install "lm-eval[api]>=0.4.9.2"
usage() { usage() {
echo`` echo``

View File

@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support. # We use this for fp8, which HF does not support.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.11" # pip install "lm-eval[api]>=0.4.9.2"
usage() { usage() {
echo`` echo``

View File

@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support. # We use this for fp8, which HF does not support.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install "lm-eval[api]>=0.4.11" # pip install "lm-eval[api]>=0.4.9.2"
usage() { usage() {
echo`` echo``
@@ -20,11 +20,14 @@ usage() {
echo echo
} }
while getopts "m:l:f:t:" OPT; do while getopts "m:b:l:f:t:" OPT; do
case ${OPT} in case ${OPT} in
m ) m )
MODEL="$OPTARG" MODEL="$OPTARG"
;; ;;
b )
BATCH_SIZE="$OPTARG"
;;
l ) l )
LIMIT="$OPTARG" LIMIT="$OPTARG"
;; ;;

View File

@@ -13,10 +13,9 @@ import os
from contextlib import contextmanager from contextlib import contextmanager
import lm_eval import lm_eval
import numpy as np
import yaml import yaml
from vllm.platforms import current_platform
DEFAULT_RTOL = 0.08 DEFAULT_RTOL = 0.08
@@ -64,9 +63,6 @@ def launch_lm_eval(eval_config, tp_size):
"allow_deprecated_quantization=True," "allow_deprecated_quantization=True,"
) )
if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]:
model_args += "attention_backend=TRITON_ATTN"
env_vars = eval_config.get("env_vars", None) env_vars = eval_config.get("env_vars", None)
with scoped_env_vars(env_vars): with scoped_env_vars(env_vars):
results = lm_eval.simple_evaluate( results = lm_eval.simple_evaluate(
@@ -106,8 +102,6 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
f"ground_truth={ground_truth:.3f} | " f"ground_truth={ground_truth:.3f} | "
f"measured={measured_value:.3f} | rtol={rtol}" f"measured={measured_value:.3f} | rtol={rtol}"
) )
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
min_acceptable = ground_truth * (1 - rtol)
success = success and measured_value >= min_acceptable
assert success assert success

View File

@@ -83,6 +83,7 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3-8B", "model": "meta-llama/Meta-Llama-3-8B",
"tensor_parallel_size": 1, "tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "", "disable_log_stats": "",
"load_format": "dummy" "load_format": "dummy"
}, },

View File

@@ -7,10 +7,8 @@ import argparse
import html as _html import html as _html
import json import json
import os import os
from contextlib import nullcontext
from dataclasses import dataclass from dataclasses import dataclass
from importlib import util from importlib import util
from pathlib import Path
import pandas as pd import pandas as pd
@@ -33,45 +31,6 @@ pd.set_option("display.precision", 2)
pd.set_option("display.float_format", lambda x: f"{x:.2f}") pd.set_option("display.float_format", lambda x: f"{x:.2f}")
# -----------------------------
# Concurrency normalization (NEW, small)
# -----------------------------
def _find_concurrency_col(df: pd.DataFrame) -> str:
for c in [
"# of max concurrency.",
"# of max concurrency",
"Max Concurrency",
"max_concurrency",
"Concurrency",
]:
if c in df.columns:
return c
for c in df.columns:
if "concurr" in str(c).lower():
s = df[c]
if s.dtype.kind in "iu" and s.nunique() > 1 and s.min() >= 1:
return c
raise ValueError(
"Cannot infer concurrency column. "
"Please rename the column to one of the known names "
"or add an explicit override (e.g., --concurrency-col)."
)
def _normalize_concurrency_in_df(
df: pd.DataFrame, canonical: str = "# of max concurrency."
) -> pd.DataFrame:
if canonical in df.columns:
return df
detected = _find_concurrency_col(df)
if detected in df.columns and detected != canonical:
return df.rename(columns={detected: canonical})
df[canonical] = pd.NA
return df
# ----------------------------- # -----------------------------
# Core data compare # Core data compare
# ----------------------------- # -----------------------------
@@ -91,25 +50,19 @@ def compare_data_columns(
- Concat along axis=1 (indexes align), then reset_index so callers can - Concat along axis=1 (indexes align), then reset_index so callers can
group by columns. group by columns.
- If --debug, add a <file_label>_name column per file. - If --debug, add a <file_label>_name column per file.
Minimal fix to support different max_concurrency lists across files:
- normalize concurrency column naming to "# of max concurrency."
- align on UNION of keys (missing points become NaN)
- BUGFIX: don't drop throughput rows based on P99/Median presence
""" """
print("\ncompare_data_column:", data_column) print("\ncompare_data_column:", data_column)
frames = [] frames = []
raw_data_cols: list[str] = [] raw_data_cols: list[str] = []
compare_frames = []
# Determine key cols after normalizing concurrency
cols_per_file: list[set] = [] cols_per_file: list[set] = []
for f in files: for f in files:
try: try:
df_tmp = pd.read_json(f, orient="records") df_tmp = pd.read_json(f, orient="records")
except Exception as err: except Exception as err:
raise ValueError(f"Failed to read {f}") from err raise ValueError(f"Failed to read {f}") from err
df_tmp = _normalize_concurrency_in_df(df_tmp, canonical="# of max concurrency.")
cols_per_file.append(set(df_tmp.columns)) cols_per_file.append(set(df_tmp.columns))
key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)] key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
@@ -120,25 +73,12 @@ def compare_data_columns(
"No common key columns found from info_cols across the input files." "No common key columns found from info_cols across the input files."
) )
union_index = None meta_added = False
metas: list[pd.DataFrame] = []
staged: list[tuple[str, pd.Series, pd.Series | None]] = []
for file in files: for file in files:
df = pd.read_json(file, orient="records") df = pd.read_json(file, orient="records")
df = _normalize_concurrency_in_df(df, canonical="# of max concurrency.")
# BUGFIX: only drop rows for latency-like metrics; throughput rows may have if drop_column in df.columns:
# NaN in P99/Median columns even if the column exists in the JSON.
metric_lc = str(data_column).lower()
is_latency_metric = (
"ttft" in metric_lc
or "tpot" in metric_lc
or "p99" in metric_lc
or "median" in metric_lc
or metric_lc.strip() in {"p99", "median"}
)
if is_latency_metric and drop_column in df.columns:
df = df.dropna(subset=[drop_column], ignore_index=True) df = df.dropna(subset=[drop_column], ignore_index=True)
for c in ( for c in (
@@ -163,61 +103,35 @@ def compare_data_columns(
meta = meta.groupby(level=key_cols, dropna=False).first() meta = meta.groupby(level=key_cols, dropna=False).first()
file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file) file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
s = df_idx[data_column]
if data_column in df_idx.columns: if not s.index.is_unique:
s = df_idx[data_column] s = s.groupby(level=key_cols, dropna=False).mean()
if not s.index.is_unique:
s = s.groupby(level=key_cols, dropna=False).mean()
else:
# keep NA series to preserve meta keys for union_index
s = pd.Series(pd.NA, index=meta.index)
s.name = file_label s.name = file_label
name_s = None if not meta_added:
frames.append(meta)
meta_added = True
if debug and name_column in df_idx.columns: if debug and name_column in df_idx.columns:
name_s = df_idx[name_column] name_s = df_idx[name_column]
if not name_s.index.is_unique: if not name_s.index.is_unique:
name_s = name_s.groupby(level=key_cols, dropna=False).first() name_s = name_s.groupby(level=key_cols, dropna=False).first()
name_s.name = f"{file_label}_name" name_s.name = f"{file_label}_name"
frames.append(name_s)
if union_index is None: frames.append(s)
union_index = meta.index
else:
union_index = union_index.union(meta.index)
metas.append(meta)
staged.append((file_label, s, name_s))
if union_index is None:
raise ValueError("No data found after loading inputs.")
# meta first (union-aligned): build UNION meta across all files
if metas:
meta_union = pd.concat(metas, axis=0)
# Collapse duplicates on the MultiIndex; keep first non-null per column
meta_union = meta_union.groupby(level=key_cols, dropna=False).first()
frames.append(meta_union.reindex(union_index))
# values + ratios (union-aligned)
metric_series_aligned: list[pd.Series] = []
for file_label, s, name_s in staged:
s_aligned = s.reindex(union_index)
frames.append(s_aligned)
raw_data_cols.append(file_label) raw_data_cols.append(file_label)
metric_series_aligned.append(s_aligned) compare_frames.append(s)
if debug and name_s is not None: if len(compare_frames) >= 2:
frames.append(name_s.reindex(union_index)) base = compare_frames[0]
current = compare_frames[-1]
if len(metric_series_aligned) >= 2: if "P99" in data_column or "Median" in data_column:
base = metric_series_aligned[0]
current = metric_series_aligned[-1]
if "P99" in str(data_column) or "Median" in str(data_column):
ratio = base / current ratio = base / current
else: else:
ratio = current / base ratio = current / base
ratio = ratio.mask(base == 0) ratio = ratio.mask(base == 0)
ratio.name = f"Ratio 1 vs {len(metric_series_aligned)}" ratio.name = f"Ratio 1 vs {len(compare_frames)}"
frames.append(ratio) frames.append(ratio)
concat_df = pd.concat(frames, axis=1).reset_index(drop=True) concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
@@ -288,10 +202,24 @@ def split_json_by_tp_pp(
# ----------------------------- # -----------------------------
# Styling helpers # Styling helpers
# ----------------------------- # -----------------------------
def _find_concurrency_col(df: pd.DataFrame) -> str:
for c in [
"# of max concurrency.",
"# of max concurrency",
"Max Concurrency",
"max_concurrency",
"Concurrency",
]:
if c in df.columns:
return c
for c in df.columns:
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
return c
return "# of max concurrency."
def _highlight_threshold( def _highlight_threshold(
df: pd.DataFrame, df: pd.DataFrame, threshold: float
threshold: float,
slack_pct: float = 0.0,
) -> pd.io.formats.style.Styler: ) -> pd.io.formats.style.Styler:
conc_col = _find_concurrency_col(df) conc_col = _find_concurrency_col(df)
key_cols = [ key_cols = [
@@ -304,24 +232,12 @@ def _highlight_threshold(
] ]
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])] conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
try: return df.style.map(
slack_pct = float(slack_pct or 0.0) lambda v: "background-color:#e6ffe6;font-weight:bold;"
except Exception: if pd.notna(v) and v <= threshold
slack_pct = 0.0 else "",
slack_limit = threshold * (1.0 + slack_pct / 100.0) subset=conf_cols,
)
def _cell(v):
if pd.isna(v):
return ""
if v <= threshold:
# Strict SLA
return "background-color:#e6ffe6;font-weight:bold;"
if v <= slack_limit:
# Within slack range
return "background-color:#ffe5cc;font-weight:bold;"
return ""
return df.style.map(_cell, subset=conf_cols)
def highlight_ratio_columns(styler: pd.io.formats.style.Styler): def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
@@ -359,177 +275,6 @@ def _apply_two_decimals(
return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="") return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
# -----------------------------
# Export helpers (Excel + CSV)
# -----------------------------
def _sanitize_sheet_name(name: str) -> str:
"""
Excel sheet constraints:
- max 31 chars
- cannot contain: : \ / ? * [ ]
- cannot be empty
NOTE: Use fast, non-regex operations here to avoid the third-party `regex`
module's compile overhead/edge-cases on some systems.
"""
name = "sheet" if name is None else str(name)
# Replace illegal characters with underscore.
trans = str.maketrans(
{
":": "_",
"\\": "_",
"/": "_",
"?": "_",
"*": "_",
"[": "_",
"]": "_",
}
)
name = name.translate(trans)
# Strip quotes/spaces and collapse whitespace.
name = name.strip().strip("'")
name = " ".join(name.split())
if not name:
name = "sheet"
return name[:31]
def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
d = dict(zip(group_cols, gkey_tuple))
# Always keep input/output lengths (these are important).
ilen = d.get("Input Len", "")
olen = d.get("Output Len", "")
lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
# Shorten model name aggressively to make room for lens.
model = d.get("Model", "model")
leaf = str(model).split("/")[-1]
max_model_len = max(1, 31 - len(lens))
model_short = leaf[:max_model_len]
return _sanitize_sheet_name(f"{model_short}{lens}")
def _write_tables_to_excel_sheet(
writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
):
"""Write all blocks to a sheet with a single to_excel() call.
Pandas+openpyxl can be extremely slow when called many times per sheet.
We flatten blocks into one table with a 'Section' column to keep structure
while making Excel generation fast and deterministic.
"""
if not blocks:
pd.DataFrame().to_excel(writer, sheet_name=sheet, index=False)
return
combined_parts: list[pd.DataFrame] = []
for title, df in blocks:
df2 = df.copy()
# Put the section label as the first column for readability.
df2.insert(0, "Section", title)
combined_parts.append(df2)
combined = pd.concat(combined_parts, axis=0, ignore_index=True, sort=False)
combined.to_excel(writer, sheet_name=sheet, index=False)
def _safe_filename(s: str) -> str:
# Fast path without the third-party `regex` module.
s = " ".join(str(s).strip().split())
allowed = []
for ch in s:
if ch.isalnum() or ch in "._-":
allowed.append(ch)
else:
allowed.append("_")
out = "".join(allowed)
return out[:180] if len(out) > 180 else out
# -----------------------------
# vLLM environment export helper
# -----------------------------
def _parse_vllm_env_txt(env_path: Path) -> pd.DataFrame:
"""Parse vllm_env.txt into a flat table (Section, Key, Value).
Supports:
- section headers as standalone lines (no ':' or '=')
- key-value lines like 'OS: Ubuntu ...'
- env var lines like 'HF_HOME=/data/hf'
"""
lines = env_path.read_text(encoding="utf-8", errors="replace").splitlines()
section = "General"
rows: list[dict] = []
def set_section(s: str):
nonlocal section
s = (s or "").strip()
if s:
section = s
for raw in lines:
stripped = raw.strip()
if not stripped:
continue
# divider lines like =====
if set(stripped) <= {"="}:
continue
# section header heuristic: short standalone line
if ":" not in stripped and "=" not in stripped and len(stripped) <= 64:
if stripped.lower().startswith("collecting environment information"):
continue
set_section(stripped)
continue
# env var style: KEY=VALUE (and not a URL with :)
if "=" in stripped and ":" not in stripped:
k, v = stripped.split("=", 1)
k = k.strip()
v = v.strip()
if k:
rows.append({"Section": section, "Key": k, "Value": v})
continue
# key: value
if ":" in stripped:
k, v = stripped.split(":", 1)
k = k.strip()
v = v.strip()
if k:
rows.append({"Section": section, "Key": k, "Value": v})
continue
return pd.DataFrame(rows, columns=["Section", "Key", "Value"])
def _load_env_df_for_inputs(args, files: list[str]) -> pd.DataFrame | None:
"""Load vllm_env.txt next to the *original* input JSON file.
Note: when only one -f is provided, the script may split JSON into ./splits/...,
but vllm_env.txt typically lives next to the original benchmark_results.json.
"""
base_dir: Path | None = None
if getattr(args, "file", None):
base_dir = Path(args.file[0]).resolve().parent
elif files:
base_dir = Path(files[0]).resolve().parent
if base_dir is None:
return None
env_path = base_dir / "vllm_env.txt"
if not env_path.exists():
return None
df = _parse_vllm_env_txt(env_path)
return df
# ----------------------------- # -----------------------------
# Valid max concurrency summary helpers # Valid max concurrency summary helpers
# ----------------------------- # -----------------------------
@@ -556,11 +301,7 @@ def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
def _max_concurrency_ok( def _max_concurrency_ok(
df: pd.DataFrame, df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
conc_col: str,
cfg_col: str,
threshold: float,
slack_pct: float = 0.0,
): ):
if df is None or conc_col not in df.columns or cfg_col not in df.columns: if df is None or conc_col not in df.columns or cfg_col not in df.columns:
return pd.NA return pd.NA
@@ -573,14 +314,7 @@ def _max_concurrency_ok(
if d.empty: if d.empty:
return pd.NA return pd.NA
# Accept values up to (1 + slack_pct%) above the SLA. ok = d[d[cfg_col] <= threshold]
try:
slack_pct = float(slack_pct or 0.0)
except Exception:
slack_pct = 0.0
effective_limit = float(threshold) * (1.0 + slack_pct / 100.0)
ok = d[d[cfg_col] <= effective_limit]
if ok.empty: if ok.empty:
return pd.NA return pd.NA
@@ -646,25 +380,15 @@ def build_valid_max_concurrency_summary_html(
if not cfg_cols: if not cfg_cols:
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str) cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
# Display SLA ranges in the table header (SLA .. SLA*(1+slack))
ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
ttft_range = f"{args.ttft_max_ms:g}{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
tpot_range = f"{args.tpot_max_ms:g}{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
rows = [] rows = []
for cfg in cfg_cols: for cfg in cfg_cols:
ttft_max = ( ttft_max = (
_max_concurrency_ok( _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
)
if ttft_group_df is not None if ttft_group_df is not None
else pd.NA else pd.NA
) )
tpot_max = ( tpot_max = (
_max_concurrency_ok( _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
)
if tpot_group_df is not None if tpot_group_df is not None
else pd.NA else pd.NA
) )
@@ -693,8 +417,8 @@ def build_valid_max_concurrency_summary_html(
rows.append( rows.append(
{ {
"Configuration": cfg, "Configuration": cfg,
f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max, f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max, f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
f"Max {conc_col} (Both)": both, f"Max {conc_col} (Both)": both,
"Output Tput @ Both (tok/s)": tput_at_both, "Output Tput @ Both (tok/s)": tput_at_both,
"TTFT @ Both (ms)": ttft_at_both, "TTFT @ Both (ms)": ttft_at_both,
@@ -704,6 +428,7 @@ def build_valid_max_concurrency_summary_html(
summary_df = pd.DataFrame(rows) summary_df = pd.DataFrame(rows)
# --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
for c in summary_df.columns: for c in summary_df.columns:
if c == "Configuration": if c == "Configuration":
continue continue
@@ -711,10 +436,12 @@ def build_valid_max_concurrency_summary_html(
both_col = f"Max {conc_col} (Both)" both_col = f"Max {conc_col} (Both)"
# --- Strict 2-decimal formatting for ALL non-Configuration columns ---
formatters = {} formatters = {}
for c in summary_df.columns: for c in summary_df.columns:
if c == "Configuration": if c == "Configuration":
continue continue
# default argument binds per-column formatter correctly
formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}" formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
styler = summary_df.style.format(formatters) styler = summary_df.style.format(formatters)
@@ -733,104 +460,6 @@ def build_valid_max_concurrency_summary_html(
return title + styler.to_html(table_attributes='border="1" class="dataframe"') return title + styler.to_html(table_attributes='border="1" class="dataframe"')
def build_valid_max_concurrency_summary_df(
tput_group_df: pd.DataFrame | None,
ttft_group_df: pd.DataFrame | None,
tpot_group_df: pd.DataFrame | None,
conc_col: str,
args,
) -> pd.DataFrame | None:
if ttft_group_df is None and tpot_group_df is None:
return None
ttft_cols = (
_config_value_columns(ttft_group_df, conc_col)
if ttft_group_df is not None
else []
)
tpot_cols = (
_config_value_columns(tpot_group_df, conc_col)
if tpot_group_df is not None
else []
)
tput_cols = (
_config_value_columns(tput_group_df, conc_col)
if tput_group_df is not None
else []
)
if ttft_group_df is not None and tpot_group_df is not None:
cfg_cols = [c for c in ttft_cols if c in tpot_cols]
if tput_group_df is not None:
cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
else:
cfg_cols = ttft_cols or tpot_cols
if not cfg_cols:
cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
ttft_range = f"{args.ttft_max_ms:g}{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
tpot_range = f"{args.tpot_max_ms:g}{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
rows = []
for cfg in cfg_cols:
ttft_max = (
_max_concurrency_ok(
ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
)
if ttft_group_df is not None
else pd.NA
)
tpot_max = (
_max_concurrency_ok(
tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
)
if tpot_group_df is not None
else pd.NA
)
both = (
pd.NA
if (pd.isna(ttft_max) or pd.isna(tpot_max))
else min(ttft_max, tpot_max)
)
tput_at_both = (
_value_at_concurrency(tput_group_df, conc_col, cfg, both)
if tput_group_df is not None
else pd.NA
)
ttft_at_both = (
_value_at_concurrency(ttft_group_df, conc_col, cfg, both)
if ttft_group_df is not None
else pd.NA
)
tpot_at_both = (
_value_at_concurrency(tpot_group_df, conc_col, cfg, both)
if tpot_group_df is not None
else pd.NA
)
rows.append(
{
"Configuration": cfg,
f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
f"Max {conc_col} (Both)": both,
"Output Tput @ Both (tok/s)": tput_at_both,
"TTFT @ Both (ms)": ttft_at_both,
"TPOT @ Both (ms)": tpot_at_both,
}
)
df = pd.DataFrame(rows)
for c in df.columns:
if c != "Configuration":
df[c] = pd.to_numeric(df[c], errors="coerce")
return df
# ----------------------------- # -----------------------------
# Plot helper # Plot helper
# ----------------------------- # -----------------------------
@@ -908,35 +537,6 @@ def build_parser() -> argparse.ArgumentParser:
default=100.0, default=100.0,
help="Reference limit for TPOT plots (ms)", help="Reference limit for TPOT plots (ms)",
) )
# ---- SLA tolerance (slack) options ----
parser.add_argument(
"--ttft-slack-pct",
type=float,
default=5.0,
help="Allowed percentage above TTFT SLA (default: 5).",
)
parser.add_argument(
"--tpot-slack-pct",
type=float,
default=5.0,
help="Allowed percentage above TPOT SLA (default: 5).",
)
# ---- export options ----
parser.add_argument(
"--excel-out",
type=str,
default="perf_comparison.xlsx",
help="Write one sheet per (Model, Dataset, Input Len, Output Len).",
)
parser.add_argument(
"--csv-out-dir",
type=str,
default="",
help="If set, write per-group per-metric CSVs into this directory.",
)
return parser return parser
@@ -1015,13 +615,9 @@ def render_metric_table_html(
metric_name = metric_label.lower() metric_name = metric_label.lower()
if "ttft" in metric_name: if "ttft" in metric_name:
styler = _highlight_threshold( styler = _highlight_threshold(display_group, args.ttft_max_ms)
display_group, args.ttft_max_ms, args.ttft_slack_pct
)
elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name): elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
styler = _highlight_threshold( styler = _highlight_threshold(display_group, args.tpot_max_ms)
display_group, args.tpot_max_ms, args.tpot_slack_pct
)
else: else:
styler = display_group.style styler = display_group.style
@@ -1061,6 +657,7 @@ def maybe_write_plot(
markers=True, markers=True,
) )
# Ensure plot hover + y tick labels are also 2 decimals.
fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>") fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
fig.update_yaxes(tickformat=".2f") fig.update_yaxes(tickformat=".2f")
@@ -1133,186 +730,87 @@ def write_report_group_first(
for metric_label, (df, _) in metric_cache.items() for metric_label, (df, _) in metric_cache.items()
} }
csv_dir = Path(args.csv_out_dir) if args.csv_out_dir else None with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
if csv_dir: main_fh.write('<meta charset="utf-8">\n')
csv_dir.mkdir(parents=True, exist_ok=True) for gkey in group_keys:
gkey_tuple = normalize_group_key(gkey)
suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
sub_path = group_filename(gkey_tuple)
group_header = (
'<div style="font-size: 1.4em; font-weight: 700; '
'margin: 18px 0 10px 0;">'
f"{_html.escape(suffix)}"
"</div>\n"
)
excel_path = args.excel_out or "perf_comparison.xlsx" main_fh.write(group_header)
disable_excel = os.getenv("VLLM_COMPARE_DISABLE_EXCEL", "0") == "1" with open(sub_path, "w", encoding="utf-8") as sub_fh:
sub_fh.write('<meta charset="utf-8">\n')
sub_fh.write(group_header)
tput_group_df = None
ttft_group_df = None
tpot_group_df = None
conc_col = args.xaxis
# Prefer xlsxwriter for speed; fallback to openpyxl if unavailable. for metric_label in plan.data_cols:
excel_engine = ( gb = metric_groupbys[metric_label]
os.getenv("VLLM_COMPARE_EXCEL_ENGINE", "xlsxwriter").strip() or "xlsxwriter" df_sorted, raw_data_cols = metric_cache[metric_label]
)
if excel_engine == "xlsxwriter" and util.find_spec("xlsxwriter") is None:
excel_engine = "openpyxl"
excel_engine_kwargs = {} try:
if excel_engine == "xlsxwriter": group_df = gb.get_group(gkey)
# Reduce memory pressure & usually faster writes. except KeyError:
excel_engine_kwargs = {"options": {"constant_memory": True}} missing = (
'<div style="font-size: 1.1em; font-weight: 600; '
'margin: 10px 0;">'
f"{_html.escape(metric_label)} — missing for this group"
"</div>\n"
)
xw_ctx = ( main_fh.write(missing)
nullcontext(None) sub_fh.write(missing)
if disable_excel continue
else pd.ExcelWriter(
excel_path, engine=excel_engine, engine_kwargs=excel_engine_kwargs if conc_col not in group_df.columns:
) conc_col = _find_concurrency_col(group_df)
)
with xw_ctx as xw: mn = metric_label.lower().strip()
used_sheets: set[str] = set() if "tok/s" in mn:
# ---- Environment sheet (first) ---- tput_group_df = group_df
env_sheet = _sanitize_sheet_name("Environment") elif "ttft" in mn:
env_df = _load_env_df_for_inputs(args, files) ttft_group_df = group_df
if xw is not None: elif mn in ("p99", "median") or "tpot" in mn:
if env_df is None or env_df.empty: tpot_group_df = group_df
pd.DataFrame(
[ display_group = group_df.drop(
{ columns=group_cols_canonical, errors="ignore"
"Section": "Environment", )
"Key": "vllm_env.txt",
"Value": "NOT FOUND (or empty)", html = render_metric_table_html(
} display_group, metric_label, suffix, args
] )
).to_excel(xw, sheet_name=env_sheet, index=False) main_fh.write(html)
else: sub_fh.write(html)
env_df.to_excel(xw, sheet_name=env_sheet, index=False)
used_sheets.add(env_sheet) maybe_write_plot(
with open("perf_comparison.html", "w", encoding="utf-8") as main_fh: main_fh,
main_fh.write('<meta charset="utf-8">\n') sub_fh,
for gkey in group_keys: group_df=group_df,
gkey_tuple = normalize_group_key(gkey) raw_data_cols=raw_data_cols,
suffix = build_group_suffix(group_cols_canonical, gkey_tuple) metric_label=metric_label,
sub_path = group_filename(gkey_tuple) y_axis_col=y_axis_col,
group_header = ( args=args,
'<div style="font-size: 1.4em; font-weight: 700; ' )
'margin: 18px 0 10px 0;">'
f"{_html.escape(suffix)}" summary_html = build_valid_max_concurrency_summary_html(
"</div>\n" tput_group_df=tput_group_df,
ttft_group_df=ttft_group_df,
tpot_group_df=tpot_group_df,
conc_col=conc_col,
args=args,
) )
if summary_html:
main_fh.write(group_header) main_fh.write(summary_html)
sub_fh.write(summary_html)
do_excel = xw is not None
sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
sheet_base = sheet
if do_excel:
dedup_i = 1
while sheet in used_sheets:
dedup_i += 1
suffix = f"_{dedup_i}"
# Ensure uniqueness even when sheet names are truncated.
base = str(sheet_base)
keep = max(1, 31 - len(suffix))
sheet = _sanitize_sheet_name(base[:keep] + suffix)
used_sheets.add(sheet)
excel_blocks: list[tuple[str, pd.DataFrame]] = []
with open(sub_path, "w", encoding="utf-8") as sub_fh:
sub_fh.write('<meta charset="utf-8">\n')
sub_fh.write(group_header)
tput_group_df = None
ttft_group_df = None
tpot_group_df = None
conc_col = args.xaxis
for metric_label in plan.data_cols:
gb = metric_groupbys[metric_label]
df_sorted, raw_data_cols = metric_cache[metric_label]
try:
group_df = gb.get_group(gkey)
except KeyError:
missing = (
'<div style="font-size: 1.1em; font-weight: 600; '
'margin: 10px 0;">'
f"{_html.escape(metric_label)} — missing for this group"
"</div>\n"
)
main_fh.write(missing)
sub_fh.write(missing)
continue
if conc_col not in group_df.columns:
conc_col = _find_concurrency_col(group_df)
mn = metric_label.lower().strip()
if "tok/s" in mn:
tput_group_df = group_df
elif "ttft" in mn:
ttft_group_df = group_df
elif mn in ("p99", "median") or "tpot" in mn:
tpot_group_df = group_df
display_group = group_df.drop(
columns=group_cols_canonical, errors="ignore"
)
html = render_metric_table_html(
display_group, metric_label, suffix, args
)
main_fh.write(html)
sub_fh.write(html)
maybe_write_plot(
main_fh,
sub_fh,
group_df=group_df,
raw_data_cols=raw_data_cols,
metric_label=metric_label,
y_axis_col=y_axis_col,
args=args,
)
excel_blocks.append(
(metric_label, group_df.reset_index(drop=True))
)
if csv_dir:
fn = _safe_filename(
f"{sheet}__{metric_label}".replace(" ", "_").replace(
"/", "_"
)
)
group_df.to_csv(csv_dir / f"{fn}.csv", index=False)
summary_html = build_valid_max_concurrency_summary_html(
tput_group_df=tput_group_df,
ttft_group_df=ttft_group_df,
tpot_group_df=tpot_group_df,
conc_col=conc_col,
args=args,
)
if summary_html:
main_fh.write(summary_html)
sub_fh.write(summary_html)
summary_df = build_valid_max_concurrency_summary_df(
tput_group_df=tput_group_df,
ttft_group_df=ttft_group_df,
tpot_group_df=tpot_group_df,
conc_col=conc_col,
args=args,
)
if summary_df is not None:
excel_blocks.append(
("Valid Max Concurrency Summary", summary_df)
)
if csv_dir:
fn = _safe_filename(
f"{sheet}__Valid_Max_Concurrency_Summary"
)
summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
if do_excel:
_write_tables_to_excel_sheet(xw, sheet, excel_blocks)
if disable_excel:
print("Skipped Excel generation (VLLM_COMPARE_DISABLE_EXCEL=1).")
else:
print(f"Wrote Excel: {excel_path}")
if csv_dir:
print(f"Wrote CSVs under: {csv_dir}")
def main(): def main():

View File

@@ -1,4 +1,6 @@
#!/bin/bash #!/bin/bash
# This script should be run inside the CI process
# This script assumes that we are already inside the vllm/ directory # This script assumes that we are already inside the vllm/ directory
# Benchmarking results will be available inside vllm/benchmarks/results/ # Benchmarking results will be available inside vllm/benchmarks/results/
@@ -7,26 +9,14 @@
set -x set -x
set -o pipefail set -o pipefail
# Environment-driven debug controls (like ON_CPU=1)
DRY_RUN="${DRY_RUN:-0}"
MODEL_FILTER="${MODEL_FILTER:-}"
DTYPE_FILTER="${DTYPE_FILTER:-}"
# Adaptive search controls
ENABLE_ADAPTIVE_CONCURRENCY="${ENABLE_ADAPTIVE_CONCURRENCY:-0}"
SLA_TTFT_MS="${SLA_TTFT_MS:-3000}"
SLA_TPOT_MS="${SLA_TPOT_MS:-100}"
ADAPTIVE_MAX_PROBES="${ADAPTIVE_MAX_PROBES:-8}"
ADAPTIVE_MAX_CONCURRENCY="${ADAPTIVE_MAX_CONCURRENCY:-1024}"
check_gpus() { check_gpus() {
if command -v nvidia-smi; then if command -v nvidia-smi; then
# check the number of GPUs and GPU type. # check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | grep -c . || true) declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
elif command -v amd-smi; then elif command -v amd-smi; then
declare -g gpu_count=$(amd-smi list | grep -c 'GPU' || true) declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
elif command -v hl-smi; then elif command -v hl-smi; then
declare -g gpu_count=$(hl-smi --list | grep -ci "Module ID" || true) declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
fi fi
if [[ $gpu_count -gt 0 ]]; then if [[ $gpu_count -gt 0 ]]; then
@@ -54,7 +44,7 @@ check_cpus() {
declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}') declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
if [[ $numa_count -gt 0 ]]; then if [[ $numa_count -gt 0 ]]; then
echo "NUMA found." echo "NUMA found."
echo "$numa_count" echo $numa_count
else else
echo "Need at least 1 NUMA to run benchmarking." echo "Need at least 1 NUMA to run benchmarking."
exit 1 exit 1
@@ -122,12 +112,13 @@ json2envs() {
} }
wait_for_server() { wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
local timeout_val="1200" local timeout_val="1200"
timeout "$timeout_val" bash -c ' timeout "$timeout_val" bash -c '
until curl -sf http://localhost:8000/v1/models >/dev/null; do until curl -X POST localhost:8000/v1/completions; do
sleep 1 sleep 1
done done' && return 0 || return 1
'
} }
kill_processes_launched_by_current_bash() { kill_processes_launched_by_current_bash() {
@@ -190,304 +181,6 @@ upload_to_buildkite() {
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*" $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
} }
# -------------------------------
# Adaptive concurrency helpers
# -------------------------------
result_json_path_for_serving() {
local test_name=$1
local qps=$2
local max_concurrency=$3
echo "$RESULTS_FOLDER/${test_name}_qps_${qps}_concurrency_${max_concurrency}.json"
}
extract_metric_ms() {
local metric_name=$1
local json_file=$2
[[ -f "$json_file" ]] || return 0
if [[ "$metric_name" == "ttft" ]]; then
jq -r '
[
.ttft_ms.p99?,
.metrics.ttft_ms.p99?,
.ttft.p99?,
.metrics.ttft.p99?,
.p99_ttft_ms?,
.ttft_ms.mean?,
.metrics.ttft_ms.mean?,
.ttft.mean?,
.metrics.ttft.mean?,
.mean_ttft_ms?
] | map(select(. != null)) | .[0] // empty
' "$json_file"
else
jq -r '
[
.tpot_ms.p99?,
.metrics.tpot_ms.p99?,
.tpot.p99?,
.metrics.tpot.p99?,
.p99_tpot_ms?,
.itl_ms.p99?,
.metrics.itl_ms.p99?,
.inter_token_latency_ms.p99?,
.tpot_ms.mean?,
.metrics.tpot_ms.mean?,
.tpot.mean?,
.metrics.tpot.mean?,
.itl_ms.mean?,
.metrics.itl_ms.mean?,
.mean_tpot_ms?,
.mean_itl_ms?
] | map(select(. != null)) | .[0] // empty
' "$json_file"
fi
}
evaluate_sla_from_json() {
local json_file=$1
local ttft
local tpot
local pass
[[ -f "$json_file" ]] || return 2
ttft=$(extract_metric_ms ttft "$json_file")
tpot=$(extract_metric_ms tpot "$json_file")
[[ -n "$ttft" && -n "$tpot" ]] || return 2
pass=$(jq -n \
--argjson ttft "$ttft" \
--argjson tpot "$tpot" \
--argjson sla_ttft "$SLA_TTFT_MS" \
--argjson sla_tpot "$SLA_TPOT_MS" \
'($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)')
[[ "$pass" == "true" ]]
}
write_adaptive_summary_json() {
local summary_file=$1
local test_name=$2
local qps=$3
local static_last_pass=$4
local static_first_fail=$5
local final_last_pass=$6
local final_first_fail=$7
jq -n \
--arg test_name "$test_name" \
--arg qps "$qps" \
--argjson sla_ttft "$SLA_TTFT_MS" \
--argjson sla_tpot "$SLA_TPOT_MS" \
--arg static_last_pass "${static_last_pass:-}" \
--arg static_first_fail "${static_first_fail:-}" \
--arg final_last_pass "${final_last_pass:-}" \
--arg final_first_fail "${final_first_fail:-}" \
'{
test_name: $test_name,
qps: $qps,
sla_ttft_ms: $sla_ttft,
sla_tpot_ms: $sla_tpot,
static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end),
static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end),
final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end),
final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end)
}' > "$summary_file"
}
run_single_serving_probe() {
local test_name=$1
local qps=$2
local max_concurrency=$3
local tp=$4
local compilation_config_mode=$5
local optimization_level=$6
local client_args_effective=$7
local client_remote_args=$8
local server_command=$9
local new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
local result_json
local num_prompts_arg=""
local client_command
result_json=$(result_json_path_for_serving "$test_name" "$qps" "$max_concurrency")
if [[ -f "$result_json" ]]; then
evaluate_sla_from_json "$result_json"
return $?
fi
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
num_prompts_arg="--num-prompts $num_prompts"
fi
client_command="vllm bench serve \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--max-concurrency $max_concurrency \
$num_prompts_arg \
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level adaptive_search=1 \
$client_args_effective $client_remote_args "
echo "Adaptive probe: $client_command"
if [[ "${DRY_RUN:-0}" != "1" ]]; then
bash -c "$client_command"
fi
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
adaptive_search: true
}')
echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
evaluate_sla_from_json "$result_json"
}
adaptive_refine_from_static_results() {
local test_name=$1
local qps=$2
local max_concurrency_list_raw=$3
local tp=$4
local compilation_config_mode=$5
local optimization_level=$6
local client_args_effective=$7
local client_remote_args=$8
local server_command=$9
local sorted_points
local point
local rc
local static_last_pass=""
local static_first_fail=""
local largest_static=""
local step_hint=1
local previous_point=""
local low
local high
local mid
local probes=0
local summary_file="$RESULTS_FOLDER/${test_name}_qps_${qps}_sla_summary.json"
[[ "${ENABLE_ADAPTIVE_CONCURRENCY}" == "1" ]] || return 0
[[ "${DRY_RUN:-0}" != "1" ]] || return 0
sorted_points=$(for point in $max_concurrency_list_raw; do printf '%s\n' "$point"; done | tr -d "'" | awk '/^[0-9]+$/' | sort -n | uniq)
[[ -n "$sorted_points" ]] || return 0
while read -r point; do
[[ -z "$point" ]] && continue
largest_static="$point"
evaluate_sla_from_json "$(result_json_path_for_serving "$test_name" "$qps" "$point")"
rc=$?
if (( rc == 0 )); then
static_last_pass="$point"
elif (( rc == 1 )); then
if [[ -n "$static_last_pass" ]]; then
static_first_fail="$point"
break
fi
fi
if [[ -n "$previous_point" ]]; then
step_hint=$(( point - previous_point ))
if (( step_hint < 1 )); then step_hint=1; fi
fi
previous_point="$point"
done <<< "$sorted_points"
if [[ -z "$static_last_pass" ]]; then
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "" "$static_first_fail" "" "$static_first_fail"
return 0
fi
if [[ -n "$static_first_fail" ]]; then
low=$static_last_pass
high=$static_first_fail
while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
mid=$(( (low + high) / 2 ))
probes=$(( probes + 1 ))
run_single_serving_probe \
"$test_name" "$qps" "$mid" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
rc=$?
if (( rc == 0 )); then
low=$mid
elif (( rc == 1 )); then
high=$mid
else
break
fi
done
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "$static_first_fail" "$low" "$high"
return 0
fi
low=$largest_static
high=""
while (( probes < ADAPTIVE_MAX_PROBES )); do
point=$(( low + step_hint ))
if (( point > ADAPTIVE_MAX_CONCURRENCY )); then
point=$ADAPTIVE_MAX_CONCURRENCY
fi
(( point > low )) || break
probes=$(( probes + 1 ))
run_single_serving_probe \
"$test_name" "$qps" "$point" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
rc=$?
if (( rc == 0 )); then
low=$point
(( point == ADAPTIVE_MAX_CONCURRENCY )) && break
step_hint=$(( step_hint * 2 ))
if (( step_hint < 1 )); then step_hint=1; fi
elif (( rc == 1 )); then
high=$point
break
else
break
fi
done
if [[ -n "$high" ]]; then
while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
mid=$(( (low + high) / 2 ))
probes=$(( probes + 1 ))
run_single_serving_probe \
"$test_name" "$qps" "$mid" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
rc=$?
if (( rc == 0 )); then
low=$mid
elif (( rc == 1 )); then
high=$mid
else
break
fi
done
fi
write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "" "$low" "$high"
}
run_benchmark_tests() { run_benchmark_tests() {
# run benchmark tests using `vllm bench <test_type>` command # run benchmark tests using `vllm bench <test_type>` command
# $1: test type (latency or throughput) # $1: test type (latency or throughput)
@@ -559,16 +252,37 @@ run_benchmark_tests() {
done done
} }
run_latency_tests() { run_benchmark_tests "latency" "$1"; } run_latency_tests() {
run_startup_tests() { run_benchmark_tests "startup" "$1"; } run_benchmark_tests "latency" "$1"
run_throughput_tests() { run_benchmark_tests "throughput" "$1"; } }
merge_serving_tests_stream() { run_startup_tests() {
# Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode. run_benchmark_tests "startup" "$1"
# This helper does NOT modify JSON; it only filters the stream in dry-run mode. }
local serving_test_file="$1"
# shellcheck disable=SC2016 run_throughput_tests() {
local merged=' run_benchmark_tests "throughput" "$1"
}
run_serving_tests() {
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
#
# Supported JSON formats:
# 1) Plain format: top-level array
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
#
# 2) Default parameters field + plain format tests
# {
# "defaults": { ... },
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
# }
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '
if type == "array" then if type == "array" then
# Plain format: test cases array # Plain format: test cases array
.[] .[]
@@ -590,50 +304,7 @@ merge_serving_tests_stream() {
else else
error("Unsupported serving test file format: must be array or object with .tests") error("Unsupported serving test file format: must be array or object with .tests")
end end
' ' "$serving_test_file" | while read -r params; do
jq -c "$merged" "$serving_test_file" | \
if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" '
select((($model|length)==0)
or ((.server_parameters.model // "") == $model)
or ((.client_parameters.model // "") == $model))
| select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
'
else
cat
fi
}
run_serving_tests() {
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
#
# Supported JSON formats:
# 1) Plain format: top-level array
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
#
# 2) Default parameters field + plain format tests
# {
# "defaults": { ... },
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
# }
local serving_test_file
serving_test_file=$1
# In dry-run mode, if filters are provided but no tests match, fail fast.
if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
local count
count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ')
if [[ "$count" -eq 0 ]]; then
echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2
return 0
fi
fi
# Iterate over serving tests (merged + optional filtered stream)
merge_serving_tests_stream "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it. # get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name') test_name=$(echo "$params" | jq -r '.test_name')
if [[ ! "$test_name" =~ ^serving_ ]]; then if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -652,48 +323,10 @@ run_serving_tests() {
server_envs=$(echo "$params" | jq -r '.server_environment_variables') server_envs=$(echo "$params" | jq -r '.server_environment_variables')
client_params=$(echo "$params" | jq -r '.client_parameters') client_params=$(echo "$params" | jq -r '.client_parameters')
# vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly. server_args=$(json2args "$server_params")
server_model=$(echo "$server_params" | jq -r '.model // empty')
if [[ -z "$server_model" || "$server_model" == "null" ]]; then
echo "Error: serving test '$test_name' is missing server_parameters.model" >&2
exit 1
fi
server_params_no_model=$(echo "$server_params" | jq -c 'del(.model)')
server_args=$(json2args "$server_params_no_model")
server_envs=$(json2envs "$server_envs") server_envs=$(json2envs "$server_envs")
client_args=$(json2args "$client_params") client_args=$(json2args "$client_params")
# ------------------------------------------------------------
# Option 1: Dynamic num-prompts scaling based on max_concurrency
#
# If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with:
# num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY
#
# If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior
# unchanged (i.e., whatever is in serving-tests-*.json).
# ------------------------------------------------------------
PROMPTS_PER_CONCURRENCY="${PROMPTS_PER_CONCURRENCY-}" # no default on purpose
MIN_NUM_PROMPTS="${MIN_NUM_PROMPTS:-1}"
MAX_NUM_PROMPTS="${MAX_NUM_PROMPTS:-1000000}"
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
# Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
# Handles: --num-prompts 123 and --num-prompts=123
client_args_no_np="$(
printf ' %s ' "$client_args" \
| sed -E \
-e 's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g' \
-e 's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g'
)"
# normalize whitespace
client_args_no_np="$(echo "$client_args_no_np" | tr -s ' ' | sed -E 's/^ //; s/ $//')"
client_args_no_np="$(echo "$client_args_no_np" | xargs)"
client_args_effective="$client_args_no_np"
else
client_args_effective="$client_args"
fi
# qps_list # qps_list
qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
@@ -725,13 +358,14 @@ run_serving_tests() {
fi fi
# check if server model and client model is aligned # check if server model and client model is aligned
server_model=$(echo "$server_params" | jq -r '.model')
client_model=$(echo "$client_params" | jq -r '.model') client_model=$(echo "$client_params" | jq -r '.model')
if [[ $server_model != "$client_model" ]]; then if [[ $server_model != "$client_model" ]]; then
echo "Server model and client model must be the same. Skip testcase $test_name." echo "Server model and client model must be the same. Skip testcase $test_name."
continue continue
fi fi
server_command="$server_envs vllm serve $server_model \ server_command="$server_envs vllm serve \
$server_args" $server_args"
# run the server # run the server
@@ -739,7 +373,7 @@ run_serving_tests() {
echo "Server command: $server_command" echo "Server command: $server_command"
# support remote vllm server # support remote vllm server
client_remote_args="" client_remote_args=""
if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then if [[ -z "${REMOTE_HOST}" ]]; then
bash -c "$server_command" & bash -c "$server_command" &
server_pid=$! server_pid=$!
# wait until the server is alive # wait until the server is alive
@@ -750,9 +384,6 @@ run_serving_tests() {
echo "" echo ""
echo "vLLM failed to start within the timeout period." echo "vLLM failed to start within the timeout period."
fi fi
elif [[ "${DRY_RUN:-0}" == "1" ]]; then
# dry-run: don't start server
echo "Dry Run."
else else
server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT" server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
if [[ ${REMOTE_PORT} ]]; then if [[ ${REMOTE_PORT} ]]; then
@@ -771,21 +402,15 @@ run_serving_tests() {
for qps in $qps_list; do for qps in $qps_list; do
# remove the surrounding single quote from qps # remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf" qps="inf"
echo "now qps is $qps"
fi fi
# iterate over different max_concurrency # iterate over different max_concurrency
for max_concurrency in $max_concurrency_list; do for max_concurrency in $max_concurrency_list; do
new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}" new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
echo " new test name $new_test_name" echo " new test name $new_test_name"
# If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts.
num_prompts_arg=""
if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
num_prompts_arg="--num-prompts $num_prompts"
fi
# pass the tensor parallel size, the compilation mode, and the optimization # pass the tensor parallel size, the compilation mode, and the optimization
# level to the client so that they can be used on the benchmark dashboard # level to the client so that they can be used on the benchmark dashboard
client_command="vllm bench serve \ client_command="vllm bench serve \
@@ -794,16 +419,13 @@ run_serving_tests() {
--result-filename ${new_test_name}.json \ --result-filename ${new_test_name}.json \
--request-rate $qps \ --request-rate $qps \
--max-concurrency $max_concurrency \ --max-concurrency $max_concurrency \
$num_prompts_arg \
--metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \ --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
$client_args_effective $client_remote_args " $client_args $client_remote_args "
echo "Running test case $test_name with qps $qps" echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command" echo "Client command: $client_command"
if [[ "${DRY_RUN:-0}" != "1" ]]; then bash -c "$client_command"
bash -c "$client_command"
fi
# record the benchmarking commands # record the benchmarking commands
jq_output=$(jq -n \ jq_output=$(jq -n \
@@ -818,23 +440,15 @@ run_serving_tests() {
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done done
adaptive_refine_from_static_results \
"$test_name" "$qps" "$max_concurrency_list" "$tp" \
"$compilation_config_mode" "$optimization_level" \
"$client_args_effective" "$client_remote_args" "$server_command"
done done
# clean up # clean up
if [[ "${DRY_RUN:-0}" != "1" ]]; then kill -9 $server_pid
kill -9 "$server_pid" kill_gpu_processes
kill_gpu_processes
fi
done done
} }
main() { main() {
local ARCH local ARCH
ARCH='' ARCH=''
if [[ "$ON_CPU" == "1" ]]; then if [[ "$ON_CPU" == "1" ]]; then
@@ -844,13 +458,7 @@ main() {
check_gpus check_gpus
ARCH="$arch_suffix" ARCH="$arch_suffix"
fi fi
check_hf_token
# DRY_RUN does not execute vLLM; do not require HF_TOKEN.
if [[ "${DRY_RUN:-0}" != "1" ]]; then
check_hf_token
else
echo "DRY_RUN=1 -> skip HF_TOKEN validation"
fi
# dependencies # dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
@@ -871,16 +479,11 @@ main() {
# dump vllm info via vllm collect-env # dump vllm info via vllm collect-env
env_output=$(vllm collect-env) env_output=$(vllm collect-env)
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt" echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
# benchmarking # benchmarking
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $? run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
if [[ "${DRY_RUN:-0}" == "1" ]]; then
echo "DRY_RUN=1 -> skip latency/startup/throughput suites"
exit 0
fi
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}" run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}" run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}" run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
@@ -888,7 +491,6 @@ main() {
# postprocess benchmarking results # postprocess benchmarking results
pip install tabulate pandas pip install tabulate pandas
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
python3 $QUICK_BENCHMARK_ROOT/scripts/compare-json-results.py -f $RESULTS_FOLDER/benchmark_results.json
upload_to_buildkite upload_to_buildkite
} }

View File

@@ -51,56 +51,5 @@
"max-model-len": 256, "max-model-len": 256,
"async-scheduling": "" "async-scheduling": ""
} }
},
{
"test_name": "latency_deepseek_r1",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"tensor_parallel_size": 8,
"load_format": "dummy",
"max-model-len": 2048,
"dtype": "bfloat16"
}
},
{
"test_name": "latency_llama4_maverick_17b128e_instruct_fp8",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
"tensor_parallel_size": 8,
"max-model-len": 512,
"max-num-seqs": 128,
"async-scheduling": "",
"gpu-memory-utilization": 0.95,
"enable_expert_parallel": ""
}
},
{
"test_name": "latency_qwen3_8b",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1,
"max-model-len": 2048,
"max-num-seqs": 128,
"dtype": "bfloat16",
"async-scheduling": ""
}
} }
] ]

View File

@@ -1,37 +0,0 @@
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
},
"server_parameters": {
"dtype": "bfloat16",
"model": "openai/whisper-large-v3-turbo"
},
"client_parameters": {
"model": "openai/whisper-large-v3-turbo",
"backend": "openai-audio",
"endpoint": "/v1/audio/transcriptions",
"dataset_name": "hf",
"dataset_path": "openslr/librispeech_asr",
"hf_subset": "clean",
"hf_split": "test",
"no_stream": "",
"no_oversample": "",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_whisper_large_v3_turbo_librispeech_clean_tp1",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {}
}
]
}

View File

@@ -1,41 +0,0 @@
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [
32,
64,
128
],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"dtype": "bfloat16",
"model": "jinaai/jina-embeddings-v3",
"trust_remote_code": ""
},
"client_parameters": {
"model": "jinaai/jina-embeddings-v3",
"backend": "openai-embeddings",
"endpoint": "/v1/embeddings",
"dataset_name": "sharegpt",
"dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_jina_embed_v3_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {}
}
]
}

View File

@@ -1,355 +0,0 @@
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"ignore-eos": "",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_llama8B_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp1_random_128_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_128_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_128_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp4_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp4_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp4_random_2048_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_int4_tp1_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 2
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int8_tp1_random_128_128",
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int8_tp2_random_128_128",
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 2
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int8_tp4_random_128_128",
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_granite2B_tp1_random_128_128",
"server_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen1.7B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-1.7B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-1.7B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen4B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-4B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-4B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen8B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-8B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_glm9B_tp1_random_128_128",
"server_parameters": {
"model": "zai-org/glm-4-9b-hf",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "zai-org/glm-4-9b-hf",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_gemma7B_tp1_random_128_128",
"server_parameters": {
"model": "google/gemma-7b",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "google/gemma-7b",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
}
]
}

View File

@@ -72,6 +72,17 @@
"random-output-len": 128 "random-output-len": 128
} }
}, },
{
"test_name": "serving_llama8B_tp4_random_128_128",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{ {
"test_name": "serving_llama8B_tp1_random_128_2048", "test_name": "serving_llama8B_tp1_random_128_2048",
"server_parameters": { "server_parameters": {
@@ -94,6 +105,17 @@
"random-output-len": 2048 "random-output-len": 2048
} }
}, },
{
"test_name": "serving_llama8B_tp4_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 4
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{ {
"test_name": "serving_llama8B_tp1_random_2048_128", "test_name": "serving_llama8B_tp1_random_2048_128",
"server_parameters": { "server_parameters": {
@@ -117,25 +139,144 @@
} }
}, },
{ {
"test_name": "serving_llama8B_tp1_random_2048_2048", "test_name": "serving_llama8B_tp4_random_2048_128",
"server_parameters": { "server_parameters": {
"tensor_parallel_size": 1 "tensor_parallel_size": 4
}, },
"client_parameters": { "client_parameters": {
"dataset_name": "random", "dataset_name": "random",
"random-input-len": 2048, "random-input-len": 2048,
"random-output-len": 2048 "random-output-len": 128
} }
}, },
{ {
"test_name": "serving_llama8B_tp2_random_2048_2048", "test_name": "serving_llama8B_int4_tp1_random_128_128",
"server_parameters": { "server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp2_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 2 "tensor_parallel_size": 2
}, },
"client_parameters": { "client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random", "dataset_name": "random",
"random-input-len": 2048, "random-input-len": 128,
"random-output-len": 2048 "random-output-len": 128
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"tensor_parallel_size": 4
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama3B_tp1_random_128_128",
"server_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_granite2B_tp1_random_128_128",
"server_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "ibm-granite/granite-3.2-2b-instruct",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen1.7B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-1.7B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-1.7B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen4B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-4B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-4B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_qwen8B_tp1_random_128_128",
"server_parameters": {
"model": "Qwen/Qwen3-8B",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "Qwen/Qwen3-8B",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_glm9B_tp1_random_128_128",
"server_parameters": {
"model": "zai-org/glm-4-9b-hf",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "zai-org/glm-4-9b-hf",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_gemma7B_tp1_random_128_128",
"server_parameters": {
"model": "google/gemma-7b",
"tensor_parallel_size": 1
},
"client_parameters": {
"model": "google/gemma-7b",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
} }
} }
] ]

View File

@@ -10,6 +10,7 @@
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1, "tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "", "disable_log_stats": "",
"load_format": "dummy", "load_format": "dummy",
"max-model-len": 2048, "max-model-len": 2048,
@@ -36,6 +37,7 @@
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4, "tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "", "disable_log_stats": "",
"load_format": "dummy", "load_format": "dummy",
"max-model-len": 2048, "max-model-len": 2048,
@@ -62,6 +64,7 @@
"server_parameters": { "server_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1", "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2, "tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "", "disable_log_stats": "",
"load_format": "dummy", "load_format": "dummy",
"max-model-len": 2048, "max-model-len": 2048,
@@ -75,83 +78,5 @@
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200 "num_prompts": 200
} }
},
{
"test_name": "serving_deepseek_r1",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"server_parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"tensor_parallel_size": 8,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
"max-num-seqs": 200,
"async-scheduling": "",
"dtype": "bfloat16"
},
"client_parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama4_maverick_17b128e_instruct_fp8",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"server_parameters": {
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
"tensor_parallel_size": 8,
"disable_log_stats": "",
"max-model-len": 2048,
"max-num-seqs": 128,
"async-scheduling": "",
"enable_expert_parallel": "",
"max-num-batched-tokens": 4096
},
"client_parameters": {
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_qwen3_8b",
"qps_list": [1, 4, 10, "inf"],
"server_environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"server_parameters": {
"model": "Qwen/Qwen-3-8B",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"disable_log_stats": "",
"async-scheduling": ""
},
"client_parameters": {
"model": "Qwen/Qwen-3-8B",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
} }
] ]

View File

@@ -5,6 +5,7 @@
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1, "tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "", "disable_log_stats": "",
"load_format": "dummy" "load_format": "dummy"
}, },
@@ -22,6 +23,7 @@
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4, "tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "", "disable_log_stats": "",
"load_format": "dummy" "load_format": "dummy"
}, },
@@ -39,6 +41,7 @@
"server_parameters": { "server_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1", "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2, "tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "", "disable_log_stats": "",
"load_format": "dummy" "load_format": "dummy"
}, },
@@ -56,6 +59,7 @@
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4, "tensor_parallel_size": 4,
"swap_space": 16,
"speculative_config": { "speculative_config": {
"model": "turboderp/Qwama-0.5B-Instruct", "model": "turboderp/Qwama-0.5B-Instruct",
"num_speculative_tokens": 4, "num_speculative_tokens": 4,

View File

@@ -57,67 +57,5 @@
"max-num-seqs": 512, "max-num-seqs": 512,
"async-scheduling": "" "async-scheduling": ""
} }
},
{
"test_name": "throughput_deepseek_r1",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"tensor_parallel_size": 8,
"load_format": "dummy",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"dataset_name": "sharegpt",
"num_prompts": 1000,
"backend": "vllm",
"max-model-len": 2048,
"max-num-seqs": 384,
"async-scheduling": ""
}
},
{
"test_name": "throughput_llama4_maverick_17b128e_instruct_fp8",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
"tensor_parallel_size": 8,
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"dataset_name": "sharegpt",
"num_prompts": 1000,
"backend": "vllm",
"max-model-len": 2048,
"max-num-seqs": 512,
"async-scheduling": "",
"enable_expert_parallel": ""
}
},
{
"test_name": "throughput_qwen3_8b",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "Qwen/Qwen-3-8B",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"dataset_name": "sharegpt",
"num_prompts": 1000,
"max-num-seqs": 512,
"backend": "vllm",
"async-scheduling": ""
}
} }
] ]

View File

@@ -83,7 +83,7 @@ steps:
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35" - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
@@ -152,7 +152,7 @@ steps:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest" - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env: env:

View File

@@ -25,7 +25,7 @@ S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com" S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"
# Format ROCm version for path (e.g., "7.1" -> "rocm710") # Format ROCm version for path (e.g., "7.1" -> "rocm710")
ROCM_VERSION_PATH="rocm$(echo "${ROCM_VERSION}" | tr -d '.')" ROCM_VERSION_PATH="rocm$(echo ${ROCM_VERSION} | tr -d '.')"
ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}" ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
## ROCm Wheel and Docker Image Releases ## ROCm Wheel and Docker Image Releases
@@ -68,7 +68,7 @@ aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl . aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl . aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl . aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amd_aiter-*.whl . aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/aiter-*.whl .
aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl . aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
\`\`\` \`\`\`
@@ -80,7 +80,7 @@ aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-
- **torchvision**: TorchVision for ROCm PyTorch - **torchvision**: TorchVision for ROCm PyTorch
- **torchaudio**: Torchaudio for ROCm PyTorch - **torchaudio**: Torchaudio for ROCm PyTorch
- **amdsmi**: AMD SMI Python bindings - **amdsmi**: AMD SMI Python bindings
- **amd_aiter**: Aiter for ROCm - **aiter**: Aiter for ROCm
- **flash-attn**: Flash Attention for ROCm - **flash-attn**: Flash Attention for ROCm
### :warning: Notes ### :warning: Notes

View File

@@ -83,7 +83,7 @@ case "${1:-}" in
exit 1 exit 1
fi fi
WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l) WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
if [[ "$WHEEL_COUNT" -eq 0 ]]; then if [[ "$WHEEL_COUNT" -eq 0 ]]; then
echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2 echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
exit 1 exit 1
@@ -110,9 +110,9 @@ case "${1:-}" in
echo "" echo ""
echo "Downloaded wheels:" echo "Downloaded wheels:"
find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \; ls -lh artifacts/rocm-base-wheels/
WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l) WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
echo "" echo ""
echo "Total: $WHEEL_COUNT wheels" echo "Total: $WHEEL_COUNT wheels"
echo "========================================" echo "========================================"

View File

@@ -1,213 +0,0 @@
#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Check if Ray LLM can generate lock files that are compatible with this
# version of vllm. Downloads Ray's requirement files and runs a full
# dependency resolution with the installed vllm's constraints to see if
# a valid lock file can be produced.
#
# See: https://github.com/vllm-project/vllm/issues/33599
set -eo pipefail
RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python"
WORK_DIR=$(mktemp -d)
trap 'rm -rf "$WORK_DIR"' EXIT
# Fetch all Ray requirement files used in the LLM depset pipeline
echo ">>> Fetching Ray requirement files"
RAY_FILES=(
"requirements.txt"
"requirements/cloud-requirements.txt"
"requirements/base-test-requirements.txt"
"requirements/llm/llm-requirements.txt"
"requirements/llm/llm-test-requirements.txt"
)
for FILE in "${RAY_FILES[@]}"; do
LOCAL_PATH="${WORK_DIR}/$(basename "$FILE")"
echo " ${FILE}"
curl -fsSL -o "$LOCAL_PATH" "${RAY_BASE_URL}/${FILE}"
done
# Extract installed vllm deps
echo ">>> Extracting installed vllm dependency constraints"
python3 - "${WORK_DIR}/vllm-constraints.txt" <<'PYEOF'
"""Write out the installed vllm's dependencies as pip constraint lines.
Ray uses vllm[audio], so audio-extra deps are included with their extra
markers stripped. The resolver cannot evaluate extra markers for a
package that is not itself being resolved from an index, so we activate
them manually here.
"""
import importlib.metadata
import re
import sys
out_path = sys.argv[1]
raw_reqs = importlib.metadata.requires("vllm") or []
# Ray uses vllm[audio] activate that extra.
ACTIVE_EXTRAS = {"audio"}
EXTRA_RE = re.compile(r"""extra\s*==\s*['"]([^'"]+)['"]""")
lines = []
for r in raw_reqs:
if ";" not in r:
# Unconditional dep — always include.
lines.append(r.strip())
continue
req_part, _, marker_part = r.partition(";")
marker_part = marker_part.strip()
extra_matches = EXTRA_RE.findall(marker_part)
if not extra_matches:
# Non-extra marker (python_version, etc.) — keep as-is.
lines.append(r.strip())
continue
if not ACTIVE_EXTRAS.intersection(extra_matches):
continue # Skip inactive extras (tensorizer, bench, …).
# Strip the extra== conditions but keep any remaining markers
# (e.g. python_version).
cleaned = EXTRA_RE.sub("", marker_part)
cleaned = re.sub(r"\band\b\s*\band\b", "and", cleaned)
cleaned = re.sub(r"^\s*and\s+|\s+and\s*$", "", cleaned).strip()
if cleaned:
lines.append(f"{req_part.strip()} ; {cleaned}")
else:
lines.append(req_part.strip())
with open(out_path, "w") as f:
for line in lines:
f.write(line + "\n")
print(f"Wrote {len(lines)} constraints to {out_path}")
PYEOF
echo ">>> Installed vllm deps (first 20 lines):"
head -20 "${WORK_DIR}/vllm-constraints.txt"
# Remove Ray's vllm pin — the installed vllm's transitive deps
# (written above) replace it in the resolution. vllm itself cannot
# be resolved from PyPI for in-development versions, so we test
# whether Ray's requirements can coexist with vllm's dependency
# constraints instead.
sed -i '/^vllm/d' "${WORK_DIR}/llm-requirements.txt"
# Install uv if needed
if ! command -v uv &>/dev/null; then
echo ">>> Installing uv"
pip install uv -q
fi
# Resolve: given vllm's constraints, can Ray compile a lock file?
#
# vllm's dependency constraints are the fixed side — Ray is flexible and
# can regenerate its lock files. We pass vllm's constraints via -c so
# the resolver treats them as non-negotiable bounds, then check whether
# Ray's own requirements can still be satisfied within those bounds.
echo ""
echo "============================================================"
echo ">>> Resolving: Can Ray generate compatible lock files?"
echo "============================================================"
set +e
uv pip compile \
"${WORK_DIR}/requirements.txt" \
"${WORK_DIR}/cloud-requirements.txt" \
"${WORK_DIR}/base-test-requirements.txt" \
"${WORK_DIR}/llm-requirements.txt" \
"${WORK_DIR}/llm-test-requirements.txt" \
-c "${WORK_DIR}/vllm-constraints.txt" \
--python-version 3.12 \
--python-platform x86_64-manylinux_2_31 \
--extra-index-url https://download.pytorch.org/whl/cu129 \
--index-strategy unsafe-best-match \
--unsafe-package setuptools \
--unsafe-package ray \
--no-header \
-o "${WORK_DIR}/resolved.txt" \
2>&1
EXIT_CODE=$?
set -e
echo ""
echo "=========================================="
if [ $EXIT_CODE -eq 0 ]; then
echo "SUCCESS: Ray can generate lock files compatible with this vllm."
echo ""
echo "Key resolved versions:"
grep -E '^(protobuf|torch|numpy|transformers)==' \
"${WORK_DIR}/resolved.txt" | sort || true
echo "=========================================="
exit 0
fi
echo "FAILURE: Ray cannot generate lock files compatible with this vllm."
echo "This means a fundamental dependency conflict exists that Ray"
echo "cannot resolve by regenerating its lock files."
echo "See: https://github.com/vllm-project/vllm/issues/33599"
echo "=========================================="
# Buildkite annotation
if [ -f /usr/bin/buildkite-agent ]; then
buildkite-agent annotate --style 'warning' --context 'ray-compat' << EOF
### :warning: Ray Dependency Compatibility Warning
This PR introduces dependencies that **cannot** be resolved with Ray's requirements.
Ray would not be able to regenerate its lock files to accommodate this vllm version.
Please check the **Ray Dependency Compatibility Check** step logs for details.
See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for context.
EOF
fi
# Notify Slack if webhook is configured and PR/branch are valid.
if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
PR="${BUILDKITE_PULL_REQUEST:-}"
BRANCH="${BUILDKITE_BRANCH:-}"
# Skip notification if PR is invalid or branch is empty
if [[ "$PR" = "false" || -z "$PR" || -z "$BRANCH" ]]; then
echo ">>> Skipping Slack notification (invalid PR or empty branch: PR=$PR, branch=$BRANCH)"
else
echo ">>> Sending Slack notification"
# Single quotes are intentional: the f-string expressions are Python, not shell.
# shellcheck disable=SC2016
PAYLOAD=$(python3 -c '
import json, os, sys
pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A")
branch = os.getenv("BUILDKITE_BRANCH", "unknown")
url = os.getenv("BUILDKITE_BUILD_URL", "#")
data = {
"text": ":warning: Ray Dependency Compatibility Check Failed",
"blocks": [{
"type": "section",
"text": {
"type": "mrkdwn",
"text": (
"*:warning: Ray Dependency Compatibility Check Failed*\n"
f"PR #{pr} on branch `{branch}` introduces dependencies "
f"that cannot be resolved with Ray'\''s requirements.\n"
f"<{url}|View Build>"
),
},
}],
}
print(json.dumps(data))
')
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
-H 'Content-type: application/json' \
-d "$PAYLOAD")
echo " Slack webhook response: $HTTP_CODE"
fi
else
echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
fi
exit 1

View File

@@ -134,7 +134,7 @@ log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
# Store PR data in a temp file # Store PR data in a temp file
PR_DATA=$(mktemp) PR_DATA=$(mktemp)
trap 'rm -f "$PR_DATA"' EXIT trap "rm -f $PR_DATA" EXIT
if ! gh pr list --state merged --search "milestone:${MILESTONE}" \ if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
--limit 1000 \ --limit 1000 \

View File

@@ -1,57 +1,25 @@
#!/bin/bash #!/bin/bash
# This script runs tests inside the corresponding ROCm docker container. # This script runs test inside the corresponding ROCm docker container.
# It handles both single-node and multi-node test configurations.
#
# Multi-node detection: Instead of matching on fragile group names, we detect
# multi-node jobs structurally by looking for the bracket command syntax
# "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
#
###############################################################################
# QUOTING / COMMAND PASSING
#
# Passing commands as positional arguments ($*) is fragile when the command
# string itself contains double quotes, e.g.:
#
# bash run-amd-test.sh "export FLAGS="value" && pytest -m "not slow""
#
# The outer shell resolves the nested quotes *before* this script runs, so
# the script receives mangled input it cannot fully recover.
#
# Preferred: pass commands via the VLLM_TEST_COMMANDS environment variable:
#
# export VLLM_TEST_COMMANDS='export FLAGS="value" && pytest -m "not slow"'
# bash run-amd-test.sh
#
# Single-quoted assignment preserves all inner double quotes verbatim.
# The $* path is kept for backward compatibility but callers should migrate.
###############################################################################
set -o pipefail set -o pipefail
# Export Python path # Export Python path
export PYTHONPATH=".." export PYTHONPATH=".."
############################################################################### # Print ROCm version
# Helper Functions echo "--- Confirming Clean Initial State"
############################################################################### while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
wait_for_clean_gpus() { echo "--- ROCm info"
local timeout=${1:-300} rocminfo
local start=$SECONDS
echo "--- Waiting for clean GPU state (timeout: ${timeout}s)"
while true; do
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
return
fi
if (( SECONDS - start >= timeout )); then
echo "Error: GPUs did not reach clean state within ${timeout}s" >&2
exit 1
fi
sleep 3
done
}
# cleanup older docker images
cleanup_docker() { cleanup_docker() {
# Get Docker's root directory # Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}') docker_root=$(docker info -f '{{.DockerRootDir}}')
@@ -60,12 +28,15 @@ cleanup_docker() {
exit 1 exit 1
fi fi
echo "Docker root directory: $docker_root" echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70 threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed." echo "Docker images and volumes cleanup completed."
else else
@@ -74,445 +45,193 @@ cleanup_docker() {
} }
cleanup_network() { cleanup_network() {
local max_nodes=${NUM_NODES:-2} for node in $(seq 0 $((NUM_NODES-1))); do
for node in $(seq 0 $((max_nodes - 1))); do if docker pr -a -q -f name="node${node}" | grep -q .; then
if docker ps -a -q -f name="node${node}" | grep -q .; then docker stop "node${node}"
docker stop "node${node}" || true
fi fi
done done
if docker network ls | grep -q docker-net; then if docker network ls | grep docker-net; then
docker network rm docker-net || true docker network rm docker-net
fi fi
} }
is_multi_node() { # Call the cleanup docker function
local cmds="$1"
# Primary signal: NUM_NODES environment variable set by the pipeline
if [[ "${NUM_NODES:-1}" -gt 1 ]]; then
return 0
fi
# Fallback: detect the bracket syntax structurally
# Pattern: [...] && [...] (per-node command arrays)
if [[ "$cmds" =~ \[.*\].*\&\&.*\[.*\] ]]; then
return 0
fi
return 1
}
handle_pytest_exit() {
local exit_code=$1
if [ "$exit_code" -eq 5 ]; then
echo "Pytest exit code 5 (no tests collected) - treating as success."
exit 0
fi
exit "$exit_code"
}
###############################################################################
# Pytest marker/keyword re-quoting
#
# When commands are passed through Buildkite -> shell -> $* -> bash -c,
# quotes around multi-word pytest -m/-k expressions get stripped:
# pytest -v -s -m 'not cpu_test' v1/core
# becomes:
# pytest -v -s -m not cpu_test v1/core
#
# pytest then interprets "cpu_test" as a file path, not part of the marker.
#
# This function detects unquoted expressions after -m/-k and re-quotes them
# by collecting tokens until a recognizable boundary is reached:
# - test path (contains '/')
# - test file (ends with '.py')
# - another pytest flag (--xxx or -x single-char flags)
# - command separator (&& || ; |)
# - environment variable assignment (FOO=bar)
#
# Single-word markers (e.g. -m cpu_test, -m hybrid_model) pass through
# unquoted since they have no spaces and work fine.
#
# Already-quoted expressions (containing literal single quotes) are passed
# through untouched to avoid double-quoting values injected by
# apply_rocm_test_overrides.
#
# NOTE: This ONLY fixes -m/-k flags. It cannot recover arbitrary inner
# double-quotes stripped by the calling shell (see header comment).
# Use VLLM_TEST_COMMANDS to avoid the problem entirely.
###############################################################################
re_quote_pytest_markers() {
local input="$1"
local output=""
local collecting=false
local marker_buf=""
# Strip backslash-newline continuations, then flatten remaining newlines
local flat="${input//$'\\\n'/ }"
flat="${flat//$'\n'/ }"
# Disable globbing to prevent *.py etc. from expanding during read -ra
local restore_glob
restore_glob="$(shopt -p -o noglob 2>/dev/null || true)"
set -o noglob
local -a words
read -ra words <<< "$flat"
eval "$restore_glob"
for word in "${words[@]}"; do
if $collecting; then
# If the token we're about to collect already contains a literal
# single quote, the expression was already quoted upstream.
# Flush and stop collecting.
if [[ "$word" == *"'"* ]]; then
if [[ -n "$marker_buf" ]]; then
# Should not normally happen (partial buf + quote), flush raw
output+="${marker_buf} "
marker_buf=""
fi
output+="${word} "
collecting=false
continue
fi
local is_boundary=false
case "$word" in
# Line-continuation artifact
"\\")
is_boundary=true ;;
# Command separators
"&&"|"||"|";"|"|")
is_boundary=true ;;
# Long flags (--ignore, --shard-id, etc.)
--*)
is_boundary=true ;;
# Short flags (-v, -s, -x, etc.) but NOT negative marker tokens
# like "not" which don't start with "-". Also skip -k/-m which
# would start a new marker (handled below).
-[a-zA-Z])
is_boundary=true ;;
# Test path (contains /)
*/*)
is_boundary=true ;;
# Test file (ends with .py, possibly with ::method)
*.py|*.py::*)
is_boundary=true ;;
# Environment variable assignment preceding a command (FOO=bar)
*=*)
# Only treat as boundary if it looks like VAR=value, not
# pytest filter expressions like num_gpus=2 inside markers
if [[ "$word" =~ ^[A-Z_][A-Z0-9_]*= ]]; then
is_boundary=true
fi
;;
esac
if $is_boundary; then
# Strip surrounding double quotes if present (from upstream
# single-to-double conversion); without this, wrapping below
# would produce '"expr"' with literal double-quote characters.
if [[ "$marker_buf" == '"'*'"' ]]; then
marker_buf="${marker_buf#\"}"
marker_buf="${marker_buf%\"}"
fi
# Flush the collected marker expression
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
output+="'${marker_buf}' "
else
output+="${marker_buf} "
fi
collecting=false
marker_buf=""
# Check if this boundary word itself starts a new -m/-k
if [[ "$word" == "-m" || "$word" == "-k" ]]; then
output+="${word} "
collecting=true
# Drop stray backslash tokens silently
elif [[ "$word" == "\\" ]]; then
:
else
output+="${word} "
fi
else
# Accumulate into marker buffer
if [[ -n "$marker_buf" ]]; then
marker_buf+=" ${word}"
else
marker_buf="${word}"
fi
fi
elif [[ "$word" == "-m" || "$word" == "-k" ]]; then
output+="${word} "
collecting=true
marker_buf=""
else
output+="${word} "
fi
done
# Flush any trailing marker expression (marker at end of command)
if $collecting && [[ -n "$marker_buf" ]]; then
# Strip surrounding double quotes (see mid-stream flush comment)
if [[ "$marker_buf" == '"'*'"' ]]; then
marker_buf="${marker_buf#\"}"
marker_buf="${marker_buf%\"}"
fi
if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
output+="'${marker_buf}'"
else
output+="${marker_buf}"
fi
fi
echo "${output% }"
}
###############################################################################
# ROCm-specific pytest command rewrites
#
# These apply ignore flags and environment overrides for tests that are not
# yet supported or behave differently on ROCm hardware. Kept as a single
# function so new exclusions are easy to add in one place.
###############################################################################
apply_rocm_test_overrides() {
local cmds="$1"
# --- Model registry filter ---
if [[ $cmds == *"pytest -v -s models/test_registry.py"* ]]; then
cmds=${cmds//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
# --- LoRA: disable custom paged attention ---
if [[ $cmds == *"pytest -v -s lora"* ]]; then
cmds=${cmds//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
fi
# --- Kernel ignores ---
if [[ $cmds == *" kernels/core"* ]]; then
cmds="${cmds} \
--ignore=kernels/core/test_fused_quant_layernorm.py \
--ignore=kernels/core/test_permute_cols.py"
fi
if [[ $cmds == *" kernels/attention"* ]]; then
cmds="${cmds} \
--ignore=kernels/attention/test_attention_selector.py \
--ignore=kernels/attention/test_encoder_decoder_attn.py \
--ignore=kernels/attention/test_flash_attn.py \
--ignore=kernels/attention/test_flashinfer.py \
--ignore=kernels/attention/test_prefix_prefill.py \
--ignore=kernels/attention/test_cascade_flash_attn.py \
--ignore=kernels/attention/test_mha_attn.py \
--ignore=kernels/attention/test_lightning_attn.py \
--ignore=kernels/attention/test_attention.py"
fi
if [[ $cmds == *" kernels/quantization"* ]]; then
cmds="${cmds} \
--ignore=kernels/quantization/test_int8_quant.py \
--ignore=kernels/quantization/test_machete_mm.py \
--ignore=kernels/quantization/test_block_fp8.py \
--ignore=kernels/quantization/test_block_int8.py \
--ignore=kernels/quantization/test_marlin_gemm.py \
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
--ignore=kernels/quantization/test_int8_kernel.py"
fi
if [[ $cmds == *" kernels/mamba"* ]]; then
cmds="${cmds} \
--ignore=kernels/mamba/test_mamba_mixer2.py \
--ignore=kernels/mamba/test_causal_conv1d.py \
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
fi
if [[ $cmds == *" kernels/moe"* ]]; then
cmds="${cmds} \
--ignore=kernels/moe/test_moe.py \
--ignore=kernels/moe/test_cutlass_moe.py \
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
fi
# --- Entrypoint ignores ---
if [[ $cmds == *" entrypoints/openai "* ]]; then
cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/chat_completion/test_audio.py \
--ignore=entrypoints/openai/completion/test_shutdown.py \
--ignore=entrypoints/openai/test_completion.py \
--ignore=entrypoints/openai/test_models.py \
--ignore=entrypoints/openai/test_lora_adapters.py \
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
--ignore=entrypoints/openai/chat_completion/test_root_path.py \
--ignore=entrypoints/openai/test_tokenization.py \
--ignore=entrypoints/openai/completion/test_prompt_validation.py "}
fi
if [[ $cmds == *" entrypoints/llm "* ]]; then
cmds=${cmds//" entrypoints/llm "/" entrypoints/llm \
--ignore=entrypoints/llm/test_chat.py \
--ignore=entrypoints/llm/test_accuracy.py \
--ignore=entrypoints/llm/test_init.py \
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
# Clean up escaped newlines from --ignore appends
cmds=$(echo "$cmds" | sed 's/ \\ / /g')
echo "$cmds"
}
###############################################################################
# Main
###############################################################################
# --- GPU initialization ---
echo "--- Confirming Clean Initial State"
wait_for_clean_gpus
echo "--- ROCm info"
rocminfo
# --- Docker housekeeping ---
cleanup_docker cleanup_docker
echo "--- Resetting GPUs" echo "--- Resetting GPUs"
echo "reset" > /opt/amdgpu/etc/gpu_state
wait_for_clean_gpus
# --- Pull test image --- echo "reset" > /opt/amdgpu/etc/gpu_state
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
echo "--- Pulling container" echo "--- Pulling container"
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}" image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
docker pull "${image_name}" docker pull "${image_name}"
remove_docker_container() { remove_docker_container() {
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
} }
trap remove_docker_container EXIT trap remove_docker_container EXIT
# --- Prepare commands ---
echo "--- Running container" echo "--- Running container"
HF_CACHE="$(realpath ~)/huggingface" HF_CACHE="$(realpath ~)/huggingface"
mkdir -p "${HF_CACHE}" mkdir -p "${HF_CACHE}"
HF_MOUNT="/root/.cache/huggingface" HF_MOUNT="/root/.cache/huggingface"
# ---- Command source selection ---- commands=$@
# Prefer VLLM_TEST_COMMANDS (preserves all inner quoting intact).
# Fall back to $* for backward compatibility, but warn that inner
# double-quotes will have been stripped by the calling shell.
if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then
commands="${VLLM_TEST_COMMANDS}"
echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)"
else
commands="$*"
if [[ -z "$commands" ]]; then
echo "Error: No test commands provided." >&2
echo "Usage:" >&2
echo " Preferred: VLLM_TEST_COMMANDS='...' bash $0" >&2
echo " Legacy: bash $0 \"commands here\"" >&2
exit 1
fi
echo "Commands sourced from positional args (legacy mode)"
echo "WARNING: Inner double-quotes in the command string may have been"
echo " stripped by the calling shell. If you see syntax errors, switch to:"
echo " export VLLM_TEST_COMMANDS='your commands here'"
echo " bash $0"
fi
echo "Raw commands: $commands" echo "Raw commands: $commands"
# Fix quoting before ROCm overrides (so overrides see correct structure) commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
commands=$(re_quote_pytest_markers "$commands")
echo "After re-quoting: $commands"
commands=$(apply_rocm_test_overrides "$commands") if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
if [[ $commands == *"pytest -v -s lora"* ]]; then
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
fi
#ignore certain kernels tests
if [[ $commands == *" kernels/core"* ]]; then
commands="${commands} \
--ignore=kernels/core/test_fused_quant_layernorm.py \
--ignore=kernels/core/test_permute_cols.py"
fi
if [[ $commands == *" kernels/attention"* ]]; then
commands="${commands} \
--ignore=kernels/attention/test_attention_selector.py \
--ignore=kernels/attention/test_encoder_decoder_attn.py \
--ignore=kernels/attention/test_flash_attn.py \
--ignore=kernels/attention/test_flashinfer.py \
--ignore=kernels/attention/test_prefix_prefill.py \
--ignore=kernels/attention/test_cascade_flash_attn.py \
--ignore=kernels/attention/test_mha_attn.py \
--ignore=kernels/attention/test_lightning_attn.py \
--ignore=kernels/attention/test_attention.py"
fi
if [[ $commands == *" kernels/quantization"* ]]; then
commands="${commands} \
--ignore=kernels/quantization/test_int8_quant.py \
--ignore=kernels/quantization/test_machete_mm.py \
--ignore=kernels/quantization/test_block_fp8.py \
--ignore=kernels/quantization/test_block_int8.py \
--ignore=kernels/quantization/test_marlin_gemm.py \
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
--ignore=kernels/quantization/test_int8_kernel.py"
fi
if [[ $commands == *" kernels/mamba"* ]]; then
commands="${commands} \
--ignore=kernels/mamba/test_mamba_mixer2.py \
--ignore=kernels/mamba/test_causal_conv1d.py \
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
fi
if [[ $commands == *" kernels/moe"* ]]; then
commands="${commands} \
--ignore=kernels/moe/test_moe.py \
--ignore=kernels/moe/test_cutlass_moe.py \
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
fi
#ignore certain Entrypoints/openai tests
if [[ $commands == *" entrypoints/openai "* ]]; then
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_shutdown.py \
--ignore=entrypoints/openai/test_completion.py \
--ignore=entrypoints/openai/test_models.py \
--ignore=entrypoints/openai/test_lora_adapters.py \
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
--ignore=entrypoints/openai/test_root_path.py \
--ignore=entrypoints/openai/test_tokenization.py \
--ignore=entrypoints/openai/test_prompt_validation.py "}
fi
#ignore certain Entrypoints/llm tests
if [[ $commands == *" entrypoints/llm "* ]]; then
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
--ignore=entrypoints/llm/test_chat.py \
--ignore=entrypoints/llm/test_accuracy.py \
--ignore=entrypoints/llm/test_init.py \
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
commands=$(echo "$commands" | sed 's/ \\ / /g')
echo "Final commands: $commands" echo "Final commands: $commands"
# --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py
# --ignore=entrypoints/openai/test_accuracy.py \
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
MYPYTHONPATH=".." MYPYTHONPATH=".."
# Verify GPU access # Test that we're launching on the machine that has
# proper access to GPUs
render_gid=$(getent group render | cut -d: -f3) render_gid=$(getent group render | cut -d: -f3)
if [[ -z "$render_gid" ]]; then if [[ -z "$render_gid" ]]; then
echo "Error: 'render' group not found. This is required for GPU access." >&2 echo "Error: 'render' group not found. This is required for GPU access." >&2
exit 1 exit 1
fi fi
# --- RDMA device passthrough (conditional) --- if [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
# If the host has RDMA devices, pass them through so tests like
# test_moriio_connector can access ibverbs. On hosts without RDMA
# hardware the tests will gracefully skip via _rdma_available().
RDMA_FLAGS=""
if [ -d /dev/infiniband ]; then
echo "RDMA devices detected on host, enabling passthrough"
RDMA_FLAGS="--device /dev/infiniband --cap-add=IPC_LOCK"
else
echo "No RDMA devices found on host, RDMA tests will be skipped"
fi
# --- Route: multi-node vs single-node ---
if is_multi_node "$commands"; then
echo "--- Multi-node job detected"
export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/') export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
# Parse the bracket syntax: prefix ; [node0_cmds] && [node1_cmds] if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then
# BASH_REMATCH[1] = prefix (everything before first bracket) prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g')
# BASH_REMATCH[2] = comma-separated node0 commands echo "PREFIX: ${prefix}"
# BASH_REMATCH[3] = comma-separated node1 commands export composite_command="(command rocm-smi || true)"
if [[ "$commands" =~ ^(.*)\[(.*)"] && ["(.*)\]$ ]]; then myIFS=$IFS
prefix=$(echo "${BASH_REMATCH[1]}" | sed 's/;//g') IFS=','
echo "PREFIX: ${prefix}" read -ra node0 <<< ${BASH_REMATCH[2]}
read -ra node1 <<< ${BASH_REMATCH[3]}
IFS=$myIFS
for i in "${!node0[@]}";do
command_node_0=$(echo ${node0[i]} | sed 's/\"//g')
command_node_1=$(echo ${node1[i]} | sed 's/\"//g')
export composite_command="(command rocm-smi || true)" export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
saved_IFS=$IFS echo "COMMANDS: ${commands}"
IFS=',' composite_command=$(echo "${composite_command} && ${commands}")
read -ra node0 <<< "${BASH_REMATCH[2]}" done
read -ra node1 <<< "${BASH_REMATCH[3]}" /bin/bash -c "${composite_command}"
IFS=$saved_IFS cleanup_network
if [[ ${#node0[@]} -ne ${#node1[@]} ]]; then
echo "Warning: node0 has ${#node0[@]} commands, node1 has ${#node1[@]}. They will be paired by index."
fi
for i in "${!node0[@]}"; do
command_node_0=$(echo "${node0[i]}" | sed 's/\"//g')
command_node_1=$(echo "${node1[i]}" | sed 's/\"//g')
step_cmd="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
echo "COMMANDS: ${step_cmd}"
composite_command="${composite_command} && ${step_cmd}"
done
/bin/bash -c "${composite_command}"
exit_code=$?
cleanup_network
handle_pytest_exit "$exit_code"
else else
echo "Multi-node job detected but failed to parse bracket command syntax." echo "Failed to parse node commands! Exiting."
echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]" cleanup_network
echo "Got: $commands" exit 111
cleanup_network
exit 111
fi fi
else else
echo "--- Single-node job"
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \ docker run \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
$RDMA_FLAGS \ --network=host \
--network=host \ --shm-size=16gb \
--shm-size=16gb \ --group-add "$render_gid" \
--group-add "$render_gid" \ --rm \
--rm \ -e HF_TOKEN \
-e HF_TOKEN \ -e AWS_ACCESS_KEY_ID \
-e AWS_ACCESS_KEY_ID \ -e AWS_SECRET_ACCESS_KEY \
-e AWS_SECRET_ACCESS_KEY \ -v "${HF_CACHE}:${HF_MOUNT}" \
-e BUILDKITE_PARALLEL_JOB \ -e "HF_HOME=${HF_MOUNT}" \
-e BUILDKITE_PARALLEL_JOB_COUNT \ -e "PYTHONPATH=${MYPYTHONPATH}" \
-v "${HF_CACHE}:${HF_MOUNT}" \ --name "${container_name}" \
-e "HF_HOME=${HF_MOUNT}" \ "${image_name}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \ /bin/bash -c "${commands}"
--name "${container_name}" \
"${image_name}" \
/bin/bash -c "${commands}"
exit_code=$?
handle_pytest_exit "$exit_code"
fi fi

View File

@@ -1,65 +0,0 @@
#!/bin/bash
set -euox pipefail
export VLLM_CPU_KVCACHE_SPACE=1
export VLLM_CPU_CI_ENV=1
# Reduce sub-processes for acceleration
export TORCH_COMPILE_DISABLE=1
export VLLM_ENABLE_V1_MULTIPROCESSING=0
SDE_ARCHIVE="sde-external-10.7.0-2026-02-18-lin.tar.xz"
SDE_CHECKSUM="CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217"
wget "https://downloadmirror.intel.com/913594/${SDE_ARCHIVE}"
echo "${SDE_CHECKSUM} ${SDE_ARCHIVE}" | sha256sum --check
mkdir -p sde
tar -xvf "./${SDE_ARCHIVE}" --strip-components=1 -C ./sde/
wait_for_pid_and_check_log() {
local pid="$1"
local log_file="$2"
local exit_status
if [ -z "$pid" ] || [ -z "$log_file" ]; then
echo "Usage: wait_for_pid_and_check_log <PID> <LOG_FILE>"
return 1
fi
echo "Waiting for process $pid to finish..."
# Use the 'wait' command to pause the script until the specific PID exits.
# The 'wait' command's own exit status will be that of the waited-for process.
if wait "$pid"; then
exit_status=$?
echo "Process $pid finished with exit status $exit_status (Success)."
else
exit_status=$?
echo "Process $pid finished with exit status $exit_status (Failure)."
fi
if [ "$exit_status" -ne 0 ]; then
echo "Process exited with a non-zero status."
echo "--- Last few lines of log file: $log_file ---"
tail -n 50 "$log_file"
echo "---------------------------------------------"
return 1 # Indicate failure based on exit status
fi
echo "No errors detected in log file and process exited successfully."
return 0
}
# Test Sky Lake (AVX512F)
./sde/sde64 -skl -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_0.log 2>&1 &
PID_TEST_0=$!
# Test Cascade Lake (AVX512F + VNNI)
./sde/sde64 -clx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_1.log 2>&1 &
PID_TEST_1=$!
# Test Cooper Lake (AVX512F + VNNI + BF16)
./sde/sde64 -cpx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_2.log 2>&1 &
PID_TEST_2=$!
wait_for_pid_and_check_log $PID_TEST_0 test_0.log
wait_for_pid_and_check_log $PID_TEST_1 test_1.log
wait_for_pid_and_check_log $PID_TEST_2 test_2.log

View File

@@ -1,43 +1,26 @@
#!/bin/bash #!/bin/bash
set -euox pipefail set -euox pipefail
export VLLM_CPU_CI_ENV=0
echo "--- PP+TP" echo "--- PP+TP"
vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 & vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
server_pid=$! server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1 timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \ vllm bench serve \
--backend vllm \ --backend vllm \
--dataset-name random \ --dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \ --model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \ --num-prompts 20 \
--result-dir ./test_results \
--result-filename tp_pp.json \
--save-result \
--endpoint /v1/completions --endpoint /v1/completions
kill -s SIGTERM $server_pid; wait $server_pid || true kill -s SIGTERM $server_pid &
failed_req=$(jq '.failed' ./test_results/tp_pp.json)
if [ "$failed_req" -ne 0 ]; then
echo "Some requests were failed!"
exit 1
fi
echo "--- DP+TP" echo "--- DP+TP"
vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 & vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
server_pid=$! server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1 timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \ vllm bench serve \
--backend vllm \ --backend vllm \
--dataset-name random \ --dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \ --model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \ --num-prompts 20 \
--result-dir ./test_results \
--result-filename dp_pp.json \
--save-result \
--endpoint /v1/completions --endpoint /v1/completions
kill -s SIGTERM $server_pid; wait $server_pid || true kill -s SIGTERM $server_pid &
failed_req=$(jq '.failed' ./test_results/dp_pp.json)
if [ "$failed_req" -ne 0 ]; then
echo "Some requests were failed!"
exit 1
fi

View File

@@ -34,7 +34,7 @@ function cpu_tests() {
# offline inference # offline inference
docker exec cpu-test bash -c " docker exec cpu-test bash -c "
set -e set -e
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run model tests # Run model tests
docker exec cpu-test bash -c " docker exec cpu-test bash -c "

View File

@@ -27,7 +27,7 @@ function cpu_tests() {
podman exec -it "$container_id" bash -c " podman exec -it "$container_id" bash -c "
export TORCH_COMPILE_DISABLE=1 export TORCH_COMPILE_DISABLE=1
set -xve set -xve
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
# Run basic model test # Run basic model test
podman exec -it "$container_id" bash -c " podman exec -it "$container_id" bash -c "
@@ -43,7 +43,7 @@ function cpu_tests() {
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach] pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being. # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> "$HOME"/test_rest.log # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
} }
# All of CPU tests are expected to be finished less than 40 mins. # All of CPU tests are expected to be finished less than 40 mins.

View File

@@ -16,5 +16,5 @@ echo "--- :docker: Building Docker image"
docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu . docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel. # Run the image, setting --shm-size=4g for tensor parallel.
docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \ docker run --rm --cpuset-cpus=$CORE_RANGE --cpuset-mems=$NUMA_NODE -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g $IMAGE_NAME \
timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}" timeout $TIMEOUT_VAL bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"

View File

@@ -25,5 +25,5 @@ remove_docker_container
# Run the image and test offline inference # Run the image and test offline inference
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
' '

View File

@@ -1,42 +1,17 @@
#!/bin/bash #!/bin/bash
# This script builds the HPU docker image and runs the offline inference inside the container. # This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage. # It serves a sanity check for compilation and basic model usage.
#
# vllm-gaudi compatibility pinning:
# The vllm-gaudi plugin is installed on top of the vllm upstream checkout used by this CI job.
# When upstream vllm changes its API, the plugin may break before it has been updated.
# To handle this, the vllm-gaudi repository maintains a file:
# vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT
# The first line of that file controls what version of vllm is used inside the Docker image:
# - "latest" : no checkout override; the current Buildkite CI commit is used as-is.
# - "<commit SHA>" : vllm is checked out to that specific commit before building, pinning
# the test to a known-compatible baseline.
# To unpin (resume testing against the live vllm tip), set the file content back to "latest".
set -exuo pipefail set -exuo pipefail
# Fetch the vllm community commit reference from vllm-gaudi (first line only).
VLLM_COMMUNITY_COMMIT=$(curl -s \
https://raw.githubusercontent.com/vllm-project/vllm-gaudi/vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT \
| head -1 | tr -d '\n')
echo "Using vllm community commit: ${VLLM_COMMUNITY_COMMIT}"
# Try building the docker image # Try building the docker image
image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}" image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container" container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
cat <<EOF | docker build -t "${image_name}" -f - . cat <<EOF | docker build -t ${image_name} -f - .
FROM gaudi-base-image:latest FROM gaudi-base-image:latest
COPY ./ /workspace/vllm COPY ./ /workspace/vllm
# If VLLM_COMMUNITY_COMMIT is a specific commit (not "latest"), check it out to pin vllm
# to the version known to be compatible with vllm-gaudi. When the value is "latest",
# the current checkout (the Buildkite CI commit) is used unchanged.
RUN if [ "${VLLM_COMMUNITY_COMMIT}" != "latest" ]; then \
cd /workspace/vllm && git fetch --unshallow 2>/dev/null || true && git checkout ${VLLM_COMMUNITY_COMMIT}; \
fi
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
ENV no_proxy=localhost,127.0.0.1 ENV no_proxy=localhost,127.0.0.1
@@ -64,19 +39,19 @@ EOF
# functions, while other platforms only need one remove_docker_container # functions, while other platforms only need one remove_docker_container
# function. # function.
EXITCODE=1 EXITCODE=1
remove_docker_containers() { docker rm -f "${container_name}" || true; } remove_docker_containers() { docker rm -f ${container_name} || true; }
trap 'remove_docker_containers; exit $EXITCODE;' EXIT trap 'remove_docker_containers; exit $EXITCODE;' EXIT
remove_docker_containers remove_docker_containers
echo "Running HPU plugin v1 test" echo "Running HPU plugin v1 test"
docker run --rm --runtime=habana --name="${container_name}" --network=host \ docker run --rm --runtime=habana --name=${container_name} --network=host \
-e HABANA_VISIBLE_DEVICES=all \ -e HABANA_VISIBLE_DEVICES=all \
-e VLLM_SKIP_WARMUP=true \ -e VLLM_SKIP_WARMUP=true \
-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \ -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
-e PT_HPU_LAZY_MODE=1 \ -e PT_HPU_LAZY_MODE=1 \
"${image_name}" \ "${image_name}" \
/bin/bash -c ' /bin/bash -c '
cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
' '
EXITCODE=$? EXITCODE=$?

View File

@@ -41,7 +41,6 @@ get_config() {
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2 echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
exit 1 exit 1
fi fi
# shellcheck source=/dev/null
source "${TEST_RUN_CONFIG_FILE}" source "${TEST_RUN_CONFIG_FILE}"
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}" echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
return 0 return 0
@@ -49,8 +48,9 @@ get_config() {
# get test running configuration. # get test running configuration.
fetch_vllm_test_cfg fetch_vllm_test_cfg
get_config
# Check if the function call was successful. If not, exit the script. # Check if the function call was successful. If not, exit the script.
if ! get_config; then if [ $? -ne 0 ]; then
exit 1 exit 1
fi fi
@@ -62,14 +62,14 @@ agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
echo "agent_idx: ${agent_idx}" echo "agent_idx: ${agent_idx}"
builder_name="cachebuilder${agent_idx}" builder_name="cachebuilder${agent_idx}"
builder_cache_dir="/mnt/docker-cache${agent_idx}" builder_cache_dir="/mnt/docker-cache${agent_idx}"
mkdir -p "${builder_cache_dir}" mkdir -p ${builder_cache_dir}
# Try building the docker image # Try building the docker image
cat <<EOF | DOCKER_BUILDKIT=1 docker build \ cat <<EOF | DOCKER_BUILDKIT=1 docker build \
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:"${PYPI_CACHE_HOST}" \ --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
--builder "${builder_name}" --cache-from type=local,src="${builder_cache_dir}" \ --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
--cache-to type=local,dest="${builder_cache_dir}",mode=max \ --cache-to type=local,dest=${builder_cache_dir},mode=max \
--progress=plain --load -t "${image_name}" -f - . --progress=plain --load -t ${image_name} -f - .
FROM ${BASE_IMAGE_NAME} FROM ${BASE_IMAGE_NAME}
# Define environments # Define environments
@@ -116,7 +116,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \ source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/devlib && \ export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -139,7 +139,7 @@ trap remove_docker_container EXIT
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME # Generate corresponding --device args based on BUILDKITE_AGENT_NAME
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1. # Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards. # e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
# returns one argument per line: --device, /dev/davinciX, ... # returns --device /dev/davinci0 --device /dev/davinci1
parse_and_gen_devices() { parse_and_gen_devices() {
local input="$1" local input="$1"
local index cards_num local index cards_num
@@ -151,24 +151,29 @@ parse_and_gen_devices() {
return 1 return 1
fi fi
local devices=""
local i=0 local i=0
while (( i < cards_num )); do while (( i < cards_num )); do
local dev_idx=$(((index - 1)*cards_num + i )) local dev_idx=$(((index - 1)*cards_num + i ))
printf '%s\n' "--device" devices="$devices --device /dev/davinci${dev_idx}"
printf '%s\n' "/dev/davinci${dev_idx}"
((i++)) ((i++))
done done
# trim leading space
devices="${devices#"${devices%%[![:space:]]*}"}"
# Output devices: assigned to the caller variable
printf '%s' "$devices"
} }
mapfile -t device_args < <(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1 devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware. # Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
# This test checks whether the OOT platform interface is functioning properly in conjunction with # This test checks whether the OOT platform interface is functioning properly in conjunction with
# the hardware plugin vllm-ascend. # the hardware plugin vllm-ascend.
model_cache_dir=/mnt/modelscope${agent_idx} model_cache_dir=/mnt/modelscope${agent_idx}
mkdir -p "${model_cache_dir}" mkdir -p ${model_cache_dir}
docker run \ docker run \
"${device_args[@]}" \ ${devices} \
--device /dev/davinci_manager \ --device /dev/davinci_manager \
--device /dev/devmm_svm \ --device /dev/devmm_svm \
--device /dev/hisi_hdc \ --device /dev/hisi_hdc \
@@ -177,7 +182,7 @@ docker run \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \ -v /etc/ascend_install.info:/etc/ascend_install.info \
-v "${model_cache_dir}":/root/.cache/modelscope \ -v ${model_cache_dir}:/root/.cache/modelscope \
--entrypoint="" \ --entrypoint="" \
--name "${container_name}" \ --name "${container_name}" \
"${image_name}" \ "${image_name}" \

View File

@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---" echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \ && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---" echo "--- Python dependencies installed ---"

View File

@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---" echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \ && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---" echo "--- Python dependencies installed ---"

View File

@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# Try building the docker image # Try building the docker image
docker build -t "${image_name}" -f docker/Dockerfile.xpu . docker build -t ${image_name} -f docker/Dockerfile.xpu .
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() {
@@ -34,17 +34,17 @@ docker run \
set -e set -e
echo $ZE_AFFINITY_MASK echo $ZE_AFFINITY_MASK
pip install tblib==3.1.0 pip install tblib==3.1.0
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2
python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
cd tests cd tests
pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
pytest -v -s v1/engine pytest -v -s v1/engine
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py

View File

@@ -21,16 +21,16 @@ echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag nam
# pull original arch-dependent images from AWS ECR Public # pull original arch-dependent images from AWS ECR Public
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX" docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX" docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
# tag arch-dependent images # tag arch-dependent images
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-x86_64 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-aarch64 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
# push arch-dependent images to DockerHub # push arch-dependent images to DockerHub
docker push vllm/vllm-openai:"$TAG_NAME"-x86_64 docker push vllm/vllm-openai:$TAG_NAME-x86_64
docker push vllm/vllm-openai:"$TAG_NAME"-aarch64 docker push vllm/vllm-openai:$TAG_NAME-aarch64
# push arch-independent manifest to DockerHub # push arch-independent manifest to DockerHub
docker manifest create vllm/vllm-openai:"$TAG_NAME" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
docker manifest create vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
docker manifest push vllm/vllm-openai:"$TAG_NAME" docker manifest push vllm/vllm-openai:$TAG_NAME
docker manifest push vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT" docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT

View File

@@ -0,0 +1,64 @@
#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Setup script for Prime-RL integration tests
# This script prepares the environment for running Prime-RL tests with nightly vLLM
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
exit 0
fi
echo "Setting up Prime-RL integration test environment..."
# Clean up any existing Prime-RL directory
if [ -d "${PRIME_RL_DIR}" ]; then
echo "Removing existing Prime-RL directory..."
rm -rf "${PRIME_RL_DIR}"
fi
# Install UV if not available
if ! command -v uv &> /dev/null; then
echo "Installing UV package manager..."
curl -LsSf https://astral.sh/uv/install.sh | sh
source $HOME/.local/bin/env
fi
# Clone Prime-RL repository at specific branch for reproducible tests
PRIME_RL_BRANCH="integ-vllm-main"
echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
cd "${PRIME_RL_DIR}"
echo "Setting up UV project environment..."
export UV_PROJECT_ENVIRONMENT=/usr/local
ln -s /usr/bin/python3 /usr/local/bin/python
# Remove vllm pin from pyproject.toml
echo "Removing vllm pin from pyproject.toml..."
sed -i '/vllm==/d' pyproject.toml
# Sync Prime-RL dependencies
echo "Installing Prime-RL dependencies..."
uv sync --inexact && uv sync --inexact --all-extras
# Verify installation
echo "Verifying installations..."
uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
echo "Prime-RL integration test environment setup complete!"
echo "Running Prime-RL integration tests..."
export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
uv run pytest -vs tests/integration/test_rl.py -m gpu
echo "Prime-RL integration tests completed!"

View File

@@ -51,14 +51,14 @@ for BACK in "${BACKENDS[@]}"; do
--enable-eplb \ --enable-eplb \
--trust-remote-code \ --trust-remote-code \
--max-model-len 2048 \ --max-model-len 2048 \
--all2all-backend "$BACK" \ --all2all-backend $BACK \
--port "$PORT" & --port $PORT &
SERVER_PID=$! SERVER_PID=$!
wait_for_server "$PORT" wait_for_server $PORT
TAG=$(echo "$MODEL" | tr '/: \\n' '_____') TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json" OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}" python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
python3 - <<PY python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy'] import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}") print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")

View File

@@ -1,57 +0,0 @@
#!/usr/bin/env bash
set -euxo pipefail
# Nightly e2e test for prefetch offloading with a MoE model.
# Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights
# and validates GSM8K accuracy matches baseline (no offloading).
#
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
THRESHOLD=${1:-0.25}
NUM_Q=${2:-1319}
PORT=${3:-8030}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}"
wait_for_server() {
local port=$1
timeout 600 bash -c '
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
sleep 1
done'
}
MODEL="deepseek-ai/DeepSeek-V2-Lite"
cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
kill "${SERVER_PID}" 2>/dev/null || true
for _ in {1..20}; do
kill -0 "${SERVER_PID}" 2>/dev/null || break
sleep 0.5
done
kill -9 "${SERVER_PID}" 2>/dev/null || true
fi
}
trap cleanup EXIT
vllm serve "$MODEL" \
--max-model-len 2048 \
--offload-group-size 8 \
--offload-num-in-group 2 \
--offload-prefetch-step 1 \
--offload-params w13_weight w2_weight \
--port "$PORT" &
SERVER_PID=$!
wait_for_server "$PORT"
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_prefetch_offload.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} prefetch_offload: accuracy {acc:.3f}")
assert acc >= ${THRESHOLD}, f"${MODEL} prefetch_offload accuracy {acc}"
PY
cleanup
SERVER_PID=

View File

@@ -47,20 +47,20 @@ for BACK in "${BACKENDS[@]}"; do
vllm serve "$MODEL" \ vllm serve "$MODEL" \
--enforce-eager \ --enforce-eager \
--enable-eplb \ --enable-eplb \
--all2all-backend "$BACK" \ --all2all-backend $BACK \
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \ --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
--tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \ --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
--data-parallel-size "${DATA_PARALLEL_SIZE}" \ --data-parallel-size ${DATA_PARALLEL_SIZE} \
--enable-expert-parallel \ --enable-expert-parallel \
--trust-remote-code \ --trust-remote-code \
--max-model-len 2048 \ --max-model-len 2048 \
--port "$PORT" & --port $PORT &
SERVER_PID=$! SERVER_PID=$!
wait_for_server "$PORT" wait_for_server $PORT
TAG=$(echo "$MODEL" | tr '/: \\n' '_____') TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json" OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}" python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
python3 - <<PY python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy'] import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}") print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")

View File

@@ -24,7 +24,7 @@ if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:
BACKENDS=("allgather_reducescatter") BACKENDS=("allgather_reducescatter")
# Disable MOE padding for ROCm since it is causing eplb to fail # Disable MOE padding for ROCm since it is causing eplb to fail
export VLLM_ROCM_MOE_PADDING=0 export VLLM_ROCM_MOE_PADDING=0
PLATFORM_ARGS=("--no-async-scheduling" "--attention-backend=TRITON_ATTN") PLATFORM_ARGS=("--no-async-scheduling")
echo "Disabled async scheduling for ROCm platform due to issues with spec decode." echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
else else
# Non-ROCm platform (CUDA/other) # Non-ROCm platform (CUDA/other)
@@ -51,20 +51,20 @@ for BACK in "${BACKENDS[@]}"; do
--tensor-parallel-size 4 \ --tensor-parallel-size 4 \
--enable-expert-parallel \ --enable-expert-parallel \
--enable-eplb \ --enable-eplb \
--all2all-backend "$BACK" \ --all2all-backend $BACK \
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \ --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \ --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
--trust-remote-code \ --trust-remote-code \
--max-model-len 2048 \ --max-model-len 2048 \
--gpu-memory-utilization 0.9 \ --gpu-memory-utilization 0.9 \
"${PLATFORM_ARGS[@]}" \ "${PLATFORM_ARGS[@]}" \
--port "$PORT" & --port $PORT &
SERVER_PID=$! SERVER_PID=$!
wait_for_server "$PORT" wait_for_server $PORT
TAG=$(echo "$MODEL" | tr '/: \\n' '_____') TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json" OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}" python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
python3 - <<PY python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy'] import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}") print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")

View File

@@ -1,248 +0,0 @@
#!/bin/bash
# Run BFCL (Berkeley Function Call Leaderboard) tool-calling correctness
# evaluation against a local vLLM server.
#
# Usage:
# # Run with defaults (gpt-oss-20b, multi_turn)
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
#
# # Run with gpt-oss-120b and multiple test categories
# BFCL_MODEL="openai/gpt-oss-120b" BFCL_TP_SIZE=4 \
# BFCL_TEST_CATEGORY="live_simple, multiple, parallel_multiple" \
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
#
# # Chain both API types (use BFCL_OUTPUT_DIR to avoid overwriting results)
# BFCL_OUTPUT_DIR=./bfcl-chat-completions BFCL_API_TYPE=chat_completions \
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh && \
# BFCL_OUTPUT_DIR=./bfcl-responses BFCL_API_TYPE=responses \
# bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
#
# Environment variables (all optional, with defaults):
# BFCL_MODEL - HF model name (default: openai/gpt-oss-20b)
# BFCL_API_TYPE - API type: "chat_completions" or "responses" (default: chat_completions)
# BFCL_OUTPUT_DIR - Directory for BFCL results (default: current working directory)
# BFCL_TEST_CATEGORY - BFCL test categories (default: multi_turn)
# BFCL_TOOL_CALL_PARSER - Tool call parser name (default: openai)
# BFCL_NUM_THREADS - Threads for BFCL generate (default: 8)
# BFCL_TP_SIZE - Tensor parallel size (default: 1)
# BFCL_MAX_MODEL_LEN - Max model length (default: 4096)
# BFCL_PORT - Server port (default: 8000)
# BFCL_REASONING_PARSER - Reasoning parser name (default: disabled)
# BFCL_EXTRA_ARGS - Additional vLLM server args
set -euo pipefail
# ---- Configuration ----
MODEL="${BFCL_MODEL:-openai/gpt-oss-20b}"
API_TYPE="${BFCL_API_TYPE:-chat_completions}"
OUTPUT_DIR="${BFCL_OUTPUT_DIR:-}"
TEST_CATEGORY="${BFCL_TEST_CATEGORY:-multi_turn}"
TOOL_CALL_PARSER="${BFCL_TOOL_CALL_PARSER:-openai}"
NUM_THREADS="${BFCL_NUM_THREADS:-8}"
TP_SIZE="${BFCL_TP_SIZE:-1}"
MAX_MODEL_LEN="${BFCL_MAX_MODEL_LEN:-4096}"
PORT="${BFCL_PORT:-8000}"
REASONING_PARSER="${BFCL_REASONING_PARSER:-}"
EXTRA_ARGS="${BFCL_EXTRA_ARGS:-}"
# Set up output directory
if [ -n "$OUTPUT_DIR" ]; then
mkdir -p "$OUTPUT_DIR"
OUTPUT_DIR="$(cd "$OUTPUT_DIR" && pwd)"
fi
echo "============================================"
echo "BFCL Tool Call Correctness Evaluation"
echo "============================================"
echo "Model: $MODEL"
echo "Tool parser: $TOOL_CALL_PARSER"
echo "API type: $API_TYPE"
echo "Output dir: ${OUTPUT_DIR:-<cwd>}"
echo "Test category: $TEST_CATEGORY"
echo "TP size: $TP_SIZE"
echo "Max model len: $MAX_MODEL_LEN"
echo "Port: $PORT"
echo "Num threads: $NUM_THREADS"
echo "============================================"
# ---- Install bfcl-eval if missing ----
if ! python3 -c "import bfcl_eval" 2>/dev/null; then
echo "Installing bfcl-eval..."
pip install "bfcl-eval>=2025.10.20.1,<2026"
fi
# ---- Cleanup handler ----
SERVER_PID=""
cleanup() {
if [ -n "$SERVER_PID" ]; then
echo "Stopping vLLM server (pid=$SERVER_PID)..."
kill "$SERVER_PID" 2>/dev/null || true
wait "$SERVER_PID" 2>/dev/null || true
fi
# Remove BFCL lock files (created by filelock for thread-safe writes)
rm -rf .file_locks/
if [ -n "${OUTPUT_DIR:-}" ]; then
rm -rf "$OUTPUT_DIR/.file_locks/"
fi
}
trap cleanup EXIT
# ---- Start vLLM server ----
echo "Starting vLLM server..."
SERVE_ARGS=(
"$MODEL"
--port "$PORT"
--enable-auto-tool-choice
--tool-call-parser "$TOOL_CALL_PARSER"
--tensor-parallel-size "$TP_SIZE"
--max-model-len "$MAX_MODEL_LEN"
--enforce-eager
--no-enable-prefix-caching
)
# Append reasoning parser if specified
if [ -n "$REASONING_PARSER" ]; then
SERVE_ARGS+=(--reasoning-parser "$REASONING_PARSER")
fi
# Append any extra args
if [ -n "$EXTRA_ARGS" ]; then
read -ra EXTRA_ARGS_ARRAY <<< "$EXTRA_ARGS"
SERVE_ARGS+=("${EXTRA_ARGS_ARRAY[@]}")
fi
echo "Command: vllm serve ${SERVE_ARGS[*]}"
vllm serve "${SERVE_ARGS[@]}" &
SERVER_PID=$!
# ---- Wait for server to be ready ----
echo "Waiting for vLLM server to start (timeout: 600s)..."
SECONDS_WAITED=0
until curl -sf "http://localhost:${PORT}/health" > /dev/null 2>&1; do
if [ $SECONDS_WAITED -ge 600 ]; then
echo ""
echo "ERROR: vLLM server failed to start within 600s"
exit 1
fi
if (( SECONDS_WAITED % 30 == 0 && SECONDS_WAITED > 0 )); then
echo " Still waiting... (${SECONDS_WAITED}s elapsed)"
fi
sleep 2
SECONDS_WAITED=$((SECONDS_WAITED + 2))
done
echo "vLLM server is ready. (started in ${SECONDS_WAITED}s)"
# ---- Run BFCL evaluation ----
# bfcl-eval has no CLI entry point; generate() and evaluate() are Typer
# functions that must be called from Python. The MODEL_CONFIG_MAPPING must
# be patched in-process so BFCL knows to use the OpenAI-compatible handler
# against our local vLLM server.
bfcl_exit_code=0
python3 - "$MODEL" "$TEST_CATEGORY" "$NUM_THREADS" "$PORT" "$API_TYPE" "$OUTPUT_DIR" << 'PYEOF' || bfcl_exit_code=$?
import os
import sys
model = sys.argv[1]
test_category = sys.argv[2]
num_threads = int(sys.argv[3])
port = sys.argv[4]
api_type = sys.argv[5]
output_dir = sys.argv[6] if len(sys.argv) > 6 and sys.argv[6] else os.getcwd()
os.environ["OPENAI_BASE_URL"] = f"http://localhost:{port}/v1"
os.environ["OPENAI_API_KEY"] = "dummy"
os.environ["BFCL_PROJECT_ROOT"] = output_dir
import bfcl_eval.constants.model_config as bfcl_model_config
from bfcl_eval.constants.model_config import ModelConfig
from bfcl_eval.model_handler.api_inference.openai_completion import (
OpenAICompletionsHandler,
)
from bfcl_eval.model_handler.api_inference.openai_response import (
OpenAIResponsesHandler,
)
if api_type == "responses":
handler = OpenAIResponsesHandler
else:
handler = OpenAICompletionsHandler
bfcl_model_config.MODEL_CONFIG_MAPPING[model] = ModelConfig(
model_name=model,
display_name=f"{model} (FC) (vLLM)",
url=f"https://huggingface.co/{model}",
org="",
license="apache-2.0",
model_handler=handler,
input_price=None,
output_price=None,
is_fc_model=True,
underscore_to_dot=True,
)
from bfcl_eval.__main__ import evaluate, generate
import inspect
import typer
def _get_default_kwargs(function):
kwargs = {}
for k, v in inspect.signature(function).parameters.items():
if v.default is not inspect.Parameter.empty:
default = v.default
if isinstance(default, typer.models.OptionInfo):
default = default.default
kwargs[k] = default
return kwargs
# ---- generate ----
print(f"=== BFCL generate: model={model} test_category={test_category} ===")
gen_kwargs = _get_default_kwargs(generate)
gen_kwargs["model"] = [model]
gen_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
gen_kwargs["skip_server_setup"] = True
gen_kwargs["num_threads"] = num_threads
generate(**gen_kwargs)
# ---- evaluate ----
print(f"=== BFCL evaluate: model={model} test_category={test_category} ===")
eval_kwargs = _get_default_kwargs(evaluate)
eval_kwargs["model"] = [model]
eval_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
evaluate(**eval_kwargs)
print("=== BFCL evaluation completed successfully ===")
PYEOF
# ---- Upload results to buildkite ----
if command -v buildkite-agent &>/dev/null; then
if [ $bfcl_exit_code -eq 0 ]; then
STYLE="success"
STATUS="PASSED"
else
STYLE="error"
STATUS="FAILED"
fi
buildkite-agent annotate --style "$STYLE" --context "bfcl-results" <<EOF
### BFCL Tool Call Correctness - ${STATUS}
- **Model:** \`${MODEL}\`
- **Parser:** \`${TOOL_CALL_PARSER}\`
- **API type:** \`${API_TYPE}\`
- **Test category:** \`${TEST_CATEGORY}\`
EOF
# BFCL writes results to $BFCL_PROJECT_ROOT/result/ and scores to
# $BFCL_PROJECT_ROOT/score/
RESULTS_ROOT="${OUTPUT_DIR:-.}"
if [ -d "$RESULTS_ROOT/result" ]; then
buildkite-agent artifact upload "$RESULTS_ROOT/result/**/*"
fi
if [ -d "$RESULTS_ROOT/score" ]; then
buildkite-agent artifact upload "$RESULTS_ROOT/score/**/*"
fi
fi
exit $bfcl_exit_code

View File

@@ -9,11 +9,10 @@ ENV_FILE=$1
# For testing on local vm, use `set -a` to export all variables # For testing on local vm, use `set -a` to export all variables
source /etc/environment source /etc/environment
# shellcheck source=/dev/null source $ENV_FILE
source "$ENV_FILE"
remove_docker_container() { remove_docker_container() {
docker rm -f "$CONTAINER_NAME" || true; docker rm -f $CONTAINER_NAME || true;
} }
trap remove_docker_container EXIT trap remove_docker_container EXIT
@@ -42,13 +41,13 @@ echo
echo "starting docker...$CONTAINER_NAME" echo "starting docker...$CONTAINER_NAME"
echo echo
docker run \ docker run \
-v "$DOWNLOAD_DIR":"$DOWNLOAD_DIR" \ -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
--env-file "$ENV_FILE" \ --env-file $ENV_FILE \
-e HF_TOKEN="$HF_TOKEN" \ -e HF_TOKEN="$HF_TOKEN" \
-e TARGET_COMMIT="$BUILDKITE_COMMIT" \ -e TARGET_COMMIT=$BUILDKITE_COMMIT \
-e MODEL="$MODEL" \ -e MODEL=$MODEL \
-e WORKSPACE=/workspace \ -e WORKSPACE=/workspace \
--name "$CONTAINER_NAME" \ --name $CONTAINER_NAME \
-d \ -d \
--privileged \ --privileged \
--network host \ --network host \

View File

@@ -42,21 +42,21 @@ echo "lanching vllm..."
echo "logging to $VLLM_LOG" echo "logging to $VLLM_LOG"
echo echo
vllm serve "$MODEL" \ vllm serve $MODEL \
--seed 42 \ --seed 42 \
--max-num-seqs "$MAX_NUM_SEQS" \ --max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \ --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--tensor-parallel-size "$TENSOR_PARALLEL_SIZE" \ --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
--no-enable-prefix-caching \ --no-enable-prefix-caching \
--download_dir "$DOWNLOAD_DIR" \ --download_dir $DOWNLOAD_DIR \
--max-model-len "$MAX_MODEL_LEN" > "$VLLM_LOG" 2>&1 & --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
echo "wait for 20 minutes.." echo "wait for 20 minutes.."
echo echo
# sleep 1200 # sleep 1200
# wait for 10 minutes... # wait for 10 minutes...
for _ in {1..120}; do for i in {1..120}; do
# TODO: detect other type of errors. # TODO: detect other type of errors.
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
echo "Detected RuntimeError, exiting." echo "Detected RuntimeError, exiting."
@@ -78,11 +78,11 @@ echo "logging to $BM_LOG"
echo echo
vllm bench serve \ vllm bench serve \
--backend vllm \ --backend vllm \
--model "$MODEL" \ --model $MODEL \
--dataset-name sonnet \ --dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \ --dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len "$INPUT_LEN" \ --sonnet-input-len $INPUT_LEN \
--sonnet-output-len "$OUTPUT_LEN" \ --sonnet-output-len $OUTPUT_LEN \
--ignore-eos > "$BM_LOG" --ignore-eos > "$BM_LOG"
echo "completed..." echo "completed..."

View File

@@ -72,19 +72,20 @@ obj_json="objects.json"
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json" aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
mkdir -p "$INDICES_OUTPUT_DIR" mkdir -p "$INDICES_OUTPUT_DIR"
# call script to generate indices for all existing wheels # call script to generate indicies for all existing wheels
# this indices have relative paths that could work as long as it is next to the wheel directory in s3 # this indices have relative paths that could work as long as it is next to the wheel directory in s3
# i.e., the wheels are always in s3://vllm-wheels/<commit>/ # i.e., the wheels are always in s3://vllm-wheels/<commit>/
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/ # and indices can be placed in /<commit>/, or /nightly/, or /<version>/
alias_args=() if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
if [[ -n "$DEFAULT_VARIANT_ALIAS" ]]; then alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
alias_args=(--alias-to-default "$DEFAULT_VARIANT_ALIAS") else
alias_arg=""
fi fi
# HACK: we do not need regex module here, but it is required by pre-commit hook # HACK: we do not need regex module here, but it is required by pre-commit hook
# To avoid any external dependency, we simply replace it back to the stdlib re module # To avoid any external dependency, we simply replace it back to the stdlib re module
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" "${alias_args[@]}" $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
# copy indices to /<commit>/ unconditionally # copy indices to /<commit>/ unconditionally
echo "Uploading indices to $S3_COMMIT_PREFIX" echo "Uploading indices to $S3_COMMIT_PREFIX"
@@ -99,9 +100,9 @@ fi
# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version # re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
if [[ "$version" != *"dev"* ]]; then if [[ "$version" != *"dev"* ]]; then
echo "Re-generating indices for /$pure_version/" echo "Re-generating indices for /$pure_version/"
rm -rf "${INDICES_OUTPUT_DIR:?}/*" rm -rf "$INDICES_OUTPUT_DIR/*"
mkdir -p "$INDICES_OUTPUT_DIR" mkdir -p "$INDICES_OUTPUT_DIR"
# wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" "${alias_args[@]}" $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/" aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
fi fi

View File

@@ -7,7 +7,7 @@ SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/" S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
RELEASE_VERSION=$(buildkite-agent meta-data get release-version) RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
GIT_VERSION=$(git describe --exact-match --tags "$BUILDKITE_COMMIT" 2>/dev/null) GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
echo "Release version from Buildkite: $RELEASE_VERSION" echo "Release version from Buildkite: $RELEASE_VERSION"
@@ -54,13 +54,10 @@ mkdir -p $DIST_DIR
# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64') # include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
echo "Wheels copied to local directory" echo "Wheels copied to local directory"
# generate source distribution using setup.py # generate source tarball
python setup.py sdist --dist-dir=$DIST_DIR git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
ls -la $DIST_DIR ls -la $DIST_DIR
SDIST_FILE=$(find $DIST_DIR -name "vllm*.tar.gz")
echo "Found sdist: $SDIST_FILE"
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name) # upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*") PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
if [[ -z "$PYPI_WHEEL_FILES" ]]; then if [[ -z "$PYPI_WHEEL_FILES" ]]; then
@@ -68,6 +65,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
exit 1 exit 1
fi fi
python3 -m twine check "$PYPI_WHEEL_FILES" "$SDIST_FILE" python3 -m twine check $PYPI_WHEEL_FILES
python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES" "$SDIST_FILE" python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES
echo "Wheels and source distribution uploaded to PyPI" echo "Wheels uploaded to PyPI"

View File

@@ -55,7 +55,7 @@ mkdir -p all-rocm-wheels
cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
WHEEL_COUNT=$(find all-rocm-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l) WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
echo "Total wheels to upload: $WHEEL_COUNT" echo "Total wheels to upload: $WHEEL_COUNT"
if [ "$WHEEL_COUNT" -eq 0 ]; then if [ "$WHEEL_COUNT" -eq 0 ]; then
@@ -115,7 +115,7 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] |
fi fi
# Extract version from vLLM wheel and update version-specific index # Extract version from vLLM wheel and update version-specific index
VLLM_WHEEL=$(find all-rocm-wheels -maxdepth 1 -name 'vllm*.whl' 2>/dev/null | head -1) VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
if [ -n "$VLLM_WHEEL" ]; then if [ -n "$VLLM_WHEEL" ]; then
VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "Version in wheel: $VERSION" echo "Version in wheel: $VERSION"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -17,15 +17,3 @@ steps:
- tests/benchmarks/ - tests/benchmarks/
commands: commands:
- pytest -v -s benchmarks/ - pytest -v -s benchmarks/
- label: Attention Benchmarks Smoke Test (B200)
device: b200
num_gpus: 2
optional: true
working_dir: "/vllm-workspace/"
timeout_in_minutes: 10
source_file_dependencies:
- benchmarks/attention_benchmarks/
- vllm/v1/attention/
commands:
- python3 benchmarks/attention_benchmarks/benchmark.py --backends flash flashinfer --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1

View File

@@ -36,16 +36,6 @@ steps:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1 - export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
- label: AsyncTP Correctness Tests (B200)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/"
device: b200
optional: true
num_devices: 2
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
- label: Distributed Compile Unit Tests (2xH100) - label: Distributed Compile Unit Tests (2xH100)
timeout_in_minutes: 20 timeout_in_minutes: 20
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
@@ -101,8 +91,8 @@ steps:
- nvidia-smi - nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops # Run all models and attn backends but only Inductor partition and native custom ops
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
# Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and (qwen3 or deepseek)" - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
- label: Fusion E2E Config Sweep (H100) - label: Fusion E2E Config Sweep (H100)
timeout_in_minutes: 30 timeout_in_minutes: 30
@@ -131,10 +121,13 @@ steps:
optional: true optional: true
commands: commands:
- nvidia-smi - nvidia-smi
# Run all models but only FLASHINFER, Inductor partition and native custom ops # Run all models and attn backends but only Inductor partition and native custom ops
# Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported # -k "inductor_partition and not +rms_norm and not +quant_fp8"
# Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition) # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek)) or llama-3)" # -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
# Run just llama3 (fp8 & fp4) for all config combinations
# -k "llama-3"
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
- label: Fusion E2E TP2 Quick (H100) - label: Fusion E2E TP2 Quick (H100)
timeout_in_minutes: 20 timeout_in_minutes: 20
@@ -150,8 +143,8 @@ steps:
commands: commands:
- nvidia-smi - nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops # Run all models and attn backends but only Inductor partition and native custom ops
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))" - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))" - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
- label: Fusion E2E TP2 AR-RMS Config Sweep (H100) - label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
timeout_in_minutes: 40 timeout_in_minutes: 40
@@ -169,7 +162,7 @@ steps:
- tests/compile/fusions_e2e/ - tests/compile/fusions_e2e/
commands: commands:
- nvidia-smi - nvidia-smi
# Run just llama3 (fp8 & bf16) for all config combinations # Run just llama3 (fp4 & fp8 & bf16) for all config combinations
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3" - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
- label: Fusion E2E TP2 AsyncTP Config Sweep (H100) - label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
@@ -204,8 +197,7 @@ steps:
- tests/compile/fusions_e2e/ - tests/compile/fusions_e2e/
commands: commands:
- nvidia-smi - nvidia-smi
# Run all models but only FLASHINFER, Inductor partition and native custom ops # Run all models and attn backends but only Inductor partition and native custom ops
# include qwen/deepseek with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
# for ar-rms-quant-fp4, also sweep llama3 # for ar-rms-quant-fp4, also sweep llama3
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))) or Llama-3.1-8B-Instruct-FP4" - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))" - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"

View File

@@ -50,18 +50,23 @@ steps:
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s v1/worker/test_worker_memory_snapshot.py - pytest -v -s v1/worker/test_worker_memory_snapshot.py
- label: Distributed Torchrun + Examples (4 GPUs) - label: Distributed Tests (4 GPUs)
timeout_in_minutes: 30 timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_devices: 4 num_devices: 4
source_file_dependencies: source_file_dependencies:
- vllm/distributed/ - vllm/distributed/
- tests/distributed/test_torchrun_example.py - tests/distributed/test_utils
- tests/distributed/test_torchrun_example_moe.py - tests/distributed/test_pynccl
- tests/distributed/test_events
- tests/compile/fullgraph/test_basic_correctness.py
- examples/offline_inference/rlhf.py - examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py - examples/offline_inference/rlhf_colocate.py
- examples/offline_inference/new_weight_syncing/ - examples/offline_inference/new_weight_syncing/
- tests/examples/offline_inference/data_parallel.py - tests/examples/offline_inference/data_parallel.py
- tests/v1/distributed
- tests/v1/engine/test_engine_core_client.py
- tests/distributed/test_symm_mem_allreduce.py
commands: commands:
# https://github.com/NVIDIA/nccl/issues/1838 # https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0 - export NCCL_CUMEM_HOST_ENABLE=0
@@ -79,27 +84,6 @@ steps:
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with internal dp # test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
# OLD rlhf examples
- cd ../examples/offline_inference
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
# NEW rlhf examples
- cd new_weight_syncing
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
- label: Distributed DP Tests (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/
- tests/v1/distributed
- tests/v1/engine/test_engine_core_client.py
- tests/distributed/test_utils
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
@@ -107,27 +91,20 @@ steps:
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py - pytest -v -s distributed/test_utils.py
- label: Distributed Compile + Comm (4 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/
- tests/distributed/test_pynccl
- tests/distributed/test_events
- tests/compile/fullgraph/test_basic_correctness.py
- tests/distributed/test_symm_mem_allreduce.py
- tests/distributed/test_multiproc_executor.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- pytest -v -s compile/fullgraph/test_basic_correctness.py - pytest -v -s compile/fullgraph/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py - pytest -v -s distributed/test_events.py
- pytest -v -s distributed/test_symm_mem_allreduce.py - pytest -v -s distributed/test_symm_mem_allreduce.py
# test multi-node TP with multiproc executor (simulated on single node) # TODO: create a dedicated test section for multi-GPU example tests
- pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node # when we have multiple distributed example tests
# OLD rlhf examples
- cd ../examples/offline_inference
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
# NEW rlhf examples
- cd new_weight_syncing
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
- label: Distributed Tests (8 GPUs)(H100) - label: Distributed Tests (8 GPUs)(H100)
timeout_in_minutes: 10 timeout_in_minutes: 10
@@ -169,7 +146,6 @@ steps:
num_devices: 2 num_devices: 2
commands: commands:
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
- VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
@@ -189,7 +165,6 @@ steps:
num_devices: 2 num_devices: 2
num_nodes: 2 num_nodes: 2
no_plugin: true no_plugin: true
optional: true # TODO: revert once infra issue solved
source_file_dependencies: source_file_dependencies:
- vllm/distributed/ - vllm/distributed/
- vllm/engine/ - vllm/engine/
@@ -222,31 +197,7 @@ steps:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs) - label: Pipeline + Context Parallelism (4 GPUs))
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: NixlConnector PD + Spec Decode acceptance (2 GPUs)
timeout_in_minutes: 30
device: a100
working_dir: "/vllm-workspace/tests"
num_devices: 2
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- vllm/v1/worker/kv_connector_model_runner_mixin.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
- label: Pipeline + Context Parallelism (4 GPUs)
timeout_in_minutes: 60 timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_devices: 4 num_devices: 4

View File

@@ -29,11 +29,15 @@ steps:
commands: commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
- label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100) - label: Prime-RL Integration (2 GPUs)
timeout_in_minutes: 60 timeout_in_minutes: 30
device: h100
optional: true optional: true
num_devices: 1 soft_fail: true
num_devices: 2
working_dir: "/vllm-workspace" working_dir: "/vllm-workspace"
source_file_dependencies:
- vllm/
- .buildkite/scripts/run-prime-rl-test.sh
commands: commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh 0.25 200 8030 - nvidia-smi
- bash .buildkite/scripts/run-prime-rl-test.sh

View File

@@ -14,59 +14,17 @@ steps:
commands: commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
- label: Engine (1 GPU) - label: V1 e2e + engine
timeout_in_minutes: 30 timeout_in_minutes: 45
source_file_dependencies: source_file_dependencies:
- vllm/v1/engine/ - vllm/
- tests/v1/engine/ - tests/v1
commands: commands:
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e
# Run this test standalone for now;
# need to untangle use (implicit) use of spawn/fork across the tests.
- pytest -v -s v1/engine/test_preprocess_error_handling.py - pytest -v -s v1/engine/test_preprocess_error_handling.py
# Run the rest of v1/engine tests
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
- label: e2e Scheduling (1 GPU)
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/
- tests/v1/e2e/general/
commands:
- pytest -v -s v1/e2e/general/test_async_scheduling.py
- label: e2e Core (1 GPU)
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/
- tests/v1/e2e/general/
commands:
- pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
- label: V1 e2e (2 GPUs)
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
optional: true
num_devices: 2
source_file_dependencies:
- vllm/
- tests/v1/e2e
commands:
# Only run tests that need exactly 2 GPUs
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
mirror:
amd:
device: mi325_2
depends_on:
- image-build-amd
- label: V1 e2e (4 GPUs)
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
optional: true
num_devices: 4
source_file_dependencies:
- vllm/
- tests/v1/e2e
commands:
# Only run tests that need 4 GPUs
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
mirror:
amd:
device: mi325_4
depends_on:
- image-build-amd

View File

@@ -34,26 +34,23 @@ steps:
- tests/entrypoints/test_chat_utils - tests/entrypoints/test_chat_utils
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Entrypoints Integration (API Server 2) - label: Entrypoints Integration (API Server 2)
timeout_in_minutes: 130 timeout_in_minutes: 130
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/entrypoints/rpc
- tests/entrypoints/instrumentator
- tests/tool_use - tests/tool_use
- tests/entrypoints/sleep
- tests/entrypoints/instrumentator
- tests/entrypoints/rpc
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/instrumentator
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
- pytest -v -s entrypoints/instrumentator
- pytest -v -s entrypoints/sleep
- pytest -v -s tool_use - pytest -v -s tool_use
- label: Entrypoints Integration (Pooling) - label: Entrypoints Integration (Pooling)
@@ -82,11 +79,6 @@ steps:
- tests/v1 - tests/v1
commands: commands:
- pytest -v -s v1/entrypoints - pytest -v -s v1/entrypoints
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: OpenAI API Correctness - label: OpenAI API Correctness
timeout_in_minutes: 30 timeout_in_minutes: 30

View File

@@ -21,18 +21,3 @@ steps:
commands: commands:
- pytest -v -s distributed/test_eplb_execute.py - pytest -v -s distributed/test_eplb_execute.py
- pytest -v -s distributed/test_eplb_spec_decode.py - pytest -v -s distributed/test_eplb_spec_decode.py
- label: Elastic EP Scaling Test
timeout_in_minutes: 20
device: b200
optional: true
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/compilation/
- tests/distributed/
commands:
- pytest -v -s distributed/test_elastic_ep.py

View File

@@ -8,9 +8,8 @@ steps:
- csrc/ - csrc/
- tests/kernels/core - tests/kernels/core
- tests/kernels/test_top_k_per_row.py - tests/kernels/test_top_k_per_row.py
- tests/kernels/test_concat_mla_q.py
commands: commands:
- pytest -v -s kernels/core kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py - pytest -v -s kernels/core kernels/test_top_k_per_row.py
- label: Kernels Attention Test %N - label: Kernels Attention Test %N
timeout_in_minutes: 35 timeout_in_minutes: 35
@@ -45,8 +44,7 @@ steps:
- vllm/envs.py - vllm/envs.py
- vllm/config - vllm/config
commands: commands:
- pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
- pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2 parallelism: 2
- label: Kernels Mamba Test - label: Kernels Mamba Test
@@ -72,7 +70,7 @@ steps:
- tests/kernels/moe/test_batched_deepgemm.py - tests/kernels/moe/test_batched_deepgemm.py
- tests/kernels/attention/test_deepgemm_attention.py - tests/kernels/attention/test_deepgemm_attention.py
commands: commands:
- pytest -v -s kernels/quantization/test_block_fp8.py - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
- pytest -v -s kernels/moe/test_deepgemm.py - pytest -v -s kernels/moe/test_deepgemm.py
- pytest -v -s kernels/moe/test_batched_deepgemm.py - pytest -v -s kernels/moe/test_batched_deepgemm.py
- pytest -v -s kernels/attention/test_deepgemm_attention.py - pytest -v -s kernels/attention/test_deepgemm_attention.py
@@ -97,7 +95,7 @@ steps:
- vllm/platforms/cuda.py - vllm/platforms/cuda.py
commands: commands:
- nvidia-smi - nvidia-smi
- python3 examples/basic/offline_inference/chat.py - python3 examples/offline_inference/basic/chat.py
# Attention # Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_attention_selector.py - pytest -v -s tests/kernels/attention/test_attention_selector.py
@@ -117,7 +115,6 @@ steps:
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py - pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/kernels/moe/test_flashinfer_moe.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
# e2e # e2e
- pytest -v -s tests/models/quantization/test_nvfp4.py - pytest -v -s tests/models/quantization/test_nvfp4.py
@@ -157,6 +154,8 @@ steps:
commands: commands:
- pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
- pytest -v -s kernels/moe/test_deepep_moe.py - pytest -v -s kernels/moe/test_deepep_moe.py
- pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
# - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
- label: Kernels Fp4 MoE Test (B200) - label: Kernels Fp4 MoE Test (B200)
timeout_in_minutes: 60 timeout_in_minutes: 60

View File

@@ -11,17 +11,17 @@ steps:
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
# - label: LM Eval Large Models (4 GPUs)(A100) - label: LM Eval Large Models (4 GPUs)(A100)
# device: a100 device: a100
# optional: true optional: true
# num_devices: 4 num_devices: 4
# working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
# source_file_dependencies: source_file_dependencies:
# - csrc/ - csrc/
# - vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
# commands: commands:
# - export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
# - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
- label: LM Eval Large Models (4 GPUs)(H100) - label: LM Eval Large Models (4 GPUs)(H100)
device: h100 device: h100
@@ -73,29 +73,3 @@ steps:
num_devices: 2 num_devices: 2
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
- label: GPQA Eval (GPT-OSS) (H100)
timeout_in_minutes: 120
device: h100
optional: true
num_devices: 2
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
- tests/evals/gpt_oss/
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-h100.txt
- label: GPQA Eval (GPT-OSS) (B200)
timeout_in_minutes: 120
device: b200
optional: true
num_devices: 2
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
- tests/evals/gpt_oss/
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-b200.txt

View File

@@ -9,7 +9,6 @@ steps:
- tests/v1 - tests/v1
commands: commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
# split the test to avoid interference # split the test to avoid interference
- pytest -v -s -m 'not cpu_test' v1/core - pytest -v -s -m 'not cpu_test' v1/core
- pytest -v -s v1/executor - pytest -v -s v1/executor
@@ -17,7 +16,6 @@ steps:
- pytest -v -s v1/sample - pytest -v -s v1/sample
- pytest -v -s v1/logits_processors - pytest -v -s v1/logits_processors
- pytest -v -s v1/worker - pytest -v -s v1/worker
# TODO: create another `optional` test group for slow tests
- pytest -v -s -m 'not slow_test' v1/spec_decode - pytest -v -s -m 'not slow_test' v1/spec_decode
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
- pytest -v -s -m 'not cpu_test' v1/metrics - pytest -v -s -m 'not cpu_test' v1/metrics
@@ -27,11 +25,6 @@ steps:
# Integration test for streaming correctness (requires special branch). # Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: V1 Others (CPU) - label: V1 Others (CPU)
depends_on: depends_on:
@@ -67,13 +60,12 @@ steps:
- examples/ - examples/
commands: commands:
- pip install tensorizer # for tensorizer test - pip install tensorizer # for tensorizer test
# for basic - python3 offline_inference/basic/chat.py # for basic
- python3 basic/offline_inference/chat.py - python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 basic/offline_inference/generate.py --model facebook/opt-125m - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - python3 offline_inference/basic/classify.py
- python3 basic/offline_inference/classify.py - python3 offline_inference/basic/embed.py
- python3 basic/offline_inference/embed.py - python3 offline_inference/basic/score.py
- python3 basic/offline_inference/score.py
# for multi-modal models # for multi-modal models
- python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0
@@ -116,11 +108,9 @@ steps:
timeout_in_minutes: 50 timeout_in_minutes: 50
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/detokenizer
- tests/multimodal - tests/multimodal
- tests/utils_ - tests/utils_
commands: commands:
- pytest -v -s detokenizer
- pytest -v -s -m 'not cpu_test' multimodal - pytest -v -s -m 'not cpu_test' multimodal
- pytest -v -s utils_ - pytest -v -s utils_
@@ -133,7 +123,6 @@ steps:
- tests/test_inputs.py - tests/test_inputs.py
- tests/test_outputs.py - tests/test_outputs.py
- tests/test_pooling_params.py - tests/test_pooling_params.py
- tests/test_ray_env.py
- tests/multimodal - tests/multimodal
- tests/renderers - tests/renderers
- tests/standalone_tests/lazy_imports.py - tests/standalone_tests/lazy_imports.py
@@ -147,7 +136,6 @@ steps:
- pytest -v -s test_inputs.py - pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py - pytest -v -s test_outputs.py
- pytest -v -s test_pooling_params.py - pytest -v -s test_pooling_params.py
- pytest -v -s test_ray_env.py
- pytest -v -s -m 'cpu_test' multimodal - pytest -v -s -m 'cpu_test' multimodal
- pytest -v -s renderers - pytest -v -s renderers
- pytest -v -s tokenizers_ - pytest -v -s tokenizers_
@@ -155,6 +143,20 @@ steps:
- pytest -v -s transformers_utils - pytest -v -s transformers_utils
- pytest -v -s config - pytest -v -s config
- label: GPT-OSS Eval (B200)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
device: b200
optional: true
source_file_dependencies:
- tests/evals/gpt_oss
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Batch Invariance (H100) - label: Batch Invariance (H100)
timeout_in_minutes: 25 timeout_in_minutes: 25
device: h100 device: h100

View File

@@ -9,9 +9,9 @@ steps:
- vllm/config/model.py - vllm/config/model.py
- vllm/model_executor - vllm/model_executor
- tests/model_executor - tests/model_executor
- tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py - tests/entrypoints/openai/test_tensorizer_entrypoint.py
commands: commands:
- apt-get update && apt-get install -y curl libsodium23 - apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor - pytest -v -s model_executor
- pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py

View File

@@ -1,110 +0,0 @@
group: Model Runner V2
depends_on:
- image-build
steps:
- label: Model Runner V2 Core Tests
timeout_in_minutes: 45
source_file_dependencies:
- vllm/v1/worker/gpu/
- vllm/v1/worker/gpu_worker.py
- vllm/v1/core/sched/
- vllm/v1/attention/
- tests/v1/engine/test_llm_engine.py
- tests/v1/e2e/
- tests/v1/entrypoints/llm/test_struct_output_generate.py
commands:
- set -x
- export VLLM_USE_V2_MODEL_RUNNER=1
- pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics"
# This requires eager until we sort out CG correctness issues.
# TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged.
- ENFORCE_EAGER=1 pytest -v -s v1/e2e/general/test_async_scheduling.py -k "not ngram"
- pytest -v -s v1/e2e/general/test_context_length.py
- pytest -v -s v1/e2e/general/test_min_tokens.py
# Temporary hack filter to exclude ngram spec decoding based tests.
- pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
- label: Model Runner V2 Examples
timeout_in_minutes: 45
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
- vllm/v1/worker/gpu/
- vllm/v1/core/sched/
- vllm/v1/worker/gpu_worker.py
- examples/offline_inference/
- examples/basic/offline_inference/
- examples/pooling/embed/vision_embedding_offline.py
- examples/others/tensorize_vllm_model.py
commands:
- set -x
- export VLLM_USE_V2_MODEL_RUNNER=1
- pip install tensorizer # for tensorizer test
- python3 basic/offline_inference/chat.py # for basic
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
#- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 # TODO
#- python3 basic/offline_inference/embed.py # TODO
# for multi-modal models
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
# for pooling models
- python3 pooling/embed/vision_embedding_offline.py --seed 0
# for features demo
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
- label: Model Runner V2 Distributed (2 GPUs)
timeout_in_minutes: 45
working_dir: "/vllm-workspace/tests"
num_devices: 2
source_file_dependencies:
- vllm/v1/worker/gpu/
- vllm/v1/worker/gpu_worker.py
- tests/basic_correctness/test_basic_correctness.py
- tests/v1/distributed/test_async_llm_dp.py
- tests/v1/distributed/test_eagle_dp.py
commands:
- set -x
- export VLLM_USE_V2_MODEL_RUNNER=1
# The "and not True" here is a hacky way to exclude the prompt_embeds cases which aren't yet supported.
- TARGET_TEST_SUITE=L4 pytest -v -s basic_correctness/test_basic_correctness.py -m 'distributed(num_gpus=2)' -k "not ray and not True"
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py -k "not ray"
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
# These require fix https://github.com/vllm-project/vllm/pull/36280
- label: Model Runner V2 Pipeline Parallelism (4 GPUs)
timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
- vllm/v1/worker/gpu/
- vllm/v1/worker/gpu_worker.py
- tests/distributed/test_pipeline_parallel.py
#- tests/distributed/test_pp_cudagraph.py
commands:
- set -x
- export VLLM_USE_V2_MODEL_RUNNER=1
- pytest -v -s distributed/test_pipeline_parallel.py -k "not ray and not Jamba"
# TODO: Uncomment once https://github.com/vllm-project/vllm/pull/35162 is merged.
#- pytest -v -s distributed/test_pp_cudagraph.py -k "not ray"
- label: Model Runner V2 Spec Decode
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/v1/worker/gpu/
- vllm/v1/worker/gpu_worker.py
- tests/v1/spec_decode/test_max_len.py
- tests/v1/e2e/spec_decode/test_spec_decode.py
commands:
- set -x
- export VLLM_USE_V2_MODEL_RUNNER=1
- pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp"
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle or mtp"

View File

@@ -4,6 +4,7 @@ depends_on:
steps: steps:
- label: Basic Models Tests (Initialization) - label: Basic Models Tests (Initialization)
timeout_in_minutes: 45 timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@@ -15,6 +16,7 @@ steps:
- label: Basic Models Tests (Extra Initialization) %N - label: Basic Models Tests (Extra Initialization) %N
timeout_in_minutes: 45 timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/models/ - vllm/model_executor/models/
@@ -36,12 +38,6 @@ steps:
- tests/models/test_registry.py - tests/models/test_registry.py
commands: commands:
- pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Basic Models Test (Other CPU) # 5min - label: Basic Models Test (Other CPU) # 5min
depends_on: depends_on:
@@ -65,7 +61,7 @@ steps:
- pytest -v -s tests/models/test_transformers.py - pytest -v -s tests/models/test_transformers.py
- pytest -v -s tests/models/multimodal/processing/ - pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py - pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/basic/offline_inference/chat.py - python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock # Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

View File

@@ -4,6 +4,7 @@ depends_on:
steps: steps:
- label: Language Models Tests (Standard) - label: Language Models Tests (Standard)
timeout_in_minutes: 25 timeout_in_minutes: 25
mirror_hardwares: [amdexperimental]
torch_nightly: true torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@@ -15,6 +16,7 @@ steps:
- label: Language Models Tests (Extra Standard) %N - label: Language Models Tests (Extra Standard) %N
timeout_in_minutes: 45 timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/models/ - vllm/model_executor/models/
@@ -30,6 +32,7 @@ steps:
- label: Language Models Tests (Hybrid) %N - label: Language Models Tests (Hybrid) %N
timeout_in_minutes: 75 timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
torch_nightly: true torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@@ -37,7 +40,7 @@ steps:
commands: commands:
# Install fast path packages for testing against transformers # Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM # Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0' - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
# Shard hybrid language model tests # Shard hybrid language model tests
- pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
@@ -45,6 +48,7 @@ steps:
- label: Language Models Test (Extended Generation) # 80min - label: Language Models Test (Extended Generation) # 80min
timeout_in_minutes: 110 timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@@ -52,21 +56,13 @@ steps:
commands: commands:
# Install fast path packages for testing against transformers # Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM # Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0' - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
commands:
- uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
- label: Language Models Test (PPL) - label: Language Models Test (PPL)
timeout_in_minutes: 110 timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@@ -76,20 +72,17 @@ steps:
- label: Language Models Test (Extended Pooling) # 36min - label: Language Models Test (Extended Pooling) # 36min
timeout_in_minutes: 50 timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/language/pooling - tests/models/language/pooling
commands: commands:
- pytest -v -s models/language/pooling -m 'not core_model' - pytest -v -s models/language/pooling -m 'not core_model'
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Language Models Test (MTEB) - label: Language Models Test (MTEB)
timeout_in_minutes: 110 timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/

View File

@@ -2,65 +2,16 @@ group: Models - Multimodal
depends_on: depends_on:
- image-build - image-build
steps: steps:
- label: "Multi-Modal Models (Standard) 1: qwen2" - label: Multi-Modal Models (Standard) # 60min
timeout_in_minutes: 45 timeout_in_minutes: 80
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/multimodal
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2" - pip freeze | grep -E 'torch'
- pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: "Multi-Modal Models (Standard) 2: qwen3 + gemma"
timeout_in_minutes: 45
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma"
- pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl"
timeout_in_minutes: 45
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma"
- pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: "Multi-Modal Models (Standard) 4: other + whisper"
timeout_in_minutes: 45
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Multi-Modal Processor Test (CPU) - label: Multi-Modal Processor Test (CPU)
depends_on: depends_on:
@@ -69,7 +20,6 @@ steps:
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/multimodal
- tests/models/registry.py
device: cpu device: cpu
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
@@ -80,7 +30,6 @@ steps:
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/multimodal
- tests/models/registry.py
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/processing/test_tensor_schema.py - pytest -v -s models/multimodal/processing/test_tensor_schema.py
@@ -103,11 +52,6 @@ steps:
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
- label: Multi-Modal Models (Extended) 2 - label: Multi-Modal Models (Extended) 2
optional: true optional: true
@@ -126,3 +70,12 @@ steps:
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models
optional: true
commands:
- echo 'Testing custom models...'
# PR authors can temporarily add commands below to test individual models
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*

View File

@@ -15,17 +15,10 @@ steps:
- pytest -v -s plugins_tests/test_platform_plugins.py - pytest -v -s plugins_tests/test_platform_plugins.py
- pip uninstall vllm_add_dummy_platform -y - pip uninstall vllm_add_dummy_platform -y
# end platform plugin tests # end platform plugin tests
# begin io_processor plugins test # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
# test generic io_processor plugins functions
- pytest -v -s ./plugins_tests/test_io_processor_plugins.py
# test Terratorch io_processor plugins
- pip install -e ./plugins/prithvi_io_processor_plugin - pip install -e ./plugins/prithvi_io_processor_plugin
- pytest -v -s plugins_tests/test_terratorch_io_processor_plugins.py - pytest -v -s plugins_tests/test_io_processor_plugins.py
- pip uninstall prithvi_io_processor_plugin -y - pip uninstall prithvi_io_processor_plugin -y
# test bge_m3_sparse io_processor plugin
- pip install -e ./plugins/bge_m3_sparse_plugin
- pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
- pip uninstall bge_m3_sparse_plugin -y
# end io_processor plugins test # end io_processor plugins test
# begin stat_logger plugins test # begin stat_logger plugins test
- pip install -e ./plugins/vllm_add_dummy_stat_logger - pip install -e ./plugins/vllm_add_dummy_stat_logger
@@ -36,6 +29,6 @@ steps:
- pytest -v -s plugins_tests/test_scheduler_plugins.py - pytest -v -s plugins_tests/test_scheduler_plugins.py
- pip install -e ./plugins/vllm_add_dummy_model - pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py - pytest -v -s distributed/test_distributed_oot.py
- pytest -v -s entrypoints/openai/chat_completion/test_oot_registration.py # it needs a clean process - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins

View File

@@ -1,16 +0,0 @@
group: Ray Compatibility
depends_on:
- image-build
steps:
- label: Ray Dependency Compatibility Check
# Informational only — does not block the pipeline.
# If this fails, it means the PR introduces a dependency that
# conflicts with Ray's dependency constraints.
# See https://github.com/vllm-project/vllm/issues/33599
soft_fail: true
timeout_in_minutes: 10
source_file_dependencies:
- requirements/
- setup.py
commands:
- bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh

View File

@@ -12,10 +12,3 @@ steps:
commands: commands:
- pytest -v -s samplers - pytest -v -s samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
commands:
- pytest -v -s samplers

View File

@@ -1,40 +0,0 @@
group: Spec Decode
depends_on:
- image-build
steps:
- label: Spec Decode Eagle
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"
- label: Spec Decode Speculators + MTP
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- vllm/transformers_utils/configs/speculators/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
- label: Spec Decode Ngram + Suffix
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"
- label: Spec Decode Draft Model
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"

View File

@@ -13,13 +13,13 @@ steps:
commands: commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
# - label: Weight Loading Multiple GPU - Large Models # optional - label: Weight Loading Multiple GPU - Large Models # optional
# working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
# num_devices: 2 num_devices: 2
# device: a100 device: a100
# optional: true optional: true
# source_file_dependencies: source_file_dependencies:
# - vllm/ - vllm/
# - tests/weight_loading - tests/weight_loading
# commands: commands:
# - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt

24
.github/.bc-linter.yml vendored Normal file
View File

@@ -0,0 +1,24 @@
# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
version: 1
paths:
# We temporarily disable globally, and will only enable with `annotations.include`
# include:
# - "vllm/v1/attetion/*.py"
# - "vllm/v1/core/*.py"
exclude:
- "**/*.py"
scan:
functions: true # check free functions and methods
classes: true # check classes/dataclasses
public_only: true # ignore names starting with "_" at any level
annotations:
include: # decorators that forceinclude a symbol
- name: "bc_linter_include" # matched by simple name or dotted suffix
propagate_to_members: false # for classes, include methods/inner classes
exclude: # decorators that forceexclude a symbol
- name: "bc_linter_skip" # matched by simple name or dotted suffix
propagate_to_members: true # for classes, exclude methods/inner classes
excluded_violations: [] # e.g. ["ParameterRenamed", "FieldTypeChanged"]

55
.github/CODEOWNERS vendored
View File

@@ -2,66 +2,45 @@
# for more info about CODEOWNERS file # for more info about CODEOWNERS file
# This lists cover the "core" components of vLLM that require careful review # This lists cover the "core" components of vLLM that require careful review
/vllm/compilation @zou3519 @youkaichao @ProExpertProg @BoyuanFeng /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery /vllm/model_executor/layers/attention @LucasWilkinson
/vllm/lora @jeejeelee
/vllm/model_executor/layers/attention @LucasWilkinson @MatthewBonanni
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
/vllm/model_executor/layers/mamba @tdoublep /vllm/model_executor/layers/mamba @tdoublep
/vllm/model_executor/model_loader @22quinn /vllm/model_executor/model_loader @22quinn
/vllm/model_executor/layers/batch_invariant.py @yewentao256 /vllm/model_executor/layers/batch_invariant.py @yewentao256
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
/vllm/vllm_flash_attn @LucasWilkinson @MatthewBonanni /vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
/vllm/reasoning @aarnphm @chaunceyjiang
/vllm/entrypoints @aarnphm @chaunceyjiang
/vllm/tool_parsers @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
CMakeLists.txt @tlrmchlsmth @LucasWilkinson CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact, # Any change to the VllmConfig changes can have a large user-facing impact,
# so spam a lot of people # so spam a lot of people
/vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg /vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
/vllm/config/cache.py @heheda12345 /vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
# Entrypoints
/vllm/entrypoints/anthropic @mgoin @DarkLight1337
/vllm/entrypoints/cli @hmellor @mgoin @DarkLight1337 @russellb
/vllm/entrypoints/mcp @heheda12345
/vllm/entrypoints/openai @aarnphm @chaunceyjiang @DarkLight1337 @russellb
/vllm/entrypoints/openai/realtime @njhill
/vllm/entrypoints/openai/speech_to_text @NickLucche
/vllm/entrypoints/pooling @noooop
/vllm/entrypoints/sagemaker @DarkLight1337
/vllm/entrypoints/serve @njhill
/vllm/entrypoints/*.py @njhill
/vllm/entrypoints/chat_utils.py @DarkLight1337
/vllm/entrypoints/llm.py @DarkLight1337
# Input/Output Processing
/vllm/sampling_params.py @njhill @NickLucche
/vllm/pooling_params.py @noooop @DarkLight1337
/vllm/tokenizers @DarkLight1337 @njhill
/vllm/renderers @DarkLight1337 @njhill
/vllm/reasoning @aarnphm @chaunceyjiang
/vllm/tool_parsers @aarnphm @chaunceyjiang
# vLLM V1 # vLLM V1
/vllm/v1/attention @LucasWilkinson @MatthewBonanni /vllm/v1/attention @LucasWilkinson
/vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill /vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
/vllm/v1/attention/backends/mla @pavanimajety /vllm/v1/attention/backends/mla @pavanimajety
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
/vllm/v1/attention/backends/triton_attn.py @tdoublep /vllm/v1/attention/backends/triton_attn.py @tdoublep
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery /vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
/vllm/v1/sample @22quinn @houseroad @njhill /vllm/v1/sample @22quinn @houseroad @njhill
/vllm/v1/spec_decode @benchislett @luccafong @MatthewBonanni /vllm/v1/spec_decode @benchislett @luccafong
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
/vllm/v1/kv_cache_interface.py @heheda12345 /vllm/v1/kv_cache_interface.py @heheda12345
/vllm/v1/kv_offload @ApostaC @orozery /vllm/v1/kv_offload @ApostaC @orozery
/vllm/v1/engine @njhill /vllm/v1/worker/gpu/kv_connector.py @orozery
/vllm/v1/executor @njhill /vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery
/vllm/v1/worker @njhill
/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery @NickLucche
# Model runner V2 # Model runner V2
/vllm/v1/worker/gpu @WoosukKwon @njhill /vllm/v1/worker/gpu @WoosukKwon
/vllm/v1/worker/gpu/kv_connector.py @orozery
# Test ownership # Test ownership
/.buildkite/lm-eval-harness @mgoin /.buildkite/lm-eval-harness @mgoin
@@ -136,8 +115,8 @@ mkdocs.yaml @hmellor
/vllm/model_executor/models/mixtral*.py @patrickvonplaten /vllm/model_executor/models/mixtral*.py @patrickvonplaten
/vllm/model_executor/models/voxtral*.py @patrickvonplaten /vllm/model_executor/models/voxtral*.py @patrickvonplaten
/vllm/model_executor/models/pixtral*.py @patrickvonplaten /vllm/model_executor/models/pixtral*.py @patrickvonplaten
/vllm/tokenizers/mistral.py @patrickvonplaten
/vllm/transformers_utils/configs/mistral.py @patrickvonplaten /vllm/transformers_utils/configs/mistral.py @patrickvonplaten
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
# Kernels # Kernels
/vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep /vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep
@@ -173,7 +152,9 @@ mkdocs.yaml @hmellor
/examples/pooling @noooop /examples/pooling @noooop
/tests/models/*/pooling* @noooop /tests/models/*/pooling* @noooop
/tests/entrypoints/pooling @noooop /tests/entrypoints/pooling @noooop
/vllm/entrypoints/pooling @noooop
/vllm/config/pooler.py @noooop /vllm/config/pooler.py @noooop
/vllm/pooling_params.py @noooop
/vllm/model_executor/layers/pooler @noooop /vllm/model_executor/layers/pooler @noooop
# Security guide and policies # Security guide and policies

16
.github/mergify.yml vendored
View File

@@ -3,7 +3,6 @@ pull_request_rules:
description: Automatically apply documentation label description: Automatically apply documentation label
conditions: conditions:
- label != stale - label != stale
- -closed
- or: - or:
- files~=^[^/]+\.md$ - files~=^[^/]+\.md$
- files~=^docs/ - files~=^docs/
@@ -27,7 +26,7 @@ pull_request_rules:
Hi @{{author}}, the pre-commit checks have failed. Please run: Hi @{{author}}, the pre-commit checks have failed. Please run:
```bash ```bash
uv pip install pre-commit>=4.5.1 uv pip install pre-commit
pre-commit install pre-commit install
pre-commit run --all-files pre-commit run --all-files
``` ```
@@ -38,13 +37,15 @@ pull_request_rules:
> [!TIP] > [!TIP]
> <details> > <details>
> <summary>Is <code>mypy</code> failing?</summary> > <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
> <br/> > <br/>
> <code>mypy</code> is run differently in CI. If the failure is related to this check, please use the following command to run it locally: > <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
> >
> ```bash > ```bash
> # For mypy (substitute "3.10" with the failing version if needed) > # For mypy (substitute "3.10" with the failing version if needed)
> pre-commit run --hook-stage manual mypy-3.10 > pre-commit run --hook-stage manual mypy-3.10
> # For markdownlint
> pre-commit run --hook-stage manual markdownlint
> ``` > ```
> </details> > </details>
@@ -258,7 +259,8 @@ pull_request_rules:
- files=benchmarks/run_structured_output_benchmark.sh - files=benchmarks/run_structured_output_benchmark.sh
- files=docs/features/structured_outputs.md - files=docs/features/structured_outputs.md
- files=examples/offline_inference/structured_outputs.py - files=examples/offline_inference/structured_outputs.py
- files=examples/online_serving/structured_outputs/structured_outputs.py - files=examples/online_serving/openai_chat_completion_structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
- files~=^tests/v1/structured_output/ - files~=^tests/v1/structured_output/
- files=tests/v1/entrypoints/llm/test_struct_output_generate.py - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
- files~=^vllm/v1/structured_output/ - files~=^vllm/v1/structured_output/
@@ -334,7 +336,7 @@ pull_request_rules:
- or: - or:
- files~=^tests/tool_use/ - files~=^tests/tool_use/
- files~=^tests/entrypoints/openai/tool_parsers/ - files~=^tests/entrypoints/openai/tool_parsers/
- files=tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
- files~=^vllm/entrypoints/openai/tool_parsers/ - files~=^vllm/entrypoints/openai/tool_parsers/
- files=docs/features/tool_calling.md - files=docs/features/tool_calling.md
- files~=^examples/tool_chat_* - files~=^examples/tool_chat_*
@@ -381,7 +383,7 @@ pull_request_rules:
- or: - or:
- files~=^vllm/model_executor/model_loader/tensorizer.py - files~=^vllm/model_executor/model_loader/tensorizer.py
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
- files~=^tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
- files~=^tests/model_executor/model_loader/tensorizer_loader/ - files~=^tests/model_executor/model_loader/tensorizer_loader/
actions: actions:
assign: assign:

29
.github/workflows/bc-lint.yml vendored Normal file
View File

@@ -0,0 +1,29 @@
name: BC Lint
on:
pull_request:
types:
- opened
- synchronize
- reopened
- labeled
- unlabeled
jobs:
bc_lint:
if: github.repository_owner == 'vllm-project'
runs-on: ubuntu-latest
steps:
- name: Run BC Lint Action
uses: pytorch/test-infra/.github/actions/bc-lint@main
with:
repo: ${{ github.event.pull_request.head.repo.full_name }}
base_sha: ${{ github.event.pull_request.base.sha }}
head_sha: ${{ github.event.pull_request.head.sha }}
suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
config_dir: .github
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
cancel-in-progress: true

View File

@@ -19,7 +19,6 @@ jobs:
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with: with:
python-version: '3.12' python-version: '3.12'
cache: 'pip'
- name: Install Python dependencies - name: Install Python dependencies
run: | run: |

View File

@@ -6,9 +6,6 @@ on:
- main - main
workflow_dispatch: # Manual trigger workflow_dispatch: # Manual trigger
permissions:
contents: read
jobs: jobs:
macos-m1-smoke-test: macos-m1-smoke-test:
runs-on: macos-latest runs-on: macos-latest

7
.gitignore vendored
View File

@@ -3,8 +3,6 @@
# vllm-flash-attn built from source # vllm-flash-attn built from source
vllm/vllm_flash_attn/* vllm/vllm_flash_attn/*
!vllm/vllm_flash_attn/__init__.py
!vllm/vllm_flash_attn/flash_attn_interface.py
# OpenAI triton kernels copied from source # OpenAI triton kernels copied from source
vllm/third_party/triton_kernels/* vllm/third_party/triton_kernels/*
@@ -189,9 +187,11 @@ cython_debug/
.vscode/ .vscode/
# Claude # Claude
CLAUDE.md
.claude/ .claude/
# Codex # Codex
AGENTS.md
.codex/ .codex/
# Cursor # Cursor
@@ -238,6 +238,3 @@ ep_kernels_workspace/
vllm/grpc/vllm_engine_pb2.py vllm/grpc/vllm_engine_pb2.py
vllm/grpc/vllm_engine_pb2_grpc.py vllm/grpc/vllm_engine_pb2_grpc.py
vllm/grpc/vllm_engine_pb2.pyi vllm/grpc/vllm_engine_pb2.pyi
# Ignore generated cpu headers
csrc/cpu/cpu_attn_dispatch_generated.h

View File

@@ -13,7 +13,7 @@ repos:
args: [--output-format, github, --fix] args: [--output-format, github, --fix]
- id: ruff-format - id: ruff-format
- repo: https://github.com/crate-ci/typos - repo: https://github.com/crate-ci/typos
rev: v1.43.5 rev: v1.38.1
hooks: hooks:
- id: typos - id: typos
args: [--force-exclude] args: [--force-exclude]
@@ -24,13 +24,12 @@ repos:
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*' exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
types_or: [c++, cuda] types_or: [c++, cuda]
args: [--style=file, --verbose] args: [--style=file, --verbose]
- repo: https://github.com/DavidAnson/markdownlint-cli2 - repo: https://github.com/igorshubovych/markdownlint-cli
rev: v0.21.0 rev: v0.45.0
hooks: hooks:
- id: markdownlint-cli2 - id: markdownlint
language_version: lts exclude: '.*\.inc\.md'
args: [--fix] stages: [manual] # Only run in CI
exclude: ^CLAUDE\.md$
- repo: https://github.com/rhysd/actionlint - repo: https://github.com/rhysd/actionlint
rev: v1.7.7 rev: v1.7.7
hooks: hooks:
@@ -56,7 +55,7 @@ repos:
language: python language: python
types_or: [python, pyi] types_or: [python, pyi]
require_serial: true require_serial: true
additional_dependencies: ["mypy[faster-cache]==1.19.1", regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic] additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.10 name: Run mypy for Python 3.10
entry: python tools/pre_commit/mypy.py 1 "3.10" entry: python tools/pre_commit/mypy.py 1 "3.10"
@@ -128,13 +127,6 @@ repos:
language: python language: python
types: [python] types: [python]
additional_dependencies: [regex] additional_dependencies: [regex]
# prevent use torch.cuda APIs
- id: check-torch-cuda-call
name: "Prevent new 'torch.cuda' APIs call"
entry: python tools/pre_commit/check_torch_cuda.py
language: python
types: [python]
additional_dependencies: [regex]
- id: validate-config - id: validate-config
name: Validate configuration has default values and that each field has a docstring name: Validate configuration has default values and that each field has a docstring
entry: python tools/pre_commit/validate_config.py entry: python tools/pre_commit/validate_config.py
@@ -151,11 +143,6 @@ repos:
name: Check attention backend documentation is up to date name: Check attention backend documentation is up to date
entry: python tools/pre_commit/generate_attention_backend_docs.py --check entry: python tools/pre_commit/generate_attention_backend_docs.py --check
language: python language: python
- id: check-boolean-context-manager
name: Check for boolean ops in with-statements
entry: python tools/pre_commit/check_boolean_context_manager.py
language: python
types: [python]
# Keep `suggestion` last # Keep `suggestion` last
- id: suggestion - id: suggestion
name: Suggestion name: Suggestion

View File

@@ -9,15 +9,13 @@ build:
python: "3.12" python: "3.12"
jobs: jobs:
post_checkout: post_checkout:
# - bash docs/maybe_skip_pr_build.sh - git fetch --unshallow || true
- git fetch origin main --unshallow --no-tags --filter=blob:none || true
pre_create_environment:
- pip install uv
create_environment:
- uv venv $READTHEDOCS_VIRTUALENV_PATH
install:
- uv pip install --python $READTHEDOCS_VIRTUALENV_PATH/bin/python --no-cache-dir -r requirements/docs.txt
mkdocs: mkdocs:
configuration: mkdocs.yaml configuration: mkdocs.yaml
fail_on_warning: true fail_on_warning: true
# Optionally declare the Python requirements required to build your docs
python:
install:
- requirements: requirements/docs.txt

113
AGENTS.md
View File

@@ -1,113 +0,0 @@
# Agent Instructions for vLLM
> These instructions apply to **all** AI-assisted contributions to `vllm-project/vllm`.
> Breaching these guidelines can result in automatic banning.
## 1. Contribution Policy (Mandatory)
### Duplicate-work checks
Before proposing a PR, run these checks:
```bash
gh issue view <issue_number> --repo vllm-project/vllm --comments
gh pr list --repo vllm-project/vllm --state open --search "<issue_number> in:body"
gh pr list --repo vllm-project/vllm --state open --search "<short area keywords>"
```
- If an open PR already addresses the same fix, do not open another.
- If your approach is materially different, explain the difference in the issue.
### No low-value busywork PRs
Do not open one-off PRs for tiny edits (single typo, isolated style change, one mutable default, etc.). Mechanical cleanups are acceptable only when bundled with substantive work.
### Accountability
- Pure code-agent PRs are **not allowed**. A human submitter must understand and defend the change end-to-end.
- The submitting human must review every changed line and run relevant tests.
- PR descriptions for AI-assisted work **must** include:
- Why this is not duplicating an existing PR.
- Test commands run and results.
- Clear statement that AI assistance was used.
### Fail-closed behavior
If work is duplicate/trivial busywork, **do not proceed**. Return a short explanation of what is missing.
---
## 2. Development Workflow
### Environment setup
```bash
# Install `uv` if you don't have it already:
curl -LsSf https://astral.sh/uv/install.sh | sh
# Always use `uv` for Python environment management:
uv venv --python 3.12
source .venv/bin/activate
# Always make sure `pre-commit` and its hooks are installed:
uv pip install -r requirements/lint.txt
pre-commit install
```
### Installing dependencies
```bash
# If you are only making Python changes:
VLLM_USE_PRECOMPILED=1 uv pip install -e .
# If you are also making C/C++ changes:
uv pip install -e .
```
### Running tests
Tests require extra dependencies.
All versions for test dependencies should be read from `requirements/test.txt`
```bash
# Install bare minimum test dependencies:
uv pip install pytest pytest-asyncio tblib
# Install additional test dependencies as needed, or install them all as follows:
uv pip install -r requirements/test.txt
# Run specific test from specific test file
pytest tests/path/to/test.py -v -s -k test_name
# Run all tests in directory
pytest tests/path/to/dir -v -s
```
### Running linters
```bash
# Run all pre-commit hooks on staged files:
pre-commit run
# Run on all files:
pre-commit run --all-files
# Run a specific hook:
pre-commit run ruff-check --all-files
# Run mypy as it is in CI:
pre-commit run mypy-3.10 --all-files --hook-stage manual
```
### Commit messages
Add attribution using commit trailers such as `Co-authored-by:` (other projects use `Assisted-by:` or `Generated-by:`). For example:
```text
Your commit message here
Co-authored-by: GitHub Copilot
Co-authored-by: Claude
Co-authored-by: gemini-code-assist
Signed-off-by: Your Name <your.email@example.com>
```

View File

@@ -1 +0,0 @@
@AGENTS.md

View File

@@ -37,7 +37,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13") set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
# Supported AMD GPU architectures. # Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1150;gfx1151;gfx1152;gfx1153;gfx1200;gfx1201") set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
# ROCm installation prefix. Default to /opt/rocm but allow override via # ROCm installation prefix. Default to /opt/rocm but allow override via
# -DROCM_PATH=/your/rocm/path when invoking cmake. # -DROCM_PATH=/your/rocm/path when invoking cmake.
@@ -293,7 +293,6 @@ set(VLLM_EXT_SRC
"csrc/fused_qknorm_rope_kernel.cu" "csrc/fused_qknorm_rope_kernel.cu"
"csrc/layernorm_quant_kernels.cu" "csrc/layernorm_quant_kernels.cu"
"csrc/sampler.cu" "csrc/sampler.cu"
"csrc/topk.cu"
"csrc/cuda_view.cu" "csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/w8a8/int8/scaled_quant.cu" "csrc/quantization/w8a8/int8/scaled_quant.cu"
@@ -725,7 +724,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# CUTLASS MoE kernels # CUTLASS MoE kernels
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
# on Hopper). get_cutlass_(batched_)moe_mm_data should only be compiled # on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
# if it's possible to compile MoE kernels that use its output. # if it's possible to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}") cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
@@ -771,51 +770,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
endif() endif()
# Expert-specialization MXFP8 blockscaled grouped kernels (SM100+).
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(ES_MXFP8_GROUPED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(ES_MXFP8_GROUPED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND ES_MXFP8_GROUPED_MM_ARCHS)
set(SRCS
"csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm.cu"
"csrc/moe/mxfp8_moe/mxfp8_experts_quant.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${ES_MXFP8_GROUPED_MM_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_ES_MXFP8_GROUPED_MM_SM100=1")
message(STATUS "Building ES MXFP8 grouped kernels for archs: ${ES_MXFP8_GROUPED_MM_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8
AND ES_MXFP8_GROUPED_MM_ARCHS)
message(STATUS "Not building ES MXFP8 grouped kernels as CUDA Compiler version is "
"not >= 12.8.")
else()
message(STATUS "Not building ES MXFP8 grouped kernels as no compatible archs found "
"in CUDA target architectures.")
endif()
endif()
# DeepSeek V3 fused A GEMM kernel (requires SM 9.0+, Hopper and later)
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_FUSED_A_GEMM_ARCHS)
set(DSV3_FUSED_A_GEMM_SRC "csrc/dsv3_fused_a_gemm.cu")
set_gencode_flags_for_srcs(
SRCS "${DSV3_FUSED_A_GEMM_SRC}"
CUDA_ARCHS "${DSV3_FUSED_A_GEMM_ARCHS}")
list(APPEND VLLM_EXT_SRC ${DSV3_FUSED_A_GEMM_SRC})
message(STATUS "Building dsv3_fused_a_gemm for archs: ${DSV3_FUSED_A_GEMM_ARCHS}")
else()
message(STATUS "Not building dsv3_fused_a_gemm as no compatible archs found "
"in CUDA target architectures.")
endif()
# moe_data.cu is used by all CUTLASS MoE kernels. # moe_data.cu is used by all CUTLASS MoE kernels.
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}") cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
@@ -998,8 +952,7 @@ set(VLLM_MOE_EXT_SRC
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC list(APPEND VLLM_MOE_EXT_SRC
"csrc/moe/moe_wna16.cu" "csrc/moe/moe_wna16.cu"
"csrc/moe/grouped_topk_kernels.cu" "csrc/moe/grouped_topk_kernels.cu")
"csrc/moe/router_gemm.cu")
endif() endif()
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
@@ -1128,27 +1081,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Not building Marlin MOE kernels as no compatible archs found" message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
" in CUDA target architectures") " in CUDA target architectures")
endif() endif()
# DeepSeek V3 router GEMM kernel - requires SM90+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_ROUTER_GEMM_ARCHS)
set(DSV3_ROUTER_GEMM_SRC
"csrc/moe/dsv3_router_gemm_entry.cu"
"csrc/moe/dsv3_router_gemm_float_out.cu"
"csrc/moe/dsv3_router_gemm_bf16_out.cu")
set_gencode_flags_for_srcs(
SRCS "${DSV3_ROUTER_GEMM_SRC}"
CUDA_ARCHS "${DSV3_ROUTER_GEMM_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC "${DSV3_ROUTER_GEMM_SRC}")
message(STATUS "Building DSV3 router GEMM kernel for archs: ${DSV3_ROUTER_GEMM_ARCHS}")
else()
message(STATUS "Not building DSV3 router GEMM kernel as no compatible archs found"
" (requires SM90+ and CUDA >= 12.0)")
endif()
endif() endif()
message(STATUS "Enabling moe extension.") message(STATUS "Enabling moe extension.")

View File

@@ -187,7 +187,7 @@ python benchmark.py \
## Hardware Requirements ## Hardware Requirements
| Backend | Hardware | | Backend | Hardware |
| ------- | -------- | |---------|----------|
| Flash/Triton/FlashInfer | Any CUDA GPU | | Flash/Triton/FlashInfer | Any CUDA GPU |
| CUTLASS MLA | Blackwell (SM100+) | | CUTLASS MLA | Blackwell (SM100+) |
| FlashAttn MLA | Hopper (SM90+) | | FlashAttn MLA | Hopper (SM90+) |

View File

@@ -15,6 +15,7 @@ from .common import (
BenchmarkConfig, BenchmarkConfig,
BenchmarkResult, BenchmarkResult,
MockLayer, MockLayer,
MockModelConfig,
ResultsFormatter, ResultsFormatter,
get_attention_scale, get_attention_scale,
is_mla_backend, is_mla_backend,
@@ -35,6 +36,7 @@ __all__ = [
"ResultsFormatter", "ResultsFormatter",
# Mock objects # Mock objects
"MockLayer", "MockLayer",
"MockModelConfig",
# Utilities # Utilities
"setup_mla_dims", "setup_mla_dims",
"get_attention_scale", "get_attention_scale",

View File

@@ -229,40 +229,3 @@ def get_batch_stats(requests: list[BatchRequest]) -> dict:
sum(r.kv_len for r in requests) / len(requests) if requests else 0 sum(r.kv_len for r in requests) / len(requests) if requests else 0
), ),
} }
def get_batch_type(batch_spec: str, spec_decode_threshold: int = 8) -> str:
"""
Classify a batch spec into a type string.
Args:
batch_spec: Batch specification string (e.g., "q2k", "8q1s1k", "2q2k_8q1s1k")
spec_decode_threshold: Max q_len to be considered spec-decode vs extend
Returns:
Type string: "prefill", "decode", "spec-decode", "extend", or "mixed (types...)"
"""
requests = parse_batch_spec(batch_spec)
# Classify each request
types_present = set()
for req in requests:
if req.is_decode:
types_present.add("decode")
elif req.is_prefill:
types_present.add("prefill")
elif req.is_extend:
# Distinguish spec-decode (small q_len) from extend (chunked prefill)
if req.q_len <= spec_decode_threshold:
types_present.add("spec-decode")
else:
types_present.add("extend")
if len(types_present) == 1:
return types_present.pop()
elif len(types_present) > 1:
# Sort for consistent output
sorted_types = sorted(types_present)
return f"mixed ({'+'.join(sorted_types)})"
else:
return "unknown"

View File

@@ -43,12 +43,9 @@ from common import (
ModelParameterSweep, ModelParameterSweep,
ParameterSweep, ParameterSweep,
ResultsFormatter, ResultsFormatter,
batch_spec_sort_key,
is_mla_backend, is_mla_backend,
) )
from vllm.v1.worker.workspace import init_workspace_manager
def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult: def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
"""Run standard attention benchmark (Flash/Triton/FlashInfer).""" """Run standard attention benchmark (Flash/Triton/FlashInfer)."""
@@ -61,9 +58,7 @@ def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
"""Run MLA benchmark with appropriate backend.""" """Run MLA benchmark with appropriate backend."""
from mla_runner import run_mla_benchmark as run_mla from mla_runner import run_mla_benchmark as run_mla
return run_mla( return run_mla(config.backend, config, **kwargs)
config.backend, config, prefill_backend=config.prefill_backend, **kwargs
)
def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult: def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
@@ -223,13 +218,10 @@ def run_model_parameter_sweep(
by_param_and_spec[key].append(r) by_param_and_spec[key].append(r)
break break
# Sort by param value then spec (batch_size, q_len, kv_len) # Sort by param value then spec
sorted_keys = sorted( sorted_keys = sorted(
by_param_and_spec.keys(), by_param_and_spec.keys(),
key=lambda x: ( key=lambda x: (int(x[0]) if x[0].isdigit() else x[0], x[1]),
int(x[0]) if x[0].isdigit() else x[0],
batch_spec_sort_key(x[1]),
),
) )
current_param_value = None current_param_value = None
@@ -338,7 +330,7 @@ def run_parameter_sweep(
by_spec[spec] = [] by_spec[spec] = []
by_spec[spec].append(r) by_spec[spec].append(r)
for spec in sorted(by_spec.keys(), key=batch_spec_sort_key): for spec in sorted(by_spec.keys()):
results = by_spec[spec] results = by_spec[spec]
best = min(results, key=lambda r: r.mean_time) best = min(results, key=lambda r: r.mean_time)
console.print( console.print(
@@ -444,27 +436,20 @@ def main():
# Backend selection # Backend selection
parser.add_argument( parser.add_argument(
"--backends", "--backends",
"--decode-backends",
nargs="+", nargs="+",
help="Decode backends to benchmark (flash, triton, flashinfer, cutlass_mla, " help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
"flashinfer_mla, flashattn_mla, flashmla)", "flashinfer_mla, flashattn_mla, flashmla)",
) )
parser.add_argument( parser.add_argument(
"--backend", "--backend",
help="Single backend (alternative to --backends)", help="Single backend (alternative to --backends)",
) )
parser.add_argument(
"--prefill-backends",
nargs="+",
help="Prefill backends to compare (fa2, fa3, fa4). "
"Uses the first decode backend for impl construction.",
)
# Batch specifications # Batch specifications
parser.add_argument( parser.add_argument(
"--batch-specs", "--batch-specs",
nargs="+", nargs="+",
default=None, default=["q2k", "8q1s1k"],
help="Batch specifications using extended grammar", help="Batch specifications using extended grammar",
) )
@@ -480,21 +465,6 @@ def main():
parser.add_argument("--repeats", type=int, default=1, help="Repetitions") parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations") parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
parser.add_argument("--profile-memory", action="store_true", help="Profile memory") parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
parser.add_argument(
"--kv-cache-dtype",
default="auto",
choices=["auto", "fp8"],
help="KV cache dtype: auto or fp8",
)
parser.add_argument(
"--cuda-graphs",
action=argparse.BooleanOptionalAction,
default=True,
help=(
"Launch kernels with CUDA graphs to eliminate CPU overhead"
"in measurements (default: True)"
),
)
# Parameter sweep (use YAML config for advanced sweeps) # Parameter sweep (use YAML config for advanced sweeps)
parser.add_argument( parser.add_argument(
@@ -526,24 +496,15 @@ def main():
if "description" in yaml_config: if "description" in yaml_config:
console.print(f"[dim]{yaml_config['description']}[/]") console.print(f"[dim]{yaml_config['description']}[/]")
# Override args with YAML values, but CLI args take precedence # Override args with YAML values
# Check if CLI provided backends (they would be non-None and not default) # (YAML takes precedence unless CLI arg was explicitly set)
cli_backends_provided = args.backend is not None or args.backends is not None # Backend(s)
if "backend" in yaml_config:
# Backend(s) - only use YAML if CLI didn't specify args.backend = yaml_config["backend"]
if not cli_backends_provided: args.backends = None
if "backend" in yaml_config: elif "backends" in yaml_config:
args.backend = yaml_config["backend"] args.backends = yaml_config["backends"]
args.backends = None args.backend = None
elif "backends" in yaml_config:
args.backends = yaml_config["backends"]
args.backend = None
elif "decode_backends" in yaml_config:
args.backends = yaml_config["decode_backends"]
args.backend = None
# Prefill backends (e.g., ["fa3", "fa4"])
args.prefill_backends = yaml_config.get("prefill_backends", None)
# Check for special modes # Check for special modes
if "mode" in yaml_config: if "mode" in yaml_config:
@@ -553,24 +514,21 @@ def main():
# Batch specs and sizes # Batch specs and sizes
# Support both explicit batch_specs and generated batch_spec_ranges # Support both explicit batch_specs and generated batch_spec_ranges
# CLI --batch-specs takes precedence over YAML when provided. if "batch_spec_ranges" in yaml_config:
cli_batch_specs_provided = args.batch_specs is not None # Generate batch specs from ranges
if not cli_batch_specs_provided: generated_specs = generate_batch_specs_from_ranges(
if "batch_spec_ranges" in yaml_config: yaml_config["batch_spec_ranges"]
# Generate batch specs from ranges )
generated_specs = generate_batch_specs_from_ranges( # Combine with any explicit batch_specs
yaml_config["batch_spec_ranges"] if "batch_specs" in yaml_config:
) args.batch_specs = yaml_config["batch_specs"] + generated_specs
# Combine with any explicit batch_specs else:
if "batch_specs" in yaml_config: args.batch_specs = generated_specs
args.batch_specs = yaml_config["batch_specs"] + generated_specs console.print(
else: f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
args.batch_specs = generated_specs )
console.print( elif "batch_specs" in yaml_config:
f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]" args.batch_specs = yaml_config["batch_specs"]
)
elif "batch_specs" in yaml_config:
args.batch_specs = yaml_config["batch_specs"]
if "batch_sizes" in yaml_config: if "batch_sizes" in yaml_config:
args.batch_sizes = yaml_config["batch_sizes"] args.batch_sizes = yaml_config["batch_sizes"]
@@ -586,19 +544,13 @@ def main():
args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads) args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads)
args.block_size = model.get("block_size", args.block_size) args.block_size = model.get("block_size", args.block_size)
# Benchmark settings (top-level keys) # Benchmark settings
if "device" in yaml_config: if "benchmark" in yaml_config:
args.device = yaml_config["device"] bench = yaml_config["benchmark"]
if "repeats" in yaml_config: args.device = bench.get("device", args.device)
args.repeats = yaml_config["repeats"] args.repeats = bench.get("repeats", args.repeats)
if "warmup_iters" in yaml_config: args.warmup_iters = bench.get("warmup_iters", args.warmup_iters)
args.warmup_iters = yaml_config["warmup_iters"] args.profile_memory = bench.get("profile_memory", args.profile_memory)
if "profile_memory" in yaml_config:
args.profile_memory = yaml_config["profile_memory"]
if "kv_cache_dtype" in yaml_config:
args.kv_cache_dtype = yaml_config["kv_cache_dtype"]
if "cuda_graphs" in yaml_config:
args.cuda_graphs = yaml_config["cuda_graphs"]
# Parameter sweep configuration # Parameter sweep configuration
if "parameter_sweep" in yaml_config: if "parameter_sweep" in yaml_config:
@@ -652,19 +604,10 @@ def main():
# Determine backends # Determine backends
backends = args.backends or ([args.backend] if args.backend else ["flash"]) backends = args.backends or ([args.backend] if args.backend else ["flash"])
prefill_backends = getattr(args, "prefill_backends", None)
if not args.batch_specs:
args.batch_specs = ["q2k", "8q1s1k"]
console.print(f"Backends: {', '.join(backends)}") console.print(f"Backends: {', '.join(backends)}")
if prefill_backends:
console.print(f"Prefill backends: {', '.join(prefill_backends)}")
console.print(f"Batch specs: {', '.join(args.batch_specs)}") console.print(f"Batch specs: {', '.join(args.batch_specs)}")
console.print(f"KV cache dtype: {args.kv_cache_dtype}")
console.print(f"CUDA graphs: {args.cuda_graphs}")
console.print() console.print()
init_workspace_manager(args.device)
# Run benchmarks # Run benchmarks
all_results = [] all_results = []
@@ -717,8 +660,6 @@ def main():
repeats=args.repeats, repeats=args.repeats,
warmup_iters=args.warmup_iters, warmup_iters=args.warmup_iters,
profile_memory=args.profile_memory, profile_memory=args.profile_memory,
kv_cache_dtype=args.kv_cache_dtype,
use_cuda_graphs=args.cuda_graphs,
) )
# Add decode pipeline config # Add decode pipeline config
@@ -871,8 +812,6 @@ def main():
"repeats": args.repeats, "repeats": args.repeats,
"warmup_iters": args.warmup_iters, "warmup_iters": args.warmup_iters,
"profile_memory": args.profile_memory, "profile_memory": args.profile_memory,
"kv_cache_dtype": args.kv_cache_dtype,
"use_cuda_graphs": args.cuda_graphs,
} }
all_results = run_model_parameter_sweep( all_results = run_model_parameter_sweep(
backends, backends,
@@ -895,8 +834,6 @@ def main():
"repeats": args.repeats, "repeats": args.repeats,
"warmup_iters": args.warmup_iters, "warmup_iters": args.warmup_iters,
"profile_memory": args.profile_memory, "profile_memory": args.profile_memory,
"kv_cache_dtype": args.kv_cache_dtype,
"use_cuda_graphs": args.cuda_graphs,
} }
all_results = run_parameter_sweep( all_results = run_parameter_sweep(
backends, args.batch_specs, base_config_args, args.parameter_sweep, console backends, args.batch_specs, base_config_args, args.parameter_sweep, console
@@ -904,95 +841,37 @@ def main():
else: else:
# Normal mode: compare backends # Normal mode: compare backends
decode_results = [] total = len(backends) * len(args.batch_specs)
prefill_results = []
# Run decode backend comparison with tqdm(total=total, desc="Benchmarking") as pbar:
if not prefill_backends: for spec in args.batch_specs:
# No prefill backends specified: compare decode backends as before for backend in backends:
total = len(backends) * len(args.batch_specs) config = BenchmarkConfig(
backend=backend,
batch_spec=spec,
num_layers=args.num_layers,
head_dim=args.head_dim,
num_q_heads=args.num_q_heads,
num_kv_heads=args.num_kv_heads,
block_size=args.block_size,
device=args.device,
repeats=args.repeats,
warmup_iters=args.warmup_iters,
profile_memory=args.profile_memory,
)
with tqdm(total=total, desc="Benchmarking") as pbar: result = run_benchmark(config)
for spec in args.batch_specs: all_results.append(result)
for backend in backends:
config = BenchmarkConfig(
backend=backend,
batch_spec=spec,
num_layers=args.num_layers,
head_dim=args.head_dim,
num_q_heads=args.num_q_heads,
num_kv_heads=args.num_kv_heads,
block_size=args.block_size,
device=args.device,
repeats=args.repeats,
warmup_iters=args.warmup_iters,
profile_memory=args.profile_memory,
kv_cache_dtype=args.kv_cache_dtype,
use_cuda_graphs=args.cuda_graphs,
)
result = run_benchmark(config) if not result.success:
decode_results.append(result) console.print(f"[red]Error {backend} {spec}: {result.error}[/]")
if not result.success: pbar.update(1)
console.print(
f"[red]Error {backend} {spec}: {result.error}[/]"
)
pbar.update(1) # Display results
console.print("\n[bold green]Results:[/]")
console.print("\n[bold green]Results:[/]") formatter = ResultsFormatter(console)
formatter = ResultsFormatter(console) formatter.print_table(all_results, backends)
formatter.print_table(decode_results, backends)
# Run prefill backend comparison
if prefill_backends:
# Use first decode backend for impl construction
decode_backend = backends[0]
total = len(prefill_backends) * len(args.batch_specs)
console.print(
f"[yellow]Prefill comparison mode: "
f"using {decode_backend} for decode impl[/]"
)
with tqdm(total=total, desc="Prefill benchmarking") as pbar:
for spec in args.batch_specs:
for pb in prefill_backends:
config = BenchmarkConfig(
backend=decode_backend,
batch_spec=spec,
num_layers=args.num_layers,
head_dim=args.head_dim,
num_q_heads=args.num_q_heads,
num_kv_heads=args.num_kv_heads,
block_size=args.block_size,
device=args.device,
repeats=args.repeats,
warmup_iters=args.warmup_iters,
profile_memory=args.profile_memory,
prefill_backend=pb,
)
result = run_benchmark(config)
# Label result with prefill backend name for display
labeled_config = replace(result.config, backend=pb)
result = replace(result, config=labeled_config)
prefill_results.append(result)
if not result.success:
console.print(f"[red]Error {pb} {spec}: {result.error}[/]")
pbar.update(1)
console.print("\n[bold green]Prefill Backend Results:[/]")
formatter = ResultsFormatter(console)
formatter.print_table(
prefill_results, prefill_backends, compare_to_fastest=True
)
all_results = decode_results + prefill_results
# Save results # Save results
if all_results: if all_results:

View File

@@ -10,37 +10,18 @@ from dataclasses import asdict, dataclass
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
import numpy as np
import torch import torch
from batch_spec import get_batch_type, parse_batch_spec
from rich.console import Console from rich.console import Console
from rich.table import Table from rich.table import Table
def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
"""
Extract sorting key from batch spec: (batch_size, max_q_len, max_kv_len).
This ensures results are sorted by batch size first, then query length,
then sequence length, rather than alphabetically.
"""
try:
requests = parse_batch_spec(spec)
batch_size = len(requests)
max_q_len = max(r.q_len for r in requests) if requests else 0
max_kv_len = max(r.kv_len for r in requests) if requests else 0
return (batch_size, max_q_len, max_kv_len)
except Exception:
# Fallback for unparsable specs
return (0, 0, 0)
# Mock classes for vLLM attention infrastructure # Mock classes for vLLM attention infrastructure
class MockHfConfig: class MockHfConfig:
"""Mock HuggingFace config that satisfies vLLM's requirements.""" """Mock HuggingFace config that satisfies vLLM's requirements."""
def __init__(self, mla_dims: dict, index_topk: int | None = None): def __init__(self, mla_dims: dict):
self.num_attention_heads = mla_dims["num_q_heads"] self.num_attention_heads = mla_dims["num_q_heads"]
self.num_key_value_heads = mla_dims["num_kv_heads"] self.num_key_value_heads = mla_dims["num_kv_heads"]
self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"] self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"]
@@ -51,8 +32,6 @@ class MockHfConfig:
self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"] self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"]
self.v_head_dim = mla_dims["v_head_dim"] self.v_head_dim = mla_dims["v_head_dim"]
self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"] self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
if index_topk is not None:
self.index_topk = index_topk
def get_text_config(self): def get_text_config(self):
return self return self
@@ -61,7 +40,10 @@ class MockHfConfig:
# Import AttentionLayerBase at module level to avoid circular dependencies # Import AttentionLayerBase at module level to avoid circular dependencies
try: try:
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
_HAS_ATTENTION_LAYER_BASE = True
except ImportError: except ImportError:
_HAS_ATTENTION_LAYER_BASE = False
AttentionLayerBase = object # Fallback AttentionLayerBase = object # Fallback
@@ -77,7 +59,6 @@ class MockKVBProj:
self.qk_nope_head_dim = qk_nope_head_dim self.qk_nope_head_dim = qk_nope_head_dim
self.v_head_dim = v_head_dim self.v_head_dim = v_head_dim
self.out_dim = qk_nope_head_dim + v_head_dim self.out_dim = qk_nope_head_dim + v_head_dim
self.weight = torch.empty(0, dtype=torch.bfloat16)
def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]: def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
""" """
@@ -101,38 +82,6 @@ class MockKVBProj:
return (result,) # Return as tuple to match ColumnParallelLinear API return (result,) # Return as tuple to match ColumnParallelLinear API
class MockIndexer:
"""Mock Indexer for sparse MLA backends.
Provides topk_indices_buffer that sparse MLA backends use to determine
which KV cache slots to attend to for each token.
"""
def __init__(
self,
max_num_tokens: int,
topk_tokens: int,
device: torch.device,
):
self.topk_tokens = topk_tokens
self.topk_indices_buffer = torch.zeros(
(max_num_tokens, topk_tokens),
dtype=torch.int32,
device=device,
)
def fill_random_indices(self, num_tokens: int, max_kv_len: int):
"""Fill topk_indices_buffer with random valid indices for benchmarking."""
indices = torch.randint(
0,
max_kv_len,
(num_tokens, self.topk_tokens),
dtype=torch.int32,
device=self.topk_indices_buffer.device,
)
self.topk_indices_buffer[:num_tokens] = indices
class MockLayer(AttentionLayerBase): class MockLayer(AttentionLayerBase):
"""Mock attention layer with scale parameters and impl. """Mock attention layer with scale parameters and impl.
@@ -164,6 +113,95 @@ class MockLayer(AttentionLayerBase):
return self._kv_cache_spec return self._kv_cache_spec
class MockModelConfig:
"""Mock model configuration."""
def __init__(
self,
num_q_heads: int,
num_kv_heads: int,
head_dim: int,
dtype: torch.dtype = torch.float16,
max_model_len: int = 32768,
):
self._n_q = num_q_heads
self._n_kv = num_kv_heads
self._d = head_dim
self.dtype = dtype
self.max_model_len = max_model_len
def get_num_attention_heads(self, _=None) -> int:
return self._n_q
def get_num_kv_heads(self, _=None) -> int:
return self._n_kv
def get_head_size(self) -> int:
return self._d
def get_num_layers(self) -> int:
"""Mock method for layer count queries."""
return 1
def get_sliding_window_for_layer(self, _layer_idx: int):
"""Mock method for sliding window queries."""
return None
def get_logits_soft_cap_for_layer(self, _layer_idx: int):
"""Mock method for logits soft cap queries."""
return None
def get_sm_scale_for_layer(self, _layer_idx: int) -> float:
"""Mock method for SM scale queries."""
return 1.0 / (self.get_head_size() ** 0.5)
class MockParallelConfig:
"""Mock parallel configuration."""
pass
class MockCompilationConfig:
"""Mock compilation configuration."""
def __init__(self):
self.full_cuda_graph = False
self.static_forward_context = {}
class MockVLLMConfig:
"""Mock VLLM configuration."""
def __init__(self):
self.compilation_config = MockCompilationConfig()
class MockRunner:
"""Mock GPU runner for metadata builders."""
def __init__(
self,
seq_lens: np.ndarray,
query_start_locs: np.ndarray,
device: torch.device,
num_q_heads: int,
num_kv_heads: int,
head_dim: int,
dtype: torch.dtype,
):
self.model_config = MockModelConfig(num_q_heads, num_kv_heads, head_dim, dtype)
self.parallel_config = MockParallelConfig()
self.vllm_config = MockVLLMConfig()
self.seq_lens_np = seq_lens
self.query_start_loc_np = query_start_locs
self.device = device
self.attention_chunk_size = None
self.num_query_heads = num_q_heads
self.num_kv_heads = num_kv_heads
self.dtype = dtype
@dataclass @dataclass
class ParameterSweep: class ParameterSweep:
"""Configuration for sweeping a backend parameter.""" """Configuration for sweeping a backend parameter."""
@@ -213,11 +251,7 @@ class BenchmarkConfig:
profile_memory: bool = False profile_memory: bool = False
use_cuda_graphs: bool = False use_cuda_graphs: bool = False
# "auto" or "fp8"
kv_cache_dtype: str = "auto"
# MLA-specific # MLA-specific
prefill_backend: str | None = None
kv_lora_rank: int | None = None kv_lora_rank: int | None = None
qk_nope_head_dim: int | None = None qk_nope_head_dim: int | None = None
qk_rope_head_dim: int | None = None qk_rope_head_dim: int | None = None
@@ -282,19 +316,14 @@ class ResultsFormatter:
backends: List of backend names being compared backends: List of backend names being compared
compare_to_fastest: Show percentage comparison to fastest compare_to_fastest: Show percentage comparison to fastest
""" """
# Group by batch spec, preserving first-occurrence order # Group by batch spec
by_spec = {} by_spec = {}
specs_order = []
for r in results: for r in results:
spec = r.config.batch_spec spec = r.config.batch_spec
if spec not in by_spec: if spec not in by_spec:
by_spec[spec] = {} by_spec[spec] = {}
specs_order.append(spec)
by_spec[spec][r.config.backend] = r by_spec[spec][r.config.backend] = r
# Sort specs by (batch_size, q_len, kv_len) instead of alphabetically
specs_order = sorted(by_spec.keys(), key=batch_spec_sort_key)
# Create shortened backend names for display # Create shortened backend names for display
def shorten_backend_name(name: str) -> str: def shorten_backend_name(name: str) -> str:
"""Shorten long backend names for table display.""" """Shorten long backend names for table display."""
@@ -308,8 +337,6 @@ class ResultsFormatter:
table = Table(title="Attention Benchmark Results") table = Table(title="Attention Benchmark Results")
table.add_column("Batch\nSpec", no_wrap=True) table.add_column("Batch\nSpec", no_wrap=True)
table.add_column("Type", no_wrap=True)
table.add_column("Batch\nSize", justify="right", no_wrap=True)
multi = len(backends) > 1 multi = len(backends) > 1
for backend in backends: for backend in backends:
@@ -323,14 +350,12 @@ class ResultsFormatter:
table.add_column(col_rel, justify="right", no_wrap=False) table.add_column(col_rel, justify="right", no_wrap=False)
# Add rows # Add rows
for spec in specs_order: for spec in sorted(by_spec.keys()):
spec_results = by_spec[spec] spec_results = by_spec[spec]
times = {b: r.mean_time for b, r in spec_results.items() if r.success} times = {b: r.mean_time for b, r in spec_results.items() if r.success}
best_time = min(times.values()) if times else 0.0 best_time = min(times.values()) if times else 0.0
batch_type = get_batch_type(spec) row = [spec]
batch_size = len(parse_batch_spec(spec))
row = [spec, batch_type, str(batch_size)]
for backend in backends: for backend in backends:
if backend in spec_results: if backend in spec_results:
r = spec_results[backend] r = spec_results[backend]
@@ -372,7 +397,6 @@ class ResultsFormatter:
"backend", "backend",
"batch_spec", "batch_spec",
"num_layers", "num_layers",
"kv_cache_dtype",
"mean_time", "mean_time",
"std_time", "std_time",
"throughput", "throughput",
@@ -386,7 +410,6 @@ class ResultsFormatter:
"backend": r.config.backend, "backend": r.config.backend,
"batch_spec": r.config.batch_spec, "batch_spec": r.config.batch_spec,
"num_layers": r.config.num_layers, "num_layers": r.config.num_layers,
"kv_cache_dtype": r.config.kv_cache_dtype,
"mean_time": r.mean_time, "mean_time": r.mean_time,
"std_time": r.std_time, "std_time": r.std_time,
"throughput": r.throughput_tokens_per_sec or 0, "throughput": r.throughput_tokens_per_sec or 0,
@@ -463,11 +486,10 @@ def get_attention_scale(head_dim: int) -> float:
def is_mla_backend(backend: str) -> bool: def is_mla_backend(backend: str) -> bool:
""" """
Check if backend is an MLA backend using the AttentionBackendEnum. Check if backend is an MLA backend using the backend's is_mla() property.
Args: Args:
backend: Backend name matching AttentionBackendEnum exactly backend: Backend name (e.g., "CUTLASS_MLA", "FLASHINFER_MLA")
(e.g., "FLASHMLA_SPARSE")
Returns: Returns:
True if the backend is an MLA backend, False otherwise True if the backend is an MLA backend, False otherwise
@@ -475,8 +497,7 @@ def is_mla_backend(backend: str) -> bool:
from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.backends.registry import AttentionBackendEnum
try: try:
backend_enum = AttentionBackendEnum[backend] backend_class = AttentionBackendEnum[backend.upper()].get_class()
backend_class = backend_enum.get_class()
return backend_class.is_mla() return backend_class.is_mla()
except (KeyError, ValueError, ImportError, AttributeError): except (KeyError, ValueError, ImportError):
return False return False

View File

@@ -3,7 +3,7 @@
model: model:
name: "deepseek-v3" name: "deepseek-v3"
num_layers: 60 num_layers: 60
num_q_heads: 128 # Base value, can be swept for TP simulation num_q_heads: 128
num_kv_heads: 1 # MLA uses single latent KV num_kv_heads: 1 # MLA uses single latent KV
head_dim: 576 head_dim: 576
kv_lora_rank: 512 kv_lora_rank: 512
@@ -12,13 +12,6 @@ model:
v_head_dim: 128 v_head_dim: 128
block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128 block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
model_parameter_sweep:
param_name: "num_q_heads"
values: [128, 64, 32, 16]
label_format: "{backend}_{value}h"
batch_specs: batch_specs:
# Small batches, varying sequence lengths # Small batches, varying sequence lengths
- "16q1s512" # 16 requests, 512 KV cache - "16q1s512" # 16 requests, 512 KV cache
@@ -41,30 +34,28 @@ batch_specs:
# Very large batches # Very large batches
- "128q1s1k" # 128 requests, 1k KV cache - "128q1s1k" # 128 requests, 1k KV cache
- "128q1s2k" # 128 requests, 2k KV cache - "128q1s2k" # 128 requests, 2k KV cache
- "128q1s4k" # 128 requests, 4k KV cache
- "128q1s8k" # 128 requests, 8k KV cache
# Long context # Long context
- "32q1s16k" # 32 requests, 16k KV cache - "32q1s16k" # 32 requests, 16k KV cache
- "32q1s32k" # 32 requests, 32k KV cache - "32q1s32k" # 32 requests, 32k KV cache
backends: backends:
- CUTLASS_MLA - cutlass_mla
- FLASHINFER_MLA - flashinfer_mla
- FLASH_ATTN_MLA # Hopper only - flashattn_mla # Hopper only
- FLASHMLA # Hopper only - flashmla # Hopper only
device: "cuda:0" device: "cuda:0"
repeats: 100 repeats: 5
warmup_iters: 10 warmup_iters: 3
profile_memory: true profile_memory: true
# Backend-specific tuning # Backend-specific tuning
CUTLASS_MLA: cutlass_mla:
num_kv_splits: auto # or specific value like 4, 8, 16 num_kv_splits: auto # or specific value like 4, 8, 16
FLASH_ATTN_MLA: flashattn_mla:
reorder_batch_threshold: 512 reorder_batch_threshold: 512
FLASHMLA: flashmla:
reorder_batch_threshold: 1 reorder_batch_threshold: 1

View File

@@ -30,9 +30,9 @@ batch_specs:
- "2q16k_32q1s4k" # 2 very large prefill + 32 decode - "2q16k_32q1s4k" # 2 very large prefill + 32 decode
# Context extension + decode # Context extension + decode
- "2q1ks2k_16q1s1k" # 2 extend + 16 decode - "2q1kkv2k_16q1s1k" # 2 extend + 16 decode
- "4q2ks4k_32q1s2k" # 4 extend + 32 decode - "4q2kkv4k_32q1s2k" # 4 extend + 32 decode
- "2q1ks8k_32q1s2k" # 2 large extend + 32 decode - "2q1kkv8k_32q1s2k" # 2 large extend + 32 decode
# Explicitly chunked prefill # Explicitly chunked prefill
- "q8k" # 8k prefill with chunking hint - "q8k" # 8k prefill with chunking hint
@@ -45,10 +45,10 @@ batch_specs:
- "4q4k_60q1s4k" # 4 prefill + 60 decode - "4q4k_60q1s4k" # 4 prefill + 60 decode
backends: backends:
- CUTLASS_MLA - cutlass_mla
- FLASHINFER_MLA - flashinfer_mla
- FLASH_ATTN_MLA # Hopper only - flashattn_mla # Hopper only
- FLASHMLA # Hopper only - flashmla # Hopper only
device: "cuda:0" device: "cuda:0"
repeats: 5 repeats: 5

View File

@@ -1,126 +0,0 @@
# MLA prefill backend comparison
#
# Compares all available MLA prefill backends:
# FA backends: fa2, fa3, fa4 (FlashAttention versions)
# Non-FA: flashinfer, cudnn, trtllm (Blackwell-only, require flashinfer)
#
# Uses cutlass_mla as the decode backend for impl construction
# (only the prefill path is exercised).
#
# Backends that aren't available on the current platform will report errors
# in the results table (e.g., fa3 on Blackwell, cudnn without artifactory).
#
# Usage:
# python benchmark.py --config configs/mla_prefill.yaml
description: "MLA prefill backend comparison"
model:
name: "deepseek-v3"
num_layers: 60
num_q_heads: 128
num_kv_heads: 1
head_dim: 576
kv_lora_rank: 512
qk_nope_head_dim: 128
qk_rope_head_dim: 64
v_head_dim: 128
block_size: 128
# model:
# name: "deepseek-v2-lite"
# num_layers: 27
# num_q_heads: 16
# num_kv_heads: 1
# head_dim: 576
# kv_lora_rank: 512
# qk_nope_head_dim: 128
# qk_rope_head_dim: 64
# v_head_dim: 128
# block_size: 128
batch_specs:
# Pure prefill
- "q512"
- "q1k"
- "q2k"
- "q4k"
- "q8k"
# Batched pure prefill
- "2q512"
- "2q1k"
- "2q2k"
- "2q4k"
- "2q8k"
- "4q512"
- "4q1k"
- "4q2k"
- "4q4k"
- "4q8k"
- "8q512"
- "8q1k"
- "8q2k"
- "8q4k"
- "8q8k"
# Chunked prefill / extend
# Short context
- "q128s1k"
- "q256s2k"
- "q512s4k"
- "q1ks4k"
- "q2ks8k"
- "2q128s1k"
- "2q256s2k"
- "2q512s4k"
- "2q1ks4k"
- "2q2ks8k"
- "4q128s1k"
- "4q256s2k"
- "4q512s4k"
- "4q1ks4k"
- "4q2ks8k"
- "8q128s1k"
- "8q256s2k"
- "8q512s4k"
- "8q1ks4k"
# Medium context
- "q128s16k"
- "q512s16k"
- "q1ks16k"
- "q2ks16k"
- "2q128s16k"
- "2q512s16k"
- "2q1ks16k"
- "2q2ks16k"
- "4q128s16k"
- "4q512s16k"
- "4q1ks16k"
- "4q2ks16k"
# Long context
- "q128s64k"
- "q512s64k"
- "q1ks64k"
- "q2ks64k"
- "2q128s64k"
- "2q512s64k"
- "2q1ks64k"
- "2q2ks64k"
decode_backends:
- CUTLASS_MLA
prefill_backends:
- fa2
- fa3
- fa4
- flashinfer
- cudnn
- trtllm
device: "cuda:0"
repeats: 20
warmup_iters: 5

View File

@@ -1,58 +0,0 @@
# MLA decode-only benchmark configuration
model:
name: "deepseek-v3"
num_layers: 60
num_q_heads: 128 # Base value, can be swept for TP simulation
num_kv_heads: 1 # MLA uses single latent KV
head_dim: 576
kv_lora_rank: 512
qk_nope_head_dim: 128
qk_rope_head_dim: 64
v_head_dim: 128
block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
model_parameter_sweep:
param_name: "num_q_heads"
values: [128, 64, 32, 16]
label_format: "{backend}_{value}h"
batch_specs:
# Small batches, varying sequence lengths
- "16q1s512" # 16 requests, 512 KV cache
- "16q1s1k" # 16 requests, 1k KV cache
- "16q1s2k" # 16 requests, 2k KV cache
- "16q1s4k" # 16 requests, 4k KV cache
# Medium batches
- "32q1s1k" # 32 requests, 1k KV cache
- "32q1s2k" # 32 requests, 2k KV cache
- "32q1s4k" # 32 requests, 4k KV cache
- "32q1s8k" # 32 requests, 8k KV cache
# Large batches
- "64q1s1k" # 64 requests, 1k KV cache
- "64q1s2k" # 64 requests, 2k KV cache
- "64q1s4k" # 64 requests, 4k KV cache
- "64q1s8k" # 64 requests, 8k KV cache
# Very large batches
- "128q1s1k" # 128 requests, 1k KV cache
- "128q1s2k" # 128 requests, 2k KV cache
- "128q1s4k" # 128 requests, 4k KV cache
- "128q1s8k" # 128 requests, 8k KV cache
# Long context
- "32q1s16k" # 32 requests, 16k KV cache
- "32q1s32k" # 32 requests, 32k KV cache
backends:
- FLASHMLA_SPARSE
- FLASHINFER_MLA_SPARSE
device: "cuda:0"
repeats: 100
warmup_iters: 10
profile_memory: true

View File

@@ -1,62 +0,0 @@
# MLA prefill-only benchmark configuration for sparse backends
model:
name: "deepseek-v3"
num_layers: 60
num_q_heads: 128
num_kv_heads: 1
head_dim: 576
kv_lora_rank: 512
qk_nope_head_dim: 128
qk_rope_head_dim: 64
v_head_dim: 128
block_size: 128
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
model_parameter_sweep:
param_name: "num_q_heads"
values: [128, 64, 32, 16]
label_format: "{backend}_{value}h"
batch_specs:
# Pure prefill
- "1q512"
- "1q1k"
- "1q2k"
- "1q4k"
- "1q8k"
# Batched pure prefill
- "2q512"
- "2q1k"
- "2q2k"
- "2q4k"
- "2q8k"
- "4q512"
- "4q1k"
- "4q2k"
- "4q4k"
- "4q8k"
- "8q512"
- "8q1k"
- "8q2k"
- "8q4k"
- "8q8k"
# Extend
- "1q512s4k"
- "1q512s8k"
- "1q1ks8k"
- "1q2ks8k"
- "1q2ks16k"
- "1q4ks16k"
backends:
- FLASHMLA_SPARSE
- FLASHINFER_MLA_SPARSE
device: "cuda:0"
repeats: 10
warmup_iters: 3
profile_memory: true

View File

@@ -6,7 +6,7 @@
description: "Decode vs Prefill pipeline crossover analysis" description: "Decode vs Prefill pipeline crossover analysis"
# Test FlashAttn MLA # Test FlashAttn MLA
backend: FLASH_ATTN_MLA backend: flashattn_mla
# Mode: decode_vs_prefill comparison (special sweep mode) # Mode: decode_vs_prefill comparison (special sweep mode)
# For each batch spec, we'll test both decode and prefill pipelines # For each batch spec, we'll test both decode and prefill pipelines
@@ -62,10 +62,11 @@ model:
block_size: 128 block_size: 128
# Benchmark settings # Benchmark settings
device: "cuda:0" benchmark:
repeats: 15 # More repeats for spec decode variance device: "cuda:0"
warmup_iters: 5 repeats: 15 # More repeats for spec decode variance
profile_memory: false warmup_iters: 5
profile_memory: false
# Output # Output
output: output:

View File

@@ -41,17 +41,18 @@ batch_specs:
# Backends that support query length > 1 # Backends that support query length > 1
backends: backends:
- FLASH_ATTN_MLA # reorder_batch_threshold = 512 - flashattn_mla # reorder_batch_threshold = 512
- FLASHMLA # reorder_batch_threshold = 1 (tunable) - flashmla # reorder_batch_threshold = 1 (tunable)
# FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism # FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
# - FLASHINFER_MLA # - flashinfer_mla
# Benchmark settings # Benchmark settings
device: "cuda:0" benchmark:
repeats: 10 # More repeats for statistical significance device: "cuda:0"
warmup_iters: 5 repeats: 10 # More repeats for statistical significance
profile_memory: false warmup_iters: 5
profile_memory: false
# Test these threshold values for optimization # Test these threshold values for optimization
parameter_sweep: parameter_sweep:

View File

@@ -25,22 +25,14 @@ batch_specs:
- "4q1k_16q1s2k" # 4 prefill + 16 decode - "4q1k_16q1s2k" # 4 prefill + 16 decode
- "2q4k_32q1s1k" # 2 large prefill + 32 decode - "2q4k_32q1s1k" # 2 large prefill + 32 decode
# Speculative decode (q <= 8) # Context extension
- "16q2s1k" # 16 requests, 2 spec tokens, 1k KV cache - "q1ks2k" # 1k query, 2k sequence (chunked prefill)
- "16q4s1k" # 16 requests, 4 spec tokens, 1k KV cache
- "16q8s1k" # 16 requests, 8 spec tokens, 1k KV cache
- "32q4s2k" # 32 requests, 4 spec tokens, 2k KV cache
- "8q8s4k" # 8 requests, 8 spec tokens, 4k KV cache
# Context extension (chunked prefill)
- "q1ks2k" # 1k query, 2k sequence
- "2q1ks4k" # 2 requests: 1k query, 4k sequence - "2q1ks4k" # 2 requests: 1k query, 4k sequence
# Available backends: FLASH_ATTN, TRITON_ATTN, FLASHINFER
backends: backends:
- FLASH_ATTN - flash
- TRITON_ATTN - triton
- FLASHINFER - flashinfer
device: "cuda:0" device: "cuda:0"
repeats: 5 repeats: 5

View File

@@ -8,13 +8,14 @@ This module provides helpers for running MLA backends without
needing full VllmConfig integration. needing full VllmConfig integration.
""" """
import importlib
import numpy as np import numpy as np
import torch import torch
from batch_spec import parse_batch_spec from batch_spec import parse_batch_spec
from common import ( from common import (
BenchmarkResult, BenchmarkResult,
MockHfConfig, MockHfConfig,
MockIndexer,
MockKVBProj, MockKVBProj,
MockLayer, MockLayer,
setup_mla_dims, setup_mla_dims,
@@ -60,11 +61,7 @@ def create_minimal_vllm_config(
model_name: str = "deepseek-v3", model_name: str = "deepseek-v3",
block_size: int = 128, block_size: int = 128,
max_num_seqs: int = 256, max_num_seqs: int = 256,
max_num_batched_tokens: int = 8192,
mla_dims: dict | None = None, mla_dims: dict | None = None,
index_topk: int | None = None,
prefill_backend: str | None = None,
kv_cache_dtype: str = "auto",
) -> VllmConfig: ) -> VllmConfig:
""" """
Create minimal VllmConfig for MLA benchmarks. Create minimal VllmConfig for MLA benchmarks.
@@ -76,11 +73,6 @@ def create_minimal_vllm_config(
max_num_seqs: Maximum number of sequences max_num_seqs: Maximum number of sequences
mla_dims: Optional custom MLA dimensions dict. If not provided, uses mla_dims: Optional custom MLA dimensions dict. If not provided, uses
setup_mla_dims(model_name) setup_mla_dims(model_name)
index_topk: Optional topk value for sparse MLA backends. If provided,
the config will include index_topk for sparse attention.
prefill_backend: Prefill backend name (e.g., "fa3", "fa4", "flashinfer",
"cudnn", "trtllm"). Configures the attention config to
force the specified prefill backend.
Returns: Returns:
VllmConfig for benchmarking VllmConfig for benchmarking
@@ -90,7 +82,7 @@ def create_minimal_vllm_config(
mla_dims = setup_mla_dims(model_name) mla_dims = setup_mla_dims(model_name)
# Create mock HF config first (avoids downloading from HuggingFace) # Create mock HF config first (avoids downloading from HuggingFace)
mock_hf_config = MockHfConfig(mla_dims, index_topk=index_topk) mock_hf_config = MockHfConfig(mla_dims)
# Create a temporary minimal config.json to avoid HF downloads # Create a temporary minimal config.json to avoid HF downloads
# This ensures consistent ModelConfig construction without network access # This ensures consistent ModelConfig construction without network access
@@ -128,12 +120,16 @@ def create_minimal_vllm_config(
seed=0, seed=0,
max_model_len=32768, max_model_len=32768,
quantization=None, quantization=None,
quantization_param_path=None,
enforce_eager=False, enforce_eager=False,
max_context_len_to_capture=None,
max_seq_len_to_capture=8192,
max_logprobs=20, max_logprobs=20,
disable_sliding_window=False, disable_sliding_window=False,
skip_tokenizer_init=True, skip_tokenizer_init=True,
served_model_name=None, served_model_name=None,
limit_mm_per_prompt=None, limit_mm_per_prompt=None,
use_async_output_proc=True,
config_format="auto", config_format="auto",
) )
finally: finally:
@@ -151,13 +147,14 @@ def create_minimal_vllm_config(
cache_config = CacheConfig( cache_config = CacheConfig(
block_size=block_size, block_size=block_size,
gpu_memory_utilization=0.9, gpu_memory_utilization=0.9,
cache_dtype=kv_cache_dtype, swap_space=0,
cache_dtype="auto",
enable_prefix_caching=False, enable_prefix_caching=False,
) )
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
max_num_batched_tokens=max(max_num_batched_tokens, max_num_seqs), max_num_batched_tokens=8192,
max_model_len=32768, max_model_len=32768,
is_encoder_decoder=False, is_encoder_decoder=False,
enable_chunked_prefill=True, enable_chunked_prefill=True,
@@ -169,7 +166,7 @@ def create_minimal_vllm_config(
compilation_config = CompilationConfig() compilation_config = CompilationConfig()
vllm_config = VllmConfig( return VllmConfig(
model_config=model_config, model_config=model_config,
cache_config=cache_config, cache_config=cache_config,
parallel_config=parallel_config, parallel_config=parallel_config,
@@ -177,147 +174,62 @@ def create_minimal_vllm_config(
compilation_config=compilation_config, compilation_config=compilation_config,
) )
if prefill_backend is not None:
prefill_cfg = get_prefill_backend_config(prefill_backend)
if prefill_cfg["flash_attn_version"] is not None:
vllm_config.attention_config.flash_attn_version = prefill_cfg[
"flash_attn_version"
]
vllm_config.attention_config.disable_flashinfer_prefill = prefill_cfg[
"disable_flashinfer_prefill"
]
vllm_config.attention_config.use_cudnn_prefill = prefill_cfg[
"use_cudnn_prefill"
]
vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill = prefill_cfg[
"use_trtllm_ragged_deepseek_prefill"
]
return vllm_config
# ============================================================================ # ============================================================================
# Prefill Backend Configuration # Backend Configuration
# ============================================================================ # ============================================================================
# Maps prefill backend names to attention config overrides.
# FA backends set flash_attn_version and disable non-FA paths. # Backend name to class name prefix mapping
# Non-FA backends enable their specific path and disable others. _BACKEND_NAME_MAP = {
_PREFILL_BACKEND_CONFIG: dict[str, dict] = { "flashattn_mla": "FlashAttnMLA",
"fa2": { "flashmla": "FlashMLA",
"flash_attn_version": 2, "flashinfer_mla": "FlashInferMLA",
"disable_flashinfer_prefill": True, "cutlass_mla": "CutlassMLA",
"use_cudnn_prefill": False,
"use_trtllm_ragged_deepseek_prefill": False,
},
"fa3": {
"flash_attn_version": 3,
"disable_flashinfer_prefill": True,
"use_cudnn_prefill": False,
"use_trtllm_ragged_deepseek_prefill": False,
},
"fa4": {
"flash_attn_version": 4,
"disable_flashinfer_prefill": True,
"use_cudnn_prefill": False,
"use_trtllm_ragged_deepseek_prefill": False,
},
"flashinfer": {
"flash_attn_version": None,
"disable_flashinfer_prefill": False,
"use_cudnn_prefill": False,
"use_trtllm_ragged_deepseek_prefill": False,
},
"cudnn": {
"flash_attn_version": None,
"disable_flashinfer_prefill": True,
"use_cudnn_prefill": True,
"use_trtllm_ragged_deepseek_prefill": False,
},
"trtllm": {
"flash_attn_version": None,
"disable_flashinfer_prefill": True,
"use_cudnn_prefill": False,
"use_trtllm_ragged_deepseek_prefill": True,
},
} }
# Special properties that differ from defaults
def get_prefill_backend_config(prefill_backend: str) -> dict:
"""Get attention config overrides for a prefill backend."""
if prefill_backend not in _PREFILL_BACKEND_CONFIG:
raise ValueError(
f"Unknown prefill backend: {prefill_backend!r}. "
f"Available: {list(_PREFILL_BACKEND_CONFIG.keys())}"
)
return _PREFILL_BACKEND_CONFIG[prefill_backend]
# ============================================================================
# Decode Backend Configuration
# ============================================================================
# Backend-specific properties that can't be inferred from the backend class
# Keys are AttentionBackendEnum names (uppercase)
_BACKEND_PROPERTIES = { _BACKEND_PROPERTIES = {
"FLASHMLA": { "flashmla": {
"query_format": "concat", # Single concatenated tensor (vs tuple) "query_format": "concat", # Single concatenated tensor (vs tuple)
"block_size": 64, # FlashMLA uses fixed block size
}, },
"FLASHMLA_SPARSE": { "flashinfer_mla": {
"query_format": "concat", # Single concatenated tensor (vs tuple) "block_size": 64, # FlashInfer MLA only supports 32 or 64
}, },
} }
def _get_backend_config(backend: str) -> dict: def _get_backend_config(backend: str) -> dict:
""" """
Get backend configuration from AttentionBackendEnum. Get backend configuration using naming conventions.
Uses the registry to get the backend class and extract configuration All MLA backends follow the pattern:
from its methods (get_impl_cls, get_builder_cls, is_sparse, etc.). - Module: vllm.v1.attention.backends.mla.{backend}
- Impl: {Name}Impl
Args: - Metadata: {Name}Metadata (or MLACommonMetadata)
backend: Backend name matching AttentionBackendEnum exactly - DecodeMetadata: {Name}DecodeMetadata (or MLACommonDecodeMetadata)
(e.g., "FLASHMLA_SPARSE") - MetadataBuilder: {Name}MetadataBuilder
Returns:
Dict with backend configuration
""" """
from vllm.v1.attention.backend import MultipleOf if backend not in _BACKEND_NAME_MAP:
from vllm.v1.attention.backends.registry import AttentionBackendEnum raise ValueError(f"Unknown backend: {backend}")
try: name = _BACKEND_NAME_MAP[backend]
backend_enum = AttentionBackendEnum[backend]
backend_class = backend_enum.get_class()
except (KeyError, ValueError) as e:
valid_backends = [e.name for e in AttentionBackendEnum if e.name != "CUSTOM"]
raise ValueError(
f"Unknown backend: {backend}. "
f"Valid MLA backends: {[b for b in valid_backends if 'MLA' in b]}"
) from e
# Get block size from backend class
block_sizes = backend_class.get_supported_kernel_block_sizes()
# Use first supported block size (backends typically support one for MLA)
block_size = block_sizes[0] if block_sizes else None
if isinstance(block_size, MultipleOf):
# No fixed block size; fall back to config value
block_size = None
# Check if sparse via class method if available
is_sparse = getattr(backend_class, "is_sparse", lambda: False)()
# Get properties that can't be inferred
props = _BACKEND_PROPERTIES.get(backend, {}) props = _BACKEND_PROPERTIES.get(backend, {})
# Check if backend uses common metadata (FlashInfer, CUTLASS)
uses_common = backend in ("flashinfer_mla", "cutlass_mla")
return { return {
"backend_class": backend_class, "module": f"vllm.v1.attention.backends.mla.{backend}",
"impl_class": backend_class.get_impl_cls(), "impl_class": f"{name}Impl",
"builder_class": backend_class.get_builder_cls(), "metadata_class": "MLACommonMetadata" if uses_common else f"{name}Metadata",
"decode_metadata_class": "MLACommonDecodeMetadata"
if uses_common
else f"{name}DecodeMetadata",
"builder_class": f"{name}MetadataBuilder",
"query_format": props.get("query_format", "tuple"), "query_format": props.get("query_format", "tuple"),
"block_size": block_size, "block_size": props.get("block_size", None),
"is_sparse": is_sparse,
} }
@@ -535,27 +447,22 @@ def _create_backend_impl(
mla_dims: dict, mla_dims: dict,
vllm_config: VllmConfig, vllm_config: VllmConfig,
device: torch.device, device: torch.device,
max_num_tokens: int = 8192,
index_topk: int | None = None,
kv_cache_dtype: str = "auto",
): ):
""" """
Create backend implementation instance. Create backend implementation instance.
Args: Args:
backend_cfg: Backend configuration dict from _get_backend_config() backend_cfg: Backend configuration dict
mla_dims: MLA dimension configuration mla_dims: MLA dimension configuration
vllm_config: VllmConfig instance vllm_config: VllmConfig instance
device: Target device device: Target device
max_num_tokens: Maximum number of tokens for sparse indexer buffer
index_topk: Topk value for sparse MLA backends
Returns: Returns:
Tuple of (impl, layer, builder_instance, indexer) Tuple of (impl, layer, builder_instance)
""" """
# Get classes from backend config (already resolved by _get_backend_config) # Import backend classes
impl_class = backend_cfg["impl_class"] backend_module = importlib.import_module(backend_cfg["module"])
builder_class = backend_cfg["builder_class"] impl_class = getattr(backend_module, backend_cfg["impl_class"])
# Calculate scale # Calculate scale
scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]) scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"])
@@ -567,44 +474,26 @@ def _create_backend_impl(
v_head_dim=mla_dims["v_head_dim"], v_head_dim=mla_dims["v_head_dim"],
) )
# Create indexer for sparse backends
indexer = None
if backend_cfg.get("is_sparse", False):
if index_topk is None:
index_topk = 2048 # Default topk for sparse MLA
indexer = MockIndexer(
max_num_tokens=max_num_tokens,
topk_tokens=index_topk,
device=device,
)
# Build impl kwargs
impl_kwargs = {
"num_heads": mla_dims["num_q_heads"],
"head_size": mla_dims["head_dim"],
"scale": scale,
"num_kv_heads": mla_dims["num_kv_heads"],
"alibi_slopes": None,
"sliding_window": None,
"kv_cache_dtype": kv_cache_dtype,
"logits_soft_cap": None,
"attn_type": "decoder",
"kv_sharing_target_layer_name": None,
"q_lora_rank": None,
"kv_lora_rank": mla_dims["kv_lora_rank"],
"qk_nope_head_dim": mla_dims["qk_nope_head_dim"],
"qk_rope_head_dim": mla_dims["qk_rope_head_dim"],
"qk_head_dim": mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
"v_head_dim": mla_dims["v_head_dim"],
"kv_b_proj": mock_kv_b_proj,
}
# Add indexer for sparse backends
if indexer is not None:
impl_kwargs["indexer"] = indexer
# Create impl # Create impl
impl = impl_class(**impl_kwargs) impl = impl_class(
num_heads=mla_dims["num_q_heads"],
head_size=mla_dims["head_dim"],
scale=scale,
num_kv_heads=mla_dims["num_kv_heads"],
alibi_slopes=None,
sliding_window=None,
kv_cache_dtype="auto",
logits_soft_cap=None,
attn_type="decoder",
kv_sharing_target_layer_name=None,
q_lora_rank=None,
kv_lora_rank=mla_dims["kv_lora_rank"],
qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
v_head_dim=mla_dims["v_head_dim"],
kv_b_proj=mock_kv_b_proj,
)
# Initialize DCP attributes # Initialize DCP attributes
if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1): if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1):
@@ -626,7 +515,9 @@ def _create_backend_impl(
# Create builder instance if needed # Create builder instance if needed
builder_instance = None builder_instance = None
if builder_class: if backend_cfg["builder_class"]:
builder_class = getattr(backend_module, backend_cfg["builder_class"])
# Populate static_forward_context so builder can find the layer # Populate static_forward_context so builder can find the layer
# MockLayer inherits from AttentionLayerBase, so isinstance checks pass # MockLayer inherits from AttentionLayerBase, so isinstance checks pass
vllm_config.compilation_config.static_forward_context = {"placeholder": layer} vllm_config.compilation_config.static_forward_context = {"placeholder": layer}
@@ -638,7 +529,7 @@ def _create_backend_impl(
device=device, device=device,
) )
return impl, layer, builder_instance, indexer return impl, layer, builder_instance
# ============================================================================ # ============================================================================
@@ -703,8 +594,6 @@ def _run_single_benchmark(
backend_cfg: dict, backend_cfg: dict,
mla_dims: dict, mla_dims: dict,
device: torch.device, device: torch.device,
indexer=None,
kv_cache_dtype: str | None = None,
) -> BenchmarkResult: ) -> BenchmarkResult:
""" """
Run a single benchmark iteration. Run a single benchmark iteration.
@@ -717,7 +606,6 @@ def _run_single_benchmark(
backend_cfg: Backend configuration dict backend_cfg: Backend configuration dict
mla_dims: MLA dimension configuration mla_dims: MLA dimension configuration
device: Target device device: Target device
indexer: Optional MockIndexer for sparse backends
Returns: Returns:
BenchmarkResult with timing statistics BenchmarkResult with timing statistics
@@ -725,9 +613,7 @@ def _run_single_benchmark(
# Parse batch spec # Parse batch spec
requests = parse_batch_spec(config.batch_spec) requests = parse_batch_spec(config.batch_spec)
q_lens = [r.q_len for r in requests] q_lens = [r.q_len for r in requests]
kv_lens = [r.kv_len for r in requests]
total_q = sum(q_lens) total_q = sum(q_lens)
max_kv_len = max(kv_lens)
# Determine block size # Determine block size
block_size = backend_cfg["block_size"] or config.block_size block_size = backend_cfg["block_size"] or config.block_size
@@ -738,123 +624,45 @@ def _run_single_benchmark(
) )
# Create KV cache # Create KV cache
if kv_cache_dtype is None: kv_cache = torch.zeros(
kv_cache_dtype = getattr(config, "kv_cache_dtype", "auto") num_blocks,
head_size = mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"] block_size,
if kv_cache_dtype == "fp8_ds_mla": mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
# FlashMLA sparse custom format: 656 bytes per token, stored as uint8. device=device,
# Layout: kv_lora_rank fp8 bytes + 4 float32 tile scales dtype=torch.bfloat16,
# + 2*rope_dim bf16 bytes )
# = 512 + 16 + 128 = 656 bytes for DeepSeek dims.
kv_cache = torch.zeros(
num_blocks,
block_size,
656,
device=device,
dtype=torch.uint8,
)
elif kv_cache_dtype == "fp8":
from vllm.platforms import current_platform
kv_cache = torch.zeros( # Create input tensors for both decode and prefill modes
num_blocks, decode_inputs, prefill_inputs = _create_input_tensors(
block_size, total_q,
head_size, mla_dims,
device=device, backend_cfg["query_format"],
dtype=torch.uint8, device,
).view(current_platform.fp8_dtype()) torch.bfloat16,
)
# Determine which forward method to use based on metadata
if metadata.decode is not None:
forward_fn = lambda: impl._forward_decode(
decode_inputs, kv_cache, metadata, layer
)
elif metadata.prefill is not None:
forward_fn = lambda: impl._forward_prefill(
prefill_inputs["q"],
prefill_inputs["k_c_normed"],
prefill_inputs["k_pe"],
kv_cache,
metadata,
prefill_inputs["k_scale"],
prefill_inputs["output"],
)
else: else:
kv_cache = torch.zeros(
num_blocks,
block_size,
head_size,
device=device,
dtype=torch.bfloat16,
)
# Fill indexer with random indices for sparse backends
is_sparse = backend_cfg.get("is_sparse", False)
if is_sparse and indexer is not None:
indexer.fill_random_indices(total_q, max_kv_len)
# Determine which forward methods to use based on metadata.
# Sparse MLA backends always use forward_mqa
has_decode = is_sparse or getattr(metadata, "decode", None) is not None
has_prefill = not is_sparse and getattr(metadata, "prefill", None) is not None
if not has_decode and not has_prefill:
raise RuntimeError("Metadata has neither decode nor prefill metadata") raise RuntimeError("Metadata has neither decode nor prefill metadata")
num_decode = (
metadata.num_decode_tokens
if (has_decode and has_prefill)
else total_q
if has_decode
else 0
)
num_prefill = total_q - num_decode
# Some backends requires fp8 queries when using fp8 KV cache.
is_fp8_kvcache = kv_cache_dtype.startswith("fp8")
quantize_query = is_fp8_kvcache and getattr(
impl, "supports_quant_query_input", False
)
# quantize_query forces concat format
query_fmt = "concat" if quantize_query else backend_cfg["query_format"]
# Create decode query tensors
if has_decode:
decode_inputs, _ = _create_input_tensors(
num_decode, mla_dims, query_fmt, device, torch.bfloat16
)
# Cast decode query to fp8 if the backend supports it
if quantize_query:
from vllm.platforms import current_platform
if isinstance(decode_inputs, tuple):
decode_inputs = torch.cat(list(decode_inputs), dim=-1)
decode_inputs = decode_inputs.to(current_platform.fp8_dtype())
# Create prefill input tensors
if has_prefill:
_, prefill_inputs = _create_input_tensors(
num_prefill, mla_dims, query_fmt, device, torch.bfloat16
)
# Build forward function
def forward_fn():
results = []
if has_decode:
results.append(impl.forward_mqa(decode_inputs, kv_cache, metadata, layer))
if has_prefill:
results.append(
impl.forward_mha(
prefill_inputs["q"],
prefill_inputs["k_c_normed"],
prefill_inputs["k_pe"],
kv_cache,
metadata,
prefill_inputs["k_scale"],
prefill_inputs["output"],
)
)
return results[0] if len(results) == 1 else tuple(results)
# Warmup # Warmup
for _ in range(config.warmup_iters): for _ in range(config.warmup_iters):
forward_fn() forward_fn()
torch.accelerator.synchronize() torch.cuda.synchronize()
# Optionally capture a CUDA graph after warmup.
# Graph replay eliminates CPU launch overhead so timings reflect pure
# kernel time.
if config.use_cuda_graphs:
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph):
forward_fn()
benchmark_fn = graph.replay
else:
benchmark_fn = forward_fn
# Benchmark # Benchmark
times = [] times = []
@@ -864,10 +672,10 @@ def _run_single_benchmark(
start.record() start.record()
for _ in range(config.num_layers): for _ in range(config.num_layers):
benchmark_fn() forward_fn()
end.record() end.record()
torch.accelerator.synchronize() torch.cuda.synchronize()
elapsed_ms = start.elapsed_time(end) elapsed_ms = start.elapsed_time(end)
times.append(elapsed_ms / 1000.0 / config.num_layers) times.append(elapsed_ms / 1000.0 / config.num_layers)
@@ -885,26 +693,20 @@ def _run_single_benchmark(
def _run_mla_benchmark_batched( def _run_mla_benchmark_batched(
backend: str, backend: str,
configs_with_params: list[tuple], # [(config, threshold, num_splits), ...] configs_with_params: list[tuple], # [(config, threshold, num_splits), ...]
index_topk: int = 2048,
prefill_backend: str | None = None,
) -> list[BenchmarkResult]: ) -> list[BenchmarkResult]:
""" """
Unified batched MLA benchmark runner for all backends. Unified batched MLA benchmark runner for all backends.
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla, Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
flashinfer_mla_sparse, flashmla_sparse
This function reuses backend initialization across multiple benchmarks This function reuses backend initialization across multiple benchmarks
to avoid setup/teardown overhead. to avoid setup/teardown overhead.
Args: Args:
backend: Backend name (decode backend used for impl construction) backend: Backend name
configs_with_params: List of (config, threshold, num_splits) tuples configs_with_params: List of (config, threshold, num_splits) tuples
- threshold: reorder_batch_threshold (FlashAttn/FlashMLA only) - threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
- num_splits: num_kv_splits (CUTLASS only) - num_splits: num_kv_splits (CUTLASS only)
index_topk: Topk value for sparse MLA backends (default 2048)
prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
When set, forces the specified FlashAttention version for prefill.
Returns: Returns:
List of BenchmarkResult objects List of BenchmarkResult objects
@@ -914,7 +716,7 @@ def _run_mla_benchmark_batched(
backend_cfg = _get_backend_config(backend) backend_cfg = _get_backend_config(backend)
device = torch.device(configs_with_params[0][0].device) device = torch.device(configs_with_params[0][0].device)
torch.accelerator.set_device_index(device) torch.cuda.set_device(device)
# Determine block size # Determine block size
config_block_size = configs_with_params[0][0].block_size config_block_size = configs_with_params[0][0].block_size
@@ -928,94 +730,21 @@ def _run_mla_benchmark_batched(
if mla_dims is None: if mla_dims is None:
mla_dims = setup_mla_dims("deepseek-v3") mla_dims = setup_mla_dims("deepseek-v3")
# Determine if this is a sparse backend
is_sparse = backend_cfg.get("is_sparse", False)
# Extract kv_cache_dtype from the first config
kv_cache_dtype = getattr(first_config, "kv_cache_dtype", "auto")
# FlashMLA sparse only supports "fp8_ds_mla" internally (not generic "fp8").
# Remap here so the user can pass --kv-cache-dtype fp8 regardless of backend.
if backend.upper() == "FLASHMLA_SPARSE" and kv_cache_dtype == "fp8":
kv_cache_dtype = "fp8_ds_mla"
# Compute max total_q across all configs so the metadata builder buffer
# and scheduler config are large enough for all batch specs.
max_total_q = max(
sum(r.q_len for r in parse_batch_spec(cfg.batch_spec))
for cfg, *_ in configs_with_params
)
# Create and set vLLM config for MLA (reused across all benchmarks) # Create and set vLLM config for MLA (reused across all benchmarks)
vllm_config = create_minimal_vllm_config( vllm_config = create_minimal_vllm_config(
model_name="deepseek-v3", # Used only for model path model_name="deepseek-v3", # Used only for model path
block_size=block_size, block_size=block_size,
max_num_batched_tokens=max_total_q,
mla_dims=mla_dims, # Use custom dims from config or default mla_dims=mla_dims, # Use custom dims from config or default
index_topk=index_topk if is_sparse else None,
prefill_backend=prefill_backend,
kv_cache_dtype=kv_cache_dtype,
) )
results = [] results = []
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
# Clear cached prefill backend detection functions so they re-evaluate # Create backend impl, layer, and builder (reused across benchmarks)
# with the current VllmConfig. These are @functools.cache decorated and impl, layer, builder_instance = _create_backend_impl(
# would otherwise return stale results from a previous backend's config. backend_cfg, mla_dims, vllm_config, device
from vllm.model_executor.layers.attention.mla_attention import (
use_cudnn_prefill,
use_flashinfer_prefill,
use_trtllm_ragged_deepseek_prefill,
) )
use_flashinfer_prefill.cache_clear()
use_cudnn_prefill.cache_clear()
use_trtllm_ragged_deepseek_prefill.cache_clear()
# Create backend impl, layer, builder, and indexer (reused across benchmarks)
impl, layer, builder_instance, indexer = _create_backend_impl(
backend_cfg,
mla_dims,
vllm_config,
device,
max_num_tokens=max_total_q,
index_topk=index_topk if is_sparse else None,
kv_cache_dtype=kv_cache_dtype,
)
# Verify the actual prefill backend matches what was requested
if prefill_backend is not None:
prefill_cfg = get_prefill_backend_config(prefill_backend)
fa_version = prefill_cfg["flash_attn_version"]
if fa_version is not None:
# FA backend: verify the impl's FA version
actual_fa_version = getattr(impl, "vllm_flash_attn_version", None)
if actual_fa_version != fa_version:
raise RuntimeError(
f"Prefill backend '{prefill_backend}' requested FA "
f"version {fa_version}, but the impl is using FA "
f"version {actual_fa_version}. Check "
f"vllm/v1/attention/backends/fa_utils.py."
)
else:
# Non-FA backend: verify the builder picked the right path
expected_flags = {
"flashinfer": "_use_fi_prefill",
"cudnn": "_use_cudnn_prefill",
"trtllm": "_use_trtllm_ragged_prefill",
}
flag_name = expected_flags.get(prefill_backend)
if flag_name and not getattr(builder_instance, flag_name, False):
raise RuntimeError(
f"Prefill backend '{prefill_backend}' was requested "
f"but the metadata builder did not enable it. This "
f"usually means a dependency is missing (e.g., "
f"flashinfer not installed) or the platform doesn't "
f"support it."
)
# Run each benchmark with the shared impl # Run each benchmark with the shared impl
for config, threshold, num_splits in configs_with_params: for config, threshold, num_splits in configs_with_params:
# Set threshold for this benchmark (FlashAttn/FlashMLA only) # Set threshold for this benchmark (FlashAttn/FlashMLA only)
@@ -1039,8 +768,6 @@ def _run_mla_benchmark_batched(
backend_cfg, backend_cfg,
mla_dims, mla_dims,
device, device,
indexer=indexer,
kv_cache_dtype=kv_cache_dtype,
) )
results.append(result) results.append(result)
@@ -1066,27 +793,20 @@ def run_mla_benchmark(
config, config,
reorder_batch_threshold: int | None = None, reorder_batch_threshold: int | None = None,
num_kv_splits: int | None = None, num_kv_splits: int | None = None,
index_topk: int = 2048,
prefill_backend: str | None = None,
) -> BenchmarkResult | list[BenchmarkResult]: ) -> BenchmarkResult | list[BenchmarkResult]:
""" """
Unified MLA benchmark runner for all backends. Unified MLA benchmark runner for all backends.
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla, Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
flashinfer_mla_sparse, flashmla_sparse
Always uses batched execution internally for optimal performance. Always uses batched execution internally for optimal performance.
Args: Args:
backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla, backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla)
flashinfer_mla_sparse, flashmla_sparse)
config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples
reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA
(single config mode only) (single config mode only)
num_kv_splits: Number of KV splits for CUTLASS (single config mode only) num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
index_topk: Topk value for sparse MLA backends (default 2048)
prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
When set, forces the specified FlashAttention version for prefill.
Returns: Returns:
BenchmarkResult (single mode) or list of BenchmarkResult (batched mode) BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
@@ -1096,9 +816,9 @@ def run_mla_benchmark(
# Already in batched format # Already in batched format
if len(config) > 0 and isinstance(config[0], tuple): if len(config) > 0 and isinstance(config[0], tuple):
# Format: [(cfg, param), ...] where param is threshold or num_splits # Format: [(cfg, param), ...] where param is threshold or num_splits
if backend in ("flashattn_mla", "flashmla", "flashmla_sparse"): if backend in ("flashattn_mla", "flashmla"):
configs_with_params = [(cfg, param, None) for cfg, param in config] configs_with_params = [(cfg, param, None) for cfg, param in config]
else: # cutlass_mla, flashinfer_mla, or sparse backends else: # cutlass_mla or flashinfer_mla
configs_with_params = [(cfg, None, param) for cfg, param in config] configs_with_params = [(cfg, None, param) for cfg, param in config]
else: else:
# Format: [cfg, ...] - just configs # Format: [cfg, ...] - just configs
@@ -1110,9 +830,7 @@ def run_mla_benchmark(
return_single = True return_single = True
# Use unified batched execution # Use unified batched execution
results = _run_mla_benchmark_batched( results = _run_mla_benchmark_batched(backend, configs_with_params)
backend, configs_with_params, index_topk, prefill_backend=prefill_backend
)
# Return single result or list based on input # Return single result or list based on input
return results[0] if return_single else results return results[0] if return_single else results

Some files were not shown because too many files have changed in this diff Show More