[ROCm] Fix AttributeError for torch.compiler.skip_all_guards_unsafe on older PyTorch (#37219 )

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
[Refactor] Relocate completion and chat completion tests (#37125 )
2026-03-17 11:34:49 +08:00 · 2026-03-17 11:31:23 +08:00 · 2026-03-17 03:24:34 +00:00 · 2026-03-16 18:04:15 -07:00 · 2026-03-16 17:48:42 -07:00 · 2026-03-17 00:38:52 +00:00
1391 changed files with 116625 additions and 55608 deletions
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -10,7 +10,7 @@ steps:
      docker build
      --build-arg max_jobs=16
      --build-arg REMOTE_VLLM=1
-      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx942;gfx950'
+      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
      --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
      --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
      -f docker/Dockerfile.rocm
--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -21,6 +21,20 @@ steps:
      pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
      pytest -x -v -s tests/kernels/test_onednn.py"
 - label: CPU-Compatibility Tests
  depends_on: []
  soft_fail: true
  device: intel_cpu
  no_plugin: true
  source_file_dependencies:
  - cmake/cpu_extension.cmake
  - setup.py
  - vllm/platforms/cpu.py
  commands:
    - |
      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
      bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
 - label: CPU-Language Generation and Pooling Model Tests
  depends_on: []
  soft_fail: true
--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -25,9 +25,7 @@ fi
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --build-arg VLLM_CPU_AVX512BF16=true \
+  --build-arg VLLM_CPU_X86=true \
  --build-arg VLLM_CPU_AVX512VNNI=true \
  --build-arg VLLM_CPU_AMXBF16=true \
  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
  --target vllm-test \
  --progress plain .
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -13,9 +13,10 @@ import os
 from contextlib import contextmanager
 import lm_eval
 import numpy as np
 import yaml
 from vllm.platforms import current_platform
 DEFAULT_RTOL = 0.08
@@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size):
        "allow_deprecated_quantization=True,"
    )
    if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]:
        model_args += "attention_backend=TRITON_ATTN"
    env_vars = eval_config.get("env_vars", None)
    with scoped_env_vars(env_vars):
        results = lm_eval.simple_evaluate(
@@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
                f"ground_truth={ground_truth:.3f} | "
                f"measured={measured_value:.3f} | rtol={rtol}"
            )
-            success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
+
            min_acceptable = ground_truth * (1 - rtol)
            success = success and measured_value >= min_acceptable
    assert success
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@@ -7,12 +7,12 @@ import argparse
 import html as _html
 import json
 import os
 from contextlib import nullcontext
 from dataclasses import dataclass
 from importlib import util
 from pathlib import Path
 import pandas as pd
 import regex as re
 pd.options.display.float_format = "{:.2f}".format
 plotly_found = util.find_spec("plotly.express") is not None
@@ -33,6 +33,45 @@ pd.set_option("display.precision", 2)
 pd.set_option("display.float_format", lambda x: f"{x:.2f}")
 # -----------------------------
 # Concurrency normalization (NEW, small)
 # -----------------------------
 def _find_concurrency_col(df: pd.DataFrame) -> str:
    for c in [
        "# of max concurrency.",
        "# of max concurrency",
        "Max Concurrency",
        "max_concurrency",
        "Concurrency",
    ]:
        if c in df.columns:
            return c
    for c in df.columns:
        if "concurr" in str(c).lower():
            s = df[c]
            if s.dtype.kind in "iu" and s.nunique() > 1 and s.min() >= 1:
                return c
    raise ValueError(
        "Cannot infer concurrency column. "
        "Please rename the column to one of the known names "
        "or add an explicit override (e.g., --concurrency-col)."
    )
 def _normalize_concurrency_in_df(
    df: pd.DataFrame, canonical: str = "# of max concurrency."
 ) -> pd.DataFrame:
    if canonical in df.columns:
        return df
    detected = _find_concurrency_col(df)
    if detected in df.columns and detected != canonical:
        return df.rename(columns={detected: canonical})
    df[canonical] = pd.NA
    return df
 # -----------------------------
 # Core data compare
 # -----------------------------
@@ -52,19 +91,25 @@ def compare_data_columns(
    - Concat along axis=1 (indexes align), then reset_index so callers can
      group by columns.
    - If --debug, add a <file_label>_name column per file.
    Minimal fix to support different max_concurrency lists across files:
      - normalize concurrency column naming to "# of max concurrency."
      - align on UNION of keys (missing points become NaN)
      - BUGFIX: don't drop throughput rows based on P99/Median presence
    """
    print("\ncompare_data_column:", data_column)
    frames = []
    raw_data_cols: list[str] = []
    compare_frames = []
    # Determine key cols after normalizing concurrency
    cols_per_file: list[set] = []
    for f in files:
        try:
            df_tmp = pd.read_json(f, orient="records")
        except Exception as err:
            raise ValueError(f"Failed to read {f}") from err
        df_tmp = _normalize_concurrency_in_df(df_tmp, canonical="# of max concurrency.")
        cols_per_file.append(set(df_tmp.columns))
    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
@@ -75,12 +120,25 @@ def compare_data_columns(
            "No common key columns found from info_cols across the input files."
        )
-    meta_added = False
+    union_index = None
    metas: list[pd.DataFrame] = []
    staged: list[tuple[str, pd.Series, pd.Series | None]] = []
    for file in files:
        df = pd.read_json(file, orient="records")
        df = _normalize_concurrency_in_df(df, canonical="# of max concurrency.")
-        if drop_column in df.columns:
+        # BUGFIX: only drop rows for latency-like metrics; throughput rows may have
        # NaN in P99/Median columns even if the column exists in the JSON.
        metric_lc = str(data_column).lower()
        is_latency_metric = (
            "ttft" in metric_lc
            or "tpot" in metric_lc
            or "p99" in metric_lc
            or "median" in metric_lc
            or metric_lc.strip() in {"p99", "median"}
        )
        if is_latency_metric and drop_column in df.columns:
            df = df.dropna(subset=[drop_column], ignore_index=True)
        for c in (
@@ -105,35 +163,61 @@ def compare_data_columns(
            meta = meta.groupby(level=key_cols, dropna=False).first()
        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
        if data_column in df_idx.columns:
            s = df_idx[data_column]
            if not s.index.is_unique:
                s = s.groupby(level=key_cols, dropna=False).mean()
        else:
            # keep NA series to preserve meta keys for union_index
            s = pd.Series(pd.NA, index=meta.index)
        s.name = file_label
-        if not meta_added:
+        name_s = None
            frames.append(meta)
            meta_added = True
        if debug and name_column in df_idx.columns:
            name_s = df_idx[name_column]
            if not name_s.index.is_unique:
                name_s = name_s.groupby(level=key_cols, dropna=False).first()
            name_s.name = f"{file_label}_name"
            frames.append(name_s)
-        frames.append(s)
+        if union_index is None:
            union_index = meta.index
        else:
            union_index = union_index.union(meta.index)
        metas.append(meta)
        staged.append((file_label, s, name_s))
    if union_index is None:
        raise ValueError("No data found after loading inputs.")
    # meta first (union-aligned): build UNION meta across all files
    if metas:
        meta_union = pd.concat(metas, axis=0)
        # Collapse duplicates on the MultiIndex; keep first non-null per column
        meta_union = meta_union.groupby(level=key_cols, dropna=False).first()
        frames.append(meta_union.reindex(union_index))
    # values + ratios (union-aligned)
    metric_series_aligned: list[pd.Series] = []
    for file_label, s, name_s in staged:
        s_aligned = s.reindex(union_index)
        frames.append(s_aligned)
        raw_data_cols.append(file_label)
-        compare_frames.append(s)
+        metric_series_aligned.append(s_aligned)
-        if len(compare_frames) >= 2:
+        if debug and name_s is not None:
-            base = compare_frames[0]
+            frames.append(name_s.reindex(union_index))
-            current = compare_frames[-1]
+
-            if "P99" in data_column or "Median" in data_column:
+        if len(metric_series_aligned) >= 2:
            base = metric_series_aligned[0]
            current = metric_series_aligned[-1]
            if "P99" in str(data_column) or "Median" in str(data_column):
                ratio = base / current
            else:
                ratio = current / base
            ratio = ratio.mask(base == 0)
-            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
+            ratio.name = f"Ratio 1 vs {len(metric_series_aligned)}"
            frames.append(ratio)
    concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
@@ -204,24 +288,10 @@ def split_json_by_tp_pp(
 # -----------------------------
 # Styling helpers
 # -----------------------------
 def _find_concurrency_col(df: pd.DataFrame) -> str:
    for c in [
        "# of max concurrency.",
        "# of max concurrency",
        "Max Concurrency",
        "max_concurrency",
        "Concurrency",
    ]:
        if c in df.columns:
            return c
    for c in df.columns:
        if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
            return c
    return "# of max concurrency."
 def _highlight_threshold(
-    df: pd.DataFrame, threshold: float
+    df: pd.DataFrame,
    threshold: float,
    slack_pct: float = 0.0,
 ) -> pd.io.formats.style.Styler:
    conc_col = _find_concurrency_col(df)
    key_cols = [
@@ -234,12 +304,24 @@ def _highlight_threshold(
    ]
    conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
-    return df.style.map(
+    try:
-        lambda v: "background-color:#e6ffe6;font-weight:bold;"
+        slack_pct = float(slack_pct or 0.0)
-        if pd.notna(v) and v <= threshold
+    except Exception:
-        else "",
+        slack_pct = 0.0
-        subset=conf_cols,
+    slack_limit = threshold * (1.0 + slack_pct / 100.0)
-    )
+
    def _cell(v):
        if pd.isna(v):
            return ""
        if v <= threshold:
            # Strict SLA
            return "background-color:#e6ffe6;font-weight:bold;"
        if v <= slack_limit:
            # Within slack range
            return "background-color:#ffe5cc;font-weight:bold;"
        return ""
    return df.style.map(_cell, subset=conf_cols)
 def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
@@ -286,11 +368,30 @@ def _sanitize_sheet_name(name: str) -> str:
      - max 31 chars
      - cannot contain: : \ / ? * [ ]
      - cannot be empty
    NOTE: Use fast, non-regex operations here to avoid the third-party `regex`
    module's compile overhead/edge-cases on some systems.
    """
    name = "sheet" if name is None else str(name)
-    name = re.sub(r"[:\\/?*\[\]]", "_", name)
+
    # Replace illegal characters with underscore.
    trans = str.maketrans(
        {
            ":": "_",
            "\\": "_",
            "/": "_",
            "?": "_",
            "*": "_",
            "[": "_",
            "]": "_",
        }
    )
    name = name.translate(trans)
    # Strip quotes/spaces and collapse whitespace.
    name = name.strip().strip("'")
-    name = re.sub(r"\s+", " ", name)
+    name = " ".join(name.split())
    if not name:
        name = "sheet"
    return name[:31]
@@ -298,30 +399,57 @@ def _sanitize_sheet_name(name: str) -> str:
 def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
    d = dict(zip(group_cols, gkey_tuple))
-    model = d.get("Model", "model")
+
-    model_short = str(model).split("/")[-1]
+    # Always keep input/output lengths (these are important).
    ilen = d.get("Input Len", "")
    olen = d.get("Output Len", "")
    lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
    # Shorten model name aggressively to make room for lens.
    model = d.get("Model", "model")
    leaf = str(model).split("/")[-1]
    max_model_len = max(1, 31 - len(lens))
    model_short = leaf[:max_model_len]
    return _sanitize_sheet_name(f"{model_short}{lens}")
 def _write_tables_to_excel_sheet(
    writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
 ):
-    startrow = 0
+    """Write all blocks to a sheet with a single to_excel() call.
    Pandas+openpyxl can be extremely slow when called many times per sheet.
    We flatten blocks into one table with a 'Section' column to keep structure
    while making Excel generation fast and deterministic.
    """
    if not blocks:
        pd.DataFrame().to_excel(writer, sheet_name=sheet, index=False)
        return
    combined_parts: list[pd.DataFrame] = []
    for title, df in blocks:
-        pd.DataFrame([[title]]).to_excel(
+        df2 = df.copy()
-            writer, sheet_name=sheet, index=False, header=False, startrow=startrow
+        # Put the section label as the first column for readability.
-        )
+        df2.insert(0, "Section", title)
-        startrow += 1
+        combined_parts.append(df2)
-        df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow)
+
-        startrow += len(df) + 3
+    combined = pd.concat(combined_parts, axis=0, ignore_index=True, sort=False)
    combined.to_excel(writer, sheet_name=sheet, index=False)
 def _safe_filename(s: str) -> str:
-    s = re.sub(r"[^\w\-.]+", "_", str(s).strip())
+    # Fast path without the third-party `regex` module.
-    return s[:180] if len(s) > 180 else s
+    s = " ".join(str(s).strip().split())
    allowed = []
    for ch in s:
        if ch.isalnum() or ch in "._-":
            allowed.append(ch)
        else:
            allowed.append("_")
    out = "".join(allowed)
    return out[:180] if len(out) > 180 else out
 # -----------------------------
@@ -428,7 +556,11 @@ def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
 def _max_concurrency_ok(
-    df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
+    df: pd.DataFrame,
    conc_col: str,
    cfg_col: str,
    threshold: float,
    slack_pct: float = 0.0,
 ):
    if df is None or conc_col not in df.columns or cfg_col not in df.columns:
        return pd.NA
@@ -441,7 +573,14 @@ def _max_concurrency_ok(
    if d.empty:
        return pd.NA
-    ok = d[d[cfg_col] <= threshold]
+    # Accept values up to (1 + slack_pct%) above the SLA.
    try:
        slack_pct = float(slack_pct or 0.0)
    except Exception:
        slack_pct = 0.0
    effective_limit = float(threshold) * (1.0 + slack_pct / 100.0)
    ok = d[d[cfg_col] <= effective_limit]
    if ok.empty:
        return pd.NA
@@ -507,15 +646,25 @@ def build_valid_max_concurrency_summary_html(
    if not cfg_cols:
        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
    # Display SLA ranges in the table header (SLA .. SLA*(1+slack))
    ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
    tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
    ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
    tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
    rows = []
    for cfg in cfg_cols:
        ttft_max = (
-            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
+            _max_concurrency_ok(
                ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
            )
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_max = (
-            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
+            _max_concurrency_ok(
                tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
            )
            if tpot_group_df is not None
            else pd.NA
        )
@@ -544,8 +693,8 @@ def build_valid_max_concurrency_summary_html(
        rows.append(
            {
                "Configuration": cfg,
-                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
+                f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
-                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
+                f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
                f"Max {conc_col} (Both)": both,
                "Output Tput @ Both (tok/s)": tput_at_both,
                "TTFT @ Both (ms)": ttft_at_both,
@@ -620,15 +769,24 @@ def build_valid_max_concurrency_summary_df(
    if not cfg_cols:
        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
    ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
    tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
    ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
    tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
    rows = []
    for cfg in cfg_cols:
        ttft_max = (
-            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
+            _max_concurrency_ok(
                ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
            )
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_max = (
-            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
+            _max_concurrency_ok(
                tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
            )
            if tpot_group_df is not None
            else pd.NA
        )
@@ -657,8 +815,8 @@ def build_valid_max_concurrency_summary_df(
        rows.append(
            {
                "Configuration": cfg,
-                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
+                f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
-                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
+                f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
                f"Max {conc_col} (Both)": both,
                "Output Tput @ Both (tok/s)": tput_at_both,
                "TTFT @ Both (ms)": ttft_at_both,
@@ -751,7 +909,21 @@ def build_parser() -> argparse.ArgumentParser:
        help="Reference limit for TPOT plots (ms)",
    )
-    # ---- NEW: export options ----
+    # ---- SLA tolerance (slack) options ----
    parser.add_argument(
        "--ttft-slack-pct",
        type=float,
        default=5.0,
        help="Allowed percentage above TTFT SLA (default: 5).",
    )
    parser.add_argument(
        "--tpot-slack-pct",
        type=float,
        default=5.0,
        help="Allowed percentage above TPOT SLA (default: 5).",
    )
    # ---- export options ----
    parser.add_argument(
        "--excel-out",
        type=str,
@@ -843,9 +1015,13 @@ def render_metric_table_html(
    metric_name = metric_label.lower()
    if "ttft" in metric_name:
-        styler = _highlight_threshold(display_group, args.ttft_max_ms)
+        styler = _highlight_threshold(
            display_group, args.ttft_max_ms, args.ttft_slack_pct
        )
    elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
-        styler = _highlight_threshold(display_group, args.tpot_max_ms)
+        styler = _highlight_threshold(
            display_group, args.tpot_max_ms, args.tpot_slack_pct
        )
    else:
        styler = display_group.style
@@ -962,10 +1138,33 @@ def write_report_group_first(
        csv_dir.mkdir(parents=True, exist_ok=True)
    excel_path = args.excel_out or "perf_comparison.xlsx"
-    with pd.ExcelWriter(excel_path, engine="openpyxl") as xw:
+    disable_excel = os.getenv("VLLM_COMPARE_DISABLE_EXCEL", "0") == "1"
    # Prefer xlsxwriter for speed; fallback to openpyxl if unavailable.
    excel_engine = (
        os.getenv("VLLM_COMPARE_EXCEL_ENGINE", "xlsxwriter").strip() or "xlsxwriter"
    )
    if excel_engine == "xlsxwriter" and util.find_spec("xlsxwriter") is None:
        excel_engine = "openpyxl"
    excel_engine_kwargs = {}
    if excel_engine == "xlsxwriter":
        # Reduce memory pressure & usually faster writes.
        excel_engine_kwargs = {"options": {"constant_memory": True}}
    xw_ctx = (
        nullcontext(None)
        if disable_excel
        else pd.ExcelWriter(
            excel_path, engine=excel_engine, engine_kwargs=excel_engine_kwargs
        )
    )
    with xw_ctx as xw:
        used_sheets: set[str] = set()
        # ---- Environment sheet (first) ----
        env_sheet = _sanitize_sheet_name("Environment")
        env_df = _load_env_df_for_inputs(args, files)
        if xw is not None:
            if env_df is None or env_df.empty:
                pd.DataFrame(
                    [
@@ -978,6 +1177,7 @@ def write_report_group_first(
                ).to_excel(xw, sheet_name=env_sheet, index=False)
            else:
                env_df.to_excel(xw, sheet_name=env_sheet, index=False)
            used_sheets.add(env_sheet)
        with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
            main_fh.write('<meta charset="utf-8">\n')
            for gkey in group_keys:
@@ -993,12 +1193,19 @@ def write_report_group_first(
                main_fh.write(group_header)
                do_excel = xw is not None
                sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
                sheet_base = sheet
                if do_excel:
                    dedup_i = 1
-                while sheet in xw.sheets:
+                    while sheet in used_sheets:
                        dedup_i += 1
-                    sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}")
+                        suffix = f"_{dedup_i}"
                        # Ensure uniqueness even when sheet names are truncated.
                        base = str(sheet_base)
                        keep = max(1, 31 - len(suffix))
                        sheet = _sanitize_sheet_name(base[:keep] + suffix)
                    used_sheets.add(sheet)
                excel_blocks: list[tuple[str, pd.DataFrame]] = []
@@ -1059,7 +1266,7 @@ def write_report_group_first(
                        )
                        excel_blocks.append(
-                            (metric_label, display_group.reset_index(drop=True))
+                            (metric_label, group_df.reset_index(drop=True))
                        )
                        if csv_dir:
                            fn = _safe_filename(
@@ -1067,7 +1274,7 @@ def write_report_group_first(
                                    "/", "_"
                                )
                            )
-                            display_group.to_csv(csv_dir / f"{fn}.csv", index=False)
+                            group_df.to_csv(csv_dir / f"{fn}.csv", index=False)
                    summary_html = build_valid_max_concurrency_summary_html(
                        tput_group_df=tput_group_df,
@@ -1097,8 +1304,12 @@ def write_report_group_first(
                            )
                            summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
                if do_excel:
                    _write_tables_to_excel_sheet(xw, sheet, excel_blocks)
    if disable_excel:
        print("Skipped Excel generation (VLLM_COMPARE_DISABLE_EXCEL=1).")
    else:
        print(f"Wrote Excel: {excel_path}")
    if csv_dir:
        print(f"Wrote CSVs under: {csv_dir}")
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -12,6 +12,13 @@ DRY_RUN="${DRY_RUN:-0}"
 MODEL_FILTER="${MODEL_FILTER:-}"
 DTYPE_FILTER="${DTYPE_FILTER:-}"
 # Adaptive search controls
 ENABLE_ADAPTIVE_CONCURRENCY="${ENABLE_ADAPTIVE_CONCURRENCY:-0}"
 SLA_TTFT_MS="${SLA_TTFT_MS:-3000}"
 SLA_TPOT_MS="${SLA_TPOT_MS:-100}"
 ADAPTIVE_MAX_PROBES="${ADAPTIVE_MAX_PROBES:-8}"
 ADAPTIVE_MAX_CONCURRENCY="${ADAPTIVE_MAX_CONCURRENCY:-1024}"
 check_gpus() {
  if command -v nvidia-smi; then
    # check the number of GPUs and GPU type.
@@ -183,6 +190,304 @@ upload_to_buildkite() {
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }
 # -------------------------------
 # Adaptive concurrency helpers
 # -------------------------------
 result_json_path_for_serving() {
  local test_name=$1
  local qps=$2
  local max_concurrency=$3
  echo "$RESULTS_FOLDER/${test_name}_qps_${qps}_concurrency_${max_concurrency}.json"
 }
 extract_metric_ms() {
  local metric_name=$1
  local json_file=$2
  [[ -f "$json_file" ]] || return 0
  if [[ "$metric_name" == "ttft" ]]; then
    jq -r '
      [
        .ttft_ms.p99?,
        .metrics.ttft_ms.p99?,
        .ttft.p99?,
        .metrics.ttft.p99?,
        .p99_ttft_ms?,
        .ttft_ms.mean?,
        .metrics.ttft_ms.mean?,
        .ttft.mean?,
        .metrics.ttft.mean?,
        .mean_ttft_ms?
      ] | map(select(. != null)) | .[0] // empty
    ' "$json_file"
  else
    jq -r '
      [
        .tpot_ms.p99?,
        .metrics.tpot_ms.p99?,
        .tpot.p99?,
        .metrics.tpot.p99?,
        .p99_tpot_ms?,
        .itl_ms.p99?,
        .metrics.itl_ms.p99?,
        .inter_token_latency_ms.p99?,
        .tpot_ms.mean?,
        .metrics.tpot_ms.mean?,
        .tpot.mean?,
        .metrics.tpot.mean?,
        .itl_ms.mean?,
        .metrics.itl_ms.mean?,
        .mean_tpot_ms?,
        .mean_itl_ms?
      ] | map(select(. != null)) | .[0] // empty
    ' "$json_file"
  fi
 }
 evaluate_sla_from_json() {
  local json_file=$1
  local ttft
  local tpot
  local pass
  [[ -f "$json_file" ]] || return 2
  ttft=$(extract_metric_ms ttft "$json_file")
  tpot=$(extract_metric_ms tpot "$json_file")
  [[ -n "$ttft" && -n "$tpot" ]] || return 2
  pass=$(jq -n \
    --argjson ttft "$ttft" \
    --argjson tpot "$tpot" \
    --argjson sla_ttft "$SLA_TTFT_MS" \
    --argjson sla_tpot "$SLA_TPOT_MS" \
    '($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)')
  [[ "$pass" == "true" ]]
 }
 write_adaptive_summary_json() {
  local summary_file=$1
  local test_name=$2
  local qps=$3
  local static_last_pass=$4
  local static_first_fail=$5
  local final_last_pass=$6
  local final_first_fail=$7
  jq -n \
    --arg test_name "$test_name" \
    --arg qps "$qps" \
    --argjson sla_ttft "$SLA_TTFT_MS" \
    --argjson sla_tpot "$SLA_TPOT_MS" \
    --arg static_last_pass "${static_last_pass:-}" \
    --arg static_first_fail "${static_first_fail:-}" \
    --arg final_last_pass "${final_last_pass:-}" \
    --arg final_first_fail "${final_first_fail:-}" \
    '{
      test_name: $test_name,
      qps: $qps,
      sla_ttft_ms: $sla_ttft,
      sla_tpot_ms: $sla_tpot,
      static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end),
      static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end),
      final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end),
      final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end)
    }' > "$summary_file"
 }
 run_single_serving_probe() {
  local test_name=$1
  local qps=$2
  local max_concurrency=$3
  local tp=$4
  local compilation_config_mode=$5
  local optimization_level=$6
  local client_args_effective=$7
  local client_remote_args=$8
  local server_command=$9
  local new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
  local result_json
  local num_prompts_arg=""
  local client_command
  result_json=$(result_json_path_for_serving "$test_name" "$qps" "$max_concurrency")
  if [[ -f "$result_json" ]]; then
    evaluate_sla_from_json "$result_json"
    return $?
  fi
  if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
    num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
    if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
    if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
    num_prompts_arg="--num-prompts $num_prompts"
  fi
  client_command="vllm bench serve \
    --save-result \
    --result-dir $RESULTS_FOLDER \
    --result-filename ${new_test_name}.json \
    --request-rate $qps \
    --max-concurrency $max_concurrency \
    $num_prompts_arg \
    --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level adaptive_search=1 \
    $client_args_effective $client_remote_args "
  echo "Adaptive probe: $client_command"
  if [[ "${DRY_RUN:-0}" != "1" ]]; then
    bash -c "$client_command"
  fi
  jq_output=$(jq -n \
    --arg server "$server_command" \
    --arg client "$client_command" \
    --arg gpu "$gpu_type" \
    '{
      server_command: $server,
      client_command: $client,
      gpu_type: $gpu,
      adaptive_search: true
    }')
  echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
  evaluate_sla_from_json "$result_json"
 }
 adaptive_refine_from_static_results() {
  local test_name=$1
  local qps=$2
  local max_concurrency_list_raw=$3
  local tp=$4
  local compilation_config_mode=$5
  local optimization_level=$6
  local client_args_effective=$7
  local client_remote_args=$8
  local server_command=$9
  local sorted_points
  local point
  local rc
  local static_last_pass=""
  local static_first_fail=""
  local largest_static=""
  local step_hint=1
  local previous_point=""
  local low
  local high
  local mid
  local probes=0
  local summary_file="$RESULTS_FOLDER/${test_name}_qps_${qps}_sla_summary.json"
  [[ "${ENABLE_ADAPTIVE_CONCURRENCY}" == "1" ]] || return 0
  [[ "${DRY_RUN:-0}" != "1" ]] || return 0
  sorted_points=$(for point in $max_concurrency_list_raw; do printf '%s\n' "$point"; done | tr -d "'" | awk '/^[0-9]+$/' | sort -n | uniq)
  [[ -n "$sorted_points" ]] || return 0
  while read -r point; do
    [[ -z "$point" ]] && continue
    largest_static="$point"
    evaluate_sla_from_json "$(result_json_path_for_serving "$test_name" "$qps" "$point")"
    rc=$?
    if (( rc == 0 )); then
      static_last_pass="$point"
    elif (( rc == 1 )); then
      if [[ -n "$static_last_pass" ]]; then
        static_first_fail="$point"
        break
      fi
    fi
    if [[ -n "$previous_point" ]]; then
      step_hint=$(( point - previous_point ))
      if (( step_hint < 1 )); then step_hint=1; fi
    fi
    previous_point="$point"
  done <<< "$sorted_points"
  if [[ -z "$static_last_pass" ]]; then
    write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "" "$static_first_fail" "" "$static_first_fail"
    return 0
  fi
  if [[ -n "$static_first_fail" ]]; then
    low=$static_last_pass
    high=$static_first_fail
    while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
      mid=$(( (low + high) / 2 ))
      probes=$(( probes + 1 ))
      run_single_serving_probe \
        "$test_name" "$qps" "$mid" "$tp" \
        "$compilation_config_mode" "$optimization_level" \
        "$client_args_effective" "$client_remote_args" "$server_command"
      rc=$?
      if (( rc == 0 )); then
        low=$mid
      elif (( rc == 1 )); then
        high=$mid
      else
        break
      fi
    done
    write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "$static_first_fail" "$low" "$high"
    return 0
  fi
  low=$largest_static
  high=""
  while (( probes < ADAPTIVE_MAX_PROBES )); do
    point=$(( low + step_hint ))
    if (( point > ADAPTIVE_MAX_CONCURRENCY )); then
      point=$ADAPTIVE_MAX_CONCURRENCY
    fi
    (( point > low )) || break
    probes=$(( probes + 1 ))
    run_single_serving_probe \
      "$test_name" "$qps" "$point" "$tp" \
      "$compilation_config_mode" "$optimization_level" \
      "$client_args_effective" "$client_remote_args" "$server_command"
    rc=$?
    if (( rc == 0 )); then
      low=$point
      (( point == ADAPTIVE_MAX_CONCURRENCY )) && break
      step_hint=$(( step_hint * 2 ))
      if (( step_hint < 1 )); then step_hint=1; fi
    elif (( rc == 1 )); then
      high=$point
      break
    else
      break
    fi
  done
  if [[ -n "$high" ]]; then
    while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
      mid=$(( (low + high) / 2 ))
      probes=$(( probes + 1 ))
      run_single_serving_probe \
        "$test_name" "$qps" "$mid" "$tp" \
        "$compilation_config_mode" "$optimization_level" \
        "$client_args_effective" "$client_remote_args" "$server_command"
      rc=$?
      if (( rc == 0 )); then
        low=$mid
      elif (( rc == 1 )); then
        high=$mid
      else
        break
      fi
    done
  fi
  write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "" "$low" "$high"
 }
 run_benchmark_tests() {
  # run benchmark tests using `vllm bench <test_type>` command
  # $1: test type (latency or throughput)
@@ -347,10 +652,48 @@ run_serving_tests() {
    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
    client_params=$(echo "$params" | jq -r '.client_parameters')
-    server_args=$(json2args "$server_params")
+    # vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly.
    server_model=$(echo "$server_params" | jq -r '.model // empty')
    if [[ -z "$server_model" || "$server_model" == "null" ]]; then
      echo "Error: serving test '$test_name' is missing server_parameters.model" >&2
      exit 1
    fi
    server_params_no_model=$(echo "$server_params" | jq -c 'del(.model)')
    server_args=$(json2args "$server_params_no_model")
    server_envs=$(json2envs "$server_envs")
    client_args=$(json2args "$client_params")
    # ------------------------------------------------------------
    # Option 1: Dynamic num-prompts scaling based on max_concurrency
    #
    # If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with:
    #   num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY
    #
    # If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior
    # unchanged (i.e., whatever is in serving-tests-*.json).
    # ------------------------------------------------------------
    PROMPTS_PER_CONCURRENCY="${PROMPTS_PER_CONCURRENCY-}"  # no default on purpose
    MIN_NUM_PROMPTS="${MIN_NUM_PROMPTS:-1}"
    MAX_NUM_PROMPTS="${MAX_NUM_PROMPTS:-1000000}"
    if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
      # Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
      # Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
      # Handles: --num-prompts 123   and   --num-prompts=123
      client_args_no_np="$(
        printf ' %s ' "$client_args" \
        | sed -E \
          -e 's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g' \
          -e 's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g'
      )"
      # normalize whitespace
      client_args_no_np="$(echo "$client_args_no_np" | tr -s ' ' | sed -E 's/^ //; s/ $//')"
      client_args_no_np="$(echo "$client_args_no_np" | xargs)"
      client_args_effective="$client_args_no_np"
    else
      client_args_effective="$client_args"
    fi
    # qps_list
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
@@ -382,14 +725,13 @@ run_serving_tests() {
    fi
    # check if server model and client model is aligned
    server_model=$(echo "$server_params" | jq -r '.model')
    client_model=$(echo "$client_params" | jq -r '.model')
    if [[ $server_model != "$client_model" ]]; then
      echo "Server model and client model must be the same. Skip testcase $test_name."
      continue
    fi
-    server_command="$server_envs vllm serve \
+    server_command="$server_envs vllm serve $server_model \
      $server_args"
    # run the server
@@ -436,6 +778,14 @@ run_serving_tests() {
      for max_concurrency in $max_concurrency_list; do
        new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
        echo " new test name $new_test_name"
        # If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts.
        num_prompts_arg=""
        if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
          num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
          if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
          if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
          num_prompts_arg="--num-prompts $num_prompts"
        fi
        # pass the tensor parallel size, the compilation mode, and the optimization
        # level to the client so that they can be used on the benchmark dashboard
        client_command="vllm bench serve \
@@ -444,8 +794,9 @@ run_serving_tests() {
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --max-concurrency $max_concurrency \
          $num_prompts_arg \
          --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
-          $client_args $client_remote_args "
+          $client_args_effective $client_remote_args "
        echo "Running test case $test_name with qps $qps"
        echo "Client command: $client_command"
@@ -467,6 +818,11 @@ run_serving_tests() {
        echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
      done
      adaptive_refine_from_static_results \
        "$test_name" "$qps" "$max_concurrency_list" "$tp" \
        "$compilation_config_mode" "$optimization_level" \
        "$client_args_effective" "$client_remote_args" "$server_command"
    done
    # clean up
@@ -532,6 +888,7 @@ main() {
  # postprocess benchmarking results
  pip install tabulate pandas
  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
  python3 $QUICK_BENCHMARK_ROOT/scripts/compare-json-results.py -f $RESULTS_FOLDER/benchmark_results.json
  upload_to_buildkite
 }
--- a/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
@@ -51,5 +51,56 @@
            "max-model-len": 256,
            "async-scheduling": ""
        }
    },
    {
        "test_name": "latency_deepseek_r1",
        "environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "parameters": {
            "model": "deepseek-ai/DeepSeek-R1",
            "tensor_parallel_size": 8,
            "load_format": "dummy",
            "max-model-len": 2048,
            "dtype": "bfloat16"
        }
    },
    {
        "test_name": "latency_llama4_maverick_17b128e_instruct_fp8",
        "environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "parameters": {
            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
            "tensor_parallel_size": 8,
            "max-model-len": 512,
            "max-num-seqs": 128,
            "async-scheduling": "",
            "gpu-memory-utilization": 0.95,
            "enable_expert_parallel": ""
        }
    },
    {
        "test_name": "latency_qwen3_8b",
        "environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "parameters": {
            "model": "Qwen/Qwen3-8B",
            "tensor_parallel_size": 1,
            "max-model-len": 2048,
            "max-num-seqs": 128,
            "dtype": "bfloat16",
            "async-scheduling": ""
        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
@@ -0,0 +1,37 @@
 {
  "defaults": {
    "qps_list": [
      "inf"
    ],
    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
    "server_environment_variables": {
      "VLLM_RPC_TIMEOUT": 100000,
      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
    },
    "server_parameters": {
      "dtype": "bfloat16",
      "model": "openai/whisper-large-v3-turbo"
    },
    "client_parameters": {
      "model": "openai/whisper-large-v3-turbo",
      "backend": "openai-audio",
      "endpoint": "/v1/audio/transcriptions",
      "dataset_name": "hf",
      "dataset_path": "openslr/librispeech_asr",
      "hf_subset": "clean",
      "hf_split": "test",
      "no_stream": "",
      "no_oversample": "",
      "num_prompts": 200
    }
  },
  "tests": [
    {
      "test_name": "serving_whisper_large_v3_turbo_librispeech_clean_tp1",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {}
    }
  ]
 }
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
@@ -149,6 +149,39 @@
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_2048_2048",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp2_random_2048_2048",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_2048_2048",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_int4_tp1_random_128_128",
      "server_parameters": {
@@ -188,6 +221,45 @@
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int8_tp1_random_128_128",
      "server_parameters": {
        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int8_tp2_random_128_128",
      "server_parameters": {
        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int8_tp4_random_128_128",
      "server_parameters": {
        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama3B_tp1_random_128_128",
      "server_parameters": {
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -72,17 +72,6 @@
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_128_128",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_128_2048",
      "server_parameters": {
@@ -105,17 +94,6 @@
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_128_2048",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_2048_128",
      "server_parameters": {
@@ -139,14 +117,25 @@
      }
    },
    {
-      "test_name": "serving_llama8B_tp4_random_2048_128",
+      "test_name": "serving_llama8B_tp1_random_2048_2048",
      "server_parameters": {
-        "tensor_parallel_size": 4
+        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
-        "random-output-len": 128
+        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp2_random_2048_2048",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 2048
      }
    }
  ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
@@ -10,7 +10,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -37,7 +36,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -64,7 +62,6 @@
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -78,5 +75,83 @@
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_deepseek_r1",
        "qps_list": [1, 4, 16, "inf"],
        "server_environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "server_parameters": {
            "model": "deepseek-ai/DeepSeek-R1",
            "tensor_parallel_size": 8,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
            "max-num-seqs": 200,
            "async-scheduling": "",
            "dtype": "bfloat16"
        },
        "client_parameters": {
            "model": "deepseek-ai/DeepSeek-R1",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama4_maverick_17b128e_instruct_fp8",
        "qps_list": [1, 4, 16, "inf"],
        "server_environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "server_parameters": {
            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
            "tensor_parallel_size": 8,
            "disable_log_stats": "",
            "max-model-len": 2048,
            "max-num-seqs": 128,
            "async-scheduling": "",
            "enable_expert_parallel": "",
            "max-num-batched-tokens": 4096
        },
        "client_parameters": {
            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_qwen3_8b",
        "qps_list": [1, 4, 10, "inf"],
        "server_environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "server_parameters": {
            "model": "Qwen/Qwen-3-8B",
            "tensor_parallel_size": 1,
            "dtype": "bfloat16",
            "disable_log_stats": "",
            "async-scheduling": ""
        },
        "client_parameters": {
            "model": "Qwen/Qwen-3-8B",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests.json
@@ -5,7 +5,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -23,7 +22,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -41,7 +39,6 @@
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -59,7 +56,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", 
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "speculative_config": {
                "model": "turboderp/Qwama-0.5B-Instruct",
                "num_speculative_tokens": 4,
--- a/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
@@ -57,5 +57,67 @@
            "max-num-seqs": 512,
            "async-scheduling": ""
        }
    },
    {
        "test_name": "throughput_deepseek_r1",
        "environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "parameters": {
            "model": "deepseek-ai/DeepSeek-R1",
            "tensor_parallel_size": 8,
            "load_format": "dummy",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "dataset_name": "sharegpt",
            "num_prompts": 1000,
            "backend": "vllm",
            "max-model-len": 2048,
            "max-num-seqs": 384,
            "async-scheduling": ""
        }
    },
    {
        "test_name": "throughput_llama4_maverick_17b128e_instruct_fp8",
        "environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "parameters": {
            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
            "tensor_parallel_size": 8,
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "dataset_name": "sharegpt",
            "num_prompts": 1000,
            "backend": "vllm",
            "max-model-len": 2048,
            "max-num-seqs": 512,
            "async-scheduling": "",
            "enable_expert_parallel": ""
        }
    },
    {
        "test_name": "throughput_qwen3_8b",
        "environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "parameters": {
            "model": "Qwen/Qwen-3-8B",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "dataset_name": "sharegpt",
            "num_prompts": 1000,
            "max-num-seqs": 512,
            "backend": "vllm",
            "async-scheduling": ""
        }
    }
 ]
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -83,7 +83,7 @@ steps:
        agents:
          queue: cpu_queue_postmerge
        commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
          - "mkdir artifacts"
          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
@@ -152,7 +152,7 @@ steps:
          queue: cpu_queue_postmerge
        commands:
          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
        env:
--- a/.buildkite/scripts/annotate-rocm-release.sh
+++ b/.buildkite/scripts/annotate-rocm-release.sh
@@ -68,7 +68,7 @@ aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/aiter-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amd_aiter-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
 \`\`\`
@@ -80,7 +80,7 @@ aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-
 - **torchvision**: TorchVision for ROCm PyTorch
 - **torchaudio**: Torchaudio for ROCm PyTorch
 - **amdsmi**: AMD SMI Python bindings
- **aiter**: Aiter for ROCm
+- **amd_aiter**: Aiter for ROCm
 - **flash-attn**: Flash Attention for ROCm
 ### :warning: Notes
--- a/.buildkite/scripts/check-ray-compatibility.sh
+++ b/.buildkite/scripts/check-ray-compatibility.sh
@@ -0,0 +1,213 @@
 #!/bin/bash
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 #
 # Check if Ray LLM can generate lock files that are compatible with this
 # version of vllm. Downloads Ray's requirement files and runs a full
 # dependency resolution with the installed vllm's constraints to see if
 # a valid lock file can be produced.
 #
 # See: https://github.com/vllm-project/vllm/issues/33599
 set -eo pipefail
 RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python"
 WORK_DIR=$(mktemp -d)
 trap 'rm -rf "$WORK_DIR"' EXIT
 # Fetch all Ray requirement files used in the LLM depset pipeline
 echo ">>> Fetching Ray requirement files"
 RAY_FILES=(
    "requirements.txt"
    "requirements/cloud-requirements.txt"
    "requirements/base-test-requirements.txt"
    "requirements/llm/llm-requirements.txt"
    "requirements/llm/llm-test-requirements.txt"
 )
 for FILE in "${RAY_FILES[@]}"; do
    LOCAL_PATH="${WORK_DIR}/$(basename "$FILE")"
    echo "    ${FILE}"
    curl -fsSL -o "$LOCAL_PATH" "${RAY_BASE_URL}/${FILE}"
 done
 # Extract installed vllm deps
 echo ">>> Extracting installed vllm dependency constraints"
 python3 - "${WORK_DIR}/vllm-constraints.txt" <<'PYEOF'
 """Write out the installed vllm's dependencies as pip constraint lines.
 Ray uses vllm[audio], so audio-extra deps are included with their extra
 markers stripped. The resolver cannot evaluate extra markers for a
 package that is not itself being resolved from an index, so we activate
 them manually here.
 """
 import importlib.metadata
 import re
 import sys
 out_path = sys.argv[1]
 raw_reqs = importlib.metadata.requires("vllm") or []
 # Ray uses vllm[audio] – activate that extra.
 ACTIVE_EXTRAS = {"audio"}
 EXTRA_RE = re.compile(r"""extra\s*==\s*['"]([^'"]+)['"]""")
 lines = []
 for r in raw_reqs:
    if ";" not in r:
        # Unconditional dep — always include.
        lines.append(r.strip())
        continue
    req_part, _, marker_part = r.partition(";")
    marker_part = marker_part.strip()
    extra_matches = EXTRA_RE.findall(marker_part)
    if not extra_matches:
        # Non-extra marker (python_version, etc.) — keep as-is.
        lines.append(r.strip())
        continue
    if not ACTIVE_EXTRAS.intersection(extra_matches):
        continue  # Skip inactive extras (tensorizer, bench, …).
    # Strip the extra== conditions but keep any remaining markers
    # (e.g. python_version).
    cleaned = EXTRA_RE.sub("", marker_part)
    cleaned = re.sub(r"\band\b\s*\band\b", "and", cleaned)
    cleaned = re.sub(r"^\s*and\s+|\s+and\s*$", "", cleaned).strip()
    if cleaned:
        lines.append(f"{req_part.strip()} ; {cleaned}")
    else:
        lines.append(req_part.strip())
 with open(out_path, "w") as f:
    for line in lines:
        f.write(line + "\n")
 print(f"Wrote {len(lines)} constraints to {out_path}")
 PYEOF
 echo ">>> Installed vllm deps (first 20 lines):"
 head -20 "${WORK_DIR}/vllm-constraints.txt"
 # Remove Ray's vllm pin — the installed vllm's transitive deps
 # (written above) replace it in the resolution. vllm itself cannot
 # be resolved from PyPI for in-development versions, so we test
 # whether Ray's requirements can coexist with vllm's dependency
 # constraints instead.
 sed -i '/^vllm/d' "${WORK_DIR}/llm-requirements.txt"
 # Install uv if needed
 if ! command -v uv &>/dev/null; then
    echo ">>> Installing uv"
    pip install uv -q
 fi
 # Resolve: given vllm's constraints, can Ray compile a lock file?
 #
 # vllm's dependency constraints are the fixed side — Ray is flexible and
 # can regenerate its lock files. We pass vllm's constraints via -c so
 # the resolver treats them as non-negotiable bounds, then check whether
 # Ray's own requirements can still be satisfied within those bounds.
 echo ""
 echo "============================================================"
 echo ">>> Resolving: Can Ray generate compatible lock files?"
 echo "============================================================"
 set +e
 uv pip compile \
    "${WORK_DIR}/requirements.txt" \
    "${WORK_DIR}/cloud-requirements.txt" \
    "${WORK_DIR}/base-test-requirements.txt" \
    "${WORK_DIR}/llm-requirements.txt" \
    "${WORK_DIR}/llm-test-requirements.txt" \
    -c "${WORK_DIR}/vllm-constraints.txt" \
    --python-version 3.12 \
    --python-platform x86_64-manylinux_2_31 \
    --extra-index-url https://download.pytorch.org/whl/cu129 \
    --index-strategy unsafe-best-match \
    --unsafe-package setuptools \
    --unsafe-package ray \
    --no-header \
    -o "${WORK_DIR}/resolved.txt" \
    2>&1
 EXIT_CODE=$?
 set -e
 echo ""
 echo "=========================================="
 if [ $EXIT_CODE -eq 0 ]; then
    echo "SUCCESS: Ray can generate lock files compatible with this vllm."
    echo ""
    echo "Key resolved versions:"
    grep -E '^(protobuf|torch|numpy|transformers)==' \
        "${WORK_DIR}/resolved.txt" | sort || true
    echo "=========================================="
    exit 0
 fi
 echo "FAILURE: Ray cannot generate lock files compatible with this vllm."
 echo "This means a fundamental dependency conflict exists that Ray"
 echo "cannot resolve by regenerating its lock files."
 echo "See: https://github.com/vllm-project/vllm/issues/33599"
 echo "=========================================="
 # Buildkite annotation
 if [ -f /usr/bin/buildkite-agent ]; then
    buildkite-agent annotate --style 'warning' --context 'ray-compat' << EOF
 ### :warning: Ray Dependency Compatibility Warning
 This PR introduces dependencies that **cannot** be resolved with Ray's requirements.
 Ray would not be able to regenerate its lock files to accommodate this vllm version.
 Please check the **Ray Dependency Compatibility Check** step logs for details.
 See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for context.
 EOF
 fi
 # Notify Slack if webhook is configured and PR/branch are valid.
 if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
    PR="${BUILDKITE_PULL_REQUEST:-}"
    BRANCH="${BUILDKITE_BRANCH:-}"
    # Skip notification if PR is invalid or branch is empty
    if [[ "$PR" = "false" || -z "$PR" || -z "$BRANCH" ]]; then
        echo ">>> Skipping Slack notification (invalid PR or empty branch: PR=$PR, branch=$BRANCH)"
    else
        echo ">>> Sending Slack notification"
        # Single quotes are intentional: the f-string expressions are Python, not shell.
        # shellcheck disable=SC2016
        PAYLOAD=$(python3 -c '
 import json, os, sys
 pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A")
 branch = os.getenv("BUILDKITE_BRANCH", "unknown")
 url = os.getenv("BUILDKITE_BUILD_URL", "#")
 data = {
    "text": ":warning: Ray Dependency Compatibility Check Failed",
    "blocks": [{
        "type": "section",
        "text": {
            "type": "mrkdwn",
            "text": (
                "*:warning: Ray Dependency Compatibility Check Failed*\n"
                f"PR #{pr} on branch `{branch}` introduces dependencies "
                f"that cannot be resolved with Ray'\''s requirements.\n"
                f"<{url}|View Build>"
            ),
        },
    }],
 }
 print(json.dumps(data))
 ')
        HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
            -H 'Content-type: application/json' \
            -d "$PAYLOAD")
        echo "    Slack webhook response: $HTTP_CODE"
    fi
 else
    echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
 fi
 exit 1
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -6,6 +6,26 @@
 # Multi-node detection: Instead of matching on fragile group names, we detect
 # multi-node jobs structurally by looking for the bracket command syntax
 # "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
 #
 ###############################################################################
 # QUOTING / COMMAND PASSING
 #
 # Passing commands as positional arguments ($*) is fragile when the command
 # string itself contains double quotes, e.g.:
 #
 #   bash run-amd-test.sh "export FLAGS="value" && pytest -m "not slow""
 #
 # The outer shell resolves the nested quotes *before* this script runs, so
 # the script receives mangled input it cannot fully recover.
 #
 # Preferred: pass commands via the VLLM_TEST_COMMANDS environment variable:
 #
 #   export VLLM_TEST_COMMANDS='export FLAGS="value" && pytest -m "not slow"'
 #   bash run-amd-test.sh
 #
 # Single-quoted assignment preserves all inner double quotes verbatim.
 # The $* path is kept for backward compatibility but callers should migrate.
 ###############################################################################
 set -o pipefail
 # Export Python path
@@ -79,26 +99,169 @@ is_multi_node() {
  return 1
 }
 handle_pytest_exit() {
  local exit_code=$1
  if [ "$exit_code" -eq 5 ]; then
    echo "Pytest exit code 5 (no tests collected) - treating as success."
    exit 0
  fi
  exit "$exit_code"
 }
 ###############################################################################
-# Pytest marker re-quoting
+# Pytest marker/keyword re-quoting
 #
 # When commands are passed through Buildkite -> shell -> $* -> bash -c,
-# quotes around pytest -m marker expressions get stripped:
+# quotes around multi-word pytest -m/-k expressions get stripped:
 #   pytest -v -s -m 'not cpu_test' v1/core
 # becomes:
 #   pytest -v -s -m not cpu_test v1/core
 #
 # pytest then interprets "cpu_test" as a file path, not part of the marker.
-# This function detects unquoted multi-word marker expressions and re-quotes
+#
-# them so they survive the final bash -c expansion.
+# This function detects unquoted expressions after -m/-k and re-quotes them
 # by collecting tokens until a recognizable boundary is reached:
 #   - test path (contains '/')
 #   - test file (ends with '.py')
 #   - another pytest flag (--xxx or -x single-char flags)
 #   - command separator (&& || ; |)
 #   - environment variable assignment (FOO=bar)
 #
 # Single-word markers (e.g. -m cpu_test, -m hybrid_model) pass through
 # unquoted since they have no spaces and work fine.
 #
 # Already-quoted expressions (containing literal single quotes) are passed
 # through untouched to avoid double-quoting values injected by
 # apply_rocm_test_overrides.
 #
 # NOTE: This ONLY fixes -m/-k flags. It cannot recover arbitrary inner
 # double-quotes stripped by the calling shell (see header comment).
 # Use VLLM_TEST_COMMANDS to avoid the problem entirely.
 ###############################################################################
 re_quote_pytest_markers() {
-  local cmds="$1"
+  local input="$1"
-  # Pattern: -m not <identifier>  ->  -m 'not <identifier>'
+  local output=""
-  # Handles the common cases: 'not cpu_test', 'not slow_test', etc.
+  local collecting=false
-  cmds=$(echo "$cmds" | sed -E "s/-m not ([a-zA-Z_][a-zA-Z0-9_]*)/-m 'not \1'/g")
+  local marker_buf=""
-  echo "$cmds"
+
  # Strip backslash-newline continuations, then flatten remaining newlines
  local flat="${input//$'\\\n'/ }"
  flat="${flat//$'\n'/ }"
  # Disable globbing to prevent *.py etc. from expanding during read -ra
  local restore_glob
  restore_glob="$(shopt -p -o noglob 2>/dev/null || true)"
  set -o noglob
  local -a words
  read -ra words <<< "$flat"
  eval "$restore_glob"
  for word in "${words[@]}"; do
    if $collecting; then
      # If the token we're about to collect already contains a literal
      # single quote, the expression was already quoted upstream.
      # Flush and stop collecting.
      if [[ "$word" == *"'"* ]]; then
        if [[ -n "$marker_buf" ]]; then
          # Should not normally happen (partial buf + quote), flush raw
          output+="${marker_buf} "
          marker_buf=""
        fi
        output+="${word} "
        collecting=false
        continue
      fi
      local is_boundary=false
      case "$word" in
        # Line-continuation artifact
        "\\")
          is_boundary=true ;;
        # Command separators
        "&&"|"||"|";"|"|")
          is_boundary=true ;;
        # Long flags (--ignore, --shard-id, etc.)
        --*)
          is_boundary=true ;;
        # Short flags (-v, -s, -x, etc.) but NOT negative marker tokens
        # like "not" which don't start with "-". Also skip -k/-m which
        # would start a new marker (handled below).
        -[a-zA-Z])
          is_boundary=true ;;
        # Test path (contains /)
        */*)
          is_boundary=true ;;
        # Test file (ends with .py, possibly with ::method)
        *.py|*.py::*)
          is_boundary=true ;;
        # Environment variable assignment preceding a command (FOO=bar)
        *=*)
          # Only treat as boundary if it looks like VAR=value, not
          # pytest filter expressions like num_gpus=2 inside markers
          if [[ "$word" =~ ^[A-Z_][A-Z0-9_]*= ]]; then
            is_boundary=true
          fi
          ;;
      esac
      if $is_boundary; then
        # Strip surrounding double quotes if present (from upstream
        # single-to-double conversion); without this, wrapping below
        # would produce '"expr"' with literal double-quote characters.
        if [[ "$marker_buf" == '"'*'"' ]]; then
          marker_buf="${marker_buf#\"}"
          marker_buf="${marker_buf%\"}"
        fi
        # Flush the collected marker expression
        if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
          output+="'${marker_buf}' "
        else
          output+="${marker_buf} "
        fi
        collecting=false
        marker_buf=""
        # Check if this boundary word itself starts a new -m/-k
        if [[ "$word" == "-m" || "$word" == "-k" ]]; then
          output+="${word} "
          collecting=true
        # Drop stray backslash tokens silently
        elif [[ "$word" == "\\" ]]; then
          :
        else
          output+="${word} "
        fi
      else
        # Accumulate into marker buffer
        if [[ -n "$marker_buf" ]]; then
          marker_buf+=" ${word}"
        else
          marker_buf="${word}"
        fi
      fi
    elif [[ "$word" == "-m" || "$word" == "-k" ]]; then
      output+="${word} "
      collecting=true
      marker_buf=""
    else
      output+="${word} "
    fi
  done
  # Flush any trailing marker expression (marker at end of command)
  if $collecting && [[ -n "$marker_buf" ]]; then
    # Strip surrounding double quotes (see mid-stream flush comment)
    if [[ "$marker_buf" == '"'*'"' ]]; then
      marker_buf="${marker_buf#\"}"
      marker_buf="${marker_buf%\"}"
    fi
    if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
      output+="'${marker_buf}'"
    else
      output+="${marker_buf}"
    fi
  fi
  echo "${output% }"
 }
 ###############################################################################
@@ -170,15 +333,15 @@ apply_rocm_test_overrides() {
  # --- Entrypoint ignores ---
  if [[ $cmds == *" entrypoints/openai "* ]]; then
    cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
-    --ignore=entrypoints/openai/test_audio.py \
+    --ignore=entrypoints/openai/chat_completion/test_audio.py \
-    --ignore=entrypoints/openai/test_shutdown.py \
+    --ignore=entrypoints/openai/completion/test_shutdown.py \
    --ignore=entrypoints/openai/test_completion.py \
    --ignore=entrypoints/openai/test_models.py \
    --ignore=entrypoints/openai/test_lora_adapters.py \
    --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
-    --ignore=entrypoints/openai/test_root_path.py \
+    --ignore=entrypoints/openai/chat_completion/test_root_path.py \
    --ignore=entrypoints/openai/test_tokenization.py \
-    --ignore=entrypoints/openai/test_prompt_validation.py "}
+    --ignore=entrypoints/openai/completion/test_prompt_validation.py "}
  fi
  if [[ $cmds == *" entrypoints/llm "* ]]; then
@@ -231,11 +394,35 @@ HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
 # ---- Command source selection ----
 # Prefer VLLM_TEST_COMMANDS (preserves all inner quoting intact).
 # Fall back to $* for backward compatibility, but warn that inner
 # double-quotes will have been stripped by the calling shell.
 if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then
  commands="${VLLM_TEST_COMMANDS}"
  echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)"
 else
  commands="$*"
  if [[ -z "$commands" ]]; then
    echo "Error: No test commands provided." >&2
    echo "Usage:" >&2
    echo "  Preferred:  VLLM_TEST_COMMANDS='...' bash $0" >&2
    echo "  Legacy:     bash $0 \"commands here\"" >&2
    exit 1
  fi
  echo "Commands sourced from positional args (legacy mode)"
  echo "WARNING: Inner double-quotes in the command string may have been"
  echo "  stripped by the calling shell. If you see syntax errors, switch to:"
  echo "  export VLLM_TEST_COMMANDS='your commands here'"
  echo "  bash $0"
 fi
 echo "Raw commands: $commands"
 # Fix quoting before ROCm overrides (so overrides see correct structure)
 commands=$(re_quote_pytest_markers "$commands")
 echo "After re-quoting: $commands"
 commands=$(apply_rocm_test_overrides "$commands")
 echo "Final commands: $commands"
@@ -248,6 +435,18 @@ if [[ -z "$render_gid" ]]; then
  exit 1
 fi
 # --- RDMA device passthrough (conditional) ---
 # If the host has RDMA devices, pass them through so tests like
 # test_moriio_connector can access ibverbs. On hosts without RDMA
 # hardware the tests will gracefully skip via _rdma_available().
 RDMA_FLAGS=""
 if [ -d /dev/infiniband ]; then
  echo "RDMA devices detected on host, enabling passthrough"
  RDMA_FLAGS="--device /dev/infiniband --cap-add=IPC_LOCK"
 else
  echo "No RDMA devices found on host, RDMA tests will be skipped"
 fi
 # --- Route: multi-node vs single-node ---
 if is_multi_node "$commands"; then
  echo "--- Multi-node job detected"
@@ -282,7 +481,9 @@ if is_multi_node "$commands"; then
    done
    /bin/bash -c "${composite_command}"
    exit_code=$?
    cleanup_network
    handle_pytest_exit "$exit_code"
  else
    echo "Multi-node job detected but failed to parse bracket command syntax."
    echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]"
@@ -295,6 +496,7 @@ else
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \
    --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
    $RDMA_FLAGS \
    --network=host \
    --shm-size=16gb \
    --group-add "$render_gid" \
@@ -302,10 +504,15 @@ else
    -e HF_TOKEN \
    -e AWS_ACCESS_KEY_ID \
    -e AWS_SECRET_ACCESS_KEY \
    -e BUILDKITE_PARALLEL_JOB \
    -e BUILDKITE_PARALLEL_JOB_COUNT \
    -v "${HF_CACHE}:${HF_MOUNT}" \
    -e "HF_HOME=${HF_MOUNT}" \
    -e "PYTHONPATH=${MYPYTHONPATH}" \
    --name "${container_name}" \
    "${image_name}" \
    /bin/bash -c "${commands}"
  exit_code=$?
  handle_pytest_exit "$exit_code"
 fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
@@ -0,0 +1,65 @@
 #!/bin/bash
 set -euox pipefail
 export VLLM_CPU_KVCACHE_SPACE=1 
 export VLLM_CPU_CI_ENV=1
 # Reduce sub-processes for acceleration
 export TORCH_COMPILE_DISABLE=1 
 export VLLM_ENABLE_V1_MULTIPROCESSING=0
 SDE_ARCHIVE="sde-external-10.7.0-2026-02-18-lin.tar.xz"
 SDE_CHECKSUM="CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217"
 wget "https://downloadmirror.intel.com/913594/${SDE_ARCHIVE}"
 echo "${SDE_CHECKSUM}  ${SDE_ARCHIVE}" | sha256sum --check
 mkdir -p sde
 tar -xvf "./${SDE_ARCHIVE}" --strip-components=1 -C ./sde/
 wait_for_pid_and_check_log() {
    local pid="$1"
    local log_file="$2"
    local exit_status
    if [ -z "$pid" ] || [ -z "$log_file" ]; then
        echo "Usage: wait_for_pid_and_check_log <PID> <LOG_FILE>"
        return 1
    fi
    echo "Waiting for process $pid to finish..."
    # Use the 'wait' command to pause the script until the specific PID exits.
    # The 'wait' command's own exit status will be that of the waited-for process.
    if wait "$pid"; then
        exit_status=$?
        echo "Process $pid finished with exit status $exit_status (Success)."
    else
        exit_status=$?
        echo "Process $pid finished with exit status $exit_status (Failure)."
    fi
    if [ "$exit_status" -ne 0 ]; then
        echo "Process exited with a non-zero status."
        echo "--- Last few lines of log file: $log_file ---"
        tail -n 50 "$log_file"
        echo "---------------------------------------------"
        return 1 # Indicate failure based on exit status
    fi
    echo "No errors detected in log file and process exited successfully."
    return 0
 }
 # Test Sky Lake (AVX512F)
 ./sde/sde64 -skl -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_0.log 2>&1 &
 PID_TEST_0=$!
 # Test Cascade Lake (AVX512F + VNNI)
 ./sde/sde64 -clx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_1.log 2>&1 &
 PID_TEST_1=$!
 # Test Cooper Lake (AVX512F + VNNI + BF16)
 ./sde/sde64 -cpx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_2.log 2>&1 &
 PID_TEST_2=$!
 wait_for_pid_and_check_log $PID_TEST_0 test_0.log
 wait_for_pid_and_check_log $PID_TEST_1 test_1.log
 wait_for_pid_and_check_log $PID_TEST_2 test_2.log
--- a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
@@ -1,26 +1,43 @@
 #!/bin/bash
 set -euox pipefail
 export VLLM_CPU_CI_ENV=0
 echo "--- PP+TP"
 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
 server_pid=$!
-timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
 vllm bench serve \
    --backend vllm \
    --dataset-name random \
    --model meta-llama/Llama-3.2-3B-Instruct \
    --num-prompts 20 \
    --result-dir ./test_results \
    --result-filename tp_pp.json \
    --save-result \
    --endpoint /v1/completions
-kill -s SIGTERM $server_pid &
+kill -s SIGTERM $server_pid; wait $server_pid || true
 failed_req=$(jq '.failed' ./test_results/tp_pp.json)
 if [ "$failed_req" -ne 0 ]; then
  echo "Some requests were failed!"
  exit 1
 fi
 echo "--- DP+TP"
 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
 server_pid=$!
-timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
 vllm bench serve \
    --backend vllm \
    --dataset-name random \
    --model meta-llama/Llama-3.2-3B-Instruct \
    --num-prompts 20 \
    --result-dir ./test_results \
    --result-filename dp_pp.json \
    --save-result \
    --endpoint /v1/completions
-kill -s SIGTERM $server_pid &
+kill -s SIGTERM $server_pid; wait $server_pid || true
 failed_req=$(jq '.failed' ./test_results/dp_pp.json)
 if [ "$failed_req" -ne 0 ]; then
  echo "Some requests were failed!"
  exit 1
 fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -34,7 +34,7 @@ function cpu_tests() {
  # offline inference
  docker exec cpu-test bash -c "
    set -e
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"
  # Run model tests
  docker exec cpu-test bash -c "
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -27,7 +27,7 @@ function cpu_tests() {
  podman exec -it "$container_id" bash -c "
    export TORCH_COMPILE_DISABLE=1
    set -xve
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
  # Run basic model test
  podman exec -it "$container_id" bash -c "
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@@ -25,5 +25,5 @@ remove_docker_container
 # Run the image and test offline inference
 docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
+    python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -1,9 +1,27 @@
 #!/bin/bash
-# This script build the CPU docker image and run the offline inference inside the container.
+# This script builds the HPU docker image and runs the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 #
 # vllm-gaudi compatibility pinning:
 #   The vllm-gaudi plugin is installed on top of the vllm upstream checkout used by this CI job.
 #   When upstream vllm changes its API, the plugin may break before it has been updated.
 #   To handle this, the vllm-gaudi repository maintains a file:
 #     vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT
 #   The first line of that file controls what version of vllm is used inside the Docker image:
 #     - "latest"        : no checkout override; the current Buildkite CI commit is used as-is.
 #     - "<commit SHA>"  : vllm is checked out to that specific commit before building, pinning
 #                         the test to a known-compatible baseline.
 #   To unpin (resume testing against the live vllm tip), set the file content back to "latest".
 set -exuo pipefail
 # Fetch the vllm community commit reference from vllm-gaudi (first line only).
 VLLM_COMMUNITY_COMMIT=$(curl -s \
  https://raw.githubusercontent.com/vllm-project/vllm-gaudi/vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT \
  | head -1 | tr -d '\n')
 echo "Using vllm community commit: ${VLLM_COMMUNITY_COMMIT}"
 # Try building the docker image
 image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
 container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
@@ -12,6 +30,13 @@ FROM gaudi-base-image:latest
 COPY ./ /workspace/vllm
 # If VLLM_COMMUNITY_COMMIT is a specific commit (not "latest"), check it out to pin vllm
 # to the version known to be compatible with vllm-gaudi. When the value is "latest",
 # the current checkout (the Buildkite CI commit) is used unchanged.
 RUN if [ "${VLLM_COMMUNITY_COMMIT}" != "latest" ]; then \
      cd /workspace/vllm && git fetch --unshallow 2>/dev/null || true && git checkout ${VLLM_COMMUNITY_COMMIT}; \
    fi
 WORKDIR /workspace/vllm
 ENV no_proxy=localhost,127.0.0.1
@@ -51,7 +76,7 @@ docker run --rm --runtime=habana --name="${container_name}" --network=host \
  -e PT_HPU_LAZY_MODE=1 \
  "${image_name}" \
  /bin/bash -c '
-  cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
+  cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
 '
 EXITCODE=$?
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -34,17 +34,17 @@ docker run \
    set -e
    echo $ZE_AFFINITY_MASK
    pip install tblib==3.1.0
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
-    python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
+    python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
+    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
+    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
    cd tests
-    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
+    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -24,7 +24,7 @@ if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:
  BACKENDS=("allgather_reducescatter")
  # Disable MOE padding for ROCm since it is causing eplb to fail
  export VLLM_ROCM_MOE_PADDING=0
-  PLATFORM_ARGS=("--no-async-scheduling")
+  PLATFORM_ARGS=("--no-async-scheduling" "--attention-backend=TRITON_ATTN")
  echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
 else
  # Non-ROCm platform (CUDA/other)
--- a/.buildkite/scripts/tool_call/run-bfcl-eval.sh
+++ b/.buildkite/scripts/tool_call/run-bfcl-eval.sh
@@ -0,0 +1,248 @@
 #!/bin/bash
 # Run BFCL (Berkeley Function Call Leaderboard) tool-calling correctness
 # evaluation against a local vLLM server.
 #
 # Usage:
 #   # Run with defaults (gpt-oss-20b, multi_turn)
 #   bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
 #
 #   # Run with gpt-oss-120b and multiple test categories
 #   BFCL_MODEL="openai/gpt-oss-120b" BFCL_TP_SIZE=4 \
 #     BFCL_TEST_CATEGORY="live_simple, multiple, parallel_multiple" \
 #     bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
 #
 #   # Chain both API types (use BFCL_OUTPUT_DIR to avoid overwriting results)
 #   BFCL_OUTPUT_DIR=./bfcl-chat-completions BFCL_API_TYPE=chat_completions \
 #     bash .buildkite/scripts/tool_call/run-bfcl-eval.sh && \
 #   BFCL_OUTPUT_DIR=./bfcl-responses BFCL_API_TYPE=responses \
 #     bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
 #
 # Environment variables (all optional, with defaults):
 #   BFCL_MODEL          - HF model name (default: openai/gpt-oss-20b)
 #   BFCL_API_TYPE       - API type: "chat_completions" or "responses" (default: chat_completions)
 #   BFCL_OUTPUT_DIR     - Directory for BFCL results (default: current working directory)
 #   BFCL_TEST_CATEGORY  - BFCL test categories (default: multi_turn)
 #   BFCL_TOOL_CALL_PARSER - Tool call parser name (default: openai)
 #   BFCL_NUM_THREADS    - Threads for BFCL generate (default: 8)
 #   BFCL_TP_SIZE        - Tensor parallel size (default: 1)
 #   BFCL_MAX_MODEL_LEN  - Max model length (default: 4096)
 #   BFCL_PORT           - Server port (default: 8000)
 #   BFCL_REASONING_PARSER - Reasoning parser name (default: disabled)
 #   BFCL_EXTRA_ARGS     - Additional vLLM server args
 set -euo pipefail
 # ---- Configuration ----
 MODEL="${BFCL_MODEL:-openai/gpt-oss-20b}"
 API_TYPE="${BFCL_API_TYPE:-chat_completions}"
 OUTPUT_DIR="${BFCL_OUTPUT_DIR:-}"
 TEST_CATEGORY="${BFCL_TEST_CATEGORY:-multi_turn}"
 TOOL_CALL_PARSER="${BFCL_TOOL_CALL_PARSER:-openai}"
 NUM_THREADS="${BFCL_NUM_THREADS:-8}"
 TP_SIZE="${BFCL_TP_SIZE:-1}"
 MAX_MODEL_LEN="${BFCL_MAX_MODEL_LEN:-4096}"
 PORT="${BFCL_PORT:-8000}"
 REASONING_PARSER="${BFCL_REASONING_PARSER:-}"
 EXTRA_ARGS="${BFCL_EXTRA_ARGS:-}"
 # Set up output directory
 if [ -n "$OUTPUT_DIR" ]; then
    mkdir -p "$OUTPUT_DIR"
    OUTPUT_DIR="$(cd "$OUTPUT_DIR" && pwd)"
 fi
 echo "============================================"
 echo "BFCL Tool Call Correctness Evaluation"
 echo "============================================"
 echo "Model:          $MODEL"
 echo "Tool parser:    $TOOL_CALL_PARSER"
 echo "API type:       $API_TYPE"
 echo "Output dir:     ${OUTPUT_DIR:-<cwd>}"
 echo "Test category:  $TEST_CATEGORY"
 echo "TP size:        $TP_SIZE"
 echo "Max model len:  $MAX_MODEL_LEN"
 echo "Port:           $PORT"
 echo "Num threads:    $NUM_THREADS"
 echo "============================================"
 # ---- Install bfcl-eval if missing ----
 if ! python3 -c "import bfcl_eval" 2>/dev/null; then
    echo "Installing bfcl-eval..."
    pip install "bfcl-eval>=2025.10.20.1,<2026"
 fi
 # ---- Cleanup handler ----
 SERVER_PID=""
 cleanup() {
    if [ -n "$SERVER_PID" ]; then
        echo "Stopping vLLM server (pid=$SERVER_PID)..."
        kill "$SERVER_PID" 2>/dev/null || true
        wait "$SERVER_PID" 2>/dev/null || true
    fi
    # Remove BFCL lock files (created by filelock for thread-safe writes)
    rm -rf .file_locks/
    if [ -n "${OUTPUT_DIR:-}" ]; then
        rm -rf "$OUTPUT_DIR/.file_locks/"
    fi
 }
 trap cleanup EXIT
 # ---- Start vLLM server ----
 echo "Starting vLLM server..."
 SERVE_ARGS=(
    "$MODEL"
    --port "$PORT"
    --enable-auto-tool-choice
    --tool-call-parser "$TOOL_CALL_PARSER"
    --tensor-parallel-size "$TP_SIZE"
    --max-model-len "$MAX_MODEL_LEN"
    --enforce-eager
    --no-enable-prefix-caching
 )
 # Append reasoning parser if specified
 if [ -n "$REASONING_PARSER" ]; then
    SERVE_ARGS+=(--reasoning-parser "$REASONING_PARSER")
 fi
 # Append any extra args
 if [ -n "$EXTRA_ARGS" ]; then
    read -ra EXTRA_ARGS_ARRAY <<< "$EXTRA_ARGS"
    SERVE_ARGS+=("${EXTRA_ARGS_ARRAY[@]}")
 fi
 echo "Command: vllm serve ${SERVE_ARGS[*]}"
 vllm serve "${SERVE_ARGS[@]}" &
 SERVER_PID=$!
 # ---- Wait for server to be ready ----
 echo "Waiting for vLLM server to start (timeout: 600s)..."
 SECONDS_WAITED=0
 until curl -sf "http://localhost:${PORT}/health" > /dev/null 2>&1; do
    if [ $SECONDS_WAITED -ge 600 ]; then
        echo ""
        echo "ERROR: vLLM server failed to start within 600s"
        exit 1
    fi
    if (( SECONDS_WAITED % 30 == 0 && SECONDS_WAITED > 0 )); then
        echo "  Still waiting... (${SECONDS_WAITED}s elapsed)"
    fi
    sleep 2
    SECONDS_WAITED=$((SECONDS_WAITED + 2))
 done
 echo "vLLM server is ready. (started in ${SECONDS_WAITED}s)"
 # ---- Run BFCL evaluation ----
 # bfcl-eval has no CLI entry point; generate() and evaluate() are Typer
 # functions that must be called from Python. The MODEL_CONFIG_MAPPING must
 # be patched in-process so BFCL knows to use the OpenAI-compatible handler
 # against our local vLLM server.
 bfcl_exit_code=0
 python3 - "$MODEL" "$TEST_CATEGORY" "$NUM_THREADS" "$PORT" "$API_TYPE" "$OUTPUT_DIR" << 'PYEOF' || bfcl_exit_code=$?
 import os
 import sys
 model = sys.argv[1]
 test_category = sys.argv[2]
 num_threads = int(sys.argv[3])
 port = sys.argv[4]
 api_type = sys.argv[5]
 output_dir = sys.argv[6] if len(sys.argv) > 6 and sys.argv[6] else os.getcwd()
 os.environ["OPENAI_BASE_URL"] = f"http://localhost:{port}/v1"
 os.environ["OPENAI_API_KEY"] = "dummy"
 os.environ["BFCL_PROJECT_ROOT"] = output_dir
 import bfcl_eval.constants.model_config as bfcl_model_config
 from bfcl_eval.constants.model_config import ModelConfig
 from bfcl_eval.model_handler.api_inference.openai_completion import (
    OpenAICompletionsHandler,
 )
 from bfcl_eval.model_handler.api_inference.openai_response import (
    OpenAIResponsesHandler,
 )
 if api_type == "responses":
    handler = OpenAIResponsesHandler
 else:
    handler = OpenAICompletionsHandler
 bfcl_model_config.MODEL_CONFIG_MAPPING[model] = ModelConfig(
    model_name=model,
    display_name=f"{model} (FC) (vLLM)",
    url=f"https://huggingface.co/{model}",
    org="",
    license="apache-2.0",
    model_handler=handler,
    input_price=None,
    output_price=None,
    is_fc_model=True,
    underscore_to_dot=True,
 )
 from bfcl_eval.__main__ import evaluate, generate
 import inspect
 import typer
 def _get_default_kwargs(function):
    kwargs = {}
    for k, v in inspect.signature(function).parameters.items():
        if v.default is not inspect.Parameter.empty:
            default = v.default
            if isinstance(default, typer.models.OptionInfo):
                default = default.default
            kwargs[k] = default
    return kwargs
 # ---- generate ----
 print(f"=== BFCL generate: model={model} test_category={test_category} ===")
 gen_kwargs = _get_default_kwargs(generate)
 gen_kwargs["model"] = [model]
 gen_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
 gen_kwargs["skip_server_setup"] = True
 gen_kwargs["num_threads"] = num_threads
 generate(**gen_kwargs)
 # ---- evaluate ----
 print(f"=== BFCL evaluate: model={model} test_category={test_category} ===")
 eval_kwargs = _get_default_kwargs(evaluate)
 eval_kwargs["model"] = [model]
 eval_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
 evaluate(**eval_kwargs)
 print("=== BFCL evaluation completed successfully ===")
 PYEOF
 # ---- Upload results to buildkite ----
 if command -v buildkite-agent &>/dev/null; then
    if [ $bfcl_exit_code -eq 0 ]; then
        STYLE="success"
        STATUS="PASSED"
    else
        STYLE="error"
        STATUS="FAILED"
    fi
    buildkite-agent annotate --style "$STYLE" --context "bfcl-results" <<EOF
 ### BFCL Tool Call Correctness - ${STATUS}
 - **Model:** \`${MODEL}\`
 - **Parser:** \`${TOOL_CALL_PARSER}\`
 - **API type:** \`${API_TYPE}\`
 - **Test category:** \`${TEST_CATEGORY}\`
 EOF
    # BFCL writes results to $BFCL_PROJECT_ROOT/result/ and scores to
    # $BFCL_PROJECT_ROOT/score/
    RESULTS_ROOT="${OUTPUT_DIR:-.}"
    if [ -d "$RESULTS_ROOT/result" ]; then
        buildkite-agent artifact upload "$RESULTS_ROOT/result/**/*"
    fi
    if [ -d "$RESULTS_ROOT/score" ]; then
        buildkite-agent artifact upload "$RESULTS_ROOT/score/**/*"
    fi
 fi
 exit $bfcl_exit_code
--- a/.buildkite/scripts/upload-nightly-wheels.sh
+++ b/.buildkite/scripts/upload-nightly-wheels.sh
@@ -72,7 +72,7 @@ obj_json="objects.json"
 aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
 mkdir -p "$INDICES_OUTPUT_DIR"
-# call script to generate indicies for all existing wheels
+# call script to generate indices for all existing wheels
 # this indices have relative paths that could work as long as it is next to the wheel directory in s3
 # i.e., the wheels are always in s3://vllm-wheels/<commit>/
 # and indices can be placed in /<commit>/, or /nightly/, or /<version>/
--- a/.buildkite/scripts/upload-release-wheels-pypi.sh
+++ b/.buildkite/scripts/upload-release-wheels-pypi.sh
@@ -54,10 +54,13 @@ mkdir -p $DIST_DIR
 # include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
 aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
 echo "Wheels copied to local directory"
-# generate source tarball
+# generate source distribution using setup.py
-git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT"
+python setup.py sdist --dist-dir=$DIST_DIR
 ls -la $DIST_DIR
 SDIST_FILE=$(find $DIST_DIR -name "vllm*.tar.gz")
 echo "Found sdist: $SDIST_FILE"
 # upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
 PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
 if [[ -z "$PYPI_WHEEL_FILES" ]]; then
@@ -65,6 +68,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
  exit 1
 fi
-python3 -m twine check "$PYPI_WHEEL_FILES"
+python3 -m twine check "$PYPI_WHEEL_FILES" "$SDIST_FILE"
-python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES"
+python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES" "$SDIST_FILE"
-echo "Wheels uploaded to PyPI"
+echo "Wheels and source distribution uploaded to PyPI"
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
--- a/.buildkite/test_areas/basic_correctness.yaml
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -14,8 +14,3 @@ steps:
  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -36,6 +36,16 @@ steps:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
 - label: AsyncTP Correctness Tests (B200)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/"
  device: b200
  optional: true
  num_devices: 2
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
 - label: Distributed Compile Unit Tests (2xH100)
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
@@ -91,8 +101,8 @@ steps:
    - nvidia-smi
    # Run all models and attn backends but only Inductor partition and native custom ops
    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
-    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and (qwen3 or deepseek)"
 - label: Fusion E2E Config Sweep (H100)
  timeout_in_minutes: 30
@@ -122,9 +132,9 @@ steps:
  commands:
    - nvidia-smi
    # Run all models but only FLASHINFER, Inductor partition and native custom ops
-    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    # Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek)) or llama-3)"
 - label: Fusion E2E TP2 Quick (H100)
  timeout_in_minutes: 20
@@ -140,8 +150,8 @@ steps:
  commands:
    - nvidia-smi
    # Run all models and attn backends but only Inductor partition and native custom ops
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
 - label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
  timeout_in_minutes: 40
@@ -195,7 +205,7 @@ steps:
  commands:
    - nvidia-smi
    # Run all models but only FLASHINFER, Inductor partition and native custom ops
-    # include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # include qwen/deepseek with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    # for ar-rms-quant-fp4, also sweep llama3
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))) or Llama-3.1-8B-Instruct-FP4"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -50,23 +50,18 @@ steps:
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
- label: Distributed Tests (4 GPUs)
+- label: Distributed Torchrun + Examples (4 GPUs)
-  timeout_in_minutes: 50
+  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
-  - tests/distributed/test_utils
+  - tests/distributed/test_torchrun_example.py
-  - tests/distributed/test_pynccl
+  - tests/distributed/test_torchrun_example_moe.py
  - tests/distributed/test_events
  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - examples/offline_inference/new_weight_syncing/
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_symm_mem_allreduce.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
@@ -84,6 +79,27 @@ steps:
  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
  # OLD rlhf examples
  - cd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  # NEW rlhf examples
  - cd new_weight_syncing
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
 - label: Distributed DP Tests (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_utils
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
@@ -91,19 +107,27 @@ steps:
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
 - label: Distributed Compile + Comm (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
  - tests/compile/fullgraph/test_basic_correctness.py
  - tests/distributed/test_symm_mem_allreduce.py
  - tests/distributed/test_multiproc_executor.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  - pytest -v -s compile/fullgraph/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
  - pytest -v -s distributed/test_symm_mem_allreduce.py
-  # TODO: create a dedicated test section for multi-GPU example tests
+  # test multi-node TP with multiproc executor (simulated on single node)
-  # when we have multiple distributed example tests
+  - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
  # OLD rlhf examples
  - cd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  # NEW rlhf examples
  - cd new_weight_syncing
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
 - label: Distributed Tests (8 GPUs)(H100)
  timeout_in_minutes: 10
@@ -209,6 +233,19 @@ steps:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 - label: NixlConnector PD + Spec Decode acceptance (2 GPUs)
  timeout_in_minutes: 30
  device: a100
  working_dir: "/vllm-workspace/tests"
  num_devices: 2
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - vllm/v1/worker/kv_connector_model_runner_mixin.py
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
 - label: Pipeline + Context Parallelism (4 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -14,25 +14,59 @@ steps:
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
- label: V1 e2e + engine
+- label: Engine (1 GPU)
-  timeout_in_minutes: 45
+  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/v1/engine/
    - tests/v1/engine/
  commands:
    - pytest -v -s v1/engine/test_preprocess_error_handling.py
    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
 - label: e2e Scheduling (1 GPU)
  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/v1/
    - tests/v1/e2e/general/
  commands:
    - pytest -v -s v1/e2e/general/test_async_scheduling.py
 - label: e2e Core (1 GPU)
  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/v1/
    - tests/v1/e2e/general/
  commands:
    - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
 - label: V1 e2e (2 GPUs)
  timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
  optional: true
  num_devices: 2
  source_file_dependencies:
    - vllm/
-    - tests/v1
+    - tests/v1/e2e
  commands:
-    # TODO: accuracy does not match, whether setting
+    # Only run tests that need exactly 2 GPUs
-    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
    - pytest -v -s v1/e2e
    # Run this test standalone for now;
    # need to untangle use (implicit) use of spawn/fork across the tests.
    - pytest -v -s v1/engine/test_preprocess_error_handling.py
    # Run the rest of v1/engine tests
    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
  mirror:
    amd:
-      device: mi325_1
+      device: mi325_2
      depends_on:
      - image-build-amd
 - label: V1 e2e (4 GPUs)
  timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
  optional: true
  num_devices: 4
  source_file_dependencies:
    - vllm/
    - tests/v1/e2e
  commands:
-      - pytest -v -s v1/e2e
+    # Only run tests that need 4 GPUs
-      - pytest -v -s v1/engine
+    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
  mirror:
    amd:
      device: mi325_4
      depends_on:
      - image-build-amd
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -24,11 +24,6 @@ steps:
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Entrypoints Integration (API Server 1)
  timeout_in_minutes: 130
@@ -39,8 +34,13 @@ steps:
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
  - pytest -v -s entrypoints/test_chat_utils.py
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Entrypoints Integration (API Server 2)
  timeout_in_minutes: 130
@@ -65,11 +65,6 @@ steps:
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/pooling
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Entrypoints Integration (Responses API)
  timeout_in_minutes: 50
@@ -87,6 +82,11 @@ steps:
    - tests/v1
  commands:
    - pytest -v -s v1/entrypoints
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: OpenAI API Correctness
  timeout_in_minutes: 30
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -21,3 +21,18 @@ steps:
  commands:
  - pytest -v -s distributed/test_eplb_execute.py
  - pytest -v -s distributed/test_eplb_spec_decode.py
 - label: Elastic EP Scaling Test
  timeout_in_minutes: 20
  device: b200
  optional: true
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/compilation/
  - tests/distributed/
  commands:
  - pytest -v -s distributed/test_elastic_ep.py
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -8,8 +8,9 @@ steps:
  - csrc/
  - tests/kernels/core
  - tests/kernels/test_top_k_per_row.py
  - tests/kernels/test_concat_mla_q.py
  commands:
-    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py
 - label: Kernels Attention Test %N
  timeout_in_minutes: 35
@@ -44,7 +45,8 @@ steps:
  - vllm/envs.py
  - vllm/config
  commands:
-    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+    - pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
    - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
 - label: Kernels Mamba Test
@@ -70,7 +72,7 @@ steps:
  - tests/kernels/moe/test_batched_deepgemm.py
  - tests/kernels/attention/test_deepgemm_attention.py
  commands:
-    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s kernels/quantization/test_block_fp8.py
    - pytest -v -s kernels/moe/test_deepgemm.py
    - pytest -v -s kernels/moe/test_batched_deepgemm.py
    - pytest -v -s kernels/attention/test_deepgemm_attention.py
@@ -95,7 +97,7 @@ steps:
  - vllm/platforms/cuda.py
  commands:
    - nvidia-smi
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
    # Attention
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
    - pytest -v -s tests/kernels/attention/test_attention_selector.py
@@ -155,5 +157,14 @@ steps:
  commands:
    - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
    - pytest -v -s kernels/moe/test_deepep_moe.py
-    - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
+
-    # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
+- label: Kernels Fp4 MoE Test (B200)
  timeout_in_minutes: 60
  device: b200
  num_devices: 1
  optional: true
  commands:
    - pytest -v -s kernels/moe/test_cutedsl_moe.py
    - pytest -v -s kernels/moe/test_flashinfer_moe.py
    - pytest -v -s kernels/moe/test_nvfp4_moe.py
    - pytest -v -s kernels/moe/test_ocp_mx_moe.py
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -11,17 +11,17 @@ steps:
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
- label: LM Eval Large Models (4 GPUs)(A100)
+# - label: LM Eval Large Models (4 GPUs)(A100)
-  device: a100
+#   device: a100
-  optional: true
+#   optional: true
-  num_devices: 4
+#   num_devices: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+#   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
+#   source_file_dependencies:
-  - csrc/
+#   - csrc/
-  - vllm/model_executor/layers/quantization
+#   - vllm/model_executor/layers/quantization
-  commands:
+#   commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+#   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 - label: LM Eval Large Models (4 GPUs)(H100)
  device: h100
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -9,6 +9,7 @@ steps:
    - tests/v1
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    # split the test to avoid interference
    - pytest -v -s -m 'not cpu_test' v1/core
    - pytest -v -s v1/executor
@@ -66,12 +67,13 @@ steps:
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
-    - python3 offline_inference/basic/chat.py # for basic
+     # for basic
-    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+    - python3 basic/offline_inference/chat.py
-    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
-    - python3 offline_inference/basic/classify.py
+    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/embed.py
+    - python3 basic/offline_inference/classify.py
-    - python3 offline_inference/basic/score.py
+    - python3 basic/offline_inference/embed.py
    - python3 basic/offline_inference/score.py
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
--- a/.buildkite/test_areas/model_executor.yaml
+++ b/.buildkite/test_areas/model_executor.yaml
@@ -9,9 +9,9 @@ steps:
  - vllm/config/model.py
  - vllm/model_executor
  - tests/model_executor
-  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
+  - tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
  commands:
    - apt-get update && apt-get install -y curl libsodium23
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s model_executor
-    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
+    - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py
--- a/.buildkite/test_areas/model_runner_v2.yaml
+++ b/.buildkite/test_areas/model_runner_v2.yaml
@@ -0,0 +1,110 @@
 group: Model Runner V2
 depends_on:
  - image-build
 steps:
 - label: Model Runner V2 Core Tests
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/v1/worker/gpu/
  - vllm/v1/worker/gpu_worker.py
  - vllm/v1/core/sched/
  - vllm/v1/attention/
  - tests/v1/engine/test_llm_engine.py
  - tests/v1/e2e/
  - tests/v1/entrypoints/llm/test_struct_output_generate.py
  commands:
  - set -x
  - export VLLM_USE_V2_MODEL_RUNNER=1
  - pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics"
  # This requires eager until we sort out CG correctness issues.
  # TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged.
  - ENFORCE_EAGER=1 pytest -v -s v1/e2e/general/test_async_scheduling.py -k "not ngram"
  - pytest -v -s v1/e2e/general/test_context_length.py
  - pytest -v -s v1/e2e/general/test_min_tokens.py
  # Temporary hack filter to exclude ngram spec decoding based tests.
  - pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
 - label: Model Runner V2 Examples
  timeout_in_minutes: 45
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
    - vllm/v1/worker/gpu/
    - vllm/v1/core/sched/
    - vllm/v1/worker/gpu_worker.py
    - examples/offline_inference/
    - examples/basic/offline_inference/
    - examples/pooling/embed/vision_embedding_offline.py
    - examples/others/tensorize_vllm_model.py
  commands:
    - set -x
    - export VLLM_USE_V2_MODEL_RUNNER=1
    - pip install tensorizer # for tensorizer test
    - python3 basic/offline_inference/chat.py # for basic
    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
    #- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10  # TODO
    #- python3 basic/offline_inference/embed.py   # TODO
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    # for pooling models
    - python3 pooling/embed/vision_embedding_offline.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 - label: Model Runner V2 Distributed (2 GPUs)
  timeout_in_minutes: 45
  working_dir: "/vllm-workspace/tests"
  num_devices: 2
  source_file_dependencies:
    - vllm/v1/worker/gpu/
    - vllm/v1/worker/gpu_worker.py
    - tests/basic_correctness/test_basic_correctness.py
    - tests/v1/distributed/test_async_llm_dp.py
    - tests/v1/distributed/test_eagle_dp.py
  commands:
    - set -x
    - export VLLM_USE_V2_MODEL_RUNNER=1
    # The "and not True" here is a hacky way to exclude the prompt_embeds cases which aren't yet supported.
    - TARGET_TEST_SUITE=L4 pytest -v -s basic_correctness/test_basic_correctness.py -m 'distributed(num_gpus=2)' -k "not ray and not True"
    # https://github.com/NVIDIA/nccl/issues/1838
    - export NCCL_CUMEM_HOST_ENABLE=0
    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py -k "not ray"
    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
 # These require fix https://github.com/vllm-project/vllm/pull/36280
 - label: Model Runner V2 Pipeline Parallelism (4 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
    - vllm/v1/worker/gpu/
    - vllm/v1/worker/gpu_worker.py
    - tests/distributed/test_pipeline_parallel.py
    #- tests/distributed/test_pp_cudagraph.py
  commands:
    - set -x
    - export VLLM_USE_V2_MODEL_RUNNER=1
    - pytest -v -s distributed/test_pipeline_parallel.py -k "not ray and not Jamba"
    # TODO: Uncomment once https://github.com/vllm-project/vllm/pull/35162 is merged.
    #- pytest -v -s distributed/test_pp_cudagraph.py -k "not ray"
 - label: Model Runner V2 Spec Decode
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/worker/gpu/
  - vllm/v1/worker/gpu_worker.py
  - tests/v1/spec_decode/test_max_len.py
  - tests/v1/e2e/spec_decode/test_spec_decode.py
  commands:
  - set -x
  - export VLLM_USE_V2_MODEL_RUNNER=1
  - pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp"
  - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle or mtp"
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -65,7 +65,7 @@ steps:
    - pytest -v -s tests/models/test_transformers.py
    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -2,16 +2,65 @@ group: Models - Multimodal
 depends_on: 
  - image-build
 steps:
- label: Multi-Modal Models (Standard) # 60min
+- label: "Multi-Modal Models (Standard) 1: qwen2"
-  timeout_in_minutes: 80
+  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2"
-    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+    - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: "Multi-Modal Models (Standard) 2: qwen3 + gemma"
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma"
    - pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl"
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma"
    - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: "Multi-Modal Models (Standard) 4: other + whisper"
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Multi-Modal Processor Test (CPU)
  depends_on: 
@@ -20,6 +69,7 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  - tests/models/registry.py
  device: cpu
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
@@ -30,6 +80,7 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  - tests/models/registry.py
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
@@ -52,6 +103,11 @@ steps:
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Multi-Modal Models (Extended) 2
  optional: true
@@ -70,12 +126,3 @@ steps:
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models
  optional: true
  commands:
    - echo 'Testing custom models...'
    # PR authors can temporarily add commands below to test individual models
    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -15,10 +15,17 @@ steps:
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
  # end platform plugin tests
-  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+  # begin io_processor plugins test
  # test generic io_processor plugins functions
  - pytest -v -s ./plugins_tests/test_io_processor_plugins.py
  # test Terratorch io_processor plugins
  - pip install -e ./plugins/prithvi_io_processor_plugin
-  - pytest -v -s plugins_tests/test_io_processor_plugins.py
+  - pytest -v -s plugins_tests/test_terratorch_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # test bge_m3_sparse io_processor plugin
  - pip install -e ./plugins/bge_m3_sparse_plugin
  - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
  - pip uninstall bge_m3_sparse_plugin -y
  # end io_processor plugins test
  # begin stat_logger plugins test
  - pip install -e ./plugins/vllm_add_dummy_stat_logger
@@ -29,6 +36,6 @@ steps:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py
-  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+  - pytest -v -s entrypoints/openai/chat_completion/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
--- a/.buildkite/test_areas/ray_compat.yaml
+++ b/.buildkite/test_areas/ray_compat.yaml
@@ -0,0 +1,16 @@
 group: Ray Compatibility
 depends_on:
  - image-build
 steps:
 - label: Ray Dependency Compatibility Check
  # Informational only — does not block the pipeline.
  # If this fails, it means the PR introduces a dependency that
  # conflicts with Ray's dependency constraints.
  # See https://github.com/vllm-project/vllm/issues/33599
  soft_fail: true
  timeout_in_minutes: 10
  source_file_dependencies:
  - requirements/
  - setup.py
  commands:
  - bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh
--- a/.buildkite/test_areas/spec_decode.yaml
+++ b/.buildkite/test_areas/spec_decode.yaml
@@ -0,0 +1,40 @@
 group: Spec Decode
 depends_on:
  - image-build
 steps:
 - label: Spec Decode Eagle
  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/v1/spec_decode/
    - vllm/v1/worker/gpu/spec_decode/
    - tests/v1/e2e/spec_decode/
  commands:
    - pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"
 - label: Spec Decode Speculators + MTP
  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/v1/spec_decode/
    - vllm/v1/worker/gpu/spec_decode/
    - vllm/transformers_utils/configs/speculators/
    - tests/v1/e2e/spec_decode/
  commands:
    - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
 - label: Spec Decode Ngram + Suffix
  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/v1/spec_decode/
    - vllm/v1/worker/gpu/spec_decode/
    - tests/v1/e2e/spec_decode/
  commands:
    - pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"
 - label: Spec Decode Draft Model
  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/v1/spec_decode/
    - vllm/v1/worker/gpu/spec_decode/
    - tests/v1/e2e/spec_decode/
  commands:
    - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"
--- a/.buildkite/test_areas/weight_loading.yaml
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -13,13 +13,13 @@ steps:
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
- label: Weight Loading Multiple GPU - Large Models # optional
+# - label: Weight Loading Multiple GPU - Large Models # optional
-  working_dir: "/vllm-workspace/tests"
+#   working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+#   num_devices: 2
-  device: a100
+#   device: a100
-  optional: true
+#   optional: true
-  source_file_dependencies:
+#   source_file_dependencies:
-  - vllm/
+#   - vllm/
-  - tests/weight_loading
+#   - tests/weight_loading
-  commands:
+#   commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+#     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
--- a/.github/.bc-linter.yml
+++ b/.github/.bc-linter.yml
@@ -1,24 +0,0 @@
 # doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
 version: 1
 paths:
 # We temporarily disable globally, and will only enable with `annotations.include`
 # include:
 #   - "vllm/v1/attetion/*.py"
 #   - "vllm/v1/core/*.py"
 exclude:
  - "**/*.py"
 scan:
  functions: true        # check free functions and methods
  classes: true          # check classes/dataclasses
  public_only: true      # ignore names starting with "_" at any level
 annotations:
  include:               # decorators that force‑include a symbol
    - name: "bc_linter_include"  # matched by simple name or dotted suffix
      propagate_to_members: false # for classes, include methods/inner classes
  exclude:               # decorators that force‑exclude a symbol
    - name: "bc_linter_skip"     # matched by simple name or dotted suffix
      propagate_to_members: true  # for classes, exclude methods/inner classes
 excluded_violations: []  # e.g. ["ParameterRenamed", "FieldTypeChanged"]
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,7 +2,7 @@
 # for more info about CODEOWNERS file
 # This lists cover the "core" components of vLLM that require careful review
-/vllm/compilation @zou3519 @youkaichao @ProExpertProg
+/vllm/compilation @zou3519 @youkaichao @ProExpertProg @BoyuanFeng
 /vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
 /vllm/lora @jeejeelee
 /vllm/model_executor/layers/attention @LucasWilkinson @MatthewBonanni
@@ -54,11 +54,14 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/kv_cache_interface.py @heheda12345
 /vllm/v1/kv_offload @ApostaC @orozery
-/vllm/v1/worker/gpu/kv_connector.py @orozery
+/vllm/v1/engine @njhill
 /vllm/v1/executor @njhill
 /vllm/v1/worker @njhill
 /vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery @NickLucche
 # Model runner V2
-/vllm/v1/worker/gpu @WoosukKwon
+/vllm/v1/worker/gpu @WoosukKwon @njhill
 /vllm/v1/worker/gpu/kv_connector.py @orozery
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin 
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -3,6 +3,7 @@ pull_request_rules:
  description: Automatically apply documentation label
  conditions:
    - label != stale
    - -closed
    - or:
      - files~=^[^/]+\.md$
      - files~=^docs/
@@ -26,7 +27,7 @@ pull_request_rules:
        Hi @{{author}}, the pre-commit checks have failed. Please run:
        ```bash 
-        uv pip install pre-commit
+        uv pip install pre-commit>=4.5.1
        pre-commit install
        pre-commit run --all-files
        ```
@@ -37,15 +38,13 @@ pull_request_rules:
        > [!TIP]
        > <details>
-        > <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
+        > <summary>Is <code>mypy</code> failing?</summary>
        > <br/>
-        > <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
+        > <code>mypy</code> is run differently in CI. If the failure is related to this check, please use the following command to run it locally:
        >
        > ```bash
        > # For mypy (substitute "3.10" with the failing version if needed)
        > pre-commit run --hook-stage manual mypy-3.10
        > # For markdownlint
        > pre-commit run --hook-stage manual markdownlint
        > ```
        > </details>
@@ -259,8 +258,7 @@ pull_request_rules:
      - files=benchmarks/run_structured_output_benchmark.sh
      - files=docs/features/structured_outputs.md
      - files=examples/offline_inference/structured_outputs.py
-      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
+      - files=examples/online_serving/structured_outputs/structured_outputs.py
      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
      - files~=^tests/v1/structured_output/
      - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
      - files~=^vllm/v1/structured_output/
@@ -336,7 +334,7 @@ pull_request_rules:
    - or:
      - files~=^tests/tool_use/
      - files~=^tests/entrypoints/openai/tool_parsers/
-      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+      - files=tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
      - files~=^vllm/entrypoints/openai/tool_parsers/
      - files=docs/features/tool_calling.md
      - files~=^examples/tool_chat_*
@@ -383,7 +381,7 @@ pull_request_rules:
    - or:
      - files~=^vllm/model_executor/model_loader/tensorizer.py
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
-      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
+      - files~=^tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
      - files~=^tests/model_executor/model_loader/tensorizer_loader/
  actions:
    assign:
--- a/.github/workflows/bc-lint.yml
+++ b/.github/workflows/bc-lint.yml
@@ -1,29 +0,0 @@
 name: BC Lint
 on:
  pull_request:
    types:
      - opened
      - synchronize
      - reopened
      - labeled
      - unlabeled
 jobs:
  bc_lint:
    if: github.repository_owner == 'vllm-project'
    runs-on: ubuntu-latest
    steps:
      - name: Run BC Lint Action
        uses: pytorch/test-infra/.github/actions/bc-lint@main
        with:
          repo: ${{ github.event.pull_request.head.repo.full_name }}
          base_sha: ${{ github.event.pull_request.base.sha }}
          head_sha: ${{ github.event.pull_request.head.sha }}
          suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
          docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
          config_dir: .github
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
  cancel-in-progress: true
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -6,6 +6,9 @@ on:
      - main
  workflow_dispatch:  # Manual trigger
 permissions:
  contents: read
 jobs:
  macos-m1-smoke-test:
    runs-on: macos-latest
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,8 @@
 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/*
 !vllm/vllm_flash_attn/__init__.py
 !vllm/vllm_flash_attn/flash_attn_interface.py
 # OpenAI triton kernels copied from source
 vllm/third_party/triton_kernels/*
@@ -187,11 +189,9 @@ cython_debug/
 .vscode/
 # Claude
 CLAUDE.md
 .claude/
 # Codex
 AGENTS.md
 .codex/
 # Cursor
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@ repos:
    args: [--output-format, github, --fix]
  - id: ruff-format
 - repo: https://github.com/crate-ci/typos
-  rev: v1.38.1
+  rev: v1.43.5
  hooks:
  - id: typos
    args: [--force-exclude]
@@ -24,12 +24,13 @@ repos:
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
    types_or: [c++, cuda]
    args: [--style=file, --verbose]
- repo: https://github.com/igorshubovych/markdownlint-cli
+- repo: https://github.com/DavidAnson/markdownlint-cli2
-  rev: v0.45.0
+  rev: v0.21.0
  hooks:
-  - id: markdownlint
+  - id: markdownlint-cli2
-    exclude: '.*\.inc\.md'
+    language_version: lts
-    stages: [manual] # Only run in CI
+    args: [--fix]
    exclude: ^CLAUDE\.md$
 - repo: https://github.com/rhysd/actionlint
  rev: v1.7.7
  hooks:
@@ -55,7 +56,7 @@ repos:
      language: python
      types_or: [python, pyi]
      require_serial: true
-      additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
+      additional_dependencies: ["mypy[faster-cache]==1.19.1", regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.10
    entry: python tools/pre_commit/mypy.py 1 "3.10"
@@ -127,6 +128,13 @@ repos:
    language: python
    types: [python]
    additional_dependencies: [regex]
  # prevent use torch.cuda APIs
  - id: check-torch-cuda-call
    name: "Prevent new 'torch.cuda' APIs call"
    entry: python tools/pre_commit/check_torch_cuda.py
    language: python
    types: [python]
    additional_dependencies: [regex]
  - id: validate-config
    name: Validate configuration has default values and that each field has a docstring
    entry: python tools/pre_commit/validate_config.py
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -9,6 +9,7 @@ build:
    python: "3.12"
  jobs:
    post_checkout:
      # - bash docs/maybe_skip_pr_build.sh
      - git fetch origin main --unshallow --no-tags --filter=blob:none || true
    pre_create_environment:
      - pip install uv
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -0,0 +1,113 @@
 # Agent Instructions for vLLM
 > These instructions apply to **all** AI-assisted contributions to `vllm-project/vllm`.
 > Breaching these guidelines can result in automatic banning.
 ## 1. Contribution Policy (Mandatory)
 ### Duplicate-work checks
 Before proposing a PR, run these checks:
 ```bash
 gh issue view <issue_number> --repo vllm-project/vllm --comments
 gh pr list --repo vllm-project/vllm --state open --search "<issue_number> in:body"
 gh pr list --repo vllm-project/vllm --state open --search "<short area keywords>"
 ```
 - If an open PR already addresses the same fix, do not open another.
 - If your approach is materially different, explain the difference in the issue.
 ### No low-value busywork PRs
 Do not open one-off PRs for tiny edits (single typo, isolated style change, one mutable default, etc.). Mechanical cleanups are acceptable only when bundled with substantive work.
 ### Accountability
 - Pure code-agent PRs are **not allowed**. A human submitter must understand and defend the change end-to-end.
 - The submitting human must review every changed line and run relevant tests.
 - PR descriptions for AI-assisted work **must** include:
    - Why this is not duplicating an existing PR.
    - Test commands run and results.
    - Clear statement that AI assistance was used.
 ### Fail-closed behavior
 If work is duplicate/trivial busywork, **do not proceed**. Return a short explanation of what is missing.
 ---
 ## 2. Development Workflow
 ### Environment setup
 ```bash
 # Install `uv` if you don't have it already:
 curl -LsSf https://astral.sh/uv/install.sh | sh
 # Always use `uv` for Python environment management:
 uv venv --python 3.12
 source .venv/bin/activate
 # Always make sure `pre-commit` and its hooks are installed:
 uv pip install -r requirements/lint.txt
 pre-commit install
 ```
 ### Installing dependencies
 ```bash
 # If you are only making Python changes:
 VLLM_USE_PRECOMPILED=1 uv pip install -e .
 # If you are also making C/C++ changes:
 uv pip install -e .
 ```
 ### Running tests
 Tests require extra dependencies.
 All versions for test dependencies should be read from `requirements/test.txt`
 ```bash
 # Install bare minimum test dependencies:
 uv pip install pytest pytest-asyncio tblib
 # Install additional test dependencies as needed, or install them all as follows:
 uv pip install -r requirements/test.txt
 # Run specific test from specific test file
 pytest tests/path/to/test.py -v -s -k test_name
 # Run all tests in directory
 pytest tests/path/to/dir -v -s
 ```
 ### Running linters
 ```bash
 # Run all pre-commit hooks on staged files:
 pre-commit run
 # Run on all files:
 pre-commit run --all-files
 # Run a specific hook:
 pre-commit run ruff-check --all-files
 # Run mypy as it is in CI:
 pre-commit run mypy-3.10 --all-files --hook-stage manual
 ```
 ### Commit messages
 Add attribution using commit trailers such as `Co-authored-by:` (other projects use `Assisted-by:` or `Generated-by:`). For example:
 ```text
 Your commit message here
 Co-authored-by: GitHub Copilot
 Co-authored-by: Claude
 Co-authored-by: gemini-code-assist
 Signed-off-by: Your Name <your.email@example.com>
 ```
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -0,0 +1 @@
@AGENTS.md
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,7 +37,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1150;gfx1151;gfx1152;gfx1153;gfx1200;gfx1201")
 # ROCm installation prefix. Default to /opt/rocm but allow override via
 # -DROCM_PATH=/your/rocm/path when invoking cmake.
@@ -725,7 +725,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # CUTLASS MoE kernels
  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
-  # on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
+  # on Hopper). get_cutlass_(batched_)moe_mm_data should only be compiled
  # if it's possible to compile MoE kernels that use its output.
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
@@ -771,6 +771,33 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()
  # Expert-specialization MXFP8 blockscaled grouped kernels (SM100+).
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
    cuda_archs_loose_intersection(ES_MXFP8_GROUPED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
  else()
    cuda_archs_loose_intersection(ES_MXFP8_GROUPED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND ES_MXFP8_GROUPED_MM_ARCHS)
    set(SRCS
      "csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm.cu"
      "csrc/moe/mxfp8_moe/mxfp8_experts_quant.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${ES_MXFP8_GROUPED_MM_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_ES_MXFP8_GROUPED_MM_SM100=1")
    message(STATUS "Building ES MXFP8 grouped kernels for archs: ${ES_MXFP8_GROUPED_MM_ARCHS}")
  else()
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8
        AND ES_MXFP8_GROUPED_MM_ARCHS)
      message(STATUS "Not building ES MXFP8 grouped kernels as CUDA Compiler version is "
                     "not >= 12.8.")
    else()
      message(STATUS "Not building ES MXFP8 grouped kernels as no compatible archs found "
                     "in CUDA target architectures.")
    endif()
  endif()
  # DeepSeek V3 fused A GEMM kernel (requires SM 9.0+, Hopper and later)
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
    cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
@@ -971,7 +998,8 @@ set(VLLM_MOE_EXT_SRC
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_MOE_EXT_SRC
    "csrc/moe/moe_wna16.cu"
-    "csrc/moe/grouped_topk_kernels.cu")
+    "csrc/moe/grouped_topk_kernels.cu"
    "csrc/moe/router_gemm.cu")
 endif()
 if(VLLM_GPU_LANG STREQUAL "CUDA")
--- a/benchmarks/attention_benchmarks/README.md
+++ b/benchmarks/attention_benchmarks/README.md
@@ -187,7 +187,7 @@ python benchmark.py \
 ## Hardware Requirements
 | Backend | Hardware |
-|---------|----------|
+| ------- | -------- |
 | Flash/Triton/FlashInfer | Any CUDA GPU |
 | CUTLASS MLA | Blackwell (SM100+) |
 | FlashAttn MLA | Hopper (SM90+) |
--- a/benchmarks/attention_benchmarks/init.py
+++ b/benchmarks/attention_benchmarks/init.py
@@ -15,7 +15,6 @@ from .common import (
    BenchmarkConfig,
    BenchmarkResult,
    MockLayer,
    MockModelConfig,
    ResultsFormatter,
    get_attention_scale,
    is_mla_backend,
@@ -36,7 +35,6 @@ __all__ = [
    "ResultsFormatter",
    # Mock objects
    "MockLayer",
    "MockModelConfig",
    # Utilities
    "setup_mla_dims",
    "get_attention_scale",
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -47,6 +47,8 @@ from common import (
    is_mla_backend,
 )
 from vllm.v1.worker.workspace import init_workspace_manager
 def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
    """Run standard attention benchmark (Flash/Triton/FlashInfer)."""
@@ -59,7 +61,9 @@ def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
    """Run MLA benchmark with appropriate backend."""
    from mla_runner import run_mla_benchmark as run_mla
-    return run_mla(config.backend, config, **kwargs)
+    return run_mla(
        config.backend, config, prefill_backend=config.prefill_backend, **kwargs
    )
 def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
@@ -440,20 +444,27 @@ def main():
    # Backend selection
    parser.add_argument(
        "--backends",
        "--decode-backends",
        nargs="+",
-        help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
+        help="Decode backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
        "flashinfer_mla, flashattn_mla, flashmla)",
    )
    parser.add_argument(
        "--backend",
        help="Single backend (alternative to --backends)",
    )
    parser.add_argument(
        "--prefill-backends",
        nargs="+",
        help="Prefill backends to compare (fa2, fa3, fa4). "
        "Uses the first decode backend for impl construction.",
    )
    # Batch specifications
    parser.add_argument(
        "--batch-specs",
        nargs="+",
-        default=["q2k", "8q1s1k"],
+        default=None,
        help="Batch specifications using extended grammar",
    )
@@ -469,6 +480,21 @@ def main():
    parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
    parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
    parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
    parser.add_argument(
        "--kv-cache-dtype",
        default="auto",
        choices=["auto", "fp8"],
        help="KV cache dtype: auto or fp8",
    )
    parser.add_argument(
        "--cuda-graphs",
        action=argparse.BooleanOptionalAction,
        default=True,
        help=(
            "Launch kernels with CUDA graphs to eliminate CPU overhead"
            "in measurements (default: True)"
        ),
    )
    # Parameter sweep (use YAML config for advanced sweeps)
    parser.add_argument(
@@ -502,7 +528,7 @@ def main():
        # Override args with YAML values, but CLI args take precedence
        # Check if CLI provided backends (they would be non-None and not default)
-        cli_backends_provided = args.backends is not None or args.backend is not None
+        cli_backends_provided = args.backend is not None or args.backends is not None
        # Backend(s) - only use YAML if CLI didn't specify
        if not cli_backends_provided:
@@ -512,6 +538,12 @@ def main():
            elif "backends" in yaml_config:
                args.backends = yaml_config["backends"]
                args.backend = None
            elif "decode_backends" in yaml_config:
                args.backends = yaml_config["decode_backends"]
                args.backend = None
        # Prefill backends (e.g., ["fa3", "fa4"])
        args.prefill_backends = yaml_config.get("prefill_backends", None)
        # Check for special modes
        if "mode" in yaml_config:
@@ -521,6 +553,9 @@ def main():
        # Batch specs and sizes
        # Support both explicit batch_specs and generated batch_spec_ranges
        # CLI --batch-specs takes precedence over YAML when provided.
        cli_batch_specs_provided = args.batch_specs is not None
        if not cli_batch_specs_provided:
            if "batch_spec_ranges" in yaml_config:
                # Generate batch specs from ranges
                generated_specs = generate_batch_specs_from_ranges(
@@ -560,6 +595,10 @@ def main():
            args.warmup_iters = yaml_config["warmup_iters"]
        if "profile_memory" in yaml_config:
            args.profile_memory = yaml_config["profile_memory"]
        if "kv_cache_dtype" in yaml_config:
            args.kv_cache_dtype = yaml_config["kv_cache_dtype"]
        if "cuda_graphs" in yaml_config:
            args.cuda_graphs = yaml_config["cuda_graphs"]
        # Parameter sweep configuration
        if "parameter_sweep" in yaml_config:
@@ -613,10 +652,19 @@ def main():
    # Determine backends
    backends = args.backends or ([args.backend] if args.backend else ["flash"])
    prefill_backends = getattr(args, "prefill_backends", None)
    if not args.batch_specs:
        args.batch_specs = ["q2k", "8q1s1k"]
    console.print(f"Backends: {', '.join(backends)}")
    if prefill_backends:
        console.print(f"Prefill backends: {', '.join(prefill_backends)}")
    console.print(f"Batch specs: {', '.join(args.batch_specs)}")
    console.print(f"KV cache dtype: {args.kv_cache_dtype}")
    console.print(f"CUDA graphs: {args.cuda_graphs}")
    console.print()
    init_workspace_manager(args.device)
    # Run benchmarks
    all_results = []
@@ -669,6 +717,8 @@ def main():
                        repeats=args.repeats,
                        warmup_iters=args.warmup_iters,
                        profile_memory=args.profile_memory,
                        kv_cache_dtype=args.kv_cache_dtype,
                        use_cuda_graphs=args.cuda_graphs,
                    )
                    # Add decode pipeline config
@@ -821,6 +871,8 @@ def main():
            "repeats": args.repeats,
            "warmup_iters": args.warmup_iters,
            "profile_memory": args.profile_memory,
            "kv_cache_dtype": args.kv_cache_dtype,
            "use_cuda_graphs": args.cuda_graphs,
        }
        all_results = run_model_parameter_sweep(
            backends,
@@ -843,6 +895,8 @@ def main():
            "repeats": args.repeats,
            "warmup_iters": args.warmup_iters,
            "profile_memory": args.profile_memory,
            "kv_cache_dtype": args.kv_cache_dtype,
            "use_cuda_graphs": args.cuda_graphs,
        }
        all_results = run_parameter_sweep(
            backends, args.batch_specs, base_config_args, args.parameter_sweep, console
@@ -850,6 +904,12 @@ def main():
    else:
        # Normal mode: compare backends
        decode_results = []
        prefill_results = []
        # Run decode backend comparison
        if not prefill_backends:
            # No prefill backends specified: compare decode backends as before
            total = len(backends) * len(args.batch_specs)
            with tqdm(total=total, desc="Benchmarking") as pbar:
@@ -867,20 +927,72 @@ def main():
                            repeats=args.repeats,
                            warmup_iters=args.warmup_iters,
                            profile_memory=args.profile_memory,
                            kv_cache_dtype=args.kv_cache_dtype,
                            use_cuda_graphs=args.cuda_graphs,
                        )
                        result = run_benchmark(config)
-                    all_results.append(result)
+                        decode_results.append(result)
                        if not result.success:
-                        console.print(f"[red]Error {backend} {spec}: {result.error}[/]")
+                            console.print(
                                f"[red]Error {backend} {spec}: {result.error}[/]"
                            )
                        pbar.update(1)
        # Display results
            console.print("\n[bold green]Results:[/]")
            formatter = ResultsFormatter(console)
-        formatter.print_table(all_results, backends)
+            formatter.print_table(decode_results, backends)
        # Run prefill backend comparison
        if prefill_backends:
            # Use first decode backend for impl construction
            decode_backend = backends[0]
            total = len(prefill_backends) * len(args.batch_specs)
            console.print(
                f"[yellow]Prefill comparison mode: "
                f"using {decode_backend} for decode impl[/]"
            )
            with tqdm(total=total, desc="Prefill benchmarking") as pbar:
                for spec in args.batch_specs:
                    for pb in prefill_backends:
                        config = BenchmarkConfig(
                            backend=decode_backend,
                            batch_spec=spec,
                            num_layers=args.num_layers,
                            head_dim=args.head_dim,
                            num_q_heads=args.num_q_heads,
                            num_kv_heads=args.num_kv_heads,
                            block_size=args.block_size,
                            device=args.device,
                            repeats=args.repeats,
                            warmup_iters=args.warmup_iters,
                            profile_memory=args.profile_memory,
                            prefill_backend=pb,
                        )
                        result = run_benchmark(config)
                        # Label result with prefill backend name for display
                        labeled_config = replace(result.config, backend=pb)
                        result = replace(result, config=labeled_config)
                        prefill_results.append(result)
                        if not result.success:
                            console.print(f"[red]Error {pb} {spec}: {result.error}[/]")
                        pbar.update(1)
            console.print("\n[bold green]Prefill Backend Results:[/]")
            formatter = ResultsFormatter(console)
            formatter.print_table(
                prefill_results, prefill_backends, compare_to_fastest=True
            )
        all_results = decode_results + prefill_results
    # Save results
    if all_results:
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -10,7 +10,6 @@ from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Any
 import numpy as np
 import torch
 from batch_spec import get_batch_type, parse_batch_spec
 from rich.console import Console
@@ -31,7 +30,7 @@ def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
        max_kv_len = max(r.kv_len for r in requests) if requests else 0
        return (batch_size, max_q_len, max_kv_len)
    except Exception:
-        # Fallback for unparseable specs
+        # Fallback for unparsable specs
        return (0, 0, 0)
@@ -62,10 +61,7 @@ class MockHfConfig:
 # Import AttentionLayerBase at module level to avoid circular dependencies
 try:
    from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
    _HAS_ATTENTION_LAYER_BASE = True
 except ImportError:
    _HAS_ATTENTION_LAYER_BASE = False
    AttentionLayerBase = object  # Fallback
@@ -81,6 +77,7 @@ class MockKVBProj:
        self.qk_nope_head_dim = qk_nope_head_dim
        self.v_head_dim = v_head_dim
        self.out_dim = qk_nope_head_dim + v_head_dim
        self.weight = torch.empty(0, dtype=torch.bfloat16)
    def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
        """
@@ -167,95 +164,6 @@ class MockLayer(AttentionLayerBase):
        return self._kv_cache_spec
 class MockModelConfig:
    """Mock model configuration."""
    def __init__(
        self,
        num_q_heads: int,
        num_kv_heads: int,
        head_dim: int,
        dtype: torch.dtype = torch.float16,
        max_model_len: int = 32768,
    ):
        self._n_q = num_q_heads
        self._n_kv = num_kv_heads
        self._d = head_dim
        self.dtype = dtype
        self.max_model_len = max_model_len
    def get_num_attention_heads(self, _=None) -> int:
        return self._n_q
    def get_num_kv_heads(self, _=None) -> int:
        return self._n_kv
    def get_head_size(self) -> int:
        return self._d
    def get_num_layers(self) -> int:
        """Mock method for layer count queries."""
        return 1
    def get_sliding_window_for_layer(self, _layer_idx: int):
        """Mock method for sliding window queries."""
        return None
    def get_logits_soft_cap_for_layer(self, _layer_idx: int):
        """Mock method for logits soft cap queries."""
        return None
    def get_sm_scale_for_layer(self, _layer_idx: int) -> float:
        """Mock method for SM scale queries."""
        return 1.0 / (self.get_head_size() ** 0.5)
 class MockParallelConfig:
    """Mock parallel configuration."""
    pass
 class MockCompilationConfig:
    """Mock compilation configuration."""
    def __init__(self):
        self.full_cuda_graph = False
        self.static_forward_context = {}
 class MockVLLMConfig:
    """Mock VLLM configuration."""
    def __init__(self):
        self.compilation_config = MockCompilationConfig()
 class MockRunner:
    """Mock GPU runner for metadata builders."""
    def __init__(
        self,
        seq_lens: np.ndarray,
        query_start_locs: np.ndarray,
        device: torch.device,
        num_q_heads: int,
        num_kv_heads: int,
        head_dim: int,
        dtype: torch.dtype,
    ):
        self.model_config = MockModelConfig(num_q_heads, num_kv_heads, head_dim, dtype)
        self.parallel_config = MockParallelConfig()
        self.vllm_config = MockVLLMConfig()
        self.seq_lens_np = seq_lens
        self.query_start_loc_np = query_start_locs
        self.device = device
        self.attention_chunk_size = None
        self.num_query_heads = num_q_heads
        self.num_kv_heads = num_kv_heads
        self.dtype = dtype
@dataclass
 class ParameterSweep:
    """Configuration for sweeping a backend parameter."""
@@ -305,7 +213,11 @@ class BenchmarkConfig:
    profile_memory: bool = False
    use_cuda_graphs: bool = False
    # "auto" or "fp8"
    kv_cache_dtype: str = "auto"
    # MLA-specific
    prefill_backend: str | None = None
    kv_lora_rank: int | None = None
    qk_nope_head_dim: int | None = None
    qk_rope_head_dim: int | None = None
@@ -460,6 +372,7 @@ class ResultsFormatter:
                    "backend",
                    "batch_spec",
                    "num_layers",
                    "kv_cache_dtype",
                    "mean_time",
                    "std_time",
                    "throughput",
@@ -473,6 +386,7 @@ class ResultsFormatter:
                        "backend": r.config.backend,
                        "batch_spec": r.config.batch_spec,
                        "num_layers": r.config.num_layers,
                        "kv_cache_dtype": r.config.kv_cache_dtype,
                        "mean_time": r.mean_time,
                        "std_time": r.std_time,
                        "throughput": r.throughput_tokens_per_sec or 0,
--- a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
@@ -30,9 +30,9 @@ batch_specs:
  - "2q16k_32q1s4k"         # 2 very large prefill + 32 decode
  # Context extension + decode
-  - "2q1kkv2k_16q1s1k"       # 2 extend + 16 decode
+  - "2q1ks2k_16q1s1k"       # 2 extend + 16 decode
-  - "4q2kkv4k_32q1s2k"       # 4 extend + 32 decode
+  - "4q2ks4k_32q1s2k"       # 4 extend + 32 decode
-  - "2q1kkv8k_32q1s2k"       # 2 large extend + 32 decode
+  - "2q1ks8k_32q1s2k"       # 2 large extend + 32 decode
  # Explicitly chunked prefill
  - "q8k"           # 8k prefill with chunking hint
--- a/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
@@ -1,4 +1,19 @@
-# MLA prefill-only benchmark configuration for sparse backends
+# MLA prefill backend comparison
 #
 # Compares all available MLA prefill backends:
 #   FA backends:  fa2, fa3, fa4 (FlashAttention versions)
 #   Non-FA:       flashinfer, cudnn, trtllm (Blackwell-only, require flashinfer)
 #
 # Uses cutlass_mla as the decode backend for impl construction
 # (only the prefill path is exercised).
 #
 # Backends that aren't available on the current platform will report errors
 # in the results table (e.g., fa3 on Blackwell, cudnn without artifactory).
 #
 # Usage:
 #   python benchmark.py --config configs/mla_prefill.yaml
 description: "MLA prefill backend comparison"
 model:
  name: "deepseek-v3"
@@ -12,20 +27,25 @@ model:
  v_head_dim: 128
  block_size: 128
-# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
+# model:
-# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
+#   name: "deepseek-v2-lite"
-model_parameter_sweep:
+#   num_layers: 27
-  param_name: "num_q_heads"
+#   num_q_heads: 16
-  values: [128, 64, 32, 16]
+#   num_kv_heads: 1
-  label_format: "{backend}_{value}h"
+#   head_dim: 576
 #   kv_lora_rank: 512
 #   qk_nope_head_dim: 128
 #   qk_rope_head_dim: 64
 #   v_head_dim: 128
 #   block_size: 128
 batch_specs:
  # Pure prefill
-  - "1q512"
+  - "q512"
-  - "1q1k"
+  - "q1k"
-  - "1q2k"
+  - "q2k"
-  - "1q4k"
+  - "q4k"
-  - "1q8k"
+  - "q8k"
  # Batched pure prefill
  - "2q512"
@@ -44,19 +64,63 @@ batch_specs:
  - "8q4k"
  - "8q8k"
-  # Extend
+  # Chunked prefill / extend
-  - "1q512s4k"
+  # Short context
-  - "1q512s8k"
+  - "q128s1k"
-  - "1q1ks8k"
+  - "q256s2k"
-  - "1q2ks8k"
+  - "q512s4k"
-  - "1q2ks16k"
+  - "q1ks4k"
-  - "1q4ks16k"
+  - "q2ks8k"
  - "2q128s1k"
  - "2q256s2k"
  - "2q512s4k"
  - "2q1ks4k"
  - "2q2ks8k"
  - "4q128s1k"
  - "4q256s2k"
  - "4q512s4k"
  - "4q1ks4k"
  - "4q2ks8k"
  - "8q128s1k"
  - "8q256s2k"
  - "8q512s4k"
  - "8q1ks4k"
-backends:
+  # Medium context
-  - FLASHMLA_SPARSE
+  - "q128s16k"
-  - FLASHINFER_MLA_SPARSE
+  - "q512s16k"
  - "q1ks16k"
  - "q2ks16k"
  - "2q128s16k"
  - "2q512s16k"
  - "2q1ks16k"
  - "2q2ks16k"
  - "4q128s16k"
  - "4q512s16k"
  - "4q1ks16k"
  - "4q2ks16k"
  # Long context
  - "q128s64k"
  - "q512s64k"
  - "q1ks64k"
  - "q2ks64k"
  - "2q128s64k"
  - "2q512s64k"
  - "2q1ks64k"
  - "2q2ks64k"
 decode_backends:
  - CUTLASS_MLA
 prefill_backends:
  - fa2
  - fa3
  - fa4
  - flashinfer
  - cudnn
  - trtllm
 device: "cuda:0"
-repeats: 10
+repeats: 20
-warmup_iters: 3
+warmup_iters: 5
 profile_memory: true
--- a/benchmarks/attention_benchmarks/configs/mla_sparse_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_sparse_decode.yaml
@@ -0,0 +1,58 @@
 # MLA decode-only benchmark configuration
 model:
  name: "deepseek-v3"
  num_layers: 60
  num_q_heads: 128  # Base value, can be swept for TP simulation
  num_kv_heads: 1  # MLA uses single latent KV
  head_dim: 576
  kv_lora_rank: 512
  qk_nope_head_dim: 128
  qk_rope_head_dim: 64
  v_head_dim: 128
  block_size: 128  # CUTLASS MLA and FlashAttn MLA use 128
 # Model parameter sweep: simulate tensor parallelism by varying num_q_heads
 # TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
 model_parameter_sweep:
  param_name: "num_q_heads"
  values: [128, 64, 32, 16]
  label_format: "{backend}_{value}h"
 batch_specs:
  # Small batches, varying sequence lengths
  - "16q1s512"     # 16 requests, 512 KV cache
  - "16q1s1k"      # 16 requests, 1k KV cache
  - "16q1s2k"      # 16 requests, 2k KV cache
  - "16q1s4k"      # 16 requests, 4k KV cache
  # Medium batches
  - "32q1s1k"      # 32 requests, 1k KV cache
  - "32q1s2k"      # 32 requests, 2k KV cache
  - "32q1s4k"      # 32 requests, 4k KV cache
  - "32q1s8k"      # 32 requests, 8k KV cache
  # Large batches
  - "64q1s1k"      # 64 requests, 1k KV cache
  - "64q1s2k"      # 64 requests, 2k KV cache
  - "64q1s4k"      # 64 requests, 4k KV cache
  - "64q1s8k"      # 64 requests, 8k KV cache
  # Very large batches
  - "128q1s1k"     # 128 requests, 1k KV cache
  - "128q1s2k"     # 128 requests, 2k KV cache
  - "128q1s4k"     # 128 requests, 4k KV cache
  - "128q1s8k"     # 128 requests, 8k KV cache
  # Long context
  - "32q1s16k"     # 32 requests, 16k KV cache
  - "32q1s32k"     # 32 requests, 32k KV cache
 backends:
  - FLASHMLA_SPARSE
  - FLASHINFER_MLA_SPARSE
 device: "cuda:0"
 repeats: 100
 warmup_iters: 10
 profile_memory: true
--- a/benchmarks/attention_benchmarks/configs/mla_sparse_prefill.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_sparse_prefill.yaml
@@ -0,0 +1,62 @@
 # MLA prefill-only benchmark configuration for sparse backends
 model:
  name: "deepseek-v3"
  num_layers: 60
  num_q_heads: 128
  num_kv_heads: 1
  head_dim: 576
  kv_lora_rank: 512
  qk_nope_head_dim: 128
  qk_rope_head_dim: 64
  v_head_dim: 128
  block_size: 128
 # Model parameter sweep: simulate tensor parallelism by varying num_q_heads
 # TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
 model_parameter_sweep:
  param_name: "num_q_heads"
  values: [128, 64, 32, 16]
  label_format: "{backend}_{value}h"
 batch_specs:
  # Pure prefill
  - "1q512"
  - "1q1k"
  - "1q2k"
  - "1q4k"
  - "1q8k"
  # Batched pure prefill
  - "2q512"
  - "2q1k"
  - "2q2k"
  - "2q4k"
  - "2q8k"
  - "4q512"
  - "4q1k"
  - "4q2k"
  - "4q4k"
  - "4q8k"
  - "8q512"
  - "8q1k"
  - "8q2k"
  - "8q4k"
  - "8q8k"
  # Extend
  - "1q512s4k"
  - "1q512s8k"
  - "1q1ks8k"
  - "1q2ks8k"
  - "1q2ks16k"
  - "1q4ks16k"
 backends:
  - FLASHMLA_SPARSE
  - FLASHINFER_MLA_SPARSE
 device: "cuda:0"
 repeats: 10
 warmup_iters: 3
 profile_memory: true
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -60,8 +60,11 @@ def create_minimal_vllm_config(
    model_name: str = "deepseek-v3",
    block_size: int = 128,
    max_num_seqs: int = 256,
    max_num_batched_tokens: int = 8192,
    mla_dims: dict | None = None,
    index_topk: int | None = None,
    prefill_backend: str | None = None,
    kv_cache_dtype: str = "auto",
 ) -> VllmConfig:
    """
    Create minimal VllmConfig for MLA benchmarks.
@@ -75,6 +78,9 @@ def create_minimal_vllm_config(
                  setup_mla_dims(model_name)
        index_topk: Optional topk value for sparse MLA backends. If provided,
                    the config will include index_topk for sparse attention.
        prefill_backend: Prefill backend name (e.g., "fa3", "fa4", "flashinfer",
                        "cudnn", "trtllm"). Configures the attention config to
                        force the specified prefill backend.
    Returns:
        VllmConfig for benchmarking
@@ -145,14 +151,13 @@ def create_minimal_vllm_config(
    cache_config = CacheConfig(
        block_size=block_size,
        gpu_memory_utilization=0.9,
-        swap_space=0,
+        cache_dtype=kv_cache_dtype,
        cache_dtype="auto",
        enable_prefix_caching=False,
    )
    scheduler_config = SchedulerConfig(
        max_num_seqs=max_num_seqs,
-        max_num_batched_tokens=8192,
+        max_num_batched_tokens=max(max_num_batched_tokens, max_num_seqs),
        max_model_len=32768,
        is_encoder_decoder=False,
        enable_chunked_prefill=True,
@@ -164,7 +169,7 @@ def create_minimal_vllm_config(
    compilation_config = CompilationConfig()
-    return VllmConfig(
+    vllm_config = VllmConfig(
        model_config=model_config,
        cache_config=cache_config,
        parallel_config=parallel_config,
@@ -172,9 +177,84 @@ def create_minimal_vllm_config(
        compilation_config=compilation_config,
    )
    if prefill_backend is not None:
        prefill_cfg = get_prefill_backend_config(prefill_backend)
        if prefill_cfg["flash_attn_version"] is not None:
            vllm_config.attention_config.flash_attn_version = prefill_cfg[
                "flash_attn_version"
            ]
        vllm_config.attention_config.disable_flashinfer_prefill = prefill_cfg[
            "disable_flashinfer_prefill"
        ]
        vllm_config.attention_config.use_cudnn_prefill = prefill_cfg[
            "use_cudnn_prefill"
        ]
        vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill = prefill_cfg[
            "use_trtllm_ragged_deepseek_prefill"
        ]
    return vllm_config
 # ============================================================================
-# Backend Configuration
+# Prefill Backend Configuration
 # ============================================================================
 # Maps prefill backend names to attention config overrides.
 # FA backends set flash_attn_version and disable non-FA paths.
 # Non-FA backends enable their specific path and disable others.
 _PREFILL_BACKEND_CONFIG: dict[str, dict] = {
    "fa2": {
        "flash_attn_version": 2,
        "disable_flashinfer_prefill": True,
        "use_cudnn_prefill": False,
        "use_trtllm_ragged_deepseek_prefill": False,
    },
    "fa3": {
        "flash_attn_version": 3,
        "disable_flashinfer_prefill": True,
        "use_cudnn_prefill": False,
        "use_trtllm_ragged_deepseek_prefill": False,
    },
    "fa4": {
        "flash_attn_version": 4,
        "disable_flashinfer_prefill": True,
        "use_cudnn_prefill": False,
        "use_trtllm_ragged_deepseek_prefill": False,
    },
    "flashinfer": {
        "flash_attn_version": None,
        "disable_flashinfer_prefill": False,
        "use_cudnn_prefill": False,
        "use_trtllm_ragged_deepseek_prefill": False,
    },
    "cudnn": {
        "flash_attn_version": None,
        "disable_flashinfer_prefill": True,
        "use_cudnn_prefill": True,
        "use_trtllm_ragged_deepseek_prefill": False,
    },
    "trtllm": {
        "flash_attn_version": None,
        "disable_flashinfer_prefill": True,
        "use_cudnn_prefill": False,
        "use_trtllm_ragged_deepseek_prefill": True,
    },
 }
 def get_prefill_backend_config(prefill_backend: str) -> dict:
    """Get attention config overrides for a prefill backend."""
    if prefill_backend not in _PREFILL_BACKEND_CONFIG:
        raise ValueError(
            f"Unknown prefill backend: {prefill_backend!r}. "
            f"Available: {list(_PREFILL_BACKEND_CONFIG.keys())}"
        )
    return _PREFILL_BACKEND_CONFIG[prefill_backend]
 # ============================================================================
 # Decode Backend Configuration
 # ============================================================================
@@ -204,6 +284,7 @@ def _get_backend_config(backend: str) -> dict:
    Returns:
        Dict with backend configuration
    """
    from vllm.v1.attention.backend import MultipleOf
    from vllm.v1.attention.backends.registry import AttentionBackendEnum
    try:
@@ -220,8 +301,8 @@ def _get_backend_config(backend: str) -> dict:
    block_sizes = backend_class.get_supported_kernel_block_sizes()
    # Use first supported block size (backends typically support one for MLA)
    block_size = block_sizes[0] if block_sizes else None
-    if hasattr(block_size, "value"):
+    if isinstance(block_size, MultipleOf):
-        # Handle MultipleOf enum
+        # No fixed block size; fall back to config value
        block_size = None
    # Check if sparse via class method if available
@@ -456,6 +537,7 @@ def _create_backend_impl(
    device: torch.device,
    max_num_tokens: int = 8192,
    index_topk: int | None = None,
    kv_cache_dtype: str = "auto",
 ):
    """
    Create backend implementation instance.
@@ -504,7 +586,7 @@ def _create_backend_impl(
        "num_kv_heads": mla_dims["num_kv_heads"],
        "alibi_slopes": None,
        "sliding_window": None,
-        "kv_cache_dtype": "auto",
+        "kv_cache_dtype": kv_cache_dtype,
        "logits_soft_cap": None,
        "attn_type": "decoder",
        "kv_sharing_target_layer_name": None,
@@ -622,6 +704,7 @@ def _run_single_benchmark(
    mla_dims: dict,
    device: torch.device,
    indexer=None,
    kv_cache_dtype: str | None = None,
 ) -> BenchmarkResult:
    """
    Run a single benchmark iteration.
@@ -655,38 +738,97 @@ def _run_single_benchmark(
    )
    # Create KV cache
    if kv_cache_dtype is None:
        kv_cache_dtype = getattr(config, "kv_cache_dtype", "auto")
    head_size = mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"]
    if kv_cache_dtype == "fp8_ds_mla":
        # FlashMLA sparse custom format: 656 bytes per token, stored as uint8.
        # Layout: kv_lora_rank fp8 bytes + 4 float32 tile scales
        #         + 2*rope_dim bf16 bytes
        # = 512 + 16 + 128 = 656 bytes for DeepSeek dims.
        kv_cache = torch.zeros(
            num_blocks,
            block_size,
-        mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
+            656,
            device=device,
            dtype=torch.uint8,
        )
    elif kv_cache_dtype == "fp8":
        from vllm.platforms import current_platform
        kv_cache = torch.zeros(
            num_blocks,
            block_size,
            head_size,
            device=device,
            dtype=torch.uint8,
        ).view(current_platform.fp8_dtype())
    else:
        kv_cache = torch.zeros(
            num_blocks,
            block_size,
            head_size,
            device=device,
            dtype=torch.bfloat16,
        )
    # Create input tensors for both decode and prefill modes
    decode_inputs, prefill_inputs = _create_input_tensors(
        total_q,
        mla_dims,
        backend_cfg["query_format"],
        device,
        torch.bfloat16,
    )
    # Fill indexer with random indices for sparse backends
    is_sparse = backend_cfg.get("is_sparse", False)
    if is_sparse and indexer is not None:
        indexer.fill_random_indices(total_q, max_kv_len)
-    # Determine which forward method to use
+    # Determine which forward methods to use based on metadata.
-    if is_sparse:
+    # Sparse MLA backends always use forward_mqa
-        # Sparse backends use forward_mqa
+    has_decode = is_sparse or getattr(metadata, "decode", None) is not None
-        forward_fn = lambda: impl.forward_mqa(decode_inputs, kv_cache, metadata, layer)
+    has_prefill = not is_sparse and getattr(metadata, "prefill", None) is not None
-    elif metadata.decode is not None:
+    if not has_decode and not has_prefill:
-        forward_fn = lambda: impl._forward_decode(
+        raise RuntimeError("Metadata has neither decode nor prefill metadata")
-            decode_inputs, kv_cache, metadata, layer
+
    num_decode = (
        metadata.num_decode_tokens
        if (has_decode and has_prefill)
        else total_q
        if has_decode
        else 0
    )
-    elif metadata.prefill is not None:
+    num_prefill = total_q - num_decode
-        forward_fn = lambda: impl._forward_prefill(
+
    # Some backends requires fp8 queries when using fp8 KV cache.
    is_fp8_kvcache = kv_cache_dtype.startswith("fp8")
    quantize_query = is_fp8_kvcache and getattr(
        impl, "supports_quant_query_input", False
    )
    # quantize_query forces concat format
    query_fmt = "concat" if quantize_query else backend_cfg["query_format"]
    # Create decode query tensors
    if has_decode:
        decode_inputs, _ = _create_input_tensors(
            num_decode, mla_dims, query_fmt, device, torch.bfloat16
        )
        # Cast decode query to fp8 if the backend supports it
        if quantize_query:
            from vllm.platforms import current_platform
            if isinstance(decode_inputs, tuple):
                decode_inputs = torch.cat(list(decode_inputs), dim=-1)
            decode_inputs = decode_inputs.to(current_platform.fp8_dtype())
    # Create prefill input tensors
    if has_prefill:
        _, prefill_inputs = _create_input_tensors(
            num_prefill, mla_dims, query_fmt, device, torch.bfloat16
        )
    # Build forward function
    def forward_fn():
        results = []
        if has_decode:
            results.append(impl.forward_mqa(decode_inputs, kv_cache, metadata, layer))
        if has_prefill:
            results.append(
                impl.forward_mha(
                    prefill_inputs["q"],
                    prefill_inputs["k_c_normed"],
                    prefill_inputs["k_pe"],
@@ -695,13 +837,24 @@ def _run_single_benchmark(
                    prefill_inputs["k_scale"],
                    prefill_inputs["output"],
                )
-    else:
+            )
-        raise RuntimeError("Metadata has neither decode nor prefill metadata")
+        return results[0] if len(results) == 1 else tuple(results)
    # Warmup
    for _ in range(config.warmup_iters):
        forward_fn()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Optionally capture a CUDA graph after warmup.
    # Graph replay eliminates CPU launch overhead so timings reflect pure
    # kernel time.
    if config.use_cuda_graphs:
        graph = torch.cuda.CUDAGraph()
        with torch.cuda.graph(graph):
            forward_fn()
        benchmark_fn = graph.replay
    else:
        benchmark_fn = forward_fn
    # Benchmark
    times = []
@@ -711,10 +864,10 @@ def _run_single_benchmark(
        start.record()
        for _ in range(config.num_layers):
-            forward_fn()
+            benchmark_fn()
        end.record()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        elapsed_ms = start.elapsed_time(end)
        times.append(elapsed_ms / 1000.0 / config.num_layers)
@@ -733,6 +886,7 @@ def _run_mla_benchmark_batched(
    backend: str,
    configs_with_params: list[tuple],  # [(config, threshold, num_splits), ...]
    index_topk: int = 2048,
    prefill_backend: str | None = None,
 ) -> list[BenchmarkResult]:
    """
    Unified batched MLA benchmark runner for all backends.
@@ -744,11 +898,13 @@ def _run_mla_benchmark_batched(
    to avoid setup/teardown overhead.
    Args:
-        backend: Backend name
+        backend: Backend name (decode backend used for impl construction)
        configs_with_params: List of (config, threshold, num_splits) tuples
            - threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
            - num_splits: num_kv_splits (CUTLASS only)
        index_topk: Topk value for sparse MLA backends (default 2048)
        prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
            When set, forces the specified FlashAttention version for prefill.
    Returns:
        List of BenchmarkResult objects
@@ -758,7 +914,7 @@ def _run_mla_benchmark_batched(
    backend_cfg = _get_backend_config(backend)
    device = torch.device(configs_with_params[0][0].device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    # Determine block size
    config_block_size = configs_with_params[0][0].block_size
@@ -775,24 +931,89 @@ def _run_mla_benchmark_batched(
    # Determine if this is a sparse backend
    is_sparse = backend_cfg.get("is_sparse", False)
    # Extract kv_cache_dtype from the first config
    kv_cache_dtype = getattr(first_config, "kv_cache_dtype", "auto")
    # FlashMLA sparse only supports "fp8_ds_mla" internally (not generic "fp8").
    # Remap here so the user can pass --kv-cache-dtype fp8 regardless of backend.
    if backend.upper() == "FLASHMLA_SPARSE" and kv_cache_dtype == "fp8":
        kv_cache_dtype = "fp8_ds_mla"
    # Compute max total_q across all configs so the metadata builder buffer
    # and scheduler config are large enough for all batch specs.
    max_total_q = max(
        sum(r.q_len for r in parse_batch_spec(cfg.batch_spec))
        for cfg, *_ in configs_with_params
    )
    # Create and set vLLM config for MLA (reused across all benchmarks)
    vllm_config = create_minimal_vllm_config(
        model_name="deepseek-v3",  # Used only for model path
        block_size=block_size,
        max_num_batched_tokens=max_total_q,
        mla_dims=mla_dims,  # Use custom dims from config or default
        index_topk=index_topk if is_sparse else None,
        prefill_backend=prefill_backend,
        kv_cache_dtype=kv_cache_dtype,
    )
    results = []
    with set_current_vllm_config(vllm_config):
        # Clear cached prefill backend detection functions so they re-evaluate
        # with the current VllmConfig. These are @functools.cache decorated and
        # would otherwise return stale results from a previous backend's config.
        from vllm.model_executor.layers.attention.mla_attention import (
            use_cudnn_prefill,
            use_flashinfer_prefill,
            use_trtllm_ragged_deepseek_prefill,
        )
        use_flashinfer_prefill.cache_clear()
        use_cudnn_prefill.cache_clear()
        use_trtllm_ragged_deepseek_prefill.cache_clear()
        # Create backend impl, layer, builder, and indexer (reused across benchmarks)
        impl, layer, builder_instance, indexer = _create_backend_impl(
            backend_cfg,
            mla_dims,
            vllm_config,
            device,
            max_num_tokens=max_total_q,
            index_topk=index_topk if is_sparse else None,
            kv_cache_dtype=kv_cache_dtype,
        )
        # Verify the actual prefill backend matches what was requested
        if prefill_backend is not None:
            prefill_cfg = get_prefill_backend_config(prefill_backend)
            fa_version = prefill_cfg["flash_attn_version"]
            if fa_version is not None:
                # FA backend: verify the impl's FA version
                actual_fa_version = getattr(impl, "vllm_flash_attn_version", None)
                if actual_fa_version != fa_version:
                    raise RuntimeError(
                        f"Prefill backend '{prefill_backend}' requested FA "
                        f"version {fa_version}, but the impl is using FA "
                        f"version {actual_fa_version}. Check "
                        f"vllm/v1/attention/backends/fa_utils.py."
                    )
            else:
                # Non-FA backend: verify the builder picked the right path
                expected_flags = {
                    "flashinfer": "_use_fi_prefill",
                    "cudnn": "_use_cudnn_prefill",
                    "trtllm": "_use_trtllm_ragged_prefill",
                }
                flag_name = expected_flags.get(prefill_backend)
                if flag_name and not getattr(builder_instance, flag_name, False):
                    raise RuntimeError(
                        f"Prefill backend '{prefill_backend}' was requested "
                        f"but the metadata builder did not enable it. This "
                        f"usually means a dependency is missing (e.g., "
                        f"flashinfer not installed) or the platform doesn't "
                        f"support it."
                    )
        # Run each benchmark with the shared impl
@@ -819,6 +1040,7 @@ def _run_mla_benchmark_batched(
                    mla_dims,
                    device,
                    indexer=indexer,
                    kv_cache_dtype=kv_cache_dtype,
                )
                results.append(result)
@@ -845,6 +1067,7 @@ def run_mla_benchmark(
    reorder_batch_threshold: int | None = None,
    num_kv_splits: int | None = None,
    index_topk: int = 2048,
    prefill_backend: str | None = None,
 ) -> BenchmarkResult | list[BenchmarkResult]:
    """
    Unified MLA benchmark runner for all backends.
@@ -862,6 +1085,8 @@ def run_mla_benchmark(
                                 (single config mode only)
        num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
        index_topk: Topk value for sparse MLA backends (default 2048)
        prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
            When set, forces the specified FlashAttention version for prefill.
    Returns:
        BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
@@ -885,7 +1110,9 @@ def run_mla_benchmark(
        return_single = True
    # Use unified batched execution
-    results = _run_mla_benchmark_batched(backend, configs_with_params, index_topk)
+    results = _run_mla_benchmark_batched(
        backend, configs_with_params, index_topk, prefill_backend=prefill_backend
    )
    # Return single result or list based on input
    return results[0] if return_single else results
--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -140,8 +140,7 @@ def _create_vllm_config(
    cache_config = CacheConfig(
        block_size=config.block_size,
-        cache_dtype="auto",
+        cache_dtype=config.kv_cache_dtype,
        swap_space=0,
    )
    cache_config.num_gpu_blocks = max_num_blocks
    cache_config.num_cpu_blocks = 0
@@ -216,7 +215,7 @@ def _create_backend_impl(
        num_kv_heads=config.num_kv_heads,
        alibi_slopes=None,
        sliding_window=None,
-        kv_cache_dtype="auto",
+        kv_cache_dtype=config.kv_cache_dtype,
    )
    kv_cache_spec = FullAttentionSpec(
@@ -289,12 +288,22 @@ def _create_input_tensors(
    total_q: int,
    device: torch.device,
    dtype: torch.dtype,
    quantize_query: bool = False,
 ) -> tuple:
-    """Create Q, K, V input tensors for all layers."""
+    """Create Q, K, V input tensors for all layers.
    When quantize_query is True, queries are cast to fp8 to match backends
    that require query/key/value dtype consistency.
    """
    q_dtype = dtype
    if quantize_query:
        from vllm.platforms import current_platform
        q_dtype = current_platform.fp8_dtype()
    q_list = [
        torch.randn(
            total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
-        )
+        ).to(q_dtype)
        for _ in range(config.num_layers)
    ]
    k_list = [
@@ -345,10 +354,17 @@ def _create_kv_cache(
    # Compute inverse permutation to get back to logical view
    inv_order = [stride_order.index(i) for i in range(len(stride_order))]
    # Use fp8 dtype for cache when requested.
    cache_dtype = dtype
    if config.kv_cache_dtype == "fp8":
        from vllm.platforms import current_platform
        cache_dtype = current_platform.fp8_dtype()
    cache_list = []
    for _ in range(config.num_layers):
        # Allocate in physical layout order (contiguous in memory)
-        cache = torch.zeros(*physical_shape, device=device, dtype=dtype)
+        cache = torch.zeros(*physical_shape, device=device, dtype=cache_dtype)
        # Permute to logical view
        cache = cache.permute(*inv_order)
        cache_list.append(cache)
@@ -391,15 +407,14 @@ def _run_single_benchmark(
                attn_metadata,
                output=out,
            )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
-    # Benchmark
+    # Optionally capture a CUDA graph after warmup.
-    times = []
+    # Graph replay eliminates CPU launch overhead so timings reflect pure
-    for _ in range(config.repeats):
+    # kernel time.
-        start = torch.cuda.Event(enable_timing=True)
+    if config.use_cuda_graphs:
-        end = torch.cuda.Event(enable_timing=True)
+        graph = torch.cuda.CUDAGraph()
-
+        with torch.cuda.graph(graph):
        start.record()
            for i in range(config.num_layers):
                impl.forward(
                    layer,
@@ -410,17 +425,40 @@ def _run_single_benchmark(
                    attn_metadata,
                    output=out,
                )
        benchmark_fn = graph.replay
    else:
        def benchmark_fn():
            for i in range(config.num_layers):
                impl.forward(
                    layer,
                    q_list[i],
                    k_list[i],
                    v_list[i],
                    cache_list[i],
                    attn_metadata,
                    output=out,
                )
    # Benchmark
    times = []
    for _ in range(config.repeats):
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        start.record()
        benchmark_fn()
        end.record()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        elapsed_ms = start.elapsed_time(end)
        times.append(elapsed_ms / 1000.0 / config.num_layers)  # seconds per layer
    mem_stats = {}
    if config.profile_memory:
        mem_stats = {
-            "allocated_mb": torch.cuda.memory_allocated(device) / 1024**2,
+            "allocated_mb": torch.accelerator.memory_allocated(device) / 1024**2,
-            "reserved_mb": torch.cuda.memory_reserved(device) / 1024**2,
+            "reserved_mb": torch.accelerator.memory_reserved(device) / 1024**2,
        }
    return times, mem_stats
@@ -444,7 +482,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
        BenchmarkResult with timing and memory statistics
    """
    device = torch.device(config.device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    backend_cfg = _get_backend_config(config.backend)
@@ -503,8 +541,12 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
                common_attn_metadata=common_metadata,
            )
            # Only quantize queries when the impl supports it
            quantize_query = config.kv_cache_dtype.startswith("fp8") and getattr(
                impl, "supports_quant_query_input", False
            )
            q_list, k_list, v_list = _create_input_tensors(
-                config, total_q, device, dtype
+                config, total_q, device, dtype, quantize_query=quantize_query
            )
            cache_list = _create_kv_cache(
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -85,7 +85,6 @@ start_server() {
    # Each argument and its value are separate elements.
    local common_args_array=(
        "$MODEL"
        "--disable-log-requests"
        "--port" "8004"
        "--host" "$HOSTNAME"
        "--gpu-memory-utilization" "$gpu_memory_utilization"
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -649,9 +649,3 @@ ASYNC_REQUEST_FUNCS = {
    "sglang": async_request_openai_completions,
    "llama.cpp": async_request_openai_completions,
 }
 OPENAI_COMPATIBLE_BACKENDS = [
    k
    for k, v in ASYNC_REQUEST_FUNCS.items()
    if v in (async_request_openai_completions, async_request_openai_chat_completions)
 ]
--- a/benchmarks/benchmark_topk_topp.py
+++ b/benchmarks/benchmark_topk_topp.py
@@ -94,15 +94,18 @@ def create_logits(
 def measure_memory() -> tuple[int, int]:
    """Return (allocated, reserved) memory in bytes."""
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
-    return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()
+    return (
        torch.accelerator.memory_allocated(),
        torch.accelerator.max_memory_allocated(),
    )
 def reset_memory_stats():
    """Reset peak memory statistics."""
    reset_buffer_cache()
-    torch.cuda.reset_peak_memory_stats()
+    torch.accelerator.reset_peak_memory_stats()
-    torch.cuda.empty_cache()
+    torch.accelerator.empty_cache()
    gc.collect()
@@ -123,7 +126,7 @@ def benchmark_function(
    for _ in range(warmup_iters):
        logits_copy = logits.clone()
        func(logits_copy, k, p)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Reset memory stats before benchmark
    reset_memory_stats()
@@ -140,7 +143,7 @@ def benchmark_function(
        func(logits_copy, k, p)
        end_events[i].record()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Calculate timing
    times = [
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -1,78 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import json
 import math
 import os
 import time
 from types import TracebackType
 from typing import Any
 def convert_to_pytorch_benchmark_format(
    args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any]
 ) -> list:
    """
    Save the benchmark results in the format used by PyTorch OSS benchmark with
    on metric per record
    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
    """
    records = []
    if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
        return records
    for name, benchmark_values in metrics.items():
        record = {
            "benchmark": {
                "name": "vLLM benchmark",
                "extra_info": {
                    "args": vars(args),
                },
            },
            "model": {
                "name": args.model,
            },
            "metric": {
                "name": name,
                "benchmark_values": benchmark_values,
                "extra_info": extra_info,
            },
        }
        tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
        # Save tensor_parallel_size parameter if it's part of the metadata
        if not tp and "tensor_parallel_size" in extra_info:
            record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = (
                extra_info["tensor_parallel_size"]
            )
        records.append(record)
    return records
 class InfEncoder(json.JSONEncoder):
    def clear_inf(self, o: Any):
        if isinstance(o, dict):
            return {k: self.clear_inf(v) for k, v in o.items()}
        elif isinstance(o, list):
            return [self.clear_inf(v) for v in o]
        elif isinstance(o, float) and math.isinf(o):
            return "inf"
        return o
    def iterencode(self, o: Any, *args, **kwargs) -> Any:
        return super().iterencode(self.clear_inf(o), *args, **kwargs)
 def write_to_json(filename: str, records: list) -> None:
    with open(filename, "w") as f:
        json.dump(
            records,
            f,
            cls=InfEncoder,
            default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
        )
 # Collect time and generate time metrics
--- a/benchmarks/cutlass_benchmarks/utils.py
+++ b/benchmarks/cutlass_benchmarks/utils.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Cutlass bench utils
 from collections.abc import Iterable
 import torch
@@ -86,15 +85,3 @@ def make_rand_sparse_tensors(
    # Compressed B, Metadata, Original A, B
    return b_compressed, e, a, b
 def make_n_rand_sparse_tensors(
    num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int
 ) -> tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
    ABs = []
    for _ in range(num_tensors):
        b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
        if b_comp is not None:
            ABs.append(make_rand_sparse_tensors(dtype, m, n, k))
    BComps, Es, As, Bs = zip(*ABs)
    return list(BComps), list(Es), list(As), list(Bs)
--- a/benchmarks/disagg_benchmarks/rate_limiter.py
+++ b/benchmarks/disagg_benchmarks/rate_limiter.py
@@ -1,45 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import time
 class RateLimiter:
    """Token bucket rate limiter implementation"""
    def __init__(self, rate_limit):
        self.rate_limit = rate_limit  # Requests per second
        self.num_available_tokens = rate_limit  # Available tokens
        self.last_refill = time.monotonic()  # Last token refill time
        self.lock = asyncio.Lock()  # Synchronization lock
    async def acquire(self):
        """Acquire a token from the rate limiter"""
        while True:
            async with self.lock:
                current_time = time.monotonic()
                elapsed = current_time - self.last_refill
                # Refill num_available_tokens if more than 1 second has passed
                if elapsed > 1.0:
                    self.num_available_tokens = self.rate_limit
                    self.last_refill = current_time
                # Check if num_available_tokens are available
                if self.num_available_tokens > 0:
                    self.num_available_tokens -= 1
                    return True
                # Calculate wait time if no num_available_tokens available
                wait_time = 1.0 - elapsed
            await asyncio.sleep(wait_time)
    async def __aenter__(self):
        """Enter async context manager - acquire token"""
        await self.acquire()
        return self
    async def __aexit__(self, exc_type, exc_value, traceback):
        """Exit async context manager - no cleanup needed"""
        pass
--- a/benchmarks/disagg_benchmarks/request_queue.py
+++ b/benchmarks/disagg_benchmarks/request_queue.py
@@ -1,39 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 from collections import deque
 class RequestQueue:
    """Request queue manager with concurrency control"""
    def __init__(self, max_concurrent, max_queue_size):
        # Maximum concurrent requests
        self.max_concurrent = max_concurrent
        self.max_queue_size = max_queue_size  # Maximum queue size
        # Concurrency control
        self.semaphore = asyncio.Semaphore(max_concurrent)
        self.queue = deque()  # Request queue
        self.queue_size = 0  # Current queue size
        self.lock = asyncio.Lock()  # Sync queue Lock
    async def enqueue(self, task):
        """Add a request task to the queue"""
        async with self.lock:
            if self.queue_size >= self.max_queue_size:
                return False
            self.queue.append(task)
            self.queue_size += 1
            return True
    async def process(self):
        """Process queued requests using semaphore for concurrency control"""
        while True:
            if self.queue:
                async with self.semaphore, self.lock:
                    task = self.queue.popleft()
                    self.queue_size -= 1
                    await task
            await asyncio.sleep(0.01)  # Yield control to event loop
--- a/benchmarks/kernels/bench_concat_mla_q.py
+++ b/benchmarks/kernels/bench_concat_mla_q.py
@@ -0,0 +1,98 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import torch
 from vllm import _custom_ops as ops
 from vllm.triton_utils import triton
 # DeepSeek V3 dimensions
 NOPE_DIM = 512
 ROPE_DIM = 64
 NUM_HEADS = 128
 NUM_TOKENS = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
 def get_configs():
    return NUM_TOKENS
 def make_inputs(num_tokens, dtype):
    """Create inputs matching the real code path.
    Args:
        contiguous_nope: If False, simulate the transposed BMM output
                         (non-contiguous nope with stride pattern from
                         [N,B,L].transpose(0,1)).
    """
    # Simulate: bmm output [N, B, L].transpose(0, 1) -> [B, N, L]
    raw = torch.randn(NUM_HEADS, num_tokens, NOPE_DIM, dtype=dtype, device="cuda")
    ql_nope = raw.transpose(0, 1)
    q_pe = torch.randn(num_tokens, NUM_HEADS, ROPE_DIM, dtype=dtype, device="cuda")
    return ql_nope, q_pe
 # ---- Non-contiguous nope benchmark (real code path) ----
@triton.testing.perf_report(
    triton.testing.Benchmark(
        x_names=["num_tokens"],
        x_vals=get_configs(),
        line_arg="provider",
        line_vals=["torch_cat", "concat_mla_q"],
        line_names=["torch.cat", "concat_mla_q (v8)"],
        styles=[("blue", "--"), ("green", "-")],
        ylabel="Latency (us)",
        plot_name="concat_mla_q-transposed",
        args={},
    )
 )
 def bench_transposed(num_tokens, provider):
    dtype = torch.bfloat16
    ql_nope, q_pe = make_inputs(num_tokens, dtype)
    q_out = torch.empty(
        num_tokens, NUM_HEADS, NOPE_DIM + ROPE_DIM, dtype=dtype, device="cuda"
    )
    quantiles = [0.5, 0.2, 0.8]
    if provider == "torch_cat":
        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
            lambda: torch.cat((ql_nope, q_pe), dim=-1), quantiles=quantiles, rep=500
        )
    else:
        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
            lambda: ops.concat_mla_q(ql_nope, q_pe, q_out), quantiles=quantiles, rep=500
        )
    return ms * 1000, max_ms * 1000, min_ms * 1000  # us
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Benchmark concat_mla_q vs torch.cat")
    parser.add_argument(
        "--save-path", type=str, default=None, help="Path to save benchmark results"
    )
    args = parser.parse_args()
    print("\n" + "=" * 70)
    print("CONCAT MLA Q KERNEL BENCHMARKS")
    print("=" * 70)
    print(f"Dimensions: nope={NOPE_DIM}, rope={ROPE_DIM}, heads={NUM_HEADS}")
    print(
        f"Per-head output: {NOPE_DIM + ROPE_DIM} bf16 = "
        f"{(NOPE_DIM + ROPE_DIM) * 2} bytes"
    )
    print(f"num_tokens (decode=batch_size, prefill=chunk_size): {NUM_TOKENS}")
    print("=" * 70)
    print("\n--- Non-contiguous nope inputs (transposed BMM output) ---")
    bench_transposed.run(print_data=True, save_path=args.save_path)
    print("\n" + "=" * 70)
    print("Benchmarking complete!")
    print("=" * 70)
--- a/benchmarks/kernels/bench_cp_gather_fp8.py
+++ b/benchmarks/kernels/bench_cp_gather_fp8.py
@@ -0,0 +1,153 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import math
 import torch
 from vllm import _custom_ops as ops
 from vllm.triton_utils import triton
 # DeepSeek V3 MLA dimensions
 NOPE_DIM = 512
 ROPE_DIM = 64
 HEAD_DIM = NOPE_DIM + ROPE_DIM  # 576 BF16 output elements per token
 ENTRY_BYTES = 656  # 512 FP8 + 16 scales + 128 BF16 RoPE
 BLOCK_SIZE = 64  # tokens per physical cache block - get_supported_kernel_block_sizes
 # Realistic prefill scenarios:
 #   - 1 long prefill: single request, 16K-96K tokens
 #   - 4 medium prefills: 4 requests, 4K-24K tokens each
 #   - 16 shorter prefills: 16 requests, 1K-6K tokens each
 SCENARIOS = [
    # (label, num_reqs, total_tokens_list)
    ("1-req", 1, [8192, 16384, 32768, 65536, 98304]),
    ("4-reqs", 4, [8192, 16384, 32768, 65536, 98304]),
    ("16-reqs", 16, [8192, 16384, 32768, 65536, 98304]),
 ]
 def make_inputs(total_tokens, num_reqs, block_size):
    """Create synthetic FP8 cache, block table, and output buffer.
    Fills the cache with random bytes (we only measure throughput,
    not correctness). Block table maps each request to contiguous
    physical blocks.
    """
    # Divide tokens evenly across requests
    base_len = total_tokens // num_reqs
    remainder = total_tokens % num_reqs
    seq_lens = [base_len + (1 if r < remainder else 0) for r in range(num_reqs)]
    # workspace_starts: cumulative sum of seq_lens
    workspace_starts = [0] * num_reqs
    for r in range(1, num_reqs):
        workspace_starts[r] = workspace_starts[r - 1] + seq_lens[r - 1]
    # Physical blocks needed per request
    blocks_per_req = [math.ceil(s / block_size) for s in seq_lens]
    total_blocks = sum(blocks_per_req)
    max_blocks = max(blocks_per_req)
    # Allocate cache with random data (content doesn't matter for perf)
    cache = torch.randint(
        0,
        256,
        (total_blocks, block_size, ENTRY_BYTES),
        dtype=torch.uint8,
        device="cuda",
    )
    # Block table: contiguous block assignments
    block_table = torch.zeros(num_reqs, max_blocks, dtype=torch.int32, device="cuda")
    block_idx = 0
    for r in range(num_reqs):
        for b in range(blocks_per_req[r]):
            block_table[r, b] = block_idx
            block_idx += 1
    # Output workspace
    dst = torch.zeros(total_tokens, HEAD_DIM, dtype=torch.bfloat16, device="cuda")
    seq_lens_t = torch.tensor(seq_lens, dtype=torch.int32, device="cuda")
    workspace_starts_t = torch.tensor(
        workspace_starts, dtype=torch.int32, device="cuda"
    )
    return cache, dst, block_table, seq_lens_t, workspace_starts_t
 def bench_scenario(label, num_reqs, total_tokens_list, save_path):
    """Run benchmark for a specific (num_reqs, total_tokens) scenario."""
    @triton.testing.perf_report(
        triton.testing.Benchmark(
            x_names=["total_tokens"],
            x_vals=total_tokens_list,
            line_arg="provider",
            line_vals=["cuda_kernel"],
            line_names=["cp_gather_fp8 (CUDA)"],
            styles=[("green", "-")],
            ylabel="Latency (us)",
            plot_name=f"cp_gather_fp8-{label}-bs{BLOCK_SIZE}",
            args={"num_reqs": num_reqs},
        )
    )
    def bench_fn(total_tokens, provider, num_reqs):
        cache, dst, block_table, seq_lens_t, ws_starts = make_inputs(
            total_tokens, num_reqs, BLOCK_SIZE
        )
        quantiles = [0.5, 0.2, 0.8]
        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
            lambda: ops.cp_gather_and_upconvert_fp8_kv_cache(
                cache, dst, block_table, seq_lens_t, ws_starts, num_reqs
            ),
            quantiles=quantiles,
            rep=500,
        )
        return ms * 1000, max_ms * 1000, min_ms * 1000  # us
    seq_len_per_req = total_tokens_list[0] // num_reqs
    seq_len_per_req_max = total_tokens_list[-1] // num_reqs
    print(
        f"\n--- {label}: {num_reqs} request(s), "
        f"~{seq_len_per_req}-{seq_len_per_req_max} tokens/req ---"
    )
    bench_fn.run(print_data=True, save_path=save_path)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Benchmark cp_gather_and_upconvert_fp8_kv_cache"
    )
    parser.add_argument(
        "--save-path",
        type=str,
        default=None,
        help="Path to save benchmark results as CSV",
    )
    args = parser.parse_args()
    # Print data volume info for bandwidth analysis
    read_per_token = ENTRY_BYTES  # 656 bytes from cache
    write_per_token = HEAD_DIM * 2  # 576 * 2 = 1152 bytes to workspace
    total_per_token = read_per_token + write_per_token  # 1808 bytes
    print("\n" + "=" * 70)
    print("CP_GATHER_AND_UPCONVERT_FP8_KV_CACHE BENCHMARKS")
    print("=" * 70)
    print(f"Cache entry: {ENTRY_BYTES} bytes (512 FP8 + 16 scales + 128 RoPE)")
    print(f"Output row:  {HEAD_DIM} BF16 = {HEAD_DIM * 2} bytes")
    print(f"Per token:   {total_per_token} bytes (read + write)")
    print(f"Block size:  {BLOCK_SIZE} tokens/block")
    print("=" * 70)
    for label, num_reqs, total_tokens_list in SCENARIOS:
        bench_scenario(label, num_reqs, total_tokens_list, args.save_path)
    print("\n" + "=" * 70)
    print("Benchmarking complete!")
    print("=" * 70)
--- a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
@@ -168,7 +168,7 @@ def bench_impl(
    # warmup
    for kwargs in kwargs_list:
        impl_type.get_impl()(**kwargs)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Merge into a single kwargs and qualify arguments as ArgPool
    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
@@ -202,7 +202,7 @@ def test_correctness(T: int, N: int):
    # reference output
    ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
-    # test ouptut
+    # test output
    out_q, out_s = output_from_impl(
        ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
    )
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -12,12 +12,12 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.activation import MoEActivation
 from vllm.model_executor.layers.fused_moe.all2all_utils import (
    maybe_make_prepare_finalize,
 )
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
 from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
    MoEPrepareAndFinalizeNoEP,
 )
 from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.v1.worker.workspace import init_workspace_manager
@@ -64,7 +64,7 @@ def bench_run(
    per_out_ch: bool,
    mkn: tuple[int, int, int],
 ):
-    init_workspace_manager(torch.cuda.current_device())
+    init_workspace_manager(torch.accelerator.current_device_index())
    (m, k, n) = mkn
    dtype = torch.half
@@ -137,15 +137,21 @@ def bench_run(
        per_out_ch_quant=per_out_ch,
    )
    fn = mk.FusedMoEModularKernel(
        MoEPrepareAndFinalizeNoEP(),
        CutlassExpertsFp8(
    moe_config = make_dummy_moe_config(
        num_experts=num_experts,
        hidden_dim=k,
        intermediate_size_per_partition=n,
        in_dtype=a.dtype,
    )
    fn = mk.FusedMoEKernel(
        maybe_make_prepare_finalize(
            moe=moe_config,
            quant_config=quant_config,
            allow_new_interface=True,
            use_monolithic=False,
        ),
        CutlassExpertsFp8(
            moe_config=moe_config,
            quant_config=quant_config,
        ),
    )
@@ -165,7 +171,7 @@ def bench_run(
                activation=MoEActivation.SILU,
                global_num_experts=num_experts,
            )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
    triton_stream = torch.cuda.Stream()
@@ -181,14 +187,14 @@ def bench_run(
                topk_ids,
                quant_config=quant_config,
            )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
        """Benchmark CUDA graph using events like benchmark_moe.py"""
        # Warmup
        for _ in range(num_warmup):
            graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        # Timing
        start_event = torch.Event(enable_timing=True)
@@ -196,7 +202,7 @@ def bench_run(
        latencies = []
        for _ in range(num_iters):
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            start_event.record()
            graph.replay()
            end_event.record()
--- a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
@@ -15,6 +15,9 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.all2all_utils import (
    maybe_make_prepare_finalize,
 )
 from vllm.model_executor.layers.fused_moe.config import (
    fp8_w8a8_moe_quant_config,
    nvfp4_moe_quant_config,
@@ -23,9 +26,6 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import (
    CutlassExpertsFp4,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
    MoEPrepareAndFinalizeNoEP,
 )
 from vllm.scalar_type import scalar_types
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.v1.worker.workspace import init_workspace_manager
@@ -196,10 +196,21 @@ def bench_run(
            g2_alphas=w2_gs,
        )
-        kernel = mk.FusedMoEModularKernel(
+        moe_config = make_dummy_moe_config(
-            MoEPrepareAndFinalizeNoEP(),
+            num_experts=num_experts,
            hidden_dim=k,
            intermediate_size_per_partition=n,
            in_dtype=a.dtype,
        )
        kernel = mk.FusedMoEKernel(
            maybe_make_prepare_finalize(
                moe=moe_config,
                quant_config=quant_config,
                allow_new_interface=True,
                use_monolithic=False,
            ),
            CutlassExpertsFp4(
-                make_dummy_moe_config(),
+                moe_config=moe_config,
                quant_config=quant_config,
            ),
        )
@@ -240,11 +251,17 @@ def bench_run(
            g1_alphas=w1_gs,
            g2_alphas=w2_gs,
        )
        moe_config = make_dummy_moe_config()
-        kernel = mk.FusedMoEModularKernel(
+        kernel = mk.FusedMoEKernel(
-            MoEPrepareAndFinalizeNoEP(),
+            maybe_make_prepare_finalize(
                moe=moe_config,
                quant_config=quant_config,
                allow_new_interface=True,
                use_monolithic=False,
            ),
            CutlassExpertsFp4(
-                make_dummy_moe_config(),
+                moe_config=moe_config,
                quant_config=quant_config,
            ),
        )
@@ -290,7 +307,7 @@ def bench_run(
    def replay_graph(graph, num_repeats):
        for _ in range(num_repeats):
            graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
    cutlass_stream = torch.cuda.Stream()
    cutlass_graph = torch.cuda.CUDAGraph()
@@ -313,7 +330,7 @@ def bench_run(
            e=num_experts,
            device=device,
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    triton_stream = torch.cuda.Stream()
    triton_graph = torch.cuda.CUDAGraph()
@@ -328,7 +345,7 @@ def bench_run(
            w2_fp8scale,
            a_fp8_scale,
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    min_run_time = 5
    num_warmup = 5
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -342,7 +342,7 @@ class CommunicatorBenchmark:
            if not should_use_fn(tensor):
                return None
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            stream = torch.cuda.Stream()
            with torch.cuda.stream(stream):
                graph_input = tensor.clone()
@@ -360,17 +360,17 @@ class CommunicatorBenchmark:
                        for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
                            allreduce_fn(graph_input)
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            for _ in range(num_warmup):
                graph.replay()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            start_time = time.perf_counter()
            for _ in range(num_trials):
                graph.replay()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            end_time = time.perf_counter()
@@ -495,7 +495,7 @@ def main():
    # Set device
    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    # Get CPU process group
    cpu_group = dist.new_group(backend="gloo")
--- a/benchmarks/kernels/benchmark_fused_collective.py
+++ b/benchmarks/kernels/benchmark_fused_collective.py
@@ -385,32 +385,32 @@ def benchmark_operation(
    # Warmup before graph capture
    for _ in range(warmup):
        operation_func(*args, **kwargs)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Create CUDA graph
    graph = torch.cuda.CUDAGraph()
    num_op_per_cudagraph = 10
    # Use vLLM's graph_capture to make tensor_model_parallel_all_reduce graph-safe
-    device = torch.device(f"cuda:{torch.cuda.current_device()}")
+    device = torch.device(f"cuda:{torch.accelerator.current_device_index()}")
    with graph_capture(device=device), torch.cuda.graph(graph):
        for _ in range(num_op_per_cudagraph):
            operation_func(*args, **kwargs)
    # Graph warmup
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    for _ in range(warmup):
        graph.replay()
    # Benchmark with CUDA graph
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_time = time.perf_counter()
    for _ in range(trials // num_op_per_cudagraph):
        # operation_func(*args, **kwargs)
        graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    end_time = time.perf_counter()
    avg_time_ms = ((end_time - start_time) / trials) * 1000
@@ -984,7 +984,7 @@ def main():
    world_size = int(os.environ["WORLD_SIZE"])
    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    torch.set_default_device(device)
    init_distributed_environment()
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -9,15 +9,15 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.all2all_utils import (
    maybe_make_prepare_finalize,
 )
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
 from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
 from vllm.model_executor.layers.fused_moe.fused_moe import (
    fused_experts,
    fused_topk,
 )
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
    MoEPrepareAndFinalizeNoEP,
 )
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.v1.worker.workspace import init_workspace_manager
@@ -50,7 +50,7 @@ def bench_run(
    per_out_ch: bool,
    mkn: tuple[int, int, int],
 ):
-    init_workspace_manager(torch.cuda.current_device())
+    init_workspace_manager(torch.accelerator.current_device_index())
    label = "Quant Matmul"
    sub_label = (
@@ -131,16 +131,22 @@ def bench_run(
            w2_scale=w2_scale,
            per_act_token_quant=per_act_token,
        )
        fn = mk.FusedMoEModularKernel(
            MoEPrepareAndFinalizeNoEP(),
            CutlassExpertsFp8(
        moe_config = make_dummy_moe_config(
            num_experts=w2.shape[0],
            hidden_dim=w2.shape[1],
            intermediate_size_per_partition=w2.shape[2],
            in_dtype=a.dtype,
        )
        fn = mk.FusedMoEKernel(
            maybe_make_prepare_finalize(
                moe=moe_config,
                quant_config=quant_config,
                allow_new_interface=True,
                use_monolithic=False,
            ),
            CutlassExpertsFp8(
                moe_config=moe_config,
                quant_config=quant_config,
            ),
        )
@@ -163,16 +169,22 @@ def bench_run(
            w2_scale=w2_scale,
            per_act_token_quant=per_act_token,
        )
        fn = mk.FusedMoEModularKernel(
            MoEPrepareAndFinalizeNoEP(),
            CutlassExpertsFp8(
        moe_config = make_dummy_moe_config(
            num_experts=w2.shape[0],
            hidden_dim=w2.shape[1],
            intermediate_size_per_partition=w2.shape[2],
            in_dtype=a.dtype,
        )
        fn = mk.FusedMoEKernel(
            maybe_make_prepare_finalize(
                moe=moe_config,
                quant_config=quant_config,
                allow_new_interface=True,
                use_monolithic=False,
            ),
            CutlassExpertsFp8(
                moe_config=moe_config,
                quant_config=quant_config,
            ),
        )
@@ -212,7 +224,7 @@ def bench_run(
    def replay_graph(graph, num_repeats):
        for _ in range(num_repeats):
            graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
    cutlass_stream = torch.cuda.Stream()
    cutlass_graph = torch.cuda.CUDAGraph()
@@ -227,7 +239,7 @@ def bench_run(
            topk_weights,
            topk_ids,
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    triton_stream = torch.cuda.Stream()
    triton_graph = torch.cuda.CUDAGraph()
@@ -242,7 +254,7 @@ def bench_run(
            w2_scale,
            a_scale,
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    min_run_time = 5
    num_warmup = 5
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -34,14 +34,14 @@ def main(
    residual = torch.randn_like(x) * scale if add_residual else None
    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
        for _ in range(num_iters):
            layer(x, residual)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        end_time = time.perf_counter()
        if profile:
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -1035,7 +1035,7 @@ def bench_optype(
    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
    for kwargs in kwargs_list:
        op_type.bench_fn()(**kwargs)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Merge into a single kwargs and qualify arguments as ArgPool
    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
--- a/benchmarks/kernels/benchmark_mla_k_concat.py
+++ b/benchmarks/kernels/benchmark_mla_k_concat.py
@@ -47,13 +47,13 @@ def benchmark_method(
    # Warmup
    for _ in range(num_warmup):
        _ = method(k_nope, k_pe)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Benchmark
    start = time.perf_counter()
    for _ in range(num_iters):
        _ = method(k_nope, k_pe)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    end = time.perf_counter()
    return (end - start) / num_iters * 1000  # Convert to ms
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -17,6 +17,9 @@ from ray.experimental.tqdm_ray import tqdm
 from vllm.model_executor.layers.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.activation import MoEActivation
 from vllm.model_executor.layers.fused_moe.all2all_utils import (
    maybe_make_prepare_finalize,
 )
 from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEConfig,
    FusedMoEParallelConfig,
@@ -51,7 +54,7 @@ def clear_triton_cache():
    # Clear CUDA memory cache
    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
+        torch.accelerator.empty_cache()
    # Try to clear Triton's runtime cache
    try:
@@ -242,10 +245,8 @@ def benchmark_config(
        deep_gemm_experts = None
        if use_deep_gemm:
-            deep_gemm_experts = mk.FusedMoEModularKernel(
+            moe_config = (
-                prepare_finalize=MoEPrepareAndFinalizeNoEP(),
+                FusedMoEConfig(
                fused_experts=TritonOrDeepGemmExperts(
                    moe_config=FusedMoEConfig(
                    num_experts=num_experts,
                    experts_per_token=topk,
                    hidden_dim=hidden_size,
@@ -258,8 +259,19 @@ def benchmark_config(
                    routing_method=RoutingMethodType.TopK,
                    device="cuda",
                ),
            )
            deep_gemm_experts = mk.FusedMoEKernel(
                prepare_finalize=maybe_make_prepare_finalize(
                    moe=moe_config,
                    quant_config=quant_config,
                    allow_new_interface=True,
                    use_monolithic=False,
                ),
                fused_experts=TritonOrDeepGemmExperts(
                    moe_config=moe_config,
                    quant_config=quant_config,
                ),
                inplace=not disable_inplace(),
            )
        with override_config(config):
@@ -269,8 +281,16 @@ def benchmark_config(
            inplace = not disable_inplace()
            if use_deep_gemm:
-                return deep_gemm_experts(
+                return deep_gemm_experts.apply(
-                    x, w1, w2, topk_weights, topk_ids, inplace=inplace
+                    x,
                    w1,
                    w2,
                    topk_weights,
                    topk_ids,
                    activation=MoEActivation.SILU,
                    global_num_experts=num_experts,
                    apply_router_weight_on_input=False,
                    expert_map=False,
                )
            return fused_experts(
                x,
@@ -284,19 +304,19 @@ def benchmark_config(
    # JIT compilation & warmup
    run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Capture 10 invocations with CUDA graph
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        for _ in range(10):
            run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Warmup
    for _ in range(5):
        graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
@@ -304,7 +324,7 @@ def benchmark_config(
    latencies: list[float] = []
    for i in range(num_iters):
        prepare(i)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        graph.replay()
@@ -606,7 +626,11 @@ class BenchmarkWorker:
            if visible_device != f"{self.device_id}":
                need_device_guard = True
-        with torch.cuda.device(self.device_id) if need_device_guard else nullcontext():
+        with (
            torch.accelerator.device_index(self.device_id)
            if need_device_guard
            else nullcontext()
        ):
            for idx, config in enumerate(tqdm(search_space)):
                try:
                    kernel_time = benchmark_config(
--- a/benchmarks/kernels/benchmark_moe_defaults.py
+++ b/benchmarks/kernels/benchmark_moe_defaults.py
@@ -131,7 +131,7 @@ def benchmark_config(
                topk_ids,
                quant_config=quant_config,
            )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Benchmark
    start = torch.cuda.Event(enable_timing=True)
@@ -149,7 +149,7 @@ def benchmark_config(
                quant_config=quant_config,
            )
    end.record()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    return start.elapsed_time(end) / num_iters * 1000  # ms -> us
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -69,19 +69,19 @@ def benchmark_permute(
    # JIT compilation & warmup
    run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Capture 10 invocations with CUDA graph
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        for _ in range(10):
            run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Warmup
    for _ in range(5):
        graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
@@ -89,7 +89,7 @@ def benchmark_permute(
    latencies: list[float] = []
    for i in range(num_iters):
        prepare(i)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        graph.replay()
@@ -159,26 +159,26 @@ def benchmark_unpermute(
    # JIT compilation & warmup
    input = prepare()
    run(input)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Capture 10 invocations with CUDA graph
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        for _ in range(10):
            run(input)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Warmup
    for _ in range(5):
        graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
    latencies: list[float] = []
    for i in range(num_iters):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        graph.replay()
        end_event.record()
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -135,14 +135,14 @@ def benchmark_mrope(
            key.clone(),
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Time reference implementation
    torch_times = []
    for _ in range(benchmark_iter):
        query_clone = query.clone()
        key_clone = key.clone()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_time = time.time()
        mrope_helper_class.forward_native(
@@ -151,7 +151,7 @@ def benchmark_mrope(
            key_clone,
        )
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        torch_times.append(time.time() - start_time)
    # Time triton kernel implementation
@@ -159,14 +159,14 @@ def benchmark_mrope(
    for _ in range(benchmark_iter):
        query_clone = query.clone()
        key_clone = key.clone()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_time = time.time()
        mrope_helper_class.forward_cuda(
            positions,
            query_clone,
            key_clone,
        )
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        triton_times.append(time.time() - start_time)
    # Calculate statistics
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -103,7 +103,7 @@ def main(
        max_logits = torch.empty_like(exp_sums)
    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
@@ -173,7 +173,7 @@ def main(
                    )
            else:
                raise ValueError(f"Invalid version: {version}")
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        end_time = time.perf_counter()
        if profile:
--- a/benchmarks/kernels/benchmark_per_token_group_quant.py
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@@ -28,7 +28,7 @@ def _time_cuda(
    # warmup
    for _ in range(warmup_iters):
        fn()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start = torch.Event(enable_timing=True)
    end = torch.Event(enable_timing=True)
@@ -37,7 +37,7 @@ def _time_cuda(
    for _ in range(bench_iters):
        fn()
    end.record()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    return start.elapsed_time(end) / bench_iters  # ms/iter
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -29,7 +29,7 @@ def main(
    scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None
    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
@@ -39,7 +39,7 @@ def main(
                ops.scaled_int8_quant(x, scale)
            else:
                ops.scaled_fp8_quant(x, scale)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        end_time = time.perf_counter()
        if profile:
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@@ -84,16 +84,16 @@ def run_benchmark(
        g = torch.cuda.CUDAGraph()
        with torch.cuda.graph(g):
            function_under_test()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        function_under_test = lambda: g.replay()
    def run_cuda_benchmark(n_iters: int) -> float:
        nonlocal key, value, key_cache, value_cache, slot_mapping
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = time.perf_counter()
        for _ in range(n_iters):
            function_under_test()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
        end = time.perf_counter()
        return (end - start) / n_iters
@@ -104,7 +104,7 @@ def run_benchmark(
    # free tensors to mitigate OOM when sweeping
    del key, value, key_cache, value_cache, slot_mapping
-    torch.cuda.empty_cache()
+    torch.accelerator.empty_cache()
    return lat
--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@@ -109,16 +109,16 @@ def run_benchmark(
        g = torch.cuda.CUDAGraph()
        with torch.cuda.graph(g):
            function_under_test()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        function_under_test = lambda: g.replay()
    def run_cuda_benchmark(n_iters: int) -> float:
        nonlocal key, value, key_cache, value_cache, slot_mapping
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = time.perf_counter()
        for _ in range(n_iters):
            function_under_test()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
        end = time.perf_counter()
        return (end - start) / n_iters
@@ -129,7 +129,7 @@ def run_benchmark(
    # free tensors to mitigate OOM when sweeping
    del key, value, key_cache, value_cache, slot_mapping
-    torch.cuda.empty_cache()
+    torch.accelerator.empty_cache()
    return lat
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -251,7 +251,7 @@ def benchmark(
        kernel(
            y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
@@ -259,7 +259,7 @@ def benchmark(
    # Benchmark
    latencies: list[float] = []
    for _ in range(runs):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        for i in range(iterations_per_run):
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -126,7 +126,7 @@ def benchmark_decode(
    )
    def time_fn(fn, warmup=10, trials=20):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = torch.Event(enable_timing=True)
        end = torch.Event(enable_timing=True)
        times = []
@@ -136,7 +136,7 @@ def benchmark_decode(
            start.record()
            fn()
            end.record()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            times.append(start.elapsed_time(end))  # ms
        return sum(times) / len(times), torch.std(torch.tensor(times))
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -138,7 +138,7 @@ def benchmark_prefill(
    )
    def time_fn(fn, warmup=10, trials=20):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = torch.Event(enable_timing=True)
        end = torch.Event(enable_timing=True)
        times = []
@@ -148,7 +148,7 @@ def benchmark_prefill(
            start.record()
            fn()
            end.record()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            times.append(start.elapsed_time(end))  # ms
        return sum(times) / len(times), torch.std(torch.tensor(times))
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -177,18 +177,18 @@ def benchmark_config(
    def run():
        w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # JIT complication & warmup
    for _ in range(5):
        run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
    latencies: list[float] = []
    for i in range(num_iters):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        run()
        end_event.record()
@@ -285,7 +285,7 @@ def tune_on_gpu(args_dict):
    weight_shapes = args_dict["weight_shapes"]
    args = args_dict["args"]
-    torch.cuda.set_device(gpu_id)
+    torch.accelerator.set_device_index(gpu_id)
    print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
    block_n = args.block_n
@@ -334,7 +334,7 @@ def distribute_batch_sizes(batch_sizes, num_gpus):
 def main(args):
    print(args)
-    num_gpus = torch.cuda.device_count()
+    num_gpus = torch.accelerator.device_count()
    if num_gpus == 0:
        raise RuntimeError("No GPU available for tuning")
    print(f"Found {num_gpus} GPUs for parallel tuning")
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -35,7 +35,7 @@ def benchmark_shape(
    B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
    # Reference result in BF16
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    C_ref = A @ B.t()
    # Pre-quantize B for all implementations
@@ -121,14 +121,14 @@ def benchmark_shape(
        # Warmup
        for _ in range(warmup):
            func()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
        # Timing loop
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = time.time()
        for _ in range(repeat):
            func()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        end = time.time()
        # Calculate timing and TFLOPS
--- a/Show More
+++ b/Show More