[ROCm] Fix AttributeError for torch.compiler.skip_all_guards_unsafe on older PyTorch (#37219 )

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
[Refactor] Relocate completion and chat completion tests (#37125 )
2026-03-17 11:34:49 +08:00 · 2026-03-17 11:31:23 +08:00 · 2026-03-17 03:24:34 +00:00 · 2026-03-16 18:04:15 -07:00 · 2026-03-16 17:48:42 -07:00 · 2026-03-17 00:38:52 +00:00
1391 changed files with 116625 additions and 55608 deletions
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -10,7 +10,7 @@ steps:
      docker build
      --build-arg max_jobs=16
      --build-arg REMOTE_VLLM=1
-      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx942;gfx950'
+      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
      --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
      --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
      -f docker/Dockerfile.rocm
--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -21,6 +21,20 @@ steps:
      pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
      pytest -x -v -s tests/kernels/test_onednn.py"

+- label: CPU-Compatibility Tests
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  - cmake/cpu_extension.cmake
+  - setup.py
+  - vllm/platforms/cpu.py
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+      bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
+
 - label: CPU-Language Generation and Pooling Model Tests
  depends_on: []
  soft_fail: true
--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -25,9 +25,7 @@ fi
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --build-arg VLLM_CPU_AVX512BF16=true \
-  --build-arg VLLM_CPU_AVX512VNNI=true \
-  --build-arg VLLM_CPU_AMXBF16=true \
+  --build-arg VLLM_CPU_X86=true \
  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
  --target vllm-test \
  --progress plain .
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -13,9 +13,10 @@ import os
 from contextlib import contextmanager

 import lm_eval
-import numpy as np
 import yaml

+from vllm.platforms import current_platform
+
 DEFAULT_RTOL = 0.08


@@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size):
        "allow_deprecated_quantization=True,"
    )

+    if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]:
+        model_args += "attention_backend=TRITON_ATTN"
+
    env_vars = eval_config.get("env_vars", None)
    with scoped_env_vars(env_vars):
        results = lm_eval.simple_evaluate(
@@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
                f"ground_truth={ground_truth:.3f} | "
                f"measured={measured_value:.3f} | rtol={rtol}"
            )
-            success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
+
+            min_acceptable = ground_truth * (1 - rtol)
+            success = success and measured_value >= min_acceptable

    assert success
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@@ -7,12 +7,12 @@ import argparse
 import html as _html
 import json
 import os
+from contextlib import nullcontext
 from dataclasses import dataclass
 from importlib import util
 from pathlib import Path

 import pandas as pd
-import regex as re

 pd.options.display.float_format = "{:.2f}".format
 plotly_found = util.find_spec("plotly.express") is not None
@@ -33,6 +33,45 @@ pd.set_option("display.precision", 2)
 pd.set_option("display.float_format", lambda x: f"{x:.2f}")


+# -----------------------------
+# Concurrency normalization (NEW, small)
+# -----------------------------
+def _find_concurrency_col(df: pd.DataFrame) -> str:
+    for c in [
+        "# of max concurrency.",
+        "# of max concurrency",
+        "Max Concurrency",
+        "max_concurrency",
+        "Concurrency",
+    ]:
+        if c in df.columns:
+            return c
+
+    for c in df.columns:
+        if "concurr" in str(c).lower():
+            s = df[c]
+            if s.dtype.kind in "iu" and s.nunique() > 1 and s.min() >= 1:
+                return c
+
+    raise ValueError(
+        "Cannot infer concurrency column. "
+        "Please rename the column to one of the known names "
+        "or add an explicit override (e.g., --concurrency-col)."
+    )
+
+
+def _normalize_concurrency_in_df(
+    df: pd.DataFrame, canonical: str = "# of max concurrency."
+) -> pd.DataFrame:
+    if canonical in df.columns:
+        return df
+    detected = _find_concurrency_col(df)
+    if detected in df.columns and detected != canonical:
+        return df.rename(columns={detected: canonical})
+    df[canonical] = pd.NA
+    return df
+
+
 # -----------------------------
 # Core data compare
 # -----------------------------
@@ -52,19 +91,25 @@ def compare_data_columns(
    - Concat along axis=1 (indexes align), then reset_index so callers can
      group by columns.
    - If --debug, add a <file_label>_name column per file.
+
+    Minimal fix to support different max_concurrency lists across files:
+      - normalize concurrency column naming to "# of max concurrency."
+      - align on UNION of keys (missing points become NaN)
+      - BUGFIX: don't drop throughput rows based on P99/Median presence
    """
    print("\ncompare_data_column:", data_column)

    frames = []
    raw_data_cols: list[str] = []
-    compare_frames = []

+    # Determine key cols after normalizing concurrency
    cols_per_file: list[set] = []
    for f in files:
        try:
            df_tmp = pd.read_json(f, orient="records")
        except Exception as err:
            raise ValueError(f"Failed to read {f}") from err
+        df_tmp = _normalize_concurrency_in_df(df_tmp, canonical="# of max concurrency.")
        cols_per_file.append(set(df_tmp.columns))

    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
@@ -75,12 +120,25 @@ def compare_data_columns(
            "No common key columns found from info_cols across the input files."
        )

-    meta_added = False
+    union_index = None
+    metas: list[pd.DataFrame] = []
+    staged: list[tuple[str, pd.Series, pd.Series | None]] = []

    for file in files:
        df = pd.read_json(file, orient="records")
+        df = _normalize_concurrency_in_df(df, canonical="# of max concurrency.")

-        if drop_column in df.columns:
+        # BUGFIX: only drop rows for latency-like metrics; throughput rows may have
+        # NaN in P99/Median columns even if the column exists in the JSON.
+        metric_lc = str(data_column).lower()
+        is_latency_metric = (
+            "ttft" in metric_lc
+            or "tpot" in metric_lc
+            or "p99" in metric_lc
+            or "median" in metric_lc
+            or metric_lc.strip() in {"p99", "median"}
+        )
+        if is_latency_metric and drop_column in df.columns:
            df = df.dropna(subset=[drop_column], ignore_index=True)

        for c in (
@@ -105,35 +163,61 @@ def compare_data_columns(
            meta = meta.groupby(level=key_cols, dropna=False).first()

        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
-        s = df_idx[data_column]
-        if not s.index.is_unique:
-            s = s.groupby(level=key_cols, dropna=False).mean()
+
+        if data_column in df_idx.columns:
+            s = df_idx[data_column]
+            if not s.index.is_unique:
+                s = s.groupby(level=key_cols, dropna=False).mean()
+        else:
+            # keep NA series to preserve meta keys for union_index
+            s = pd.Series(pd.NA, index=meta.index)
        s.name = file_label

-        if not meta_added:
-            frames.append(meta)
-            meta_added = True
-
+        name_s = None
        if debug and name_column in df_idx.columns:
            name_s = df_idx[name_column]
            if not name_s.index.is_unique:
                name_s = name_s.groupby(level=key_cols, dropna=False).first()
            name_s.name = f"{file_label}_name"
-            frames.append(name_s)

-        frames.append(s)
+        if union_index is None:
+            union_index = meta.index
+        else:
+            union_index = union_index.union(meta.index)
+        metas.append(meta)
+
+        staged.append((file_label, s, name_s))
+
+    if union_index is None:
+        raise ValueError("No data found after loading inputs.")
+
+    # meta first (union-aligned): build UNION meta across all files
+    if metas:
+        meta_union = pd.concat(metas, axis=0)
+        # Collapse duplicates on the MultiIndex; keep first non-null per column
+        meta_union = meta_union.groupby(level=key_cols, dropna=False).first()
+        frames.append(meta_union.reindex(union_index))
+
+    # values + ratios (union-aligned)
+    metric_series_aligned: list[pd.Series] = []
+    for file_label, s, name_s in staged:
+        s_aligned = s.reindex(union_index)
+        frames.append(s_aligned)
        raw_data_cols.append(file_label)
-        compare_frames.append(s)
+        metric_series_aligned.append(s_aligned)

-        if len(compare_frames) >= 2:
-            base = compare_frames[0]
-            current = compare_frames[-1]
-            if "P99" in data_column or "Median" in data_column:
+        if debug and name_s is not None:
+            frames.append(name_s.reindex(union_index))
+
+        if len(metric_series_aligned) >= 2:
+            base = metric_series_aligned[0]
+            current = metric_series_aligned[-1]
+            if "P99" in str(data_column) or "Median" in str(data_column):
                ratio = base / current
            else:
                ratio = current / base
            ratio = ratio.mask(base == 0)
-            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
+            ratio.name = f"Ratio 1 vs {len(metric_series_aligned)}"
            frames.append(ratio)

    concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
@@ -204,24 +288,10 @@ def split_json_by_tp_pp(
 # -----------------------------
 # Styling helpers
 # -----------------------------
-def _find_concurrency_col(df: pd.DataFrame) -> str:
-    for c in [
-        "# of max concurrency.",
-        "# of max concurrency",
-        "Max Concurrency",
-        "max_concurrency",
-        "Concurrency",
-    ]:
-        if c in df.columns:
-            return c
-    for c in df.columns:
-        if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
-            return c
-    return "# of max concurrency."
-
-
 def _highlight_threshold(
-    df: pd.DataFrame, threshold: float
+    df: pd.DataFrame,
+    threshold: float,
+    slack_pct: float = 0.0,
 ) -> pd.io.formats.style.Styler:
    conc_col = _find_concurrency_col(df)
    key_cols = [
@@ -234,12 +304,24 @@ def _highlight_threshold(
    ]
    conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]

-    return df.style.map(
-        lambda v: "background-color:#e6ffe6;font-weight:bold;"
-        if pd.notna(v) and v <= threshold
-        else "",
-        subset=conf_cols,
-    )
+    try:
+        slack_pct = float(slack_pct or 0.0)
+    except Exception:
+        slack_pct = 0.0
+    slack_limit = threshold * (1.0 + slack_pct / 100.0)
+
+    def _cell(v):
+        if pd.isna(v):
+            return ""
+        if v <= threshold:
+            # Strict SLA
+            return "background-color:#e6ffe6;font-weight:bold;"
+        if v <= slack_limit:
+            # Within slack range
+            return "background-color:#ffe5cc;font-weight:bold;"
+        return ""
+
+    return df.style.map(_cell, subset=conf_cols)


 def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
@@ -286,11 +368,30 @@ def _sanitize_sheet_name(name: str) -> str:
      - max 31 chars
      - cannot contain: : \ / ? * [ ]
      - cannot be empty
+
+    NOTE: Use fast, non-regex operations here to avoid the third-party `regex`
+    module's compile overhead/edge-cases on some systems.
    """
    name = "sheet" if name is None else str(name)
-    name = re.sub(r"[:\\/?*\[\]]", "_", name)
+
+    # Replace illegal characters with underscore.
+    trans = str.maketrans(
+        {
+            ":": "_",
+            "\\": "_",
+            "/": "_",
+            "?": "_",
+            "*": "_",
+            "[": "_",
+            "]": "_",
+        }
+    )
+    name = name.translate(trans)
+
+    # Strip quotes/spaces and collapse whitespace.
    name = name.strip().strip("'")
-    name = re.sub(r"\s+", " ", name)
+    name = " ".join(name.split())
+
    if not name:
        name = "sheet"
    return name[:31]
@@ -298,30 +399,57 @@ def _sanitize_sheet_name(name: str) -> str:

 def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
    d = dict(zip(group_cols, gkey_tuple))
-    model = d.get("Model", "model")
-    model_short = str(model).split("/")[-1]
+
+    # Always keep input/output lengths (these are important).
    ilen = d.get("Input Len", "")
    olen = d.get("Output Len", "")
    lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
+
+    # Shorten model name aggressively to make room for lens.
+    model = d.get("Model", "model")
+    leaf = str(model).split("/")[-1]
+
+    max_model_len = max(1, 31 - len(lens))
+    model_short = leaf[:max_model_len]
+
    return _sanitize_sheet_name(f"{model_short}{lens}")


 def _write_tables_to_excel_sheet(
    writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
 ):
-    startrow = 0
+    """Write all blocks to a sheet with a single to_excel() call.
+
+    Pandas+openpyxl can be extremely slow when called many times per sheet.
+    We flatten blocks into one table with a 'Section' column to keep structure
+    while making Excel generation fast and deterministic.
+    """
+    if not blocks:
+        pd.DataFrame().to_excel(writer, sheet_name=sheet, index=False)
+        return
+
+    combined_parts: list[pd.DataFrame] = []
    for title, df in blocks:
-        pd.DataFrame([[title]]).to_excel(
-            writer, sheet_name=sheet, index=False, header=False, startrow=startrow
-        )
-        startrow += 1
-        df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow)
-        startrow += len(df) + 3
+        df2 = df.copy()
+        # Put the section label as the first column for readability.
+        df2.insert(0, "Section", title)
+        combined_parts.append(df2)
+
+    combined = pd.concat(combined_parts, axis=0, ignore_index=True, sort=False)
+    combined.to_excel(writer, sheet_name=sheet, index=False)


 def _safe_filename(s: str) -> str:
-    s = re.sub(r"[^\w\-.]+", "_", str(s).strip())
-    return s[:180] if len(s) > 180 else s
+    # Fast path without the third-party `regex` module.
+    s = " ".join(str(s).strip().split())
+    allowed = []
+    for ch in s:
+        if ch.isalnum() or ch in "._-":
+            allowed.append(ch)
+        else:
+            allowed.append("_")
+    out = "".join(allowed)
+    return out[:180] if len(out) > 180 else out


 # -----------------------------
@@ -428,7 +556,11 @@ def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:


 def _max_concurrency_ok(
-    df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
+    df: pd.DataFrame,
+    conc_col: str,
+    cfg_col: str,
+    threshold: float,
+    slack_pct: float = 0.0,
 ):
    if df is None or conc_col not in df.columns or cfg_col not in df.columns:
        return pd.NA
@@ -441,7 +573,14 @@ def _max_concurrency_ok(
    if d.empty:
        return pd.NA

-    ok = d[d[cfg_col] <= threshold]
+    # Accept values up to (1 + slack_pct%) above the SLA.
+    try:
+        slack_pct = float(slack_pct or 0.0)
+    except Exception:
+        slack_pct = 0.0
+    effective_limit = float(threshold) * (1.0 + slack_pct / 100.0)
+
+    ok = d[d[cfg_col] <= effective_limit]
    if ok.empty:
        return pd.NA

@@ -507,15 +646,25 @@ def build_valid_max_concurrency_summary_html(
    if not cfg_cols:
        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)

+    # Display SLA ranges in the table header (SLA .. SLA*(1+slack))
+    ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
+    tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
+    ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
+    tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
+
    rows = []
    for cfg in cfg_cols:
        ttft_max = (
-            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
+            _max_concurrency_ok(
+                ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
+            )
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_max = (
-            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
+            _max_concurrency_ok(
+                tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
+            )
            if tpot_group_df is not None
            else pd.NA
        )
@@ -544,8 +693,8 @@ def build_valid_max_concurrency_summary_html(
        rows.append(
            {
                "Configuration": cfg,
-                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
-                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
+                f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
+                f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
                f"Max {conc_col} (Both)": both,
                "Output Tput @ Both (tok/s)": tput_at_both,
                "TTFT @ Both (ms)": ttft_at_both,
@@ -620,15 +769,24 @@ def build_valid_max_concurrency_summary_df(
    if not cfg_cols:
        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)

+    ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
+    tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
+    ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
+    tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
+
    rows = []
    for cfg in cfg_cols:
        ttft_max = (
-            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
+            _max_concurrency_ok(
+                ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
+            )
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_max = (
-            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
+            _max_concurrency_ok(
+                tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
+            )
            if tpot_group_df is not None
            else pd.NA
        )
@@ -657,8 +815,8 @@ def build_valid_max_concurrency_summary_df(
        rows.append(
            {
                "Configuration": cfg,
-                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
-                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
+                f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
+                f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
                f"Max {conc_col} (Both)": both,
                "Output Tput @ Both (tok/s)": tput_at_both,
                "TTFT @ Both (ms)": ttft_at_both,
@@ -751,7 +909,21 @@ def build_parser() -> argparse.ArgumentParser:
        help="Reference limit for TPOT plots (ms)",
    )

-    # ---- NEW: export options ----
+    # ---- SLA tolerance (slack) options ----
+    parser.add_argument(
+        "--ttft-slack-pct",
+        type=float,
+        default=5.0,
+        help="Allowed percentage above TTFT SLA (default: 5).",
+    )
+    parser.add_argument(
+        "--tpot-slack-pct",
+        type=float,
+        default=5.0,
+        help="Allowed percentage above TPOT SLA (default: 5).",
+    )
+
+    # ---- export options ----
    parser.add_argument(
        "--excel-out",
        type=str,
@@ -843,9 +1015,13 @@ def render_metric_table_html(

    metric_name = metric_label.lower()
    if "ttft" in metric_name:
-        styler = _highlight_threshold(display_group, args.ttft_max_ms)
+        styler = _highlight_threshold(
+            display_group, args.ttft_max_ms, args.ttft_slack_pct
+        )
    elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
-        styler = _highlight_threshold(display_group, args.tpot_max_ms)
+        styler = _highlight_threshold(
+            display_group, args.tpot_max_ms, args.tpot_slack_pct
+        )
    else:
        styler = display_group.style

@@ -962,22 +1138,46 @@ def write_report_group_first(
        csv_dir.mkdir(parents=True, exist_ok=True)

    excel_path = args.excel_out or "perf_comparison.xlsx"
-    with pd.ExcelWriter(excel_path, engine="openpyxl") as xw:
+    disable_excel = os.getenv("VLLM_COMPARE_DISABLE_EXCEL", "0") == "1"
+
+    # Prefer xlsxwriter for speed; fallback to openpyxl if unavailable.
+    excel_engine = (
+        os.getenv("VLLM_COMPARE_EXCEL_ENGINE", "xlsxwriter").strip() or "xlsxwriter"
+    )
+    if excel_engine == "xlsxwriter" and util.find_spec("xlsxwriter") is None:
+        excel_engine = "openpyxl"
+
+    excel_engine_kwargs = {}
+    if excel_engine == "xlsxwriter":
+        # Reduce memory pressure & usually faster writes.
+        excel_engine_kwargs = {"options": {"constant_memory": True}}
+
+    xw_ctx = (
+        nullcontext(None)
+        if disable_excel
+        else pd.ExcelWriter(
+            excel_path, engine=excel_engine, engine_kwargs=excel_engine_kwargs
+        )
+    )
+    with xw_ctx as xw:
+        used_sheets: set[str] = set()
        # ---- Environment sheet (first) ----
        env_sheet = _sanitize_sheet_name("Environment")
        env_df = _load_env_df_for_inputs(args, files)
-        if env_df is None or env_df.empty:
-            pd.DataFrame(
-                [
-                    {
-                        "Section": "Environment",
-                        "Key": "vllm_env.txt",
-                        "Value": "NOT FOUND (or empty)",
-                    }
-                ]
-            ).to_excel(xw, sheet_name=env_sheet, index=False)
-        else:
-            env_df.to_excel(xw, sheet_name=env_sheet, index=False)
+        if xw is not None:
+            if env_df is None or env_df.empty:
+                pd.DataFrame(
+                    [
+                        {
+                            "Section": "Environment",
+                            "Key": "vllm_env.txt",
+                            "Value": "NOT FOUND (or empty)",
+                        }
+                    ]
+                ).to_excel(xw, sheet_name=env_sheet, index=False)
+            else:
+                env_df.to_excel(xw, sheet_name=env_sheet, index=False)
+            used_sheets.add(env_sheet)
        with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
            main_fh.write('<meta charset="utf-8">\n')
            for gkey in group_keys:
@@ -993,12 +1193,19 @@ def write_report_group_first(

                main_fh.write(group_header)

+                do_excel = xw is not None
                sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
                sheet_base = sheet
-                dedup_i = 1
-                while sheet in xw.sheets:
-                    dedup_i += 1
-                    sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}")
+                if do_excel:
+                    dedup_i = 1
+                    while sheet in used_sheets:
+                        dedup_i += 1
+                        suffix = f"_{dedup_i}"
+                        # Ensure uniqueness even when sheet names are truncated.
+                        base = str(sheet_base)
+                        keep = max(1, 31 - len(suffix))
+                        sheet = _sanitize_sheet_name(base[:keep] + suffix)
+                    used_sheets.add(sheet)

                excel_blocks: list[tuple[str, pd.DataFrame]] = []

@@ -1059,7 +1266,7 @@ def write_report_group_first(
                        )

                        excel_blocks.append(
-                            (metric_label, display_group.reset_index(drop=True))
+                            (metric_label, group_df.reset_index(drop=True))
                        )
                        if csv_dir:
                            fn = _safe_filename(
@@ -1067,7 +1274,7 @@ def write_report_group_first(
                                    "/", "_"
                                )
                            )
-                            display_group.to_csv(csv_dir / f"{fn}.csv", index=False)
+                            group_df.to_csv(csv_dir / f"{fn}.csv", index=False)

                    summary_html = build_valid_max_concurrency_summary_html(
                        tput_group_df=tput_group_df,
@@ -1097,9 +1304,13 @@ def write_report_group_first(
                            )
                            summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)

-                _write_tables_to_excel_sheet(xw, sheet, excel_blocks)
+                if do_excel:
+                    _write_tables_to_excel_sheet(xw, sheet, excel_blocks)

-    print(f"Wrote Excel: {excel_path}")
+    if disable_excel:
+        print("Skipped Excel generation (VLLM_COMPARE_DISABLE_EXCEL=1).")
+    else:
+        print(f"Wrote Excel: {excel_path}")
    if csv_dir:
        print(f"Wrote CSVs under: {csv_dir}")

--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -12,6 +12,13 @@ DRY_RUN="${DRY_RUN:-0}"
 MODEL_FILTER="${MODEL_FILTER:-}"
 DTYPE_FILTER="${DTYPE_FILTER:-}"

+# Adaptive search controls
+ENABLE_ADAPTIVE_CONCURRENCY="${ENABLE_ADAPTIVE_CONCURRENCY:-0}"
+SLA_TTFT_MS="${SLA_TTFT_MS:-3000}"
+SLA_TPOT_MS="${SLA_TPOT_MS:-100}"
+ADAPTIVE_MAX_PROBES="${ADAPTIVE_MAX_PROBES:-8}"
+ADAPTIVE_MAX_CONCURRENCY="${ADAPTIVE_MAX_CONCURRENCY:-1024}"
+
 check_gpus() {
  if command -v nvidia-smi; then
    # check the number of GPUs and GPU type.
@@ -183,6 +190,304 @@ upload_to_buildkite() {
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }

+# -------------------------------
+# Adaptive concurrency helpers
+# -------------------------------
+result_json_path_for_serving() {
+  local test_name=$1
+  local qps=$2
+  local max_concurrency=$3
+  echo "$RESULTS_FOLDER/${test_name}_qps_${qps}_concurrency_${max_concurrency}.json"
+}
+
+extract_metric_ms() {
+  local metric_name=$1
+  local json_file=$2
+
+  [[ -f "$json_file" ]] || return 0
+
+  if [[ "$metric_name" == "ttft" ]]; then
+    jq -r '
+      [
+        .ttft_ms.p99?,
+        .metrics.ttft_ms.p99?,
+        .ttft.p99?,
+        .metrics.ttft.p99?,
+        .p99_ttft_ms?,
+        .ttft_ms.mean?,
+        .metrics.ttft_ms.mean?,
+        .ttft.mean?,
+        .metrics.ttft.mean?,
+        .mean_ttft_ms?
+      ] | map(select(. != null)) | .[0] // empty
+    ' "$json_file"
+  else
+    jq -r '
+      [
+        .tpot_ms.p99?,
+        .metrics.tpot_ms.p99?,
+        .tpot.p99?,
+        .metrics.tpot.p99?,
+        .p99_tpot_ms?,
+        .itl_ms.p99?,
+        .metrics.itl_ms.p99?,
+        .inter_token_latency_ms.p99?,
+        .tpot_ms.mean?,
+        .metrics.tpot_ms.mean?,
+        .tpot.mean?,
+        .metrics.tpot.mean?,
+        .itl_ms.mean?,
+        .metrics.itl_ms.mean?,
+        .mean_tpot_ms?,
+        .mean_itl_ms?
+      ] | map(select(. != null)) | .[0] // empty
+    ' "$json_file"
+  fi
+}
+
+evaluate_sla_from_json() {
+  local json_file=$1
+  local ttft
+  local tpot
+  local pass
+
+  [[ -f "$json_file" ]] || return 2
+
+  ttft=$(extract_metric_ms ttft "$json_file")
+  tpot=$(extract_metric_ms tpot "$json_file")
+
+  [[ -n "$ttft" && -n "$tpot" ]] || return 2
+
+  pass=$(jq -n \
+    --argjson ttft "$ttft" \
+    --argjson tpot "$tpot" \
+    --argjson sla_ttft "$SLA_TTFT_MS" \
+    --argjson sla_tpot "$SLA_TPOT_MS" \
+    '($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)')
+
+  [[ "$pass" == "true" ]]
+}
+
+write_adaptive_summary_json() {
+  local summary_file=$1
+  local test_name=$2
+  local qps=$3
+  local static_last_pass=$4
+  local static_first_fail=$5
+  local final_last_pass=$6
+  local final_first_fail=$7
+
+  jq -n \
+    --arg test_name "$test_name" \
+    --arg qps "$qps" \
+    --argjson sla_ttft "$SLA_TTFT_MS" \
+    --argjson sla_tpot "$SLA_TPOT_MS" \
+    --arg static_last_pass "${static_last_pass:-}" \
+    --arg static_first_fail "${static_first_fail:-}" \
+    --arg final_last_pass "${final_last_pass:-}" \
+    --arg final_first_fail "${final_first_fail:-}" \
+    '{
+      test_name: $test_name,
+      qps: $qps,
+      sla_ttft_ms: $sla_ttft,
+      sla_tpot_ms: $sla_tpot,
+      static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end),
+      static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end),
+      final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end),
+      final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end)
+    }' > "$summary_file"
+}
+
+run_single_serving_probe() {
+  local test_name=$1
+  local qps=$2
+  local max_concurrency=$3
+  local tp=$4
+  local compilation_config_mode=$5
+  local optimization_level=$6
+  local client_args_effective=$7
+  local client_remote_args=$8
+  local server_command=$9
+
+  local new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
+  local result_json
+  local num_prompts_arg=""
+  local client_command
+
+  result_json=$(result_json_path_for_serving "$test_name" "$qps" "$max_concurrency")
+
+  if [[ -f "$result_json" ]]; then
+    evaluate_sla_from_json "$result_json"
+    return $?
+  fi
+
+  if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
+    num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
+    if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
+    if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
+    num_prompts_arg="--num-prompts $num_prompts"
+  fi
+
+  client_command="vllm bench serve \
+    --save-result \
+    --result-dir $RESULTS_FOLDER \
+    --result-filename ${new_test_name}.json \
+    --request-rate $qps \
+    --max-concurrency $max_concurrency \
+    $num_prompts_arg \
+    --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level adaptive_search=1 \
+    $client_args_effective $client_remote_args "
+
+  echo "Adaptive probe: $client_command"
+
+  if [[ "${DRY_RUN:-0}" != "1" ]]; then
+    bash -c "$client_command"
+  fi
+
+  jq_output=$(jq -n \
+    --arg server "$server_command" \
+    --arg client "$client_command" \
+    --arg gpu "$gpu_type" \
+    '{
+      server_command: $server,
+      client_command: $client,
+      gpu_type: $gpu,
+      adaptive_search: true
+    }')
+  echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
+
+  evaluate_sla_from_json "$result_json"
+}
+
+adaptive_refine_from_static_results() {
+  local test_name=$1
+  local qps=$2
+  local max_concurrency_list_raw=$3
+  local tp=$4
+  local compilation_config_mode=$5
+  local optimization_level=$6
+  local client_args_effective=$7
+  local client_remote_args=$8
+  local server_command=$9
+
+  local sorted_points
+  local point
+  local rc
+  local static_last_pass=""
+  local static_first_fail=""
+  local largest_static=""
+  local step_hint=1
+  local previous_point=""
+  local low
+  local high
+  local mid
+  local probes=0
+  local summary_file="$RESULTS_FOLDER/${test_name}_qps_${qps}_sla_summary.json"
+
+  [[ "${ENABLE_ADAPTIVE_CONCURRENCY}" == "1" ]] || return 0
+  [[ "${DRY_RUN:-0}" != "1" ]] || return 0
+
+  sorted_points=$(for point in $max_concurrency_list_raw; do printf '%s\n' "$point"; done | tr -d "'" | awk '/^[0-9]+$/' | sort -n | uniq)
+  [[ -n "$sorted_points" ]] || return 0
+
+  while read -r point; do
+    [[ -z "$point" ]] && continue
+    largest_static="$point"
+    evaluate_sla_from_json "$(result_json_path_for_serving "$test_name" "$qps" "$point")"
+    rc=$?
+    if (( rc == 0 )); then
+      static_last_pass="$point"
+    elif (( rc == 1 )); then
+      if [[ -n "$static_last_pass" ]]; then
+        static_first_fail="$point"
+        break
+      fi
+    fi
+
+    if [[ -n "$previous_point" ]]; then
+      step_hint=$(( point - previous_point ))
+      if (( step_hint < 1 )); then step_hint=1; fi
+    fi
+    previous_point="$point"
+  done <<< "$sorted_points"
+
+  if [[ -z "$static_last_pass" ]]; then
+    write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "" "$static_first_fail" "" "$static_first_fail"
+    return 0
+  fi
+
+  if [[ -n "$static_first_fail" ]]; then
+    low=$static_last_pass
+    high=$static_first_fail
+    while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
+      mid=$(( (low + high) / 2 ))
+      probes=$(( probes + 1 ))
+      run_single_serving_probe \
+        "$test_name" "$qps" "$mid" "$tp" \
+        "$compilation_config_mode" "$optimization_level" \
+        "$client_args_effective" "$client_remote_args" "$server_command"
+      rc=$?
+      if (( rc == 0 )); then
+        low=$mid
+      elif (( rc == 1 )); then
+        high=$mid
+      else
+        break
+      fi
+    done
+    write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "$static_first_fail" "$low" "$high"
+    return 0
+  fi
+
+  low=$largest_static
+  high=""
+  while (( probes < ADAPTIVE_MAX_PROBES )); do
+    point=$(( low + step_hint ))
+    if (( point > ADAPTIVE_MAX_CONCURRENCY )); then
+      point=$ADAPTIVE_MAX_CONCURRENCY
+    fi
+    (( point > low )) || break
+    probes=$(( probes + 1 ))
+    run_single_serving_probe \
+      "$test_name" "$qps" "$point" "$tp" \
+      "$compilation_config_mode" "$optimization_level" \
+      "$client_args_effective" "$client_remote_args" "$server_command"
+    rc=$?
+    if (( rc == 0 )); then
+      low=$point
+      (( point == ADAPTIVE_MAX_CONCURRENCY )) && break
+      step_hint=$(( step_hint * 2 ))
+      if (( step_hint < 1 )); then step_hint=1; fi
+    elif (( rc == 1 )); then
+      high=$point
+      break
+    else
+      break
+    fi
+  done
+
+  if [[ -n "$high" ]]; then
+    while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
+      mid=$(( (low + high) / 2 ))
+      probes=$(( probes + 1 ))
+      run_single_serving_probe \
+        "$test_name" "$qps" "$mid" "$tp" \
+        "$compilation_config_mode" "$optimization_level" \
+        "$client_args_effective" "$client_remote_args" "$server_command"
+      rc=$?
+      if (( rc == 0 )); then
+        low=$mid
+      elif (( rc == 1 )); then
+        high=$mid
+      else
+        break
+      fi
+    done
+  fi
+
+  write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "" "$low" "$high"
+}
+
 run_benchmark_tests() {
  # run benchmark tests using `vllm bench <test_type>` command
  # $1: test type (latency or throughput)
@@ -347,10 +652,48 @@ run_serving_tests() {
    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
    client_params=$(echo "$params" | jq -r '.client_parameters')

-    server_args=$(json2args "$server_params")
+    # vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly.
+    server_model=$(echo "$server_params" | jq -r '.model // empty')
+    if [[ -z "$server_model" || "$server_model" == "null" ]]; then
+      echo "Error: serving test '$test_name' is missing server_parameters.model" >&2
+      exit 1
+    fi
+    server_params_no_model=$(echo "$server_params" | jq -c 'del(.model)')
+    server_args=$(json2args "$server_params_no_model")
+
    server_envs=$(json2envs "$server_envs")
    client_args=$(json2args "$client_params")

+    # ------------------------------------------------------------
+    # Option 1: Dynamic num-prompts scaling based on max_concurrency
+    #
+    # If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with:
+    #   num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY
+    #
+    # If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior
+    # unchanged (i.e., whatever is in serving-tests-*.json).
+    # ------------------------------------------------------------
+    PROMPTS_PER_CONCURRENCY="${PROMPTS_PER_CONCURRENCY-}"  # no default on purpose
+    MIN_NUM_PROMPTS="${MIN_NUM_PROMPTS:-1}"
+    MAX_NUM_PROMPTS="${MAX_NUM_PROMPTS:-1000000}"
+
+    if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
+      # Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
+      # Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
+      # Handles: --num-prompts 123   and   --num-prompts=123
+      client_args_no_np="$(
+        printf ' %s ' "$client_args" \
+        | sed -E \
+          -e 's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g' \
+          -e 's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g'
+      )"
+      # normalize whitespace
+      client_args_no_np="$(echo "$client_args_no_np" | tr -s ' ' | sed -E 's/^ //; s/ $//')"
+      client_args_no_np="$(echo "$client_args_no_np" | xargs)"
+      client_args_effective="$client_args_no_np"
+    else
+      client_args_effective="$client_args"
+    fi
    # qps_list
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
@@ -382,14 +725,13 @@ run_serving_tests() {
    fi

    # check if server model and client model is aligned
-    server_model=$(echo "$server_params" | jq -r '.model')
    client_model=$(echo "$client_params" | jq -r '.model')
    if [[ $server_model != "$client_model" ]]; then
      echo "Server model and client model must be the same. Skip testcase $test_name."
      continue
    fi

-    server_command="$server_envs vllm serve \
+    server_command="$server_envs vllm serve $server_model \
      $server_args"

    # run the server
@@ -436,6 +778,14 @@ run_serving_tests() {
      for max_concurrency in $max_concurrency_list; do
        new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
        echo " new test name $new_test_name"
+        # If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts.
+        num_prompts_arg=""
+        if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
+          num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
+          if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
+          if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
+          num_prompts_arg="--num-prompts $num_prompts"
+        fi
        # pass the tensor parallel size, the compilation mode, and the optimization
        # level to the client so that they can be used on the benchmark dashboard
        client_command="vllm bench serve \
@@ -444,8 +794,9 @@ run_serving_tests() {
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --max-concurrency $max_concurrency \
+          $num_prompts_arg \
          --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
-          $client_args $client_remote_args "
+          $client_args_effective $client_remote_args "

        echo "Running test case $test_name with qps $qps"
        echo "Client command: $client_command"
@@ -467,6 +818,11 @@ run_serving_tests() {
        echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"

      done
+
+      adaptive_refine_from_static_results \
+        "$test_name" "$qps" "$max_concurrency_list" "$tp" \
+        "$compilation_config_mode" "$optimization_level" \
+        "$client_args_effective" "$client_remote_args" "$server_command"
    done

    # clean up
@@ -532,6 +888,7 @@ main() {
  # postprocess benchmarking results
  pip install tabulate pandas
  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
+  python3 $QUICK_BENCHMARK_ROOT/scripts/compare-json-results.py -f $RESULTS_FOLDER/benchmark_results.json

  upload_to_buildkite
 }
--- a/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
@@ -51,5 +51,56 @@
            "max-model-len": 256,
            "async-scheduling": ""
        }
+    },
+    {
+        "test_name": "latency_deepseek_r1",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "deepseek-ai/DeepSeek-R1",
+            "tensor_parallel_size": 8,
+            "load_format": "dummy",
+            "max-model-len": 2048,
+            "dtype": "bfloat16"
+        }
+    },
+    {
+        "test_name": "latency_llama4_maverick_17b128e_instruct_fp8",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+            "tensor_parallel_size": 8,
+            "max-model-len": 512,
+            "max-num-seqs": 128,
+            "async-scheduling": "",
+            "gpu-memory-utilization": 0.95,
+            "enable_expert_parallel": ""
+        }
+    },
+    {
+        "test_name": "latency_qwen3_8b",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "Qwen/Qwen3-8B",
+            "tensor_parallel_size": 1,
+            "max-model-len": 2048,
+            "max-num-seqs": 128,
+            "dtype": "bfloat16",
+            "async-scheduling": ""
+        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
@@ -0,0 +1,37 @@
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+    "server_environment_variables": {
+      "VLLM_RPC_TIMEOUT": 100000,
+      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
+    },
+    "server_parameters": {
+      "dtype": "bfloat16",
+      "model": "openai/whisper-large-v3-turbo"
+    },
+    "client_parameters": {
+      "model": "openai/whisper-large-v3-turbo",
+      "backend": "openai-audio",
+      "endpoint": "/v1/audio/transcriptions",
+      "dataset_name": "hf",
+      "dataset_path": "openslr/librispeech_asr",
+      "hf_subset": "clean",
+      "hf_split": "test",
+      "no_stream": "",
+      "no_oversample": "",
+      "num_prompts": 200
+    }
+  },
+  "tests": [
+    {
+      "test_name": "serving_whisper_large_v3_turbo_librispeech_clean_tp1",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {}
+    }
+  ]
+}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
@@ -149,6 +149,39 @@
        "random-output-len": 128
      }
    },
+    {
+      "test_name": "serving_llama8B_tp1_random_2048_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_2048_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_2048_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 2048
+      }
+    },
    {
      "test_name": "serving_llama8B_int4_tp1_random_128_128",
      "server_parameters": {
@@ -188,6 +221,45 @@
        "random-output-len": 128
      }
    },
+    {
+      "test_name": "serving_llama8B_int8_tp1_random_128_128",
+      "server_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int8_tp2_random_128_128",
+      "server_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int8_tp4_random_128_128",
+      "server_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
    {
      "test_name": "serving_llama3B_tp1_random_128_128",
      "server_parameters": {
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -72,17 +72,6 @@
        "random-output-len": 128
      }
    },
-    {
-      "test_name": "serving_llama8B_tp4_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
    {
      "test_name": "serving_llama8B_tp1_random_128_2048",
      "server_parameters": {
@@ -105,17 +94,6 @@
        "random-output-len": 2048
      }
    },
-    {
-      "test_name": "serving_llama8B_tp4_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
    {
      "test_name": "serving_llama8B_tp1_random_2048_128",
      "server_parameters": {
@@ -139,14 +117,25 @@
      }
    },
    {
-      "test_name": "serving_llama8B_tp4_random_2048_128",
+      "test_name": "serving_llama8B_tp1_random_2048_2048",
      "server_parameters": {
-        "tensor_parallel_size": 4
+        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
-        "random-output-len": 128
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_2048_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 2048
      }
    }
  ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
@@ -10,7 +10,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -37,7 +36,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -64,7 +62,6 @@
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -78,5 +75,83 @@
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
+    },
+    {
+        "test_name": "serving_deepseek_r1",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "deepseek-ai/DeepSeek-R1",
+            "tensor_parallel_size": 8,
+            "disable_log_stats": "",
+            "load_format": "dummy",
+            "max-model-len": 2048,
+            "max-num-seqs": 200,
+            "async-scheduling": "",
+            "dtype": "bfloat16"
+        },
+        "client_parameters": {
+            "model": "deepseek-ai/DeepSeek-R1",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama4_maverick_17b128e_instruct_fp8",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+            "tensor_parallel_size": 8,
+            "disable_log_stats": "",
+            "max-model-len": 2048,
+            "max-num-seqs": 128,
+            "async-scheduling": "",
+            "enable_expert_parallel": "",
+            "max-num-batched-tokens": 4096
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_qwen3_8b",
+        "qps_list": [1, 4, 10, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "Qwen/Qwen-3-8B",
+            "tensor_parallel_size": 1,
+            "dtype": "bfloat16",
+            "disable_log_stats": "",
+            "async-scheduling": ""
+        },
+        "client_parameters": {
+            "model": "Qwen/Qwen-3-8B",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests.json
@@ -5,7 +5,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -23,7 +22,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -41,7 +39,6 @@
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -59,7 +56,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", 
            "tensor_parallel_size": 4,
-            "swap_space": 16,
            "speculative_config": {
                "model": "turboderp/Qwama-0.5B-Instruct",
                "num_speculative_tokens": 4,
--- a/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
@@ -57,5 +57,67 @@
            "max-num-seqs": 512,
            "async-scheduling": ""
        }
+    },
+    {
+        "test_name": "throughput_deepseek_r1",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "deepseek-ai/DeepSeek-R1",
+            "tensor_parallel_size": 8,
+            "load_format": "dummy",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "dataset_name": "sharegpt",
+            "num_prompts": 1000,
+            "backend": "vllm",
+            "max-model-len": 2048,
+            "max-num-seqs": 384,
+            "async-scheduling": ""
+        }
+    },
+    {
+        "test_name": "throughput_llama4_maverick_17b128e_instruct_fp8",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+            "tensor_parallel_size": 8,
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "dataset_name": "sharegpt",
+            "num_prompts": 1000,
+            "backend": "vllm",
+            "max-model-len": 2048,
+            "max-num-seqs": 512,
+            "async-scheduling": "",
+            "enable_expert_parallel": ""
+        }
+    },
+    {
+        "test_name": "throughput_qwen3_8b",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "Qwen/Qwen-3-8B",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "dataset_name": "sharegpt",
+            "num_prompts": 1000,
+            "max-num-seqs": 512,
+            "backend": "vllm",
+            "async-scheduling": ""
+        }
    }
 ]
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -83,7 +83,7 @@ steps:
        agents:
          queue: cpu_queue_postmerge
        commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
          - "mkdir artifacts"
          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
@@ -152,7 +152,7 @@ steps:
          queue: cpu_queue_postmerge
        commands:
          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
        env:
--- a/.buildkite/scripts/annotate-rocm-release.sh
+++ b/.buildkite/scripts/annotate-rocm-release.sh
@@ -68,7 +68,7 @@ aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/aiter-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amd_aiter-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
 \`\`\`

@@ -80,7 +80,7 @@ aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-
 - **torchvision**: TorchVision for ROCm PyTorch
 - **torchaudio**: Torchaudio for ROCm PyTorch
 - **amdsmi**: AMD SMI Python bindings
- **aiter**: Aiter for ROCm
+- **amd_aiter**: Aiter for ROCm
 - **flash-attn**: Flash Attention for ROCm

 ### :warning: Notes
--- a/.buildkite/scripts/check-ray-compatibility.sh
+++ b/.buildkite/scripts/check-ray-compatibility.sh
@@ -0,0 +1,213 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Check if Ray LLM can generate lock files that are compatible with this
+# version of vllm. Downloads Ray's requirement files and runs a full
+# dependency resolution with the installed vllm's constraints to see if
+# a valid lock file can be produced.
+#
+# See: https://github.com/vllm-project/vllm/issues/33599
+
+set -eo pipefail
+
+RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python"
+
+WORK_DIR=$(mktemp -d)
+trap 'rm -rf "$WORK_DIR"' EXIT
+
+# Fetch all Ray requirement files used in the LLM depset pipeline
+echo ">>> Fetching Ray requirement files"
+RAY_FILES=(
+    "requirements.txt"
+    "requirements/cloud-requirements.txt"
+    "requirements/base-test-requirements.txt"
+    "requirements/llm/llm-requirements.txt"
+    "requirements/llm/llm-test-requirements.txt"
+)
+for FILE in "${RAY_FILES[@]}"; do
+    LOCAL_PATH="${WORK_DIR}/$(basename "$FILE")"
+    echo "    ${FILE}"
+    curl -fsSL -o "$LOCAL_PATH" "${RAY_BASE_URL}/${FILE}"
+done
+
+# Extract installed vllm deps
+echo ">>> Extracting installed vllm dependency constraints"
+python3 - "${WORK_DIR}/vllm-constraints.txt" <<'PYEOF'
+"""Write out the installed vllm's dependencies as pip constraint lines.
+
+Ray uses vllm[audio], so audio-extra deps are included with their extra
+markers stripped. The resolver cannot evaluate extra markers for a
+package that is not itself being resolved from an index, so we activate
+them manually here.
+"""
+import importlib.metadata
+import re
+import sys
+
+out_path = sys.argv[1]
+raw_reqs = importlib.metadata.requires("vllm") or []
+
+# Ray uses vllm[audio] – activate that extra.
+ACTIVE_EXTRAS = {"audio"}
+EXTRA_RE = re.compile(r"""extra\s*==\s*['"]([^'"]+)['"]""")
+
+lines = []
+for r in raw_reqs:
+    if ";" not in r:
+        # Unconditional dep — always include.
+        lines.append(r.strip())
+        continue
+
+    req_part, _, marker_part = r.partition(";")
+    marker_part = marker_part.strip()
+
+    extra_matches = EXTRA_RE.findall(marker_part)
+    if not extra_matches:
+        # Non-extra marker (python_version, etc.) — keep as-is.
+        lines.append(r.strip())
+        continue
+
+    if not ACTIVE_EXTRAS.intersection(extra_matches):
+        continue  # Skip inactive extras (tensorizer, bench, …).
+
+    # Strip the extra== conditions but keep any remaining markers
+    # (e.g. python_version).
+    cleaned = EXTRA_RE.sub("", marker_part)
+    cleaned = re.sub(r"\band\b\s*\band\b", "and", cleaned)
+    cleaned = re.sub(r"^\s*and\s+|\s+and\s*$", "", cleaned).strip()
+
+    if cleaned:
+        lines.append(f"{req_part.strip()} ; {cleaned}")
+    else:
+        lines.append(req_part.strip())
+
+with open(out_path, "w") as f:
+    for line in lines:
+        f.write(line + "\n")
+
+print(f"Wrote {len(lines)} constraints to {out_path}")
+PYEOF
+
+echo ">>> Installed vllm deps (first 20 lines):"
+head -20 "${WORK_DIR}/vllm-constraints.txt"
+
+# Remove Ray's vllm pin — the installed vllm's transitive deps
+# (written above) replace it in the resolution. vllm itself cannot
+# be resolved from PyPI for in-development versions, so we test
+# whether Ray's requirements can coexist with vllm's dependency
+# constraints instead.
+sed -i '/^vllm/d' "${WORK_DIR}/llm-requirements.txt"
+
+# Install uv if needed
+if ! command -v uv &>/dev/null; then
+    echo ">>> Installing uv"
+    pip install uv -q
+fi
+
+# Resolve: given vllm's constraints, can Ray compile a lock file?
+#
+# vllm's dependency constraints are the fixed side — Ray is flexible and
+# can regenerate its lock files. We pass vllm's constraints via -c so
+# the resolver treats them as non-negotiable bounds, then check whether
+# Ray's own requirements can still be satisfied within those bounds.
+echo ""
+echo "============================================================"
+echo ">>> Resolving: Can Ray generate compatible lock files?"
+echo "============================================================"
+
+set +e
+uv pip compile \
+    "${WORK_DIR}/requirements.txt" \
+    "${WORK_DIR}/cloud-requirements.txt" \
+    "${WORK_DIR}/base-test-requirements.txt" \
+    "${WORK_DIR}/llm-requirements.txt" \
+    "${WORK_DIR}/llm-test-requirements.txt" \
+    -c "${WORK_DIR}/vllm-constraints.txt" \
+    --python-version 3.12 \
+    --python-platform x86_64-manylinux_2_31 \
+    --extra-index-url https://download.pytorch.org/whl/cu129 \
+    --index-strategy unsafe-best-match \
+    --unsafe-package setuptools \
+    --unsafe-package ray \
+    --no-header \
+    -o "${WORK_DIR}/resolved.txt" \
+    2>&1
+EXIT_CODE=$?
+set -e
+
+echo ""
+echo "=========================================="
+if [ $EXIT_CODE -eq 0 ]; then
+    echo "SUCCESS: Ray can generate lock files compatible with this vllm."
+    echo ""
+    echo "Key resolved versions:"
+    grep -E '^(protobuf|torch|numpy|transformers)==' \
+        "${WORK_DIR}/resolved.txt" | sort || true
+    echo "=========================================="
+    exit 0
+fi
+
+echo "FAILURE: Ray cannot generate lock files compatible with this vllm."
+echo "This means a fundamental dependency conflict exists that Ray"
+echo "cannot resolve by regenerating its lock files."
+echo "See: https://github.com/vllm-project/vllm/issues/33599"
+echo "=========================================="
+
+# Buildkite annotation
+if [ -f /usr/bin/buildkite-agent ]; then
+    buildkite-agent annotate --style 'warning' --context 'ray-compat' << EOF
+### :warning: Ray Dependency Compatibility Warning
+This PR introduces dependencies that **cannot** be resolved with Ray's requirements.
+Ray would not be able to regenerate its lock files to accommodate this vllm version.
+
+Please check the **Ray Dependency Compatibility Check** step logs for details.
+See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for context.
+EOF
+fi
+
+# Notify Slack if webhook is configured and PR/branch are valid.
+if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
+    PR="${BUILDKITE_PULL_REQUEST:-}"
+    BRANCH="${BUILDKITE_BRANCH:-}"
+
+    # Skip notification if PR is invalid or branch is empty
+    if [[ "$PR" = "false" || -z "$PR" || -z "$BRANCH" ]]; then
+        echo ">>> Skipping Slack notification (invalid PR or empty branch: PR=$PR, branch=$BRANCH)"
+    else
+        echo ">>> Sending Slack notification"
+        # Single quotes are intentional: the f-string expressions are Python, not shell.
+        # shellcheck disable=SC2016
+        PAYLOAD=$(python3 -c '
+import json, os, sys
+pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A")
+branch = os.getenv("BUILDKITE_BRANCH", "unknown")
+url = os.getenv("BUILDKITE_BUILD_URL", "#")
+data = {
+    "text": ":warning: Ray Dependency Compatibility Check Failed",
+    "blocks": [{
+        "type": "section",
+        "text": {
+            "type": "mrkdwn",
+            "text": (
+                "*:warning: Ray Dependency Compatibility Check Failed*\n"
+                f"PR #{pr} on branch `{branch}` introduces dependencies "
+                f"that cannot be resolved with Ray'\''s requirements.\n"
+                f"<{url}|View Build>"
+            ),
+        },
+    }],
+}
+print(json.dumps(data))
+')
+
+        HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
+            -H 'Content-type: application/json' \
+            -d "$PAYLOAD")
+        echo "    Slack webhook response: $HTTP_CODE"
+    fi
+else
+    echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
+fi
+
+exit 1
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -6,6 +6,26 @@
 # Multi-node detection: Instead of matching on fragile group names, we detect
 # multi-node jobs structurally by looking for the bracket command syntax
 # "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
+#
+###############################################################################
+# QUOTING / COMMAND PASSING
+#
+# Passing commands as positional arguments ($*) is fragile when the command
+# string itself contains double quotes, e.g.:
+#
+#   bash run-amd-test.sh "export FLAGS="value" && pytest -m "not slow""
+#
+# The outer shell resolves the nested quotes *before* this script runs, so
+# the script receives mangled input it cannot fully recover.
+#
+# Preferred: pass commands via the VLLM_TEST_COMMANDS environment variable:
+#
+#   export VLLM_TEST_COMMANDS='export FLAGS="value" && pytest -m "not slow"'
+#   bash run-amd-test.sh
+#
+# Single-quoted assignment preserves all inner double quotes verbatim.
+# The $* path is kept for backward compatibility but callers should migrate.
+###############################################################################
 set -o pipefail

 # Export Python path
@@ -79,26 +99,169 @@ is_multi_node() {
  return 1
 }

+handle_pytest_exit() {
+  local exit_code=$1
+  if [ "$exit_code" -eq 5 ]; then
+    echo "Pytest exit code 5 (no tests collected) - treating as success."
+    exit 0
+  fi
+  exit "$exit_code"
+}
+
 ###############################################################################
-# Pytest marker re-quoting
+# Pytest marker/keyword re-quoting
 #
 # When commands are passed through Buildkite -> shell -> $* -> bash -c,
-# quotes around pytest -m marker expressions get stripped:
+# quotes around multi-word pytest -m/-k expressions get stripped:
 #   pytest -v -s -m 'not cpu_test' v1/core
 # becomes:
 #   pytest -v -s -m not cpu_test v1/core
 #
 # pytest then interprets "cpu_test" as a file path, not part of the marker.
-# This function detects unquoted multi-word marker expressions and re-quotes
-# them so they survive the final bash -c expansion.
+#
+# This function detects unquoted expressions after -m/-k and re-quotes them
+# by collecting tokens until a recognizable boundary is reached:
+#   - test path (contains '/')
+#   - test file (ends with '.py')
+#   - another pytest flag (--xxx or -x single-char flags)
+#   - command separator (&& || ; |)
+#   - environment variable assignment (FOO=bar)
+#
+# Single-word markers (e.g. -m cpu_test, -m hybrid_model) pass through
+# unquoted since they have no spaces and work fine.
+#
+# Already-quoted expressions (containing literal single quotes) are passed
+# through untouched to avoid double-quoting values injected by
+# apply_rocm_test_overrides.
+#
+# NOTE: This ONLY fixes -m/-k flags. It cannot recover arbitrary inner
+# double-quotes stripped by the calling shell (see header comment).
+# Use VLLM_TEST_COMMANDS to avoid the problem entirely.
 ###############################################################################
-
 re_quote_pytest_markers() {
-  local cmds="$1"
-  # Pattern: -m not <identifier>  ->  -m 'not <identifier>'
-  # Handles the common cases: 'not cpu_test', 'not slow_test', etc.
-  cmds=$(echo "$cmds" | sed -E "s/-m not ([a-zA-Z_][a-zA-Z0-9_]*)/-m 'not \1'/g")
-  echo "$cmds"
+  local input="$1"
+  local output=""
+  local collecting=false
+  local marker_buf=""
+
+  # Strip backslash-newline continuations, then flatten remaining newlines
+  local flat="${input//$'\\\n'/ }"
+  flat="${flat//$'\n'/ }"
+
+  # Disable globbing to prevent *.py etc. from expanding during read -ra
+  local restore_glob
+  restore_glob="$(shopt -p -o noglob 2>/dev/null || true)"
+  set -o noglob
+  local -a words
+  read -ra words <<< "$flat"
+  eval "$restore_glob"
+
+  for word in "${words[@]}"; do
+    if $collecting; then
+      # If the token we're about to collect already contains a literal
+      # single quote, the expression was already quoted upstream.
+      # Flush and stop collecting.
+      if [[ "$word" == *"'"* ]]; then
+        if [[ -n "$marker_buf" ]]; then
+          # Should not normally happen (partial buf + quote), flush raw
+          output+="${marker_buf} "
+          marker_buf=""
+        fi
+        output+="${word} "
+        collecting=false
+        continue
+      fi
+
+      local is_boundary=false
+      case "$word" in
+        # Line-continuation artifact
+        "\\")
+          is_boundary=true ;;
+        # Command separators
+        "&&"|"||"|";"|"|")
+          is_boundary=true ;;
+        # Long flags (--ignore, --shard-id, etc.)
+        --*)
+          is_boundary=true ;;
+        # Short flags (-v, -s, -x, etc.) but NOT negative marker tokens
+        # like "not" which don't start with "-". Also skip -k/-m which
+        # would start a new marker (handled below).
+        -[a-zA-Z])
+          is_boundary=true ;;
+        # Test path (contains /)
+        */*)
+          is_boundary=true ;;
+        # Test file (ends with .py, possibly with ::method)
+        *.py|*.py::*)
+          is_boundary=true ;;
+        # Environment variable assignment preceding a command (FOO=bar)
+        *=*)
+          # Only treat as boundary if it looks like VAR=value, not
+          # pytest filter expressions like num_gpus=2 inside markers
+          if [[ "$word" =~ ^[A-Z_][A-Z0-9_]*= ]]; then
+            is_boundary=true
+          fi
+          ;;
+      esac
+
+      if $is_boundary; then
+        # Strip surrounding double quotes if present (from upstream
+        # single-to-double conversion); without this, wrapping below
+        # would produce '"expr"' with literal double-quote characters.
+        if [[ "$marker_buf" == '"'*'"' ]]; then
+          marker_buf="${marker_buf#\"}"
+          marker_buf="${marker_buf%\"}"
+        fi
+        # Flush the collected marker expression
+        if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
+          output+="'${marker_buf}' "
+        else
+          output+="${marker_buf} "
+        fi
+        collecting=false
+        marker_buf=""
+        # Check if this boundary word itself starts a new -m/-k
+        if [[ "$word" == "-m" || "$word" == "-k" ]]; then
+          output+="${word} "
+          collecting=true
+        # Drop stray backslash tokens silently
+        elif [[ "$word" == "\\" ]]; then
+          :
+        else
+          output+="${word} "
+        fi
+      else
+        # Accumulate into marker buffer
+        if [[ -n "$marker_buf" ]]; then
+          marker_buf+=" ${word}"
+        else
+          marker_buf="${word}"
+        fi
+      fi
+    elif [[ "$word" == "-m" || "$word" == "-k" ]]; then
+      output+="${word} "
+      collecting=true
+      marker_buf=""
+    else
+      output+="${word} "
+    fi
+  done
+
+  # Flush any trailing marker expression (marker at end of command)
+  if $collecting && [[ -n "$marker_buf" ]]; then
+    # Strip surrounding double quotes (see mid-stream flush comment)
+    if [[ "$marker_buf" == '"'*'"' ]]; then
+      marker_buf="${marker_buf#\"}"
+      marker_buf="${marker_buf%\"}"
+    fi
+    if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
+      output+="'${marker_buf}'"
+    else
+      output+="${marker_buf}"
+    fi
+  fi
+
+  echo "${output% }"
 }

 ###############################################################################
@@ -170,15 +333,15 @@ apply_rocm_test_overrides() {
  # --- Entrypoint ignores ---
  if [[ $cmds == *" entrypoints/openai "* ]]; then
    cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
-    --ignore=entrypoints/openai/test_audio.py \
-    --ignore=entrypoints/openai/test_shutdown.py \
+    --ignore=entrypoints/openai/chat_completion/test_audio.py \
+    --ignore=entrypoints/openai/completion/test_shutdown.py \
    --ignore=entrypoints/openai/test_completion.py \
    --ignore=entrypoints/openai/test_models.py \
    --ignore=entrypoints/openai/test_lora_adapters.py \
    --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
-    --ignore=entrypoints/openai/test_root_path.py \
+    --ignore=entrypoints/openai/chat_completion/test_root_path.py \
    --ignore=entrypoints/openai/test_tokenization.py \
-    --ignore=entrypoints/openai/test_prompt_validation.py "}
+    --ignore=entrypoints/openai/completion/test_prompt_validation.py "}
  fi

  if [[ $cmds == *" entrypoints/llm "* ]]; then
@@ -231,11 +394,35 @@ HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"

-commands="$*"
+# ---- Command source selection ----
+# Prefer VLLM_TEST_COMMANDS (preserves all inner quoting intact).
+# Fall back to $* for backward compatibility, but warn that inner
+# double-quotes will have been stripped by the calling shell.
+if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then
+  commands="${VLLM_TEST_COMMANDS}"
+  echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)"
+else
+  commands="$*"
+  if [[ -z "$commands" ]]; then
+    echo "Error: No test commands provided." >&2
+    echo "Usage:" >&2
+    echo "  Preferred:  VLLM_TEST_COMMANDS='...' bash $0" >&2
+    echo "  Legacy:     bash $0 \"commands here\"" >&2
+    exit 1
+  fi
+  echo "Commands sourced from positional args (legacy mode)"
+  echo "WARNING: Inner double-quotes in the command string may have been"
+  echo "  stripped by the calling shell. If you see syntax errors, switch to:"
+  echo "  export VLLM_TEST_COMMANDS='your commands here'"
+  echo "  bash $0"
+fi
+
 echo "Raw commands: $commands"

 # Fix quoting before ROCm overrides (so overrides see correct structure)
 commands=$(re_quote_pytest_markers "$commands")
+echo "After re-quoting: $commands"
+
 commands=$(apply_rocm_test_overrides "$commands")
 echo "Final commands: $commands"

@@ -248,6 +435,18 @@ if [[ -z "$render_gid" ]]; then
  exit 1
 fi

+# --- RDMA device passthrough (conditional) ---
+# If the host has RDMA devices, pass them through so tests like
+# test_moriio_connector can access ibverbs. On hosts without RDMA
+# hardware the tests will gracefully skip via _rdma_available().
+RDMA_FLAGS=""
+if [ -d /dev/infiniband ]; then
+  echo "RDMA devices detected on host, enabling passthrough"
+  RDMA_FLAGS="--device /dev/infiniband --cap-add=IPC_LOCK"
+else
+  echo "No RDMA devices found on host, RDMA tests will be skipped"
+fi
+
 # --- Route: multi-node vs single-node ---
 if is_multi_node "$commands"; then
  echo "--- Multi-node job detected"
@@ -282,7 +481,9 @@ if is_multi_node "$commands"; then
    done

    /bin/bash -c "${composite_command}"
+    exit_code=$?
    cleanup_network
+    handle_pytest_exit "$exit_code"
  else
    echo "Multi-node job detected but failed to parse bracket command syntax."
    echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]"
@@ -295,6 +496,7 @@ else
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \
    --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+    $RDMA_FLAGS \
    --network=host \
    --shm-size=16gb \
    --group-add "$render_gid" \
@@ -302,10 +504,15 @@ else
    -e HF_TOKEN \
    -e AWS_ACCESS_KEY_ID \
    -e AWS_SECRET_ACCESS_KEY \
+    -e BUILDKITE_PARALLEL_JOB \
+    -e BUILDKITE_PARALLEL_JOB_COUNT \
    -v "${HF_CACHE}:${HF_MOUNT}" \
    -e "HF_HOME=${HF_MOUNT}" \
    -e "PYTHONPATH=${MYPYTHONPATH}" \
    --name "${container_name}" \
    "${image_name}" \
    /bin/bash -c "${commands}"
+
+  exit_code=$?
+  handle_pytest_exit "$exit_code"
 fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+set -euox pipefail
+
+export VLLM_CPU_KVCACHE_SPACE=1 
+export VLLM_CPU_CI_ENV=1
+# Reduce sub-processes for acceleration
+export TORCH_COMPILE_DISABLE=1 
+export VLLM_ENABLE_V1_MULTIPROCESSING=0
+
+SDE_ARCHIVE="sde-external-10.7.0-2026-02-18-lin.tar.xz"
+SDE_CHECKSUM="CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217"
+wget "https://downloadmirror.intel.com/913594/${SDE_ARCHIVE}"
+echo "${SDE_CHECKSUM}  ${SDE_ARCHIVE}" | sha256sum --check
+mkdir -p sde
+tar -xvf "./${SDE_ARCHIVE}" --strip-components=1 -C ./sde/
+
+wait_for_pid_and_check_log() {
+    local pid="$1"
+    local log_file="$2"
+    local exit_status
+
+    if [ -z "$pid" ] || [ -z "$log_file" ]; then
+        echo "Usage: wait_for_pid_and_check_log <PID> <LOG_FILE>"
+        return 1
+    fi
+
+    echo "Waiting for process $pid to finish..."
+    
+    # Use the 'wait' command to pause the script until the specific PID exits.
+    # The 'wait' command's own exit status will be that of the waited-for process.
+    if wait "$pid"; then
+        exit_status=$?
+        echo "Process $pid finished with exit status $exit_status (Success)."
+    else
+        exit_status=$?
+        echo "Process $pid finished with exit status $exit_status (Failure)."
+    fi
+
+    if [ "$exit_status" -ne 0 ]; then
+        echo "Process exited with a non-zero status."
+        echo "--- Last few lines of log file: $log_file ---"
+        tail -n 50 "$log_file"
+        echo "---------------------------------------------"
+        return 1 # Indicate failure based on exit status
+    fi
+
+    echo "No errors detected in log file and process exited successfully."
+    return 0
+}
+
+# Test Sky Lake (AVX512F)
+./sde/sde64 -skl -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_0.log 2>&1 &
+PID_TEST_0=$!
+
+# Test Cascade Lake (AVX512F + VNNI)
+./sde/sde64 -clx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_1.log 2>&1 &
+PID_TEST_1=$!
+
+# Test Cooper Lake (AVX512F + VNNI + BF16)
+./sde/sde64 -cpx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_2.log 2>&1 &
+PID_TEST_2=$!
+
+wait_for_pid_and_check_log $PID_TEST_0 test_0.log
+wait_for_pid_and_check_log $PID_TEST_1 test_1.log
+wait_for_pid_and_check_log $PID_TEST_2 test_2.log
--- a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
@@ -1,26 +1,43 @@
 #!/bin/bash
 set -euox pipefail
+export VLLM_CPU_CI_ENV=0

 echo "--- PP+TP"
 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
 server_pid=$!
-timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
 vllm bench serve \
    --backend vllm \
    --dataset-name random \
    --model meta-llama/Llama-3.2-3B-Instruct \
    --num-prompts 20 \
+    --result-dir ./test_results \
+    --result-filename tp_pp.json \
+    --save-result \
    --endpoint /v1/completions
-kill -s SIGTERM $server_pid &
+kill -s SIGTERM $server_pid; wait $server_pid || true
+failed_req=$(jq '.failed' ./test_results/tp_pp.json)
+if [ "$failed_req" -ne 0 ]; then
+  echo "Some requests were failed!"
+  exit 1
+fi

 echo "--- DP+TP"
 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
 server_pid=$!
-timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
 vllm bench serve \
    --backend vllm \
    --dataset-name random \
    --model meta-llama/Llama-3.2-3B-Instruct \
    --num-prompts 20 \
+    --result-dir ./test_results \
+    --result-filename dp_pp.json \
+    --save-result \
    --endpoint /v1/completions
-kill -s SIGTERM $server_pid &
+kill -s SIGTERM $server_pid; wait $server_pid || true
+failed_req=$(jq '.failed' ./test_results/dp_pp.json)
+if [ "$failed_req" -ne 0 ]; then
+  echo "Some requests were failed!"
+  exit 1
+fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -34,7 +34,7 @@ function cpu_tests() {
  # offline inference
  docker exec cpu-test bash -c "
    set -e
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"

  # Run model tests
  docker exec cpu-test bash -c "
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -27,7 +27,7 @@ function cpu_tests() {
  podman exec -it "$container_id" bash -c "
    export TORCH_COMPILE_DISABLE=1
    set -xve
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log

  # Run basic model test
  podman exec -it "$container_id" bash -c "
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@@ -25,5 +25,5 @@ remove_docker_container

 # Run the image and test offline inference
 docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
+    python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -1,9 +1,27 @@
 #!/bin/bash

-# This script build the CPU docker image and run the offline inference inside the container.
+# This script builds the HPU docker image and runs the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
+#
+# vllm-gaudi compatibility pinning:
+#   The vllm-gaudi plugin is installed on top of the vllm upstream checkout used by this CI job.
+#   When upstream vllm changes its API, the plugin may break before it has been updated.
+#   To handle this, the vllm-gaudi repository maintains a file:
+#     vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT
+#   The first line of that file controls what version of vllm is used inside the Docker image:
+#     - "latest"        : no checkout override; the current Buildkite CI commit is used as-is.
+#     - "<commit SHA>"  : vllm is checked out to that specific commit before building, pinning
+#                         the test to a known-compatible baseline.
+#   To unpin (resume testing against the live vllm tip), set the file content back to "latest".
 set -exuo pipefail

+# Fetch the vllm community commit reference from vllm-gaudi (first line only).
+VLLM_COMMUNITY_COMMIT=$(curl -s \
+  https://raw.githubusercontent.com/vllm-project/vllm-gaudi/vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT \
+  | head -1 | tr -d '\n')
+
+echo "Using vllm community commit: ${VLLM_COMMUNITY_COMMIT}"
+
 # Try building the docker image
 image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
 container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
@@ -12,6 +30,13 @@ FROM gaudi-base-image:latest

 COPY ./ /workspace/vllm

+# If VLLM_COMMUNITY_COMMIT is a specific commit (not "latest"), check it out to pin vllm
+# to the version known to be compatible with vllm-gaudi. When the value is "latest",
+# the current checkout (the Buildkite CI commit) is used unchanged.
+RUN if [ "${VLLM_COMMUNITY_COMMIT}" != "latest" ]; then \
+      cd /workspace/vllm && git fetch --unshallow 2>/dev/null || true && git checkout ${VLLM_COMMUNITY_COMMIT}; \
+    fi
+
 WORKDIR /workspace/vllm

 ENV no_proxy=localhost,127.0.0.1
@@ -51,7 +76,7 @@ docker run --rm --runtime=habana --name="${container_name}" --network=host \
  -e PT_HPU_LAZY_MODE=1 \
  "${image_name}" \
  /bin/bash -c '
-  cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
+  cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
 '

 EXITCODE=$?
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -34,17 +34,17 @@ docker run \
    set -e
    echo $ZE_AFFINITY_MASK
    pip install tblib==3.1.0
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
-    python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
+    python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
+    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
+    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
    cd tests
-    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
+    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -24,7 +24,7 @@ if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:
  BACKENDS=("allgather_reducescatter")
  # Disable MOE padding for ROCm since it is causing eplb to fail
  export VLLM_ROCM_MOE_PADDING=0
-  PLATFORM_ARGS=("--no-async-scheduling")
+  PLATFORM_ARGS=("--no-async-scheduling" "--attention-backend=TRITON_ATTN")
  echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
 else
  # Non-ROCm platform (CUDA/other)
--- a/.buildkite/scripts/tool_call/run-bfcl-eval.sh
+++ b/.buildkite/scripts/tool_call/run-bfcl-eval.sh
@@ -0,0 +1,248 @@
+#!/bin/bash
+# Run BFCL (Berkeley Function Call Leaderboard) tool-calling correctness
+# evaluation against a local vLLM server.
+#
+# Usage:
+#   # Run with defaults (gpt-oss-20b, multi_turn)
+#   bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
+#
+#   # Run with gpt-oss-120b and multiple test categories
+#   BFCL_MODEL="openai/gpt-oss-120b" BFCL_TP_SIZE=4 \
+#     BFCL_TEST_CATEGORY="live_simple, multiple, parallel_multiple" \
+#     bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
+#
+#   # Chain both API types (use BFCL_OUTPUT_DIR to avoid overwriting results)
+#   BFCL_OUTPUT_DIR=./bfcl-chat-completions BFCL_API_TYPE=chat_completions \
+#     bash .buildkite/scripts/tool_call/run-bfcl-eval.sh && \
+#   BFCL_OUTPUT_DIR=./bfcl-responses BFCL_API_TYPE=responses \
+#     bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
+#
+# Environment variables (all optional, with defaults):
+#   BFCL_MODEL          - HF model name (default: openai/gpt-oss-20b)
+#   BFCL_API_TYPE       - API type: "chat_completions" or "responses" (default: chat_completions)
+#   BFCL_OUTPUT_DIR     - Directory for BFCL results (default: current working directory)
+#   BFCL_TEST_CATEGORY  - BFCL test categories (default: multi_turn)
+#   BFCL_TOOL_CALL_PARSER - Tool call parser name (default: openai)
+#   BFCL_NUM_THREADS    - Threads for BFCL generate (default: 8)
+#   BFCL_TP_SIZE        - Tensor parallel size (default: 1)
+#   BFCL_MAX_MODEL_LEN  - Max model length (default: 4096)
+#   BFCL_PORT           - Server port (default: 8000)
+#   BFCL_REASONING_PARSER - Reasoning parser name (default: disabled)
+#   BFCL_EXTRA_ARGS     - Additional vLLM server args
+
+set -euo pipefail
+
+# ---- Configuration ----
+MODEL="${BFCL_MODEL:-openai/gpt-oss-20b}"
+API_TYPE="${BFCL_API_TYPE:-chat_completions}"
+OUTPUT_DIR="${BFCL_OUTPUT_DIR:-}"
+TEST_CATEGORY="${BFCL_TEST_CATEGORY:-multi_turn}"
+TOOL_CALL_PARSER="${BFCL_TOOL_CALL_PARSER:-openai}"
+NUM_THREADS="${BFCL_NUM_THREADS:-8}"
+TP_SIZE="${BFCL_TP_SIZE:-1}"
+MAX_MODEL_LEN="${BFCL_MAX_MODEL_LEN:-4096}"
+PORT="${BFCL_PORT:-8000}"
+REASONING_PARSER="${BFCL_REASONING_PARSER:-}"
+EXTRA_ARGS="${BFCL_EXTRA_ARGS:-}"
+
+# Set up output directory
+if [ -n "$OUTPUT_DIR" ]; then
+    mkdir -p "$OUTPUT_DIR"
+    OUTPUT_DIR="$(cd "$OUTPUT_DIR" && pwd)"
+fi
+
+echo "============================================"
+echo "BFCL Tool Call Correctness Evaluation"
+echo "============================================"
+echo "Model:          $MODEL"
+echo "Tool parser:    $TOOL_CALL_PARSER"
+echo "API type:       $API_TYPE"
+echo "Output dir:     ${OUTPUT_DIR:-<cwd>}"
+echo "Test category:  $TEST_CATEGORY"
+echo "TP size:        $TP_SIZE"
+echo "Max model len:  $MAX_MODEL_LEN"
+echo "Port:           $PORT"
+echo "Num threads:    $NUM_THREADS"
+echo "============================================"
+
+# ---- Install bfcl-eval if missing ----
+if ! python3 -c "import bfcl_eval" 2>/dev/null; then
+    echo "Installing bfcl-eval..."
+    pip install "bfcl-eval>=2025.10.20.1,<2026"
+fi
+
+# ---- Cleanup handler ----
+SERVER_PID=""
+cleanup() {
+    if [ -n "$SERVER_PID" ]; then
+        echo "Stopping vLLM server (pid=$SERVER_PID)..."
+        kill "$SERVER_PID" 2>/dev/null || true
+        wait "$SERVER_PID" 2>/dev/null || true
+    fi
+    # Remove BFCL lock files (created by filelock for thread-safe writes)
+    rm -rf .file_locks/
+    if [ -n "${OUTPUT_DIR:-}" ]; then
+        rm -rf "$OUTPUT_DIR/.file_locks/"
+    fi
+}
+trap cleanup EXIT
+
+# ---- Start vLLM server ----
+echo "Starting vLLM server..."
+
+SERVE_ARGS=(
+    "$MODEL"
+    --port "$PORT"
+    --enable-auto-tool-choice
+    --tool-call-parser "$TOOL_CALL_PARSER"
+    --tensor-parallel-size "$TP_SIZE"
+    --max-model-len "$MAX_MODEL_LEN"
+    --enforce-eager
+    --no-enable-prefix-caching
+)
+
+# Append reasoning parser if specified
+if [ -n "$REASONING_PARSER" ]; then
+    SERVE_ARGS+=(--reasoning-parser "$REASONING_PARSER")
+fi
+
+# Append any extra args
+if [ -n "$EXTRA_ARGS" ]; then
+    read -ra EXTRA_ARGS_ARRAY <<< "$EXTRA_ARGS"
+    SERVE_ARGS+=("${EXTRA_ARGS_ARRAY[@]}")
+fi
+
+echo "Command: vllm serve ${SERVE_ARGS[*]}"
+vllm serve "${SERVE_ARGS[@]}" &
+SERVER_PID=$!
+
+# ---- Wait for server to be ready ----
+echo "Waiting for vLLM server to start (timeout: 600s)..."
+SECONDS_WAITED=0
+until curl -sf "http://localhost:${PORT}/health" > /dev/null 2>&1; do
+    if [ $SECONDS_WAITED -ge 600 ]; then
+        echo ""
+        echo "ERROR: vLLM server failed to start within 600s"
+        exit 1
+    fi
+    if (( SECONDS_WAITED % 30 == 0 && SECONDS_WAITED > 0 )); then
+        echo "  Still waiting... (${SECONDS_WAITED}s elapsed)"
+    fi
+    sleep 2
+    SECONDS_WAITED=$((SECONDS_WAITED + 2))
+done
+echo "vLLM server is ready. (started in ${SECONDS_WAITED}s)"
+
+# ---- Run BFCL evaluation ----
+# bfcl-eval has no CLI entry point; generate() and evaluate() are Typer
+# functions that must be called from Python. The MODEL_CONFIG_MAPPING must
+# be patched in-process so BFCL knows to use the OpenAI-compatible handler
+# against our local vLLM server.
+bfcl_exit_code=0
+python3 - "$MODEL" "$TEST_CATEGORY" "$NUM_THREADS" "$PORT" "$API_TYPE" "$OUTPUT_DIR" << 'PYEOF' || bfcl_exit_code=$?
+import os
+import sys
+
+model = sys.argv[1]
+test_category = sys.argv[2]
+num_threads = int(sys.argv[3])
+port = sys.argv[4]
+api_type = sys.argv[5]
+output_dir = sys.argv[6] if len(sys.argv) > 6 and sys.argv[6] else os.getcwd()
+
+os.environ["OPENAI_BASE_URL"] = f"http://localhost:{port}/v1"
+os.environ["OPENAI_API_KEY"] = "dummy"
+os.environ["BFCL_PROJECT_ROOT"] = output_dir
+
+import bfcl_eval.constants.model_config as bfcl_model_config
+from bfcl_eval.constants.model_config import ModelConfig
+from bfcl_eval.model_handler.api_inference.openai_completion import (
+    OpenAICompletionsHandler,
+)
+from bfcl_eval.model_handler.api_inference.openai_response import (
+    OpenAIResponsesHandler,
+)
+
+if api_type == "responses":
+    handler = OpenAIResponsesHandler
+else:
+    handler = OpenAICompletionsHandler
+
+bfcl_model_config.MODEL_CONFIG_MAPPING[model] = ModelConfig(
+    model_name=model,
+    display_name=f"{model} (FC) (vLLM)",
+    url=f"https://huggingface.co/{model}",
+    org="",
+    license="apache-2.0",
+    model_handler=handler,
+    input_price=None,
+    output_price=None,
+    is_fc_model=True,
+    underscore_to_dot=True,
+)
+
+from bfcl_eval.__main__ import evaluate, generate
+import inspect
+import typer
+
+
+def _get_default_kwargs(function):
+    kwargs = {}
+    for k, v in inspect.signature(function).parameters.items():
+        if v.default is not inspect.Parameter.empty:
+            default = v.default
+            if isinstance(default, typer.models.OptionInfo):
+                default = default.default
+            kwargs[k] = default
+    return kwargs
+
+
+# ---- generate ----
+print(f"=== BFCL generate: model={model} test_category={test_category} ===")
+gen_kwargs = _get_default_kwargs(generate)
+gen_kwargs["model"] = [model]
+gen_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
+gen_kwargs["skip_server_setup"] = True
+gen_kwargs["num_threads"] = num_threads
+generate(**gen_kwargs)
+
+# ---- evaluate ----
+print(f"=== BFCL evaluate: model={model} test_category={test_category} ===")
+eval_kwargs = _get_default_kwargs(evaluate)
+eval_kwargs["model"] = [model]
+eval_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
+evaluate(**eval_kwargs)
+
+print("=== BFCL evaluation completed successfully ===")
+PYEOF
+
+# ---- Upload results to buildkite ----
+if command -v buildkite-agent &>/dev/null; then
+    if [ $bfcl_exit_code -eq 0 ]; then
+        STYLE="success"
+        STATUS="PASSED"
+    else
+        STYLE="error"
+        STATUS="FAILED"
+    fi
+
+    buildkite-agent annotate --style "$STYLE" --context "bfcl-results" <<EOF
+### BFCL Tool Call Correctness - ${STATUS}
+- **Model:** \`${MODEL}\`
+- **Parser:** \`${TOOL_CALL_PARSER}\`
+- **API type:** \`${API_TYPE}\`
+- **Test category:** \`${TEST_CATEGORY}\`
+EOF
+
+    # BFCL writes results to $BFCL_PROJECT_ROOT/result/ and scores to
+    # $BFCL_PROJECT_ROOT/score/
+    RESULTS_ROOT="${OUTPUT_DIR:-.}"
+    if [ -d "$RESULTS_ROOT/result" ]; then
+        buildkite-agent artifact upload "$RESULTS_ROOT/result/**/*"
+    fi
+    if [ -d "$RESULTS_ROOT/score" ]; then
+        buildkite-agent artifact upload "$RESULTS_ROOT/score/**/*"
+    fi
+fi
+
+exit $bfcl_exit_code
--- a/.buildkite/scripts/upload-nightly-wheels.sh
+++ b/.buildkite/scripts/upload-nightly-wheels.sh
@@ -72,7 +72,7 @@ obj_json="objects.json"
 aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
 mkdir -p "$INDICES_OUTPUT_DIR"

-# call script to generate indicies for all existing wheels
+# call script to generate indices for all existing wheels
 # this indices have relative paths that could work as long as it is next to the wheel directory in s3
 # i.e., the wheels are always in s3://vllm-wheels/<commit>/
 # and indices can be placed in /<commit>/, or /nightly/, or /<version>/
--- a/.buildkite/scripts/upload-release-wheels-pypi.sh
+++ b/.buildkite/scripts/upload-release-wheels-pypi.sh
@@ -54,10 +54,13 @@ mkdir -p $DIST_DIR
 # include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
 aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
 echo "Wheels copied to local directory"
-# generate source tarball
-git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT"
+# generate source distribution using setup.py
+python setup.py sdist --dist-dir=$DIST_DIR
 ls -la $DIST_DIR

+SDIST_FILE=$(find $DIST_DIR -name "vllm*.tar.gz")
+echo "Found sdist: $SDIST_FILE"
+
 # upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
 PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
 if [[ -z "$PYPI_WHEEL_FILES" ]]; then
@@ -65,6 +68,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
  exit 1
 fi

-python3 -m twine check "$PYPI_WHEEL_FILES"
-python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES"
-echo "Wheels uploaded to PyPI"
+python3 -m twine check "$PYPI_WHEEL_FILES" "$SDIST_FILE"
+python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES" "$SDIST_FILE"
+echo "Wheels and source distribution uploaded to PyPI"
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
--- a/.buildkite/test_areas/basic_correctness.yaml
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -14,8 +14,3 @@ steps:
  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -36,6 +36,16 @@ steps:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py

+- label: AsyncTP Correctness Tests (B200)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/"
+  device: b200
+  optional: true
+  num_devices: 2
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
+
 - label: Distributed Compile Unit Tests (2xH100)
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
@@ -91,8 +101,8 @@ steps:
    - nvidia-smi
    # Run all models and attn backends but only Inductor partition and native custom ops
    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
-    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
+    # Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and (qwen3 or deepseek)"

 - label: Fusion E2E Config Sweep (H100)
  timeout_in_minutes: 30
@@ -122,9 +132,9 @@ steps:
  commands:
    - nvidia-smi
    # Run all models but only FLASHINFER, Inductor partition and native custom ops
-    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    # Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek)) or llama-3)"

 - label: Fusion E2E TP2 Quick (H100)
  timeout_in_minutes: 20
@@ -140,8 +150,8 @@ steps:
  commands:
    - nvidia-smi
    # Run all models and attn backends but only Inductor partition and native custom ops
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"

 - label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
  timeout_in_minutes: 40
@@ -195,7 +205,7 @@ steps:
  commands:
    - nvidia-smi
    # Run all models but only FLASHINFER, Inductor partition and native custom ops
-    # include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # include qwen/deepseek with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    # for ar-rms-quant-fp4, also sweep llama3
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))) or Llama-3.1-8B-Instruct-FP4"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -50,23 +50,18 @@ steps:
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py

- label: Distributed Tests (4 GPUs)
-  timeout_in_minutes: 50
+- label: Distributed Torchrun + Examples (4 GPUs)
+  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
-  - tests/distributed/test_utils
-  - tests/distributed/test_pynccl
-  - tests/distributed/test_events
-  - tests/compile/fullgraph/test_basic_correctness.py
+  - tests/distributed/test_torchrun_example.py
+  - tests/distributed/test_torchrun_example_moe.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - examples/offline_inference/new_weight_syncing/
  - tests/examples/offline_inference/data_parallel.py
-  - tests/v1/distributed
-  - tests/v1/engine/test_engine_core_client.py
-  - tests/distributed/test_symm_mem_allreduce.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
@@ -84,6 +79,27 @@ steps:
  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+  # OLD rlhf examples
+  - cd ../examples/offline_inference
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+  # NEW rlhf examples
+  - cd new_weight_syncing
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
+
+- label: Distributed DP Tests (4 GPUs)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - tests/v1/distributed
+  - tests/v1/engine/test_engine_core_client.py
+  - tests/distributed/test_utils
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
@@ -91,19 +107,27 @@ steps:
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
+
+- label: Distributed Compile + Comm (4 GPUs)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - tests/distributed/test_pynccl
+  - tests/distributed/test_events
+  - tests/compile/fullgraph/test_basic_correctness.py
+  - tests/distributed/test_symm_mem_allreduce.py
+  - tests/distributed/test_multiproc_executor.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
  - pytest -v -s compile/fullgraph/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
  - pytest -v -s distributed/test_symm_mem_allreduce.py
-  # TODO: create a dedicated test section for multi-GPU example tests
-  # when we have multiple distributed example tests
-  # OLD rlhf examples
-  - cd ../examples/offline_inference
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
-  # NEW rlhf examples
-  - cd new_weight_syncing
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  # test multi-node TP with multiproc executor (simulated on single node)
+  - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node

 - label: Distributed Tests (8 GPUs)(H100)
  timeout_in_minutes: 10
@@ -209,6 +233,19 @@ steps:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

+- label: NixlConnector PD + Spec Decode acceptance (2 GPUs)
+  timeout_in_minutes: 30
+  device: a100
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 2
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - vllm/v1/worker/kv_connector_model_runner_mixin.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
+
 - label: Pipeline + Context Parallelism (4 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -1,5 +1,5 @@
 group: Engine
-depends_on: 
+depends_on:
  - image-build
 steps:
 - label: Engine
@@ -14,25 +14,59 @@ steps:
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py

- label: V1 e2e + engine
-  timeout_in_minutes: 45
+- label: Engine (1 GPU)
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/v1/engine/
+    - tests/v1/engine/
+  commands:
+    - pytest -v -s v1/engine/test_preprocess_error_handling.py
+    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
+
+- label: e2e Scheduling (1 GPU)
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/v1/
+    - tests/v1/e2e/general/
+  commands:
+    - pytest -v -s v1/e2e/general/test_async_scheduling.py
+
+- label: e2e Core (1 GPU)
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/v1/
+    - tests/v1/e2e/general/
+  commands:
+    - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
+
+- label: V1 e2e (2 GPUs)
+  timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
+  optional: true
+  num_devices: 2
  source_file_dependencies:
    - vllm/
-    - tests/v1
+    - tests/v1/e2e
  commands:
-    # TODO: accuracy does not match, whether setting
-    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-    - pytest -v -s v1/e2e
-    # Run this test standalone for now;
-    # need to untangle use (implicit) use of spawn/fork across the tests.
-    - pytest -v -s v1/engine/test_preprocess_error_handling.py
-    # Run the rest of v1/engine tests
-    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
+    # Only run tests that need exactly 2 GPUs
+    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
  mirror:
    amd:
-      device: mi325_1
+      device: mi325_2
+      depends_on:
+      - image-build-amd
+
+- label: V1 e2e (4 GPUs)
+  timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
+  optional: true
+  num_devices: 4
+  source_file_dependencies:
+    - vllm/
+    - tests/v1/e2e
+  commands:
+    # Only run tests that need 4 GPUs
+    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
+  mirror:
+    amd:
+      device: mi325_4
      depends_on:
      - image-build-amd
-      commands:
-      - pytest -v -s v1/e2e
-      - pytest -v -s v1/engine
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -24,11 +24,6 @@ steps:
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: Entrypoints Integration (API Server 1)
  timeout_in_minutes: 130
@@ -39,8 +34,13 @@ steps:
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
  - pytest -v -s entrypoints/test_chat_utils.py
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd

 - label: Entrypoints Integration (API Server 2)
  timeout_in_minutes: 130
@@ -65,11 +65,6 @@ steps:
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/pooling
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: Entrypoints Integration (Responses API)
  timeout_in_minutes: 50
@@ -87,6 +82,11 @@ steps:
    - tests/v1
  commands:
    - pytest -v -s v1/entrypoints
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd

 - label: OpenAI API Correctness
  timeout_in_minutes: 30
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -20,4 +20,19 @@ steps:
  - tests/distributed/test_eplb_execute.py
  commands:
  - pytest -v -s distributed/test_eplb_execute.py
-  - pytest -v -s distributed/test_eplb_spec_decode.py
+  - pytest -v -s distributed/test_eplb_spec_decode.py
+
+- label: Elastic EP Scaling Test
+  timeout_in_minutes: 20
+  device: b200
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/compilation/
+  - tests/distributed/
+  commands:
+  - pytest -v -s distributed/test_elastic_ep.py
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -8,8 +8,9 @@ steps:
  - csrc/
  - tests/kernels/core
  - tests/kernels/test_top_k_per_row.py
+  - tests/kernels/test_concat_mla_q.py
  commands:
-    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py

 - label: Kernels Attention Test %N
  timeout_in_minutes: 35
@@ -44,7 +45,8 @@ steps:
  - vllm/envs.py
  - vllm/config
  commands:
-    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+    - pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+    - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2

 - label: Kernels Mamba Test
@@ -70,7 +72,7 @@ steps:
  - tests/kernels/moe/test_batched_deepgemm.py
  - tests/kernels/attention/test_deepgemm_attention.py
  commands:
-    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s kernels/quantization/test_block_fp8.py
    - pytest -v -s kernels/moe/test_deepgemm.py
    - pytest -v -s kernels/moe/test_batched_deepgemm.py
    - pytest -v -s kernels/attention/test_deepgemm_attention.py
@@ -95,7 +97,7 @@ steps:
  - vllm/platforms/cuda.py
  commands:
    - nvidia-smi
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
    # Attention
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
    - pytest -v -s tests/kernels/attention/test_attention_selector.py
@@ -155,5 +157,14 @@ steps:
  commands:
    - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
    - pytest -v -s kernels/moe/test_deepep_moe.py
-    - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
-    # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
+
+- label: Kernels Fp4 MoE Test (B200)
+  timeout_in_minutes: 60
+  device: b200
+  num_devices: 1
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_cutedsl_moe.py
+    - pytest -v -s kernels/moe/test_flashinfer_moe.py
+    - pytest -v -s kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s kernels/moe/test_ocp_mx_moe.py
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -11,17 +11,17 @@ steps:
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt

- label: LM Eval Large Models (4 GPUs)(A100)
-  device: a100
-  optional: true
-  num_devices: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+# - label: LM Eval Large Models (4 GPUs)(A100)
+#   device: a100
+#   optional: true
+#   num_devices: 4
+#   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+#   source_file_dependencies:
+#   - csrc/
+#   - vllm/model_executor/layers/quantization
+#   commands:
+#   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

 - label: LM Eval Large Models (4 GPUs)(H100)
  device: h100
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -9,6 +9,7 @@ steps:
    - tests/v1
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    # split the test to avoid interference
    - pytest -v -s -m 'not cpu_test' v1/core
    - pytest -v -s v1/executor
@@ -66,12 +67,13 @@ steps:
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
-    - python3 offline_inference/basic/chat.py # for basic
-    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
-    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/classify.py
-    - python3 offline_inference/basic/embed.py
-    - python3 offline_inference/basic/score.py
+     # for basic
+    - python3 basic/offline_inference/chat.py
+    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
+    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 basic/offline_inference/classify.py
+    - python3 basic/offline_inference/embed.py
+    - python3 basic/offline_inference/score.py
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
--- a/.buildkite/test_areas/model_executor.yaml
+++ b/.buildkite/test_areas/model_executor.yaml
@@ -9,9 +9,9 @@ steps:
  - vllm/config/model.py
  - vllm/model_executor
  - tests/model_executor
-  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
+  - tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
  commands:
    - apt-get update && apt-get install -y curl libsodium23
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s model_executor
-    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
+    - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py
--- a/.buildkite/test_areas/model_runner_v2.yaml
+++ b/.buildkite/test_areas/model_runner_v2.yaml
@@ -0,0 +1,110 @@
+group: Model Runner V2
+depends_on:
+  - image-build
+steps:
+- label: Model Runner V2 Core Tests
+  timeout_in_minutes: 45
+  source_file_dependencies:
+  - vllm/v1/worker/gpu/
+  - vllm/v1/worker/gpu_worker.py
+  - vllm/v1/core/sched/
+  - vllm/v1/attention/
+  - tests/v1/engine/test_llm_engine.py
+  - tests/v1/e2e/
+  - tests/v1/entrypoints/llm/test_struct_output_generate.py
+  commands:
+  - set -x
+  - export VLLM_USE_V2_MODEL_RUNNER=1
+  - pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics"
+  # This requires eager until we sort out CG correctness issues.
+  # TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged.
+  - ENFORCE_EAGER=1 pytest -v -s v1/e2e/general/test_async_scheduling.py -k "not ngram"
+  - pytest -v -s v1/e2e/general/test_context_length.py
+  - pytest -v -s v1/e2e/general/test_min_tokens.py
+  # Temporary hack filter to exclude ngram spec decoding based tests.
+  - pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
+
+- label: Model Runner V2 Examples
+  timeout_in_minutes: 45
+  working_dir: "/vllm-workspace/examples"
+  source_file_dependencies:
+    - vllm/v1/worker/gpu/
+    - vllm/v1/core/sched/
+    - vllm/v1/worker/gpu_worker.py
+    - examples/offline_inference/
+    - examples/basic/offline_inference/
+    - examples/pooling/embed/vision_embedding_offline.py
+    - examples/others/tensorize_vllm_model.py
+  commands:
+    - set -x
+    - export VLLM_USE_V2_MODEL_RUNNER=1
+    - pip install tensorizer # for tensorizer test
+    - python3 basic/offline_inference/chat.py # for basic
+    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
+    #- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10  # TODO
+    #- python3 basic/offline_inference/embed.py   # TODO
+    # for multi-modal models
+    - python3 offline_inference/audio_language.py --seed 0
+    - python3 offline_inference/vision_language.py --seed 0
+    - python3 offline_inference/vision_language_multi_image.py --seed 0
+    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+    # for pooling models
+    - python3 pooling/embed/vision_embedding_offline.py --seed 0
+    # for features demo
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+
+- label: Model Runner V2 Distributed (2 GPUs)
+  timeout_in_minutes: 45
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 2
+  source_file_dependencies:
+    - vllm/v1/worker/gpu/
+    - vllm/v1/worker/gpu_worker.py
+    - tests/basic_correctness/test_basic_correctness.py
+    - tests/v1/distributed/test_async_llm_dp.py
+    - tests/v1/distributed/test_eagle_dp.py
+  commands:
+    - set -x
+    - export VLLM_USE_V2_MODEL_RUNNER=1
+    # The "and not True" here is a hacky way to exclude the prompt_embeds cases which aren't yet supported.
+    - TARGET_TEST_SUITE=L4 pytest -v -s basic_correctness/test_basic_correctness.py -m 'distributed(num_gpus=2)' -k "not ray and not True"
+    # https://github.com/NVIDIA/nccl/issues/1838
+    - export NCCL_CUMEM_HOST_ENABLE=0
+    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py -k "not ray"
+    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+
+# These require fix https://github.com/vllm-project/vllm/pull/36280
+- label: Model Runner V2 Pipeline Parallelism (4 GPUs)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 4
+  source_file_dependencies:
+    - vllm/v1/worker/gpu/
+    - vllm/v1/worker/gpu_worker.py
+    - tests/distributed/test_pipeline_parallel.py
+    #- tests/distributed/test_pp_cudagraph.py
+  commands:
+    - set -x
+    - export VLLM_USE_V2_MODEL_RUNNER=1
+    - pytest -v -s distributed/test_pipeline_parallel.py -k "not ray and not Jamba"
+    # TODO: Uncomment once https://github.com/vllm-project/vllm/pull/35162 is merged.
+    #- pytest -v -s distributed/test_pp_cudagraph.py -k "not ray"
+
+- label: Model Runner V2 Spec Decode
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/v1/worker/gpu/
+  - vllm/v1/worker/gpu_worker.py
+  - tests/v1/spec_decode/test_max_len.py
+  - tests/v1/e2e/spec_decode/test_spec_decode.py
+  commands:
+  - set -x
+  - export VLLM_USE_V2_MODEL_RUNNER=1
+  - pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp"
+  - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle or mtp"
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -65,7 +65,7 @@ steps:
    - pytest -v -s tests/models/test_transformers.py
    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -2,16 +2,65 @@ group: Models - Multimodal
 depends_on: 
  - image-build
 steps:
- label: Multi-Modal Models (Standard) # 60min
-  timeout_in_minutes: 80
+- label: "Multi-Modal Models (Standard) 1: qwen2"
+  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+    - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2"
+    - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
+
+- label: "Multi-Modal Models (Standard) 2: qwen3 + gemma"
+  timeout_in_minutes: 45
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma"
+    - pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
+
+- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl"
+  timeout_in_minutes: 45
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma"
+    - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
+
+- label: "Multi-Modal Models (Standard) 4: other + whisper"
+  timeout_in_minutes: 45
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd

 - label: Multi-Modal Processor Test (CPU)
  depends_on: 
@@ -20,6 +69,7 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
+  - tests/models/registry.py
  device: cpu
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
@@ -30,6 +80,7 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
+  - tests/models/registry.py
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
@@ -52,6 +103,11 @@ steps:
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd

 - label: Multi-Modal Models (Extended) 2
  optional: true
@@ -70,12 +126,3 @@ steps:
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
-
-# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models
-  optional: true
-  commands:
-    - echo 'Testing custom models...'
-    # PR authors can temporarily add commands below to test individual models
-    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
-    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -15,10 +15,17 @@ steps:
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
  # end platform plugin tests
-  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+  # begin io_processor plugins test
+  # test generic io_processor plugins functions
+  - pytest -v -s ./plugins_tests/test_io_processor_plugins.py
+  # test Terratorch io_processor plugins
  - pip install -e ./plugins/prithvi_io_processor_plugin
-  - pytest -v -s plugins_tests/test_io_processor_plugins.py
+  - pytest -v -s plugins_tests/test_terratorch_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
+  # test bge_m3_sparse io_processor plugin
+  - pip install -e ./plugins/bge_m3_sparse_plugin
+  - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
+  - pip uninstall bge_m3_sparse_plugin -y
  # end io_processor plugins test
  # begin stat_logger plugins test
  - pip install -e ./plugins/vllm_add_dummy_stat_logger
@@ -29,6 +36,6 @@ steps:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py
-  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+  - pytest -v -s entrypoints/openai/chat_completion/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
--- a/.buildkite/test_areas/ray_compat.yaml
+++ b/.buildkite/test_areas/ray_compat.yaml
@@ -0,0 +1,16 @@
+group: Ray Compatibility
+depends_on:
+  - image-build
+steps:
+- label: Ray Dependency Compatibility Check
+  # Informational only — does not block the pipeline.
+  # If this fails, it means the PR introduces a dependency that
+  # conflicts with Ray's dependency constraints.
+  # See https://github.com/vllm-project/vllm/issues/33599
+  soft_fail: true
+  timeout_in_minutes: 10
+  source_file_dependencies:
+  - requirements/
+  - setup.py
+  commands:
+  - bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh
--- a/.buildkite/test_areas/spec_decode.yaml
+++ b/.buildkite/test_areas/spec_decode.yaml
@@ -0,0 +1,40 @@
+group: Spec Decode
+depends_on:
+  - image-build
+steps:
+- label: Spec Decode Eagle
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/v1/spec_decode/
+    - vllm/v1/worker/gpu/spec_decode/
+    - tests/v1/e2e/spec_decode/
+  commands:
+    - pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"
+
+- label: Spec Decode Speculators + MTP
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/v1/spec_decode/
+    - vllm/v1/worker/gpu/spec_decode/
+    - vllm/transformers_utils/configs/speculators/
+    - tests/v1/e2e/spec_decode/
+  commands:
+    - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
+
+- label: Spec Decode Ngram + Suffix
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/v1/spec_decode/
+    - vllm/v1/worker/gpu/spec_decode/
+    - tests/v1/e2e/spec_decode/
+  commands:
+    - pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"
+
+- label: Spec Decode Draft Model
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/v1/spec_decode/
+    - vllm/v1/worker/gpu/spec_decode/
+    - tests/v1/e2e/spec_decode/
+  commands:
+    - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"
--- a/.buildkite/test_areas/weight_loading.yaml
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -13,13 +13,13 @@ steps:
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt

- label: Weight Loading Multiple GPU - Large Models # optional
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
-  device: a100
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/weight_loading
-  commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+# - label: Weight Loading Multiple GPU - Large Models # optional
+#   working_dir: "/vllm-workspace/tests"
+#   num_devices: 2
+#   device: a100
+#   optional: true
+#   source_file_dependencies:
+#   - vllm/
+#   - tests/weight_loading
+#   commands:
+#     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
--- a/.github/.bc-linter.yml
+++ b/.github/.bc-linter.yml
@@ -1,24 +0,0 @@
-# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
-version: 1
-paths:
-# We temporarily disable globally, and will only enable with `annotations.include`
-# include:
-#   - "vllm/v1/attetion/*.py"
-#   - "vllm/v1/core/*.py"
-exclude:
-  - "**/*.py"
-
-scan:
-  functions: true        # check free functions and methods
-  classes: true          # check classes/dataclasses
-  public_only: true      # ignore names starting with "_" at any level
-
-annotations:
-  include:               # decorators that force‑include a symbol
-    - name: "bc_linter_include"  # matched by simple name or dotted suffix
-      propagate_to_members: false # for classes, include methods/inner classes
-  exclude:               # decorators that force‑exclude a symbol
-    - name: "bc_linter_skip"     # matched by simple name or dotted suffix
-      propagate_to_members: true  # for classes, exclude methods/inner classes
-
-excluded_violations: []  # e.g. ["ParameterRenamed", "FieldTypeChanged"]
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,7 +2,7 @@
 # for more info about CODEOWNERS file

 # This lists cover the "core" components of vLLM that require careful review
-/vllm/compilation @zou3519 @youkaichao @ProExpertProg
+/vllm/compilation @zou3519 @youkaichao @ProExpertProg @BoyuanFeng
 /vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
 /vllm/lora @jeejeelee
 /vllm/model_executor/layers/attention @LucasWilkinson @MatthewBonanni
@@ -54,11 +54,14 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/kv_cache_interface.py @heheda12345
 /vllm/v1/kv_offload @ApostaC @orozery
-/vllm/v1/worker/gpu/kv_connector.py @orozery
+/vllm/v1/engine @njhill
+/vllm/v1/executor @njhill
+/vllm/v1/worker @njhill
 /vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery @NickLucche

 # Model runner V2
-/vllm/v1/worker/gpu @WoosukKwon
+/vllm/v1/worker/gpu @WoosukKwon @njhill
+/vllm/v1/worker/gpu/kv_connector.py @orozery

 # Test ownership
 /.buildkite/lm-eval-harness @mgoin 
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -3,6 +3,7 @@ pull_request_rules:
  description: Automatically apply documentation label
  conditions:
    - label != stale
+    - -closed
    - or:
      - files~=^[^/]+\.md$
      - files~=^docs/
@@ -26,7 +27,7 @@ pull_request_rules:
        Hi @{{author}}, the pre-commit checks have failed. Please run:

        ```bash 
-        uv pip install pre-commit
+        uv pip install pre-commit>=4.5.1
        pre-commit install
        pre-commit run --all-files
        ```
@@ -37,15 +38,13 @@ pull_request_rules:

        > [!TIP]
        > <details>
-        > <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
+        > <summary>Is <code>mypy</code> failing?</summary>
        > <br/>
-        > <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
+        > <code>mypy</code> is run differently in CI. If the failure is related to this check, please use the following command to run it locally:
        >
        > ```bash
        > # For mypy (substitute "3.10" with the failing version if needed)
        > pre-commit run --hook-stage manual mypy-3.10
-        > # For markdownlint
-        > pre-commit run --hook-stage manual markdownlint
        > ```
        > </details>

@@ -259,8 +258,7 @@ pull_request_rules:
      - files=benchmarks/run_structured_output_benchmark.sh
      - files=docs/features/structured_outputs.md
      - files=examples/offline_inference/structured_outputs.py
-      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
-      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
+      - files=examples/online_serving/structured_outputs/structured_outputs.py
      - files~=^tests/v1/structured_output/
      - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
      - files~=^vllm/v1/structured_output/
@@ -336,7 +334,7 @@ pull_request_rules:
    - or:
      - files~=^tests/tool_use/
      - files~=^tests/entrypoints/openai/tool_parsers/
-      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+      - files=tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
      - files~=^vllm/entrypoints/openai/tool_parsers/
      - files=docs/features/tool_calling.md
      - files~=^examples/tool_chat_*
@@ -383,7 +381,7 @@ pull_request_rules:
    - or:
      - files~=^vllm/model_executor/model_loader/tensorizer.py
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
-      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
+      - files~=^tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
      - files~=^tests/model_executor/model_loader/tensorizer_loader/
  actions:
    assign:
--- a/.github/workflows/bc-lint.yml
+++ b/.github/workflows/bc-lint.yml
@@ -1,29 +0,0 @@
-name: BC Lint
-
-on:
-  pull_request:
-    types:
-      - opened
-      - synchronize
-      - reopened
-      - labeled
-      - unlabeled
-
-jobs:
-  bc_lint:
-    if: github.repository_owner == 'vllm-project'
-    runs-on: ubuntu-latest
-    steps:
-      - name: Run BC Lint Action
-        uses: pytorch/test-infra/.github/actions/bc-lint@main
-        with:
-          repo: ${{ github.event.pull_request.head.repo.full_name }}
-          base_sha: ${{ github.event.pull_request.base.sha }}
-          head_sha: ${{ github.event.pull_request.head.sha }}
-          suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
-          docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
-          config_dir: .github
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
-  cancel-in-progress: true
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -6,6 +6,9 @@ on:
      - main
  workflow_dispatch:  # Manual trigger

+permissions:
+  contents: read
+
 jobs:
  macos-m1-smoke-test:
    runs-on: macos-latest
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,8 @@

 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/*
+!vllm/vllm_flash_attn/__init__.py
+!vllm/vllm_flash_attn/flash_attn_interface.py

 # OpenAI triton kernels copied from source
 vllm/third_party/triton_kernels/*
@@ -187,11 +189,9 @@ cython_debug/
 .vscode/

 # Claude
-CLAUDE.md
 .claude/

 # Codex
-AGENTS.md
 .codex/

 # Cursor
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@ repos:
    args: [--output-format, github, --fix]
  - id: ruff-format
 - repo: https://github.com/crate-ci/typos
-  rev: v1.38.1
+  rev: v1.43.5
  hooks:
  - id: typos
    args: [--force-exclude]
@@ -24,12 +24,13 @@ repos:
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
    types_or: [c++, cuda]
    args: [--style=file, --verbose]
- repo: https://github.com/igorshubovych/markdownlint-cli
-  rev: v0.45.0
+- repo: https://github.com/DavidAnson/markdownlint-cli2
+  rev: v0.21.0
  hooks:
-  - id: markdownlint
-    exclude: '.*\.inc\.md'
-    stages: [manual] # Only run in CI
+  - id: markdownlint-cli2
+    language_version: lts
+    args: [--fix]
+    exclude: ^CLAUDE\.md$
 - repo: https://github.com/rhysd/actionlint
  rev: v1.7.7
  hooks:
@@ -55,7 +56,7 @@ repos:
      language: python
      types_or: [python, pyi]
      require_serial: true
-      additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
+      additional_dependencies: ["mypy[faster-cache]==1.19.1", regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.10
    entry: python tools/pre_commit/mypy.py 1 "3.10"
@@ -127,6 +128,13 @@ repos:
    language: python
    types: [python]
    additional_dependencies: [regex]
+  # prevent use torch.cuda APIs
+  - id: check-torch-cuda-call
+    name: "Prevent new 'torch.cuda' APIs call"
+    entry: python tools/pre_commit/check_torch_cuda.py
+    language: python
+    types: [python]
+    additional_dependencies: [regex]
  - id: validate-config
    name: Validate configuration has default values and that each field has a docstring
    entry: python tools/pre_commit/validate_config.py
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -9,6 +9,7 @@ build:
    python: "3.12"
  jobs:
    post_checkout:
+      # - bash docs/maybe_skip_pr_build.sh
      - git fetch origin main --unshallow --no-tags --filter=blob:none || true
    pre_create_environment:
      - pip install uv
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -0,0 +1,113 @@
+# Agent Instructions for vLLM
+
+> These instructions apply to **all** AI-assisted contributions to `vllm-project/vllm`.
+> Breaching these guidelines can result in automatic banning.
+
+## 1. Contribution Policy (Mandatory)
+
+### Duplicate-work checks
+
+Before proposing a PR, run these checks:
+
+```bash
+gh issue view <issue_number> --repo vllm-project/vllm --comments
+gh pr list --repo vllm-project/vllm --state open --search "<issue_number> in:body"
+gh pr list --repo vllm-project/vllm --state open --search "<short area keywords>"
+```
+
+- If an open PR already addresses the same fix, do not open another.
+- If your approach is materially different, explain the difference in the issue.
+
+### No low-value busywork PRs
+
+Do not open one-off PRs for tiny edits (single typo, isolated style change, one mutable default, etc.). Mechanical cleanups are acceptable only when bundled with substantive work.
+
+### Accountability
+
+- Pure code-agent PRs are **not allowed**. A human submitter must understand and defend the change end-to-end.
+- The submitting human must review every changed line and run relevant tests.
+- PR descriptions for AI-assisted work **must** include:
+    - Why this is not duplicating an existing PR.
+    - Test commands run and results.
+    - Clear statement that AI assistance was used.
+
+### Fail-closed behavior
+
+If work is duplicate/trivial busywork, **do not proceed**. Return a short explanation of what is missing.
+
+---
+
+## 2. Development Workflow
+
+### Environment setup
+
+```bash
+# Install `uv` if you don't have it already:
+curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Always use `uv` for Python environment management:
+uv venv --python 3.12
+source .venv/bin/activate
+
+# Always make sure `pre-commit` and its hooks are installed:
+uv pip install -r requirements/lint.txt
+pre-commit install
+```
+
+### Installing dependencies
+
+```bash
+# If you are only making Python changes:
+VLLM_USE_PRECOMPILED=1 uv pip install -e .
+
+# If you are also making C/C++ changes:
+uv pip install -e .
+```
+
+### Running tests
+
+Tests require extra dependencies.
+All versions for test dependencies should be read from `requirements/test.txt`
+
+```bash
+# Install bare minimum test dependencies:
+uv pip install pytest pytest-asyncio tblib
+
+# Install additional test dependencies as needed, or install them all as follows:
+uv pip install -r requirements/test.txt
+
+# Run specific test from specific test file
+pytest tests/path/to/test.py -v -s -k test_name
+
+# Run all tests in directory
+pytest tests/path/to/dir -v -s
+```
+
+### Running linters
+
+```bash
+# Run all pre-commit hooks on staged files:
+pre-commit run
+
+# Run on all files:
+pre-commit run --all-files
+
+# Run a specific hook:
+pre-commit run ruff-check --all-files
+
+# Run mypy as it is in CI:
+pre-commit run mypy-3.10 --all-files --hook-stage manual
+```
+
+### Commit messages
+
+Add attribution using commit trailers such as `Co-authored-by:` (other projects use `Assisted-by:` or `Generated-by:`). For example:
+
+```text
+Your commit message here
+
+Co-authored-by: GitHub Copilot
+Co-authored-by: Claude
+Co-authored-by: gemini-code-assist
+Signed-off-by: Your Name <your.email@example.com>
+```
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -0,0 +1 @@
+@AGENTS.md
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,7 +37,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")

 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1150;gfx1151;gfx1152;gfx1153;gfx1200;gfx1201")

 # ROCm installation prefix. Default to /opt/rocm but allow override via
 # -DROCM_PATH=/your/rocm/path when invoking cmake.
@@ -725,7 +725,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # CUTLASS MoE kernels

  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
-  # on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
+  # on Hopper). get_cutlass_(batched_)moe_mm_data should only be compiled
  # if it's possible to compile MoE kernels that use its output.
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
@@ -771,6 +771,33 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()

+  # Expert-specialization MXFP8 blockscaled grouped kernels (SM100+).
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(ES_MXFP8_GROUPED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(ES_MXFP8_GROUPED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
+  endif()
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND ES_MXFP8_GROUPED_MM_ARCHS)
+    set(SRCS
+      "csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm.cu"
+      "csrc/moe/mxfp8_moe/mxfp8_experts_quant.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${ES_MXFP8_GROUPED_MM_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_ES_MXFP8_GROUPED_MM_SM100=1")
+    message(STATUS "Building ES MXFP8 grouped kernels for archs: ${ES_MXFP8_GROUPED_MM_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8
+        AND ES_MXFP8_GROUPED_MM_ARCHS)
+      message(STATUS "Not building ES MXFP8 grouped kernels as CUDA Compiler version is "
+                     "not >= 12.8.")
+    else()
+      message(STATUS "Not building ES MXFP8 grouped kernels as no compatible archs found "
+                     "in CUDA target architectures.")
+    endif()
+  endif()
+
  # DeepSeek V3 fused A GEMM kernel (requires SM 9.0+, Hopper and later)
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
    cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
@@ -971,7 +998,8 @@ set(VLLM_MOE_EXT_SRC
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_MOE_EXT_SRC
    "csrc/moe/moe_wna16.cu"
-    "csrc/moe/grouped_topk_kernels.cu")
+    "csrc/moe/grouped_topk_kernels.cu"
+    "csrc/moe/router_gemm.cu")
 endif()

 if(VLLM_GPU_LANG STREQUAL "CUDA")
--- a/benchmarks/attention_benchmarks/README.md
+++ b/benchmarks/attention_benchmarks/README.md
@@ -187,7 +187,7 @@ python benchmark.py \
 ## Hardware Requirements

 | Backend | Hardware |
-|---------|----------|
+| ------- | -------- |
 | Flash/Triton/FlashInfer | Any CUDA GPU |
 | CUTLASS MLA | Blackwell (SM100+) |
 | FlashAttn MLA | Hopper (SM90+) |
--- a/benchmarks/attention_benchmarks/init.py
+++ b/benchmarks/attention_benchmarks/init.py
@@ -15,7 +15,6 @@ from .common import (
    BenchmarkConfig,
    BenchmarkResult,
    MockLayer,
-    MockModelConfig,
    ResultsFormatter,
    get_attention_scale,
    is_mla_backend,
@@ -36,7 +35,6 @@ __all__ = [
    "ResultsFormatter",
    # Mock objects
    "MockLayer",
-    "MockModelConfig",
    # Utilities
    "setup_mla_dims",
    "get_attention_scale",
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -47,6 +47,8 @@ from common import (
    is_mla_backend,
 )

+from vllm.v1.worker.workspace import init_workspace_manager
+

 def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
    """Run standard attention benchmark (Flash/Triton/FlashInfer)."""
@@ -59,7 +61,9 @@ def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
    """Run MLA benchmark with appropriate backend."""
    from mla_runner import run_mla_benchmark as run_mla

-    return run_mla(config.backend, config, **kwargs)
+    return run_mla(
+        config.backend, config, prefill_backend=config.prefill_backend, **kwargs
+    )


 def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
@@ -440,20 +444,27 @@ def main():
    # Backend selection
    parser.add_argument(
        "--backends",
+        "--decode-backends",
        nargs="+",
-        help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
+        help="Decode backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
        "flashinfer_mla, flashattn_mla, flashmla)",
    )
    parser.add_argument(
        "--backend",
        help="Single backend (alternative to --backends)",
    )
+    parser.add_argument(
+        "--prefill-backends",
+        nargs="+",
+        help="Prefill backends to compare (fa2, fa3, fa4). "
+        "Uses the first decode backend for impl construction.",
+    )

    # Batch specifications
    parser.add_argument(
        "--batch-specs",
        nargs="+",
-        default=["q2k", "8q1s1k"],
+        default=None,
        help="Batch specifications using extended grammar",
    )

@@ -469,6 +480,21 @@ def main():
    parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
    parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
    parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
+    parser.add_argument(
+        "--kv-cache-dtype",
+        default="auto",
+        choices=["auto", "fp8"],
+        help="KV cache dtype: auto or fp8",
+    )
+    parser.add_argument(
+        "--cuda-graphs",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help=(
+            "Launch kernels with CUDA graphs to eliminate CPU overhead"
+            "in measurements (default: True)"
+        ),
+    )

    # Parameter sweep (use YAML config for advanced sweeps)
    parser.add_argument(
@@ -502,7 +528,7 @@ def main():

        # Override args with YAML values, but CLI args take precedence
        # Check if CLI provided backends (they would be non-None and not default)
-        cli_backends_provided = args.backends is not None or args.backend is not None
+        cli_backends_provided = args.backend is not None or args.backends is not None

        # Backend(s) - only use YAML if CLI didn't specify
        if not cli_backends_provided:
@@ -512,6 +538,12 @@ def main():
            elif "backends" in yaml_config:
                args.backends = yaml_config["backends"]
                args.backend = None
+            elif "decode_backends" in yaml_config:
+                args.backends = yaml_config["decode_backends"]
+                args.backend = None
+
+        # Prefill backends (e.g., ["fa3", "fa4"])
+        args.prefill_backends = yaml_config.get("prefill_backends", None)

        # Check for special modes
        if "mode" in yaml_config:
@@ -521,21 +553,24 @@ def main():

        # Batch specs and sizes
        # Support both explicit batch_specs and generated batch_spec_ranges
-        if "batch_spec_ranges" in yaml_config:
-            # Generate batch specs from ranges
-            generated_specs = generate_batch_specs_from_ranges(
-                yaml_config["batch_spec_ranges"]
-            )
-            # Combine with any explicit batch_specs
-            if "batch_specs" in yaml_config:
-                args.batch_specs = yaml_config["batch_specs"] + generated_specs
-            else:
-                args.batch_specs = generated_specs
-            console.print(
-                f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
-            )
-        elif "batch_specs" in yaml_config:
-            args.batch_specs = yaml_config["batch_specs"]
+        # CLI --batch-specs takes precedence over YAML when provided.
+        cli_batch_specs_provided = args.batch_specs is not None
+        if not cli_batch_specs_provided:
+            if "batch_spec_ranges" in yaml_config:
+                # Generate batch specs from ranges
+                generated_specs = generate_batch_specs_from_ranges(
+                    yaml_config["batch_spec_ranges"]
+                )
+                # Combine with any explicit batch_specs
+                if "batch_specs" in yaml_config:
+                    args.batch_specs = yaml_config["batch_specs"] + generated_specs
+                else:
+                    args.batch_specs = generated_specs
+                console.print(
+                    f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
+                )
+            elif "batch_specs" in yaml_config:
+                args.batch_specs = yaml_config["batch_specs"]

        if "batch_sizes" in yaml_config:
            args.batch_sizes = yaml_config["batch_sizes"]
@@ -560,6 +595,10 @@ def main():
            args.warmup_iters = yaml_config["warmup_iters"]
        if "profile_memory" in yaml_config:
            args.profile_memory = yaml_config["profile_memory"]
+        if "kv_cache_dtype" in yaml_config:
+            args.kv_cache_dtype = yaml_config["kv_cache_dtype"]
+        if "cuda_graphs" in yaml_config:
+            args.cuda_graphs = yaml_config["cuda_graphs"]

        # Parameter sweep configuration
        if "parameter_sweep" in yaml_config:
@@ -613,10 +652,19 @@ def main():

    # Determine backends
    backends = args.backends or ([args.backend] if args.backend else ["flash"])
+    prefill_backends = getattr(args, "prefill_backends", None)
+    if not args.batch_specs:
+        args.batch_specs = ["q2k", "8q1s1k"]
    console.print(f"Backends: {', '.join(backends)}")
+    if prefill_backends:
+        console.print(f"Prefill backends: {', '.join(prefill_backends)}")
    console.print(f"Batch specs: {', '.join(args.batch_specs)}")
+    console.print(f"KV cache dtype: {args.kv_cache_dtype}")
+    console.print(f"CUDA graphs: {args.cuda_graphs}")
    console.print()

+    init_workspace_manager(args.device)
+
    # Run benchmarks
    all_results = []

@@ -669,6 +717,8 @@ def main():
                        repeats=args.repeats,
                        warmup_iters=args.warmup_iters,
                        profile_memory=args.profile_memory,
+                        kv_cache_dtype=args.kv_cache_dtype,
+                        use_cuda_graphs=args.cuda_graphs,
                    )

                    # Add decode pipeline config
@@ -821,6 +871,8 @@ def main():
            "repeats": args.repeats,
            "warmup_iters": args.warmup_iters,
            "profile_memory": args.profile_memory,
+            "kv_cache_dtype": args.kv_cache_dtype,
+            "use_cuda_graphs": args.cuda_graphs,
        }
        all_results = run_model_parameter_sweep(
            backends,
@@ -843,6 +895,8 @@ def main():
            "repeats": args.repeats,
            "warmup_iters": args.warmup_iters,
            "profile_memory": args.profile_memory,
+            "kv_cache_dtype": args.kv_cache_dtype,
+            "use_cuda_graphs": args.cuda_graphs,
        }
        all_results = run_parameter_sweep(
            backends, args.batch_specs, base_config_args, args.parameter_sweep, console
@@ -850,37 +904,95 @@ def main():

    else:
        # Normal mode: compare backends
-        total = len(backends) * len(args.batch_specs)
+        decode_results = []
+        prefill_results = []

-        with tqdm(total=total, desc="Benchmarking") as pbar:
-            for spec in args.batch_specs:
-                for backend in backends:
-                    config = BenchmarkConfig(
-                        backend=backend,
-                        batch_spec=spec,
-                        num_layers=args.num_layers,
-                        head_dim=args.head_dim,
-                        num_q_heads=args.num_q_heads,
-                        num_kv_heads=args.num_kv_heads,
-                        block_size=args.block_size,
-                        device=args.device,
-                        repeats=args.repeats,
-                        warmup_iters=args.warmup_iters,
-                        profile_memory=args.profile_memory,
-                    )
+        # Run decode backend comparison
+        if not prefill_backends:
+            # No prefill backends specified: compare decode backends as before
+            total = len(backends) * len(args.batch_specs)

-                    result = run_benchmark(config)
-                    all_results.append(result)
+            with tqdm(total=total, desc="Benchmarking") as pbar:
+                for spec in args.batch_specs:
+                    for backend in backends:
+                        config = BenchmarkConfig(
+                            backend=backend,
+                            batch_spec=spec,
+                            num_layers=args.num_layers,
+                            head_dim=args.head_dim,
+                            num_q_heads=args.num_q_heads,
+                            num_kv_heads=args.num_kv_heads,
+                            block_size=args.block_size,
+                            device=args.device,
+                            repeats=args.repeats,
+                            warmup_iters=args.warmup_iters,
+                            profile_memory=args.profile_memory,
+                            kv_cache_dtype=args.kv_cache_dtype,
+                            use_cuda_graphs=args.cuda_graphs,
+                        )

-                    if not result.success:
-                        console.print(f"[red]Error {backend} {spec}: {result.error}[/]")
+                        result = run_benchmark(config)
+                        decode_results.append(result)

-                    pbar.update(1)
+                        if not result.success:
+                            console.print(
+                                f"[red]Error {backend} {spec}: {result.error}[/]"
+                            )

-        # Display results
-        console.print("\n[bold green]Results:[/]")
-        formatter = ResultsFormatter(console)
-        formatter.print_table(all_results, backends)
+                        pbar.update(1)
+
+            console.print("\n[bold green]Results:[/]")
+            formatter = ResultsFormatter(console)
+            formatter.print_table(decode_results, backends)
+
+        # Run prefill backend comparison
+        if prefill_backends:
+            # Use first decode backend for impl construction
+            decode_backend = backends[0]
+            total = len(prefill_backends) * len(args.batch_specs)
+
+            console.print(
+                f"[yellow]Prefill comparison mode: "
+                f"using {decode_backend} for decode impl[/]"
+            )
+
+            with tqdm(total=total, desc="Prefill benchmarking") as pbar:
+                for spec in args.batch_specs:
+                    for pb in prefill_backends:
+                        config = BenchmarkConfig(
+                            backend=decode_backend,
+                            batch_spec=spec,
+                            num_layers=args.num_layers,
+                            head_dim=args.head_dim,
+                            num_q_heads=args.num_q_heads,
+                            num_kv_heads=args.num_kv_heads,
+                            block_size=args.block_size,
+                            device=args.device,
+                            repeats=args.repeats,
+                            warmup_iters=args.warmup_iters,
+                            profile_memory=args.profile_memory,
+                            prefill_backend=pb,
+                        )
+
+                        result = run_benchmark(config)
+
+                        # Label result with prefill backend name for display
+                        labeled_config = replace(result.config, backend=pb)
+                        result = replace(result, config=labeled_config)
+                        prefill_results.append(result)
+
+                        if not result.success:
+                            console.print(f"[red]Error {pb} {spec}: {result.error}[/]")
+
+                        pbar.update(1)
+
+            console.print("\n[bold green]Prefill Backend Results:[/]")
+            formatter = ResultsFormatter(console)
+            formatter.print_table(
+                prefill_results, prefill_backends, compare_to_fastest=True
+            )
+
+        all_results = decode_results + prefill_results

    # Save results
    if all_results:
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -10,7 +10,6 @@ from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Any

-import numpy as np
 import torch
 from batch_spec import get_batch_type, parse_batch_spec
 from rich.console import Console
@@ -31,7 +30,7 @@ def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
        max_kv_len = max(r.kv_len for r in requests) if requests else 0
        return (batch_size, max_q_len, max_kv_len)
    except Exception:
-        # Fallback for unparseable specs
+        # Fallback for unparsable specs
        return (0, 0, 0)


@@ -62,10 +61,7 @@ class MockHfConfig:
 # Import AttentionLayerBase at module level to avoid circular dependencies
 try:
    from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
-
-    _HAS_ATTENTION_LAYER_BASE = True
 except ImportError:
-    _HAS_ATTENTION_LAYER_BASE = False
    AttentionLayerBase = object  # Fallback


@@ -81,6 +77,7 @@ class MockKVBProj:
        self.qk_nope_head_dim = qk_nope_head_dim
        self.v_head_dim = v_head_dim
        self.out_dim = qk_nope_head_dim + v_head_dim
+        self.weight = torch.empty(0, dtype=torch.bfloat16)

    def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
        """
@@ -167,95 +164,6 @@ class MockLayer(AttentionLayerBase):
        return self._kv_cache_spec


-class MockModelConfig:
-    """Mock model configuration."""
-
-    def __init__(
-        self,
-        num_q_heads: int,
-        num_kv_heads: int,
-        head_dim: int,
-        dtype: torch.dtype = torch.float16,
-        max_model_len: int = 32768,
-    ):
-        self._n_q = num_q_heads
-        self._n_kv = num_kv_heads
-        self._d = head_dim
-        self.dtype = dtype
-        self.max_model_len = max_model_len
-
-    def get_num_attention_heads(self, _=None) -> int:
-        return self._n_q
-
-    def get_num_kv_heads(self, _=None) -> int:
-        return self._n_kv
-
-    def get_head_size(self) -> int:
-        return self._d
-
-    def get_num_layers(self) -> int:
-        """Mock method for layer count queries."""
-        return 1
-
-    def get_sliding_window_for_layer(self, _layer_idx: int):
-        """Mock method for sliding window queries."""
-        return None
-
-    def get_logits_soft_cap_for_layer(self, _layer_idx: int):
-        """Mock method for logits soft cap queries."""
-        return None
-
-    def get_sm_scale_for_layer(self, _layer_idx: int) -> float:
-        """Mock method for SM scale queries."""
-        return 1.0 / (self.get_head_size() ** 0.5)
-
-
-class MockParallelConfig:
-    """Mock parallel configuration."""
-
-    pass
-
-
-class MockCompilationConfig:
-    """Mock compilation configuration."""
-
-    def __init__(self):
-        self.full_cuda_graph = False
-        self.static_forward_context = {}
-
-
-class MockVLLMConfig:
-    """Mock VLLM configuration."""
-
-    def __init__(self):
-        self.compilation_config = MockCompilationConfig()
-
-
-class MockRunner:
-    """Mock GPU runner for metadata builders."""
-
-    def __init__(
-        self,
-        seq_lens: np.ndarray,
-        query_start_locs: np.ndarray,
-        device: torch.device,
-        num_q_heads: int,
-        num_kv_heads: int,
-        head_dim: int,
-        dtype: torch.dtype,
-    ):
-        self.model_config = MockModelConfig(num_q_heads, num_kv_heads, head_dim, dtype)
-        self.parallel_config = MockParallelConfig()
-        self.vllm_config = MockVLLMConfig()
-        self.seq_lens_np = seq_lens
-        self.query_start_loc_np = query_start_locs
-        self.device = device
-        self.attention_chunk_size = None
-        self.num_query_heads = num_q_heads
-        self.num_kv_heads = num_kv_heads
-        self.dtype = dtype
-
-
@dataclass
 class ParameterSweep:
    """Configuration for sweeping a backend parameter."""
@@ -305,7 +213,11 @@ class BenchmarkConfig:
    profile_memory: bool = False
    use_cuda_graphs: bool = False

+    # "auto" or "fp8"
+    kv_cache_dtype: str = "auto"
+
    # MLA-specific
+    prefill_backend: str | None = None
    kv_lora_rank: int | None = None
    qk_nope_head_dim: int | None = None
    qk_rope_head_dim: int | None = None
@@ -460,6 +372,7 @@ class ResultsFormatter:
                    "backend",
                    "batch_spec",
                    "num_layers",
+                    "kv_cache_dtype",
                    "mean_time",
                    "std_time",
                    "throughput",
@@ -473,6 +386,7 @@ class ResultsFormatter:
                        "backend": r.config.backend,
                        "batch_spec": r.config.batch_spec,
                        "num_layers": r.config.num_layers,
+                        "kv_cache_dtype": r.config.kv_cache_dtype,
                        "mean_time": r.mean_time,
                        "std_time": r.std_time,
                        "throughput": r.throughput_tokens_per_sec or 0,
--- a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
@@ -30,9 +30,9 @@ batch_specs:
  - "2q16k_32q1s4k"         # 2 very large prefill + 32 decode

  # Context extension + decode
-  - "2q1kkv2k_16q1s1k"       # 2 extend + 16 decode
-  - "4q2kkv4k_32q1s2k"       # 4 extend + 32 decode
-  - "2q1kkv8k_32q1s2k"       # 2 large extend + 32 decode
+  - "2q1ks2k_16q1s1k"       # 2 extend + 16 decode
+  - "4q2ks4k_32q1s2k"       # 4 extend + 32 decode
+  - "2q1ks8k_32q1s2k"       # 2 large extend + 32 decode

  # Explicitly chunked prefill
  - "q8k"           # 8k prefill with chunking hint
--- a/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
@@ -1,4 +1,19 @@
-# MLA prefill-only benchmark configuration for sparse backends
+# MLA prefill backend comparison
+#
+# Compares all available MLA prefill backends:
+#   FA backends:  fa2, fa3, fa4 (FlashAttention versions)
+#   Non-FA:       flashinfer, cudnn, trtllm (Blackwell-only, require flashinfer)
+#
+# Uses cutlass_mla as the decode backend for impl construction
+# (only the prefill path is exercised).
+#
+# Backends that aren't available on the current platform will report errors
+# in the results table (e.g., fa3 on Blackwell, cudnn without artifactory).
+#
+# Usage:
+#   python benchmark.py --config configs/mla_prefill.yaml
+
+description: "MLA prefill backend comparison"

 model:
  name: "deepseek-v3"
@@ -12,20 +27,25 @@ model:
  v_head_dim: 128
  block_size: 128

-# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
-# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
-model_parameter_sweep:
-  param_name: "num_q_heads"
-  values: [128, 64, 32, 16]
-  label_format: "{backend}_{value}h"
+# model:
+#   name: "deepseek-v2-lite"
+#   num_layers: 27
+#   num_q_heads: 16
+#   num_kv_heads: 1
+#   head_dim: 576
+#   kv_lora_rank: 512
+#   qk_nope_head_dim: 128
+#   qk_rope_head_dim: 64
+#   v_head_dim: 128
+#   block_size: 128

 batch_specs:
  # Pure prefill
-  - "1q512"
-  - "1q1k"
-  - "1q2k"
-  - "1q4k"
-  - "1q8k"
+  - "q512"
+  - "q1k"
+  - "q2k"
+  - "q4k"
+  - "q8k"

  # Batched pure prefill
  - "2q512"
@@ -44,19 +64,63 @@ batch_specs:
  - "8q4k"
  - "8q8k"

-  # Extend
-  - "1q512s4k"
-  - "1q512s8k"
-  - "1q1ks8k"
-  - "1q2ks8k"
-  - "1q2ks16k"
-  - "1q4ks16k"
+  # Chunked prefill / extend
+  # Short context
+  - "q128s1k"
+  - "q256s2k"
+  - "q512s4k"
+  - "q1ks4k"
+  - "q2ks8k"
+  - "2q128s1k"
+  - "2q256s2k"
+  - "2q512s4k"
+  - "2q1ks4k"
+  - "2q2ks8k"
+  - "4q128s1k"
+  - "4q256s2k"
+  - "4q512s4k"
+  - "4q1ks4k"
+  - "4q2ks8k"
+  - "8q128s1k"
+  - "8q256s2k"
+  - "8q512s4k"
+  - "8q1ks4k"

-backends:
-  - FLASHMLA_SPARSE
-  - FLASHINFER_MLA_SPARSE
+  # Medium context
+  - "q128s16k"
+  - "q512s16k"
+  - "q1ks16k"
+  - "q2ks16k"
+  - "2q128s16k"
+  - "2q512s16k"
+  - "2q1ks16k"
+  - "2q2ks16k"
+  - "4q128s16k"
+  - "4q512s16k"
+  - "4q1ks16k"
+  - "4q2ks16k"
+
+  # Long context
+  - "q128s64k"
+  - "q512s64k"
+  - "q1ks64k"
+  - "q2ks64k"
+  - "2q128s64k"
+  - "2q512s64k"
+  - "2q1ks64k"
+  - "2q2ks64k"
+
+decode_backends:
+  - CUTLASS_MLA
+
+prefill_backends:
+  - fa2
+  - fa3
+  - fa4
+  - flashinfer
+  - cudnn
+  - trtllm

 device: "cuda:0"
-repeats: 10
-warmup_iters: 3
-profile_memory: true
+repeats: 20
+warmup_iters: 5
--- a/benchmarks/attention_benchmarks/configs/mla_sparse_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_sparse_decode.yaml
@@ -0,0 +1,58 @@
+# MLA decode-only benchmark configuration
+
+model:
+  name: "deepseek-v3"
+  num_layers: 60
+  num_q_heads: 128  # Base value, can be swept for TP simulation
+  num_kv_heads: 1  # MLA uses single latent KV
+  head_dim: 576
+  kv_lora_rank: 512
+  qk_nope_head_dim: 128
+  qk_rope_head_dim: 64
+  v_head_dim: 128
+  block_size: 128  # CUTLASS MLA and FlashAttn MLA use 128
+
+# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
+# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
+model_parameter_sweep:
+  param_name: "num_q_heads"
+  values: [128, 64, 32, 16]
+  label_format: "{backend}_{value}h"
+
+batch_specs:
+  # Small batches, varying sequence lengths
+  - "16q1s512"     # 16 requests, 512 KV cache
+  - "16q1s1k"      # 16 requests, 1k KV cache
+  - "16q1s2k"      # 16 requests, 2k KV cache
+  - "16q1s4k"      # 16 requests, 4k KV cache
+
+  # Medium batches
+  - "32q1s1k"      # 32 requests, 1k KV cache
+  - "32q1s2k"      # 32 requests, 2k KV cache
+  - "32q1s4k"      # 32 requests, 4k KV cache
+  - "32q1s8k"      # 32 requests, 8k KV cache
+
+  # Large batches
+  - "64q1s1k"      # 64 requests, 1k KV cache
+  - "64q1s2k"      # 64 requests, 2k KV cache
+  - "64q1s4k"      # 64 requests, 4k KV cache
+  - "64q1s8k"      # 64 requests, 8k KV cache
+
+  # Very large batches
+  - "128q1s1k"     # 128 requests, 1k KV cache
+  - "128q1s2k"     # 128 requests, 2k KV cache
+  - "128q1s4k"     # 128 requests, 4k KV cache
+  - "128q1s8k"     # 128 requests, 8k KV cache
+
+  # Long context
+  - "32q1s16k"     # 32 requests, 16k KV cache
+  - "32q1s32k"     # 32 requests, 32k KV cache
+
+backends:
+  - FLASHMLA_SPARSE
+  - FLASHINFER_MLA_SPARSE
+
+device: "cuda:0"
+repeats: 100
+warmup_iters: 10
+profile_memory: true
--- a/benchmarks/attention_benchmarks/configs/mla_sparse_prefill.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_sparse_prefill.yaml
@@ -0,0 +1,62 @@
+# MLA prefill-only benchmark configuration for sparse backends
+
+model:
+  name: "deepseek-v3"
+  num_layers: 60
+  num_q_heads: 128
+  num_kv_heads: 1
+  head_dim: 576
+  kv_lora_rank: 512
+  qk_nope_head_dim: 128
+  qk_rope_head_dim: 64
+  v_head_dim: 128
+  block_size: 128
+
+# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
+# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
+model_parameter_sweep:
+  param_name: "num_q_heads"
+  values: [128, 64, 32, 16]
+  label_format: "{backend}_{value}h"
+
+batch_specs:
+  # Pure prefill
+  - "1q512"
+  - "1q1k"
+  - "1q2k"
+  - "1q4k"
+  - "1q8k"
+
+  # Batched pure prefill
+  - "2q512"
+  - "2q1k"
+  - "2q2k"
+  - "2q4k"
+  - "2q8k"
+  - "4q512"
+  - "4q1k"
+  - "4q2k"
+  - "4q4k"
+  - "4q8k"
+  - "8q512"
+  - "8q1k"
+  - "8q2k"
+  - "8q4k"
+  - "8q8k"
+
+  # Extend
+  - "1q512s4k"
+  - "1q512s8k"
+  - "1q1ks8k"
+  - "1q2ks8k"
+  - "1q2ks16k"
+  - "1q4ks16k"
+
+backends:
+  - FLASHMLA_SPARSE
+  - FLASHINFER_MLA_SPARSE
+
+device: "cuda:0"
+repeats: 10
+warmup_iters: 3
+profile_memory: true
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -60,8 +60,11 @@ def create_minimal_vllm_config(
    model_name: str = "deepseek-v3",
    block_size: int = 128,
    max_num_seqs: int = 256,
+    max_num_batched_tokens: int = 8192,
    mla_dims: dict | None = None,
    index_topk: int | None = None,
+    prefill_backend: str | None = None,
+    kv_cache_dtype: str = "auto",
 ) -> VllmConfig:
    """
    Create minimal VllmConfig for MLA benchmarks.
@@ -75,6 +78,9 @@ def create_minimal_vllm_config(
                  setup_mla_dims(model_name)
        index_topk: Optional topk value for sparse MLA backends. If provided,
                    the config will include index_topk for sparse attention.
+        prefill_backend: Prefill backend name (e.g., "fa3", "fa4", "flashinfer",
+                        "cudnn", "trtllm"). Configures the attention config to
+                        force the specified prefill backend.

    Returns:
        VllmConfig for benchmarking
@@ -145,14 +151,13 @@ def create_minimal_vllm_config(
    cache_config = CacheConfig(
        block_size=block_size,
        gpu_memory_utilization=0.9,
-        swap_space=0,
-        cache_dtype="auto",
+        cache_dtype=kv_cache_dtype,
        enable_prefix_caching=False,
    )

    scheduler_config = SchedulerConfig(
        max_num_seqs=max_num_seqs,
-        max_num_batched_tokens=8192,
+        max_num_batched_tokens=max(max_num_batched_tokens, max_num_seqs),
        max_model_len=32768,
        is_encoder_decoder=False,
        enable_chunked_prefill=True,
@@ -164,7 +169,7 @@ def create_minimal_vllm_config(

    compilation_config = CompilationConfig()

-    return VllmConfig(
+    vllm_config = VllmConfig(
        model_config=model_config,
        cache_config=cache_config,
        parallel_config=parallel_config,
@@ -172,9 +177,84 @@ def create_minimal_vllm_config(
        compilation_config=compilation_config,
    )

+    if prefill_backend is not None:
+        prefill_cfg = get_prefill_backend_config(prefill_backend)
+        if prefill_cfg["flash_attn_version"] is not None:
+            vllm_config.attention_config.flash_attn_version = prefill_cfg[
+                "flash_attn_version"
+            ]
+        vllm_config.attention_config.disable_flashinfer_prefill = prefill_cfg[
+            "disable_flashinfer_prefill"
+        ]
+        vllm_config.attention_config.use_cudnn_prefill = prefill_cfg[
+            "use_cudnn_prefill"
+        ]
+        vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill = prefill_cfg[
+            "use_trtllm_ragged_deepseek_prefill"
+        ]
+
+    return vllm_config
+

 # ============================================================================
-# Backend Configuration
+# Prefill Backend Configuration
+# ============================================================================
+
+# Maps prefill backend names to attention config overrides.
+# FA backends set flash_attn_version and disable non-FA paths.
+# Non-FA backends enable their specific path and disable others.
+_PREFILL_BACKEND_CONFIG: dict[str, dict] = {
+    "fa2": {
+        "flash_attn_version": 2,
+        "disable_flashinfer_prefill": True,
+        "use_cudnn_prefill": False,
+        "use_trtllm_ragged_deepseek_prefill": False,
+    },
+    "fa3": {
+        "flash_attn_version": 3,
+        "disable_flashinfer_prefill": True,
+        "use_cudnn_prefill": False,
+        "use_trtllm_ragged_deepseek_prefill": False,
+    },
+    "fa4": {
+        "flash_attn_version": 4,
+        "disable_flashinfer_prefill": True,
+        "use_cudnn_prefill": False,
+        "use_trtllm_ragged_deepseek_prefill": False,
+    },
+    "flashinfer": {
+        "flash_attn_version": None,
+        "disable_flashinfer_prefill": False,
+        "use_cudnn_prefill": False,
+        "use_trtllm_ragged_deepseek_prefill": False,
+    },
+    "cudnn": {
+        "flash_attn_version": None,
+        "disable_flashinfer_prefill": True,
+        "use_cudnn_prefill": True,
+        "use_trtllm_ragged_deepseek_prefill": False,
+    },
+    "trtllm": {
+        "flash_attn_version": None,
+        "disable_flashinfer_prefill": True,
+        "use_cudnn_prefill": False,
+        "use_trtllm_ragged_deepseek_prefill": True,
+    },
+}
+
+
+def get_prefill_backend_config(prefill_backend: str) -> dict:
+    """Get attention config overrides for a prefill backend."""
+    if prefill_backend not in _PREFILL_BACKEND_CONFIG:
+        raise ValueError(
+            f"Unknown prefill backend: {prefill_backend!r}. "
+            f"Available: {list(_PREFILL_BACKEND_CONFIG.keys())}"
+        )
+    return _PREFILL_BACKEND_CONFIG[prefill_backend]
+
+
+# ============================================================================
+# Decode Backend Configuration
 # ============================================================================


@@ -204,6 +284,7 @@ def _get_backend_config(backend: str) -> dict:
    Returns:
        Dict with backend configuration
    """
+    from vllm.v1.attention.backend import MultipleOf
    from vllm.v1.attention.backends.registry import AttentionBackendEnum

    try:
@@ -220,8 +301,8 @@ def _get_backend_config(backend: str) -> dict:
    block_sizes = backend_class.get_supported_kernel_block_sizes()
    # Use first supported block size (backends typically support one for MLA)
    block_size = block_sizes[0] if block_sizes else None
-    if hasattr(block_size, "value"):
-        # Handle MultipleOf enum
+    if isinstance(block_size, MultipleOf):
+        # No fixed block size; fall back to config value
        block_size = None

    # Check if sparse via class method if available
@@ -456,6 +537,7 @@ def _create_backend_impl(
    device: torch.device,
    max_num_tokens: int = 8192,
    index_topk: int | None = None,
+    kv_cache_dtype: str = "auto",
 ):
    """
    Create backend implementation instance.
@@ -504,7 +586,7 @@ def _create_backend_impl(
        "num_kv_heads": mla_dims["num_kv_heads"],
        "alibi_slopes": None,
        "sliding_window": None,
-        "kv_cache_dtype": "auto",
+        "kv_cache_dtype": kv_cache_dtype,
        "logits_soft_cap": None,
        "attn_type": "decoder",
        "kv_sharing_target_layer_name": None,
@@ -622,6 +704,7 @@ def _run_single_benchmark(
    mla_dims: dict,
    device: torch.device,
    indexer=None,
+    kv_cache_dtype: str | None = None,
 ) -> BenchmarkResult:
    """
    Run a single benchmark iteration.
@@ -655,53 +738,123 @@ def _run_single_benchmark(
    )

    # Create KV cache
-    kv_cache = torch.zeros(
-        num_blocks,
-        block_size,
-        mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
-        device=device,
-        dtype=torch.bfloat16,
-    )
+    if kv_cache_dtype is None:
+        kv_cache_dtype = getattr(config, "kv_cache_dtype", "auto")
+    head_size = mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"]
+    if kv_cache_dtype == "fp8_ds_mla":
+        # FlashMLA sparse custom format: 656 bytes per token, stored as uint8.
+        # Layout: kv_lora_rank fp8 bytes + 4 float32 tile scales
+        #         + 2*rope_dim bf16 bytes
+        # = 512 + 16 + 128 = 656 bytes for DeepSeek dims.
+        kv_cache = torch.zeros(
+            num_blocks,
+            block_size,
+            656,
+            device=device,
+            dtype=torch.uint8,
+        )
+    elif kv_cache_dtype == "fp8":
+        from vllm.platforms import current_platform

-    # Create input tensors for both decode and prefill modes
-    decode_inputs, prefill_inputs = _create_input_tensors(
-        total_q,
-        mla_dims,
-        backend_cfg["query_format"],
-        device,
-        torch.bfloat16,
-    )
+        kv_cache = torch.zeros(
+            num_blocks,
+            block_size,
+            head_size,
+            device=device,
+            dtype=torch.uint8,
+        ).view(current_platform.fp8_dtype())
+    else:
+        kv_cache = torch.zeros(
+            num_blocks,
+            block_size,
+            head_size,
+            device=device,
+            dtype=torch.bfloat16,
+        )

    # Fill indexer with random indices for sparse backends
    is_sparse = backend_cfg.get("is_sparse", False)
    if is_sparse and indexer is not None:
        indexer.fill_random_indices(total_q, max_kv_len)

-    # Determine which forward method to use
-    if is_sparse:
-        # Sparse backends use forward_mqa
-        forward_fn = lambda: impl.forward_mqa(decode_inputs, kv_cache, metadata, layer)
-    elif metadata.decode is not None:
-        forward_fn = lambda: impl._forward_decode(
-            decode_inputs, kv_cache, metadata, layer
-        )
-    elif metadata.prefill is not None:
-        forward_fn = lambda: impl._forward_prefill(
-            prefill_inputs["q"],
-            prefill_inputs["k_c_normed"],
-            prefill_inputs["k_pe"],
-            kv_cache,
-            metadata,
-            prefill_inputs["k_scale"],
-            prefill_inputs["output"],
-        )
-    else:
+    # Determine which forward methods to use based on metadata.
+    # Sparse MLA backends always use forward_mqa
+    has_decode = is_sparse or getattr(metadata, "decode", None) is not None
+    has_prefill = not is_sparse and getattr(metadata, "prefill", None) is not None
+    if not has_decode and not has_prefill:
        raise RuntimeError("Metadata has neither decode nor prefill metadata")

+    num_decode = (
+        metadata.num_decode_tokens
+        if (has_decode and has_prefill)
+        else total_q
+        if has_decode
+        else 0
+    )
+    num_prefill = total_q - num_decode
+
+    # Some backends requires fp8 queries when using fp8 KV cache.
+    is_fp8_kvcache = kv_cache_dtype.startswith("fp8")
+    quantize_query = is_fp8_kvcache and getattr(
+        impl, "supports_quant_query_input", False
+    )
+
+    # quantize_query forces concat format
+    query_fmt = "concat" if quantize_query else backend_cfg["query_format"]
+
+    # Create decode query tensors
+    if has_decode:
+        decode_inputs, _ = _create_input_tensors(
+            num_decode, mla_dims, query_fmt, device, torch.bfloat16
+        )
+        # Cast decode query to fp8 if the backend supports it
+        if quantize_query:
+            from vllm.platforms import current_platform
+
+            if isinstance(decode_inputs, tuple):
+                decode_inputs = torch.cat(list(decode_inputs), dim=-1)
+            decode_inputs = decode_inputs.to(current_platform.fp8_dtype())
+
+    # Create prefill input tensors
+    if has_prefill:
+        _, prefill_inputs = _create_input_tensors(
+            num_prefill, mla_dims, query_fmt, device, torch.bfloat16
+        )
+
+    # Build forward function
+    def forward_fn():
+        results = []
+        if has_decode:
+            results.append(impl.forward_mqa(decode_inputs, kv_cache, metadata, layer))
+        if has_prefill:
+            results.append(
+                impl.forward_mha(
+                    prefill_inputs["q"],
+                    prefill_inputs["k_c_normed"],
+                    prefill_inputs["k_pe"],
+                    kv_cache,
+                    metadata,
+                    prefill_inputs["k_scale"],
+                    prefill_inputs["output"],
+                )
+            )
+        return results[0] if len(results) == 1 else tuple(results)
+
    # Warmup
    for _ in range(config.warmup_iters):
        forward_fn()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
+
+    # Optionally capture a CUDA graph after warmup.
+    # Graph replay eliminates CPU launch overhead so timings reflect pure
+    # kernel time.
+    if config.use_cuda_graphs:
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph):
+            forward_fn()
+        benchmark_fn = graph.replay
+    else:
+        benchmark_fn = forward_fn

    # Benchmark
    times = []
@@ -711,10 +864,10 @@ def _run_single_benchmark(

        start.record()
        for _ in range(config.num_layers):
-            forward_fn()
+            benchmark_fn()
        end.record()

-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        elapsed_ms = start.elapsed_time(end)
        times.append(elapsed_ms / 1000.0 / config.num_layers)

@@ -733,6 +886,7 @@ def _run_mla_benchmark_batched(
    backend: str,
    configs_with_params: list[tuple],  # [(config, threshold, num_splits), ...]
    index_topk: int = 2048,
+    prefill_backend: str | None = None,
 ) -> list[BenchmarkResult]:
    """
    Unified batched MLA benchmark runner for all backends.
@@ -744,11 +898,13 @@ def _run_mla_benchmark_batched(
    to avoid setup/teardown overhead.

    Args:
-        backend: Backend name
+        backend: Backend name (decode backend used for impl construction)
        configs_with_params: List of (config, threshold, num_splits) tuples
            - threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
            - num_splits: num_kv_splits (CUTLASS only)
        index_topk: Topk value for sparse MLA backends (default 2048)
+        prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
+            When set, forces the specified FlashAttention version for prefill.

    Returns:
        List of BenchmarkResult objects
@@ -758,7 +914,7 @@ def _run_mla_benchmark_batched(

    backend_cfg = _get_backend_config(backend)
    device = torch.device(configs_with_params[0][0].device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)

    # Determine block size
    config_block_size = configs_with_params[0][0].block_size
@@ -775,26 +931,91 @@ def _run_mla_benchmark_batched(
    # Determine if this is a sparse backend
    is_sparse = backend_cfg.get("is_sparse", False)

+    # Extract kv_cache_dtype from the first config
+    kv_cache_dtype = getattr(first_config, "kv_cache_dtype", "auto")
+
+    # FlashMLA sparse only supports "fp8_ds_mla" internally (not generic "fp8").
+    # Remap here so the user can pass --kv-cache-dtype fp8 regardless of backend.
+    if backend.upper() == "FLASHMLA_SPARSE" and kv_cache_dtype == "fp8":
+        kv_cache_dtype = "fp8_ds_mla"
+
+    # Compute max total_q across all configs so the metadata builder buffer
+    # and scheduler config are large enough for all batch specs.
+    max_total_q = max(
+        sum(r.q_len for r in parse_batch_spec(cfg.batch_spec))
+        for cfg, *_ in configs_with_params
+    )
+
    # Create and set vLLM config for MLA (reused across all benchmarks)
    vllm_config = create_minimal_vllm_config(
        model_name="deepseek-v3",  # Used only for model path
        block_size=block_size,
+        max_num_batched_tokens=max_total_q,
        mla_dims=mla_dims,  # Use custom dims from config or default
        index_topk=index_topk if is_sparse else None,
+        prefill_backend=prefill_backend,
+        kv_cache_dtype=kv_cache_dtype,
    )

    results = []

    with set_current_vllm_config(vllm_config):
+        # Clear cached prefill backend detection functions so they re-evaluate
+        # with the current VllmConfig. These are @functools.cache decorated and
+        # would otherwise return stale results from a previous backend's config.
+        from vllm.model_executor.layers.attention.mla_attention import (
+            use_cudnn_prefill,
+            use_flashinfer_prefill,
+            use_trtllm_ragged_deepseek_prefill,
+        )
+
+        use_flashinfer_prefill.cache_clear()
+        use_cudnn_prefill.cache_clear()
+        use_trtllm_ragged_deepseek_prefill.cache_clear()
+
        # Create backend impl, layer, builder, and indexer (reused across benchmarks)
        impl, layer, builder_instance, indexer = _create_backend_impl(
            backend_cfg,
            mla_dims,
            vllm_config,
            device,
+            max_num_tokens=max_total_q,
            index_topk=index_topk if is_sparse else None,
+            kv_cache_dtype=kv_cache_dtype,
        )

+        # Verify the actual prefill backend matches what was requested
+        if prefill_backend is not None:
+            prefill_cfg = get_prefill_backend_config(prefill_backend)
+            fa_version = prefill_cfg["flash_attn_version"]
+
+            if fa_version is not None:
+                # FA backend: verify the impl's FA version
+                actual_fa_version = getattr(impl, "vllm_flash_attn_version", None)
+                if actual_fa_version != fa_version:
+                    raise RuntimeError(
+                        f"Prefill backend '{prefill_backend}' requested FA "
+                        f"version {fa_version}, but the impl is using FA "
+                        f"version {actual_fa_version}. Check "
+                        f"vllm/v1/attention/backends/fa_utils.py."
+                    )
+            else:
+                # Non-FA backend: verify the builder picked the right path
+                expected_flags = {
+                    "flashinfer": "_use_fi_prefill",
+                    "cudnn": "_use_cudnn_prefill",
+                    "trtllm": "_use_trtllm_ragged_prefill",
+                }
+                flag_name = expected_flags.get(prefill_backend)
+                if flag_name and not getattr(builder_instance, flag_name, False):
+                    raise RuntimeError(
+                        f"Prefill backend '{prefill_backend}' was requested "
+                        f"but the metadata builder did not enable it. This "
+                        f"usually means a dependency is missing (e.g., "
+                        f"flashinfer not installed) or the platform doesn't "
+                        f"support it."
+                    )
+
        # Run each benchmark with the shared impl
        for config, threshold, num_splits in configs_with_params:
            # Set threshold for this benchmark (FlashAttn/FlashMLA only)
@@ -819,6 +1040,7 @@ def _run_mla_benchmark_batched(
                    mla_dims,
                    device,
                    indexer=indexer,
+                    kv_cache_dtype=kv_cache_dtype,
                )
                results.append(result)

@@ -845,6 +1067,7 @@ def run_mla_benchmark(
    reorder_batch_threshold: int | None = None,
    num_kv_splits: int | None = None,
    index_topk: int = 2048,
+    prefill_backend: str | None = None,
 ) -> BenchmarkResult | list[BenchmarkResult]:
    """
    Unified MLA benchmark runner for all backends.
@@ -862,6 +1085,8 @@ def run_mla_benchmark(
                                 (single config mode only)
        num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
        index_topk: Topk value for sparse MLA backends (default 2048)
+        prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
+            When set, forces the specified FlashAttention version for prefill.

    Returns:
        BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
@@ -885,7 +1110,9 @@ def run_mla_benchmark(
        return_single = True

    # Use unified batched execution
-    results = _run_mla_benchmark_batched(backend, configs_with_params, index_topk)
+    results = _run_mla_benchmark_batched(
+        backend, configs_with_params, index_topk, prefill_backend=prefill_backend
+    )

    # Return single result or list based on input
    return results[0] if return_single else results
--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -140,8 +140,7 @@ def _create_vllm_config(

    cache_config = CacheConfig(
        block_size=config.block_size,
-        cache_dtype="auto",
-        swap_space=0,
+        cache_dtype=config.kv_cache_dtype,
    )
    cache_config.num_gpu_blocks = max_num_blocks
    cache_config.num_cpu_blocks = 0
@@ -216,7 +215,7 @@ def _create_backend_impl(
        num_kv_heads=config.num_kv_heads,
        alibi_slopes=None,
        sliding_window=None,
-        kv_cache_dtype="auto",
+        kv_cache_dtype=config.kv_cache_dtype,
    )

    kv_cache_spec = FullAttentionSpec(
@@ -289,12 +288,22 @@ def _create_input_tensors(
    total_q: int,
    device: torch.device,
    dtype: torch.dtype,
+    quantize_query: bool = False,
 ) -> tuple:
-    """Create Q, K, V input tensors for all layers."""
+    """Create Q, K, V input tensors for all layers.
+
+    When quantize_query is True, queries are cast to fp8 to match backends
+    that require query/key/value dtype consistency.
+    """
+    q_dtype = dtype
+    if quantize_query:
+        from vllm.platforms import current_platform
+
+        q_dtype = current_platform.fp8_dtype()
    q_list = [
        torch.randn(
            total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
-        )
+        ).to(q_dtype)
        for _ in range(config.num_layers)
    ]
    k_list = [
@@ -345,10 +354,17 @@ def _create_kv_cache(
    # Compute inverse permutation to get back to logical view
    inv_order = [stride_order.index(i) for i in range(len(stride_order))]

+    # Use fp8 dtype for cache when requested.
+    cache_dtype = dtype
+    if config.kv_cache_dtype == "fp8":
+        from vllm.platforms import current_platform
+
+        cache_dtype = current_platform.fp8_dtype()
+
    cache_list = []
    for _ in range(config.num_layers):
        # Allocate in physical layout order (contiguous in memory)
-        cache = torch.zeros(*physical_shape, device=device, dtype=dtype)
+        cache = torch.zeros(*physical_shape, device=device, dtype=cache_dtype)
        # Permute to logical view
        cache = cache.permute(*inv_order)
        cache_list.append(cache)
@@ -391,7 +407,38 @@ def _run_single_benchmark(
                attn_metadata,
                output=out,
            )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
+
+    # Optionally capture a CUDA graph after warmup.
+    # Graph replay eliminates CPU launch overhead so timings reflect pure
+    # kernel time.
+    if config.use_cuda_graphs:
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph):
+            for i in range(config.num_layers):
+                impl.forward(
+                    layer,
+                    q_list[i],
+                    k_list[i],
+                    v_list[i],
+                    cache_list[i],
+                    attn_metadata,
+                    output=out,
+                )
+        benchmark_fn = graph.replay
+    else:
+
+        def benchmark_fn():
+            for i in range(config.num_layers):
+                impl.forward(
+                    layer,
+                    q_list[i],
+                    k_list[i],
+                    v_list[i],
+                    cache_list[i],
+                    attn_metadata,
+                    output=out,
+                )

    # Benchmark
    times = []
@@ -400,27 +447,18 @@ def _run_single_benchmark(
        end = torch.cuda.Event(enable_timing=True)

        start.record()
-        for i in range(config.num_layers):
-            impl.forward(
-                layer,
-                q_list[i],
-                k_list[i],
-                v_list[i],
-                cache_list[i],
-                attn_metadata,
-                output=out,
-            )
+        benchmark_fn()
        end.record()

-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        elapsed_ms = start.elapsed_time(end)
        times.append(elapsed_ms / 1000.0 / config.num_layers)  # seconds per layer

    mem_stats = {}
    if config.profile_memory:
        mem_stats = {
-            "allocated_mb": torch.cuda.memory_allocated(device) / 1024**2,
-            "reserved_mb": torch.cuda.memory_reserved(device) / 1024**2,
+            "allocated_mb": torch.accelerator.memory_allocated(device) / 1024**2,
+            "reserved_mb": torch.accelerator.memory_reserved(device) / 1024**2,
        }

    return times, mem_stats
@@ -444,7 +482,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
        BenchmarkResult with timing and memory statistics
    """
    device = torch.device(config.device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)

    backend_cfg = _get_backend_config(config.backend)

@@ -503,8 +541,12 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
                common_attn_metadata=common_metadata,
            )

+            # Only quantize queries when the impl supports it
+            quantize_query = config.kv_cache_dtype.startswith("fp8") and getattr(
+                impl, "supports_quant_query_input", False
+            )
            q_list, k_list, v_list = _create_input_tensors(
-                config, total_q, device, dtype
+                config, total_q, device, dtype, quantize_query=quantize_query
            )

            cache_list = _create_kv_cache(
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -41,7 +41,7 @@ MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LE
 | --- | --- | --- |
 | `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
 | `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` |
-| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
+| `SYSTEM` | **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
 | `TP` | **Required.** The tensor-parallelism size. | `1` |
 | `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
 | `INPUT_LEN` | **Required.** Request input length. | `4000` |
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -85,7 +85,6 @@ start_server() {
    # Each argument and its value are separate elements.
    local common_args_array=(
        "$MODEL"
-        "--disable-log-requests"
        "--port" "8004"
        "--host" "$HOSTNAME"
        "--gpu-memory-utilization" "$gpu_memory_utilization"
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -649,9 +649,3 @@ ASYNC_REQUEST_FUNCS = {
    "sglang": async_request_openai_completions,
    "llama.cpp": async_request_openai_completions,
 }
-
-OPENAI_COMPATIBLE_BACKENDS = [
-    k
-    for k, v in ASYNC_REQUEST_FUNCS.items()
-    if v in (async_request_openai_completions, async_request_openai_chat_completions)
-]
--- a/benchmarks/benchmark_topk_topp.py
+++ b/benchmarks/benchmark_topk_topp.py
@@ -94,15 +94,18 @@ def create_logits(

 def measure_memory() -> tuple[int, int]:
    """Return (allocated, reserved) memory in bytes."""
-    torch.cuda.synchronize()
-    return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()
+    torch.accelerator.synchronize()
+    return (
+        torch.accelerator.memory_allocated(),
+        torch.accelerator.max_memory_allocated(),
+    )


 def reset_memory_stats():
    """Reset peak memory statistics."""
    reset_buffer_cache()
-    torch.cuda.reset_peak_memory_stats()
-    torch.cuda.empty_cache()
+    torch.accelerator.reset_peak_memory_stats()
+    torch.accelerator.empty_cache()
    gc.collect()


@@ -123,7 +126,7 @@ def benchmark_function(
    for _ in range(warmup_iters):
        logits_copy = logits.clone()
        func(logits_copy, k, p)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Reset memory stats before benchmark
    reset_memory_stats()
@@ -140,7 +143,7 @@ def benchmark_function(
        func(logits_copy, k, p)
        end_events[i].record()

-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Calculate timing
    times = [
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -1,78 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import argparse
-import json
-import math
-import os
 import time
 from types import TracebackType
-from typing import Any
-
-
-def convert_to_pytorch_benchmark_format(
-    args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any]
-) -> list:
-    """
-    Save the benchmark results in the format used by PyTorch OSS benchmark with
-    on metric per record
-    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
-    """
-    records = []
-    if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
-        return records
-
-    for name, benchmark_values in metrics.items():
-        record = {
-            "benchmark": {
-                "name": "vLLM benchmark",
-                "extra_info": {
-                    "args": vars(args),
-                },
-            },
-            "model": {
-                "name": args.model,
-            },
-            "metric": {
-                "name": name,
-                "benchmark_values": benchmark_values,
-                "extra_info": extra_info,
-            },
-        }
-
-        tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
-        # Save tensor_parallel_size parameter if it's part of the metadata
-        if not tp and "tensor_parallel_size" in extra_info:
-            record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = (
-                extra_info["tensor_parallel_size"]
-            )
-
-        records.append(record)
-
-    return records
-
-
-class InfEncoder(json.JSONEncoder):
-    def clear_inf(self, o: Any):
-        if isinstance(o, dict):
-            return {k: self.clear_inf(v) for k, v in o.items()}
-        elif isinstance(o, list):
-            return [self.clear_inf(v) for v in o]
-        elif isinstance(o, float) and math.isinf(o):
-            return "inf"
-        return o
-
-    def iterencode(self, o: Any, *args, **kwargs) -> Any:
-        return super().iterencode(self.clear_inf(o), *args, **kwargs)
-
-
-def write_to_json(filename: str, records: list) -> None:
-    with open(filename, "w") as f:
-        json.dump(
-            records,
-            f,
-            cls=InfEncoder,
-            default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
-        )


 # Collect time and generate time metrics
--- a/benchmarks/cutlass_benchmarks/utils.py
+++ b/benchmarks/cutlass_benchmarks/utils.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 # Cutlass bench utils
-from collections.abc import Iterable

 import torch

@@ -86,15 +85,3 @@ def make_rand_sparse_tensors(

    # Compressed B, Metadata, Original A, B
    return b_compressed, e, a, b
-
-
-def make_n_rand_sparse_tensors(
-    num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int
-) -> tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
-    ABs = []
-    for _ in range(num_tensors):
-        b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
-        if b_comp is not None:
-            ABs.append(make_rand_sparse_tensors(dtype, m, n, k))
-    BComps, Es, As, Bs = zip(*ABs)
-    return list(BComps), list(Es), list(As), list(Bs)
--- a/benchmarks/disagg_benchmarks/rate_limiter.py
+++ b/benchmarks/disagg_benchmarks/rate_limiter.py
@@ -1,45 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import time
-
-
-class RateLimiter:
-    """Token bucket rate limiter implementation"""
-
-    def __init__(self, rate_limit):
-        self.rate_limit = rate_limit  # Requests per second
-        self.num_available_tokens = rate_limit  # Available tokens
-        self.last_refill = time.monotonic()  # Last token refill time
-        self.lock = asyncio.Lock()  # Synchronization lock
-
-    async def acquire(self):
-        """Acquire a token from the rate limiter"""
-        while True:
-            async with self.lock:
-                current_time = time.monotonic()
-                elapsed = current_time - self.last_refill
-
-                # Refill num_available_tokens if more than 1 second has passed
-                if elapsed > 1.0:
-                    self.num_available_tokens = self.rate_limit
-                    self.last_refill = current_time
-
-                # Check if num_available_tokens are available
-                if self.num_available_tokens > 0:
-                    self.num_available_tokens -= 1
-                    return True
-
-                # Calculate wait time if no num_available_tokens available
-                wait_time = 1.0 - elapsed
-            await asyncio.sleep(wait_time)
-
-    async def __aenter__(self):
-        """Enter async context manager - acquire token"""
-        await self.acquire()
-        return self
-
-    async def __aexit__(self, exc_type, exc_value, traceback):
-        """Exit async context manager - no cleanup needed"""
-        pass
--- a/benchmarks/disagg_benchmarks/request_queue.py
+++ b/benchmarks/disagg_benchmarks/request_queue.py
@@ -1,39 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-from collections import deque
-
-
-class RequestQueue:
-    """Request queue manager with concurrency control"""
-
-    def __init__(self, max_concurrent, max_queue_size):
-        # Maximum concurrent requests
-        self.max_concurrent = max_concurrent
-        self.max_queue_size = max_queue_size  # Maximum queue size
-        # Concurrency control
-        self.semaphore = asyncio.Semaphore(max_concurrent)
-        self.queue = deque()  # Request queue
-        self.queue_size = 0  # Current queue size
-        self.lock = asyncio.Lock()  # Sync queue Lock
-
-    async def enqueue(self, task):
-        """Add a request task to the queue"""
-        async with self.lock:
-            if self.queue_size >= self.max_queue_size:
-                return False
-
-            self.queue.append(task)
-            self.queue_size += 1
-            return True
-
-    async def process(self):
-        """Process queued requests using semaphore for concurrency control"""
-        while True:
-            if self.queue:
-                async with self.semaphore, self.lock:
-                    task = self.queue.popleft()
-                    self.queue_size -= 1
-                    await task
-            await asyncio.sleep(0.01)  # Yield control to event loop
--- a/benchmarks/kernels/bench_concat_mla_q.py
+++ b/benchmarks/kernels/bench_concat_mla_q.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.triton_utils import triton
+
+# DeepSeek V3 dimensions
+NOPE_DIM = 512
+ROPE_DIM = 64
+NUM_HEADS = 128
+
+NUM_TOKENS = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
+
+
+def get_configs():
+    return NUM_TOKENS
+
+
+def make_inputs(num_tokens, dtype):
+    """Create inputs matching the real code path.
+
+    Args:
+        contiguous_nope: If False, simulate the transposed BMM output
+                         (non-contiguous nope with stride pattern from
+                         [N,B,L].transpose(0,1)).
+    """
+    # Simulate: bmm output [N, B, L].transpose(0, 1) -> [B, N, L]
+    raw = torch.randn(NUM_HEADS, num_tokens, NOPE_DIM, dtype=dtype, device="cuda")
+    ql_nope = raw.transpose(0, 1)
+
+    q_pe = torch.randn(num_tokens, NUM_HEADS, ROPE_DIM, dtype=dtype, device="cuda")
+    return ql_nope, q_pe
+
+
+# ---- Non-contiguous nope benchmark (real code path) ----
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["num_tokens"],
+        x_vals=get_configs(),
+        line_arg="provider",
+        line_vals=["torch_cat", "concat_mla_q"],
+        line_names=["torch.cat", "concat_mla_q (v8)"],
+        styles=[("blue", "--"), ("green", "-")],
+        ylabel="Latency (us)",
+        plot_name="concat_mla_q-transposed",
+        args={},
+    )
+)
+def bench_transposed(num_tokens, provider):
+    dtype = torch.bfloat16
+    ql_nope, q_pe = make_inputs(num_tokens, dtype)
+
+    q_out = torch.empty(
+        num_tokens, NUM_HEADS, NOPE_DIM + ROPE_DIM, dtype=dtype, device="cuda"
+    )
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "torch_cat":
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: torch.cat((ql_nope, q_pe), dim=-1), quantiles=quantiles, rep=500
+        )
+    else:
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: ops.concat_mla_q(ql_nope, q_pe, q_out), quantiles=quantiles, rep=500
+        )
+
+    return ms * 1000, max_ms * 1000, min_ms * 1000  # us
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark concat_mla_q vs torch.cat")
+    parser.add_argument(
+        "--save-path", type=str, default=None, help="Path to save benchmark results"
+    )
+    args = parser.parse_args()
+
+    print("\n" + "=" * 70)
+    print("CONCAT MLA Q KERNEL BENCHMARKS")
+    print("=" * 70)
+    print(f"Dimensions: nope={NOPE_DIM}, rope={ROPE_DIM}, heads={NUM_HEADS}")
+    print(
+        f"Per-head output: {NOPE_DIM + ROPE_DIM} bf16 = "
+        f"{(NOPE_DIM + ROPE_DIM) * 2} bytes"
+    )
+    print(f"num_tokens (decode=batch_size, prefill=chunk_size): {NUM_TOKENS}")
+    print("=" * 70)
+
+    print("\n--- Non-contiguous nope inputs (transposed BMM output) ---")
+    bench_transposed.run(print_data=True, save_path=args.save_path)
+
+    print("\n" + "=" * 70)
+    print("Benchmarking complete!")
+    print("=" * 70)
--- a/benchmarks/kernels/bench_cp_gather_fp8.py
+++ b/benchmarks/kernels/bench_cp_gather_fp8.py
@@ -0,0 +1,153 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import math
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.triton_utils import triton
+
+# DeepSeek V3 MLA dimensions
+NOPE_DIM = 512
+ROPE_DIM = 64
+HEAD_DIM = NOPE_DIM + ROPE_DIM  # 576 BF16 output elements per token
+ENTRY_BYTES = 656  # 512 FP8 + 16 scales + 128 BF16 RoPE
+BLOCK_SIZE = 64  # tokens per physical cache block - get_supported_kernel_block_sizes
+
+# Realistic prefill scenarios:
+#   - 1 long prefill: single request, 16K-96K tokens
+#   - 4 medium prefills: 4 requests, 4K-24K tokens each
+#   - 16 shorter prefills: 16 requests, 1K-6K tokens each
+SCENARIOS = [
+    # (label, num_reqs, total_tokens_list)
+    ("1-req", 1, [8192, 16384, 32768, 65536, 98304]),
+    ("4-reqs", 4, [8192, 16384, 32768, 65536, 98304]),
+    ("16-reqs", 16, [8192, 16384, 32768, 65536, 98304]),
+]
+
+
+def make_inputs(total_tokens, num_reqs, block_size):
+    """Create synthetic FP8 cache, block table, and output buffer.
+
+    Fills the cache with random bytes (we only measure throughput,
+    not correctness). Block table maps each request to contiguous
+    physical blocks.
+    """
+    # Divide tokens evenly across requests
+    base_len = total_tokens // num_reqs
+    remainder = total_tokens % num_reqs
+    seq_lens = [base_len + (1 if r < remainder else 0) for r in range(num_reqs)]
+
+    # workspace_starts: cumulative sum of seq_lens
+    workspace_starts = [0] * num_reqs
+    for r in range(1, num_reqs):
+        workspace_starts[r] = workspace_starts[r - 1] + seq_lens[r - 1]
+
+    # Physical blocks needed per request
+    blocks_per_req = [math.ceil(s / block_size) for s in seq_lens]
+    total_blocks = sum(blocks_per_req)
+    max_blocks = max(blocks_per_req)
+
+    # Allocate cache with random data (content doesn't matter for perf)
+    cache = torch.randint(
+        0,
+        256,
+        (total_blocks, block_size, ENTRY_BYTES),
+        dtype=torch.uint8,
+        device="cuda",
+    )
+
+    # Block table: contiguous block assignments
+    block_table = torch.zeros(num_reqs, max_blocks, dtype=torch.int32, device="cuda")
+    block_idx = 0
+    for r in range(num_reqs):
+        for b in range(blocks_per_req[r]):
+            block_table[r, b] = block_idx
+            block_idx += 1
+
+    # Output workspace
+    dst = torch.zeros(total_tokens, HEAD_DIM, dtype=torch.bfloat16, device="cuda")
+
+    seq_lens_t = torch.tensor(seq_lens, dtype=torch.int32, device="cuda")
+    workspace_starts_t = torch.tensor(
+        workspace_starts, dtype=torch.int32, device="cuda"
+    )
+
+    return cache, dst, block_table, seq_lens_t, workspace_starts_t
+
+
+def bench_scenario(label, num_reqs, total_tokens_list, save_path):
+    """Run benchmark for a specific (num_reqs, total_tokens) scenario."""
+
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["total_tokens"],
+            x_vals=total_tokens_list,
+            line_arg="provider",
+            line_vals=["cuda_kernel"],
+            line_names=["cp_gather_fp8 (CUDA)"],
+            styles=[("green", "-")],
+            ylabel="Latency (us)",
+            plot_name=f"cp_gather_fp8-{label}-bs{BLOCK_SIZE}",
+            args={"num_reqs": num_reqs},
+        )
+    )
+    def bench_fn(total_tokens, provider, num_reqs):
+        cache, dst, block_table, seq_lens_t, ws_starts = make_inputs(
+            total_tokens, num_reqs, BLOCK_SIZE
+        )
+
+        quantiles = [0.5, 0.2, 0.8]
+
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: ops.cp_gather_and_upconvert_fp8_kv_cache(
+                cache, dst, block_table, seq_lens_t, ws_starts, num_reqs
+            ),
+            quantiles=quantiles,
+            rep=500,
+        )
+
+        return ms * 1000, max_ms * 1000, min_ms * 1000  # us
+
+    seq_len_per_req = total_tokens_list[0] // num_reqs
+    seq_len_per_req_max = total_tokens_list[-1] // num_reqs
+    print(
+        f"\n--- {label}: {num_reqs} request(s), "
+        f"~{seq_len_per_req}-{seq_len_per_req_max} tokens/req ---"
+    )
+    bench_fn.run(print_data=True, save_path=save_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Benchmark cp_gather_and_upconvert_fp8_kv_cache"
+    )
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default=None,
+        help="Path to save benchmark results as CSV",
+    )
+    args = parser.parse_args()
+
+    # Print data volume info for bandwidth analysis
+    read_per_token = ENTRY_BYTES  # 656 bytes from cache
+    write_per_token = HEAD_DIM * 2  # 576 * 2 = 1152 bytes to workspace
+    total_per_token = read_per_token + write_per_token  # 1808 bytes
+
+    print("\n" + "=" * 70)
+    print("CP_GATHER_AND_UPCONVERT_FP8_KV_CACHE BENCHMARKS")
+    print("=" * 70)
+    print(f"Cache entry: {ENTRY_BYTES} bytes (512 FP8 + 16 scales + 128 RoPE)")
+    print(f"Output row:  {HEAD_DIM} BF16 = {HEAD_DIM * 2} bytes")
+    print(f"Per token:   {total_per_token} bytes (read + write)")
+    print(f"Block size:  {BLOCK_SIZE} tokens/block")
+    print("=" * 70)
+
+    for label, num_reqs, total_tokens_list in SCENARIOS:
+        bench_scenario(label, num_reqs, total_tokens_list, args.save_path)
+
+    print("\n" + "=" * 70)
+    print("Benchmarking complete!")
+    print("=" * 70)
--- a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
@@ -168,7 +168,7 @@ def bench_impl(
    # warmup
    for kwargs in kwargs_list:
        impl_type.get_impl()(**kwargs)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Merge into a single kwargs and qualify arguments as ArgPool
    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
@@ -202,7 +202,7 @@ def test_correctness(T: int, N: int):
    # reference output
    ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)

-    # test ouptut
+    # test output
    out_q, out_s = output_from_impl(
        ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
    )
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -12,12 +12,12 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.activation import MoEActivation
+from vllm.model_executor.layers.fused_moe.all2all_utils import (
+    maybe_make_prepare_finalize,
+)
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
 from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
-from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP,
-)
 from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.v1.worker.workspace import init_workspace_manager
@@ -64,7 +64,7 @@ def bench_run(
    per_out_ch: bool,
    mkn: tuple[int, int, int],
 ):
-    init_workspace_manager(torch.cuda.current_device())
+    init_workspace_manager(torch.accelerator.current_device_index())
    (m, k, n) = mkn

    dtype = torch.half
@@ -137,15 +137,21 @@ def bench_run(
        per_out_ch_quant=per_out_ch,
    )

-    fn = mk.FusedMoEModularKernel(
-        MoEPrepareAndFinalizeNoEP(),
+    moe_config = make_dummy_moe_config(
+        num_experts=num_experts,
+        hidden_dim=k,
+        intermediate_size_per_partition=n,
+        in_dtype=a.dtype,
+    )
+    fn = mk.FusedMoEKernel(
+        maybe_make_prepare_finalize(
+            moe=moe_config,
+            quant_config=quant_config,
+            allow_new_interface=True,
+            use_monolithic=False,
+        ),
        CutlassExpertsFp8(
-            moe_config=make_dummy_moe_config(
-                num_experts=num_experts,
-                hidden_dim=k,
-                intermediate_size_per_partition=n,
-                in_dtype=a.dtype,
-            ),
+            moe_config=moe_config,
            quant_config=quant_config,
        ),
    )
@@ -165,7 +171,7 @@ def bench_run(
                activation=MoEActivation.SILU,
                global_num_experts=num_experts,
            )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
    triton_stream = torch.cuda.Stream()
@@ -181,14 +187,14 @@ def bench_run(
                topk_ids,
                quant_config=quant_config,
            )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
        """Benchmark CUDA graph using events like benchmark_moe.py"""
        # Warmup
        for _ in range(num_warmup):
            graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

        # Timing
        start_event = torch.Event(enable_timing=True)
@@ -196,7 +202,7 @@ def bench_run(

        latencies = []
        for _ in range(num_iters):
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            start_event.record()
            graph.replay()
            end_event.record()
--- a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
@@ -15,6 +15,9 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.all2all_utils import (
+    maybe_make_prepare_finalize,
+)
 from vllm.model_executor.layers.fused_moe.config import (
    fp8_w8a8_moe_quant_config,
    nvfp4_moe_quant_config,
@@ -23,9 +26,6 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import (
    CutlassExpertsFp4,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
-from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP,
-)
 from vllm.scalar_type import scalar_types
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.v1.worker.workspace import init_workspace_manager
@@ -196,10 +196,21 @@ def bench_run(
            g2_alphas=w2_gs,
        )

-        kernel = mk.FusedMoEModularKernel(
-            MoEPrepareAndFinalizeNoEP(),
+        moe_config = make_dummy_moe_config(
+            num_experts=num_experts,
+            hidden_dim=k,
+            intermediate_size_per_partition=n,
+            in_dtype=a.dtype,
+        )
+        kernel = mk.FusedMoEKernel(
+            maybe_make_prepare_finalize(
+                moe=moe_config,
+                quant_config=quant_config,
+                allow_new_interface=True,
+                use_monolithic=False,
+            ),
            CutlassExpertsFp4(
-                make_dummy_moe_config(),
+                moe_config=moe_config,
                quant_config=quant_config,
            ),
        )
@@ -240,11 +251,17 @@ def bench_run(
            g1_alphas=w1_gs,
            g2_alphas=w2_gs,
        )
+        moe_config = make_dummy_moe_config()

-        kernel = mk.FusedMoEModularKernel(
-            MoEPrepareAndFinalizeNoEP(),
+        kernel = mk.FusedMoEKernel(
+            maybe_make_prepare_finalize(
+                moe=moe_config,
+                quant_config=quant_config,
+                allow_new_interface=True,
+                use_monolithic=False,
+            ),
            CutlassExpertsFp4(
-                make_dummy_moe_config(),
+                moe_config=moe_config,
                quant_config=quant_config,
            ),
        )
@@ -290,7 +307,7 @@ def bench_run(
    def replay_graph(graph, num_repeats):
        for _ in range(num_repeats):
            graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

    cutlass_stream = torch.cuda.Stream()
    cutlass_graph = torch.cuda.CUDAGraph()
@@ -313,7 +330,7 @@ def bench_run(
            e=num_experts,
            device=device,
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    triton_stream = torch.cuda.Stream()
    triton_graph = torch.cuda.CUDAGraph()
@@ -328,7 +345,7 @@ def bench_run(
            w2_fp8scale,
            a_fp8_scale,
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    min_run_time = 5
    num_warmup = 5
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -342,7 +342,7 @@ class CommunicatorBenchmark:
            if not should_use_fn(tensor):
                return None

-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            stream = torch.cuda.Stream()
            with torch.cuda.stream(stream):
                graph_input = tensor.clone()
@@ -360,17 +360,17 @@ class CommunicatorBenchmark:
                        for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
                            allreduce_fn(graph_input)

-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            for _ in range(num_warmup):
                graph.replay()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()

-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            start_time = time.perf_counter()

            for _ in range(num_trials):
                graph.replay()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()

            end_time = time.perf_counter()

@@ -495,7 +495,7 @@ def main():

    # Set device
    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)

    # Get CPU process group
    cpu_group = dist.new_group(backend="gloo")
--- a/benchmarks/kernels/benchmark_fused_collective.py
+++ b/benchmarks/kernels/benchmark_fused_collective.py
@@ -385,32 +385,32 @@ def benchmark_operation(
    # Warmup before graph capture
    for _ in range(warmup):
        operation_func(*args, **kwargs)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Create CUDA graph
    graph = torch.cuda.CUDAGraph()
    num_op_per_cudagraph = 10

    # Use vLLM's graph_capture to make tensor_model_parallel_all_reduce graph-safe
-    device = torch.device(f"cuda:{torch.cuda.current_device()}")
+    device = torch.device(f"cuda:{torch.accelerator.current_device_index()}")
    with graph_capture(device=device), torch.cuda.graph(graph):
        for _ in range(num_op_per_cudagraph):
            operation_func(*args, **kwargs)

    # Graph warmup
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    for _ in range(warmup):
        graph.replay()

    # Benchmark with CUDA graph
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_time = time.perf_counter()

    for _ in range(trials // num_op_per_cudagraph):
        # operation_func(*args, **kwargs)
        graph.replay()

-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    end_time = time.perf_counter()

    avg_time_ms = ((end_time - start_time) / trials) * 1000
@@ -984,7 +984,7 @@ def main():
    world_size = int(os.environ["WORLD_SIZE"])

    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    torch.set_default_device(device)

    init_distributed_environment()
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -9,15 +9,15 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.all2all_utils import (
+    maybe_make_prepare_finalize,
+)
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
 from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
 from vllm.model_executor.layers.fused_moe.fused_moe import (
    fused_experts,
    fused_topk,
 )
-from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP,
-)
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.v1.worker.workspace import init_workspace_manager

@@ -50,7 +50,7 @@ def bench_run(
    per_out_ch: bool,
    mkn: tuple[int, int, int],
 ):
-    init_workspace_manager(torch.cuda.current_device())
+    init_workspace_manager(torch.accelerator.current_device_index())
    label = "Quant Matmul"

    sub_label = (
@@ -131,16 +131,22 @@ def bench_run(
            w2_scale=w2_scale,
            per_act_token_quant=per_act_token,
        )
+        moe_config = make_dummy_moe_config(
+            num_experts=w2.shape[0],
+            hidden_dim=w2.shape[1],
+            intermediate_size_per_partition=w2.shape[2],
+            in_dtype=a.dtype,
+        )

-        fn = mk.FusedMoEModularKernel(
-            MoEPrepareAndFinalizeNoEP(),
+        fn = mk.FusedMoEKernel(
+            maybe_make_prepare_finalize(
+                moe=moe_config,
+                quant_config=quant_config,
+                allow_new_interface=True,
+                use_monolithic=False,
+            ),
            CutlassExpertsFp8(
-                moe_config=make_dummy_moe_config(
-                    num_experts=w2.shape[0],
-                    hidden_dim=w2.shape[1],
-                    intermediate_size_per_partition=w2.shape[2],
-                    in_dtype=a.dtype,
-                ),
+                moe_config=moe_config,
                quant_config=quant_config,
            ),
        )
@@ -163,16 +169,22 @@ def bench_run(
            w2_scale=w2_scale,
            per_act_token_quant=per_act_token,
        )
+        moe_config = make_dummy_moe_config(
+            num_experts=w2.shape[0],
+            hidden_dim=w2.shape[1],
+            intermediate_size_per_partition=w2.shape[2],
+            in_dtype=a.dtype,
+        )

-        fn = mk.FusedMoEModularKernel(
-            MoEPrepareAndFinalizeNoEP(),
+        fn = mk.FusedMoEKernel(
+            maybe_make_prepare_finalize(
+                moe=moe_config,
+                quant_config=quant_config,
+                allow_new_interface=True,
+                use_monolithic=False,
+            ),
            CutlassExpertsFp8(
-                moe_config=make_dummy_moe_config(
-                    num_experts=w2.shape[0],
-                    hidden_dim=w2.shape[1],
-                    intermediate_size_per_partition=w2.shape[2],
-                    in_dtype=a.dtype,
-                ),
+                moe_config=moe_config,
                quant_config=quant_config,
            ),
        )
@@ -212,7 +224,7 @@ def bench_run(
    def replay_graph(graph, num_repeats):
        for _ in range(num_repeats):
            graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

    cutlass_stream = torch.cuda.Stream()
    cutlass_graph = torch.cuda.CUDAGraph()
@@ -227,7 +239,7 @@ def bench_run(
            topk_weights,
            topk_ids,
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    triton_stream = torch.cuda.Stream()
    triton_graph = torch.cuda.CUDAGraph()
@@ -242,7 +254,7 @@ def bench_run(
            w2_scale,
            a_scale,
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    min_run_time = 5
    num_warmup = 5
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -34,14 +34,14 @@ def main(
    residual = torch.randn_like(x) * scale if add_residual else None

    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()

        for _ in range(num_iters):
            layer(x, residual)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

        end_time = time.perf_counter()
        if profile:
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -1035,7 +1035,7 @@ def bench_optype(
    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
    for kwargs in kwargs_list:
        op_type.bench_fn()(**kwargs)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Merge into a single kwargs and qualify arguments as ArgPool
    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
--- a/benchmarks/kernels/benchmark_mla_k_concat.py
+++ b/benchmarks/kernels/benchmark_mla_k_concat.py
@@ -47,13 +47,13 @@ def benchmark_method(
    # Warmup
    for _ in range(num_warmup):
        _ = method(k_nope, k_pe)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Benchmark
    start = time.perf_counter()
    for _ in range(num_iters):
        _ = method(k_nope, k_pe)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    end = time.perf_counter()

    return (end - start) / num_iters * 1000  # Convert to ms
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -17,6 +17,9 @@ from ray.experimental.tqdm_ray import tqdm

 from vllm.model_executor.layers.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.activation import MoEActivation
+from vllm.model_executor.layers.fused_moe.all2all_utils import (
+    maybe_make_prepare_finalize,
+)
 from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEConfig,
    FusedMoEParallelConfig,
@@ -51,7 +54,7 @@ def clear_triton_cache():

    # Clear CUDA memory cache
    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
+        torch.accelerator.empty_cache()

    # Try to clear Triton's runtime cache
    try:
@@ -242,24 +245,33 @@ def benchmark_config(

        deep_gemm_experts = None
        if use_deep_gemm:
-            deep_gemm_experts = mk.FusedMoEModularKernel(
-                prepare_finalize=MoEPrepareAndFinalizeNoEP(),
+            moe_config = (
+                FusedMoEConfig(
+                    num_experts=num_experts,
+                    experts_per_token=topk,
+                    hidden_dim=hidden_size,
+                    intermediate_size_per_partition=shard_intermediate_size,
+                    num_local_experts=num_experts,
+                    num_logical_experts=num_experts,
+                    activation=MoEActivation.SILU,
+                    moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
+                    in_dtype=init_dtype,
+                    routing_method=RoutingMethodType.TopK,
+                    device="cuda",
+                ),
+            )
+            deep_gemm_experts = mk.FusedMoEKernel(
+                prepare_finalize=maybe_make_prepare_finalize(
+                    moe=moe_config,
+                    quant_config=quant_config,
+                    allow_new_interface=True,
+                    use_monolithic=False,
+                ),
                fused_experts=TritonOrDeepGemmExperts(
-                    moe_config=FusedMoEConfig(
-                        num_experts=num_experts,
-                        experts_per_token=topk,
-                        hidden_dim=hidden_size,
-                        intermediate_size_per_partition=shard_intermediate_size,
-                        num_local_experts=num_experts,
-                        num_logical_experts=num_experts,
-                        activation=MoEActivation.SILU,
-                        moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
-                        in_dtype=init_dtype,
-                        routing_method=RoutingMethodType.TopK,
-                        device="cuda",
-                    ),
+                    moe_config=moe_config,
                    quant_config=quant_config,
                ),
+                inplace=not disable_inplace(),
            )

        with override_config(config):
@@ -269,8 +281,16 @@ def benchmark_config(

            inplace = not disable_inplace()
            if use_deep_gemm:
-                return deep_gemm_experts(
-                    x, w1, w2, topk_weights, topk_ids, inplace=inplace
+                return deep_gemm_experts.apply(
+                    x,
+                    w1,
+                    w2,
+                    topk_weights,
+                    topk_ids,
+                    activation=MoEActivation.SILU,
+                    global_num_experts=num_experts,
+                    apply_router_weight_on_input=False,
+                    expert_map=False,
                )
            return fused_experts(
                x,
@@ -284,19 +304,19 @@ def benchmark_config(

    # JIT compilation & warmup
    run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Capture 10 invocations with CUDA graph
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        for _ in range(10):
            run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Warmup
    for _ in range(5):
        graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
@@ -304,7 +324,7 @@ def benchmark_config(
    latencies: list[float] = []
    for i in range(num_iters):
        prepare(i)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

        start_event.record()
        graph.replay()
@@ -606,7 +626,11 @@ class BenchmarkWorker:
            if visible_device != f"{self.device_id}":
                need_device_guard = True

-        with torch.cuda.device(self.device_id) if need_device_guard else nullcontext():
+        with (
+            torch.accelerator.device_index(self.device_id)
+            if need_device_guard
+            else nullcontext()
+        ):
            for idx, config in enumerate(tqdm(search_space)):
                try:
                    kernel_time = benchmark_config(
--- a/benchmarks/kernels/benchmark_moe_defaults.py
+++ b/benchmarks/kernels/benchmark_moe_defaults.py
@@ -131,7 +131,7 @@ def benchmark_config(
                topk_ids,
                quant_config=quant_config,
            )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Benchmark
    start = torch.cuda.Event(enable_timing=True)
@@ -149,7 +149,7 @@ def benchmark_config(
                quant_config=quant_config,
            )
    end.record()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    return start.elapsed_time(end) / num_iters * 1000  # ms -> us


--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -69,19 +69,19 @@ def benchmark_permute(

    # JIT compilation & warmup
    run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Capture 10 invocations with CUDA graph
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        for _ in range(10):
            run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Warmup
    for _ in range(5):
        graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
@@ -89,7 +89,7 @@ def benchmark_permute(
    latencies: list[float] = []
    for i in range(num_iters):
        prepare(i)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

        start_event.record()
        graph.replay()
@@ -159,26 +159,26 @@ def benchmark_unpermute(
    # JIT compilation & warmup
    input = prepare()
    run(input)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Capture 10 invocations with CUDA graph
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        for _ in range(10):
            run(input)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Warmup
    for _ in range(5):
        graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)

    latencies: list[float] = []
    for i in range(num_iters):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        graph.replay()
        end_event.record()
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -135,14 +135,14 @@ def benchmark_mrope(
            key.clone(),
        )

-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Time reference implementation
    torch_times = []
    for _ in range(benchmark_iter):
        query_clone = query.clone()
        key_clone = key.clone()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_time = time.time()

        mrope_helper_class.forward_native(
@@ -151,7 +151,7 @@ def benchmark_mrope(
            key_clone,
        )

-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        torch_times.append(time.time() - start_time)

    # Time triton kernel implementation
@@ -159,14 +159,14 @@ def benchmark_mrope(
    for _ in range(benchmark_iter):
        query_clone = query.clone()
        key_clone = key.clone()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_time = time.time()
        mrope_helper_class.forward_cuda(
            positions,
            query_clone,
            key_clone,
        )
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        triton_times.append(time.time() - start_time)

    # Calculate statistics
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -103,7 +103,7 @@ def main(
        max_logits = torch.empty_like(exp_sums)

    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
@@ -173,7 +173,7 @@ def main(
                    )
            else:
                raise ValueError(f"Invalid version: {version}")
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

        end_time = time.perf_counter()
        if profile:
--- a/benchmarks/kernels/benchmark_per_token_group_quant.py
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@@ -28,7 +28,7 @@ def _time_cuda(
    # warmup
    for _ in range(warmup_iters):
        fn()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    start = torch.Event(enable_timing=True)
    end = torch.Event(enable_timing=True)
@@ -37,7 +37,7 @@ def _time_cuda(
    for _ in range(bench_iters):
        fn()
    end.record()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    return start.elapsed_time(end) / bench_iters  # ms/iter

--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -29,7 +29,7 @@ def main(
    scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None

    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
@@ -39,7 +39,7 @@ def main(
                ops.scaled_int8_quant(x, scale)
            else:
                ops.scaled_fp8_quant(x, scale)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

        end_time = time.perf_counter()
        if profile:
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@@ -84,16 +84,16 @@ def run_benchmark(
        g = torch.cuda.CUDAGraph()
        with torch.cuda.graph(g):
            function_under_test()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        function_under_test = lambda: g.replay()

    def run_cuda_benchmark(n_iters: int) -> float:
        nonlocal key, value, key_cache, value_cache, slot_mapping
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = time.perf_counter()
        for _ in range(n_iters):
            function_under_test()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
        end = time.perf_counter()
        return (end - start) / n_iters

@@ -104,7 +104,7 @@ def run_benchmark(

    # free tensors to mitigate OOM when sweeping
    del key, value, key_cache, value_cache, slot_mapping
-    torch.cuda.empty_cache()
+    torch.accelerator.empty_cache()

    return lat

--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@@ -109,16 +109,16 @@ def run_benchmark(
        g = torch.cuda.CUDAGraph()
        with torch.cuda.graph(g):
            function_under_test()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        function_under_test = lambda: g.replay()

    def run_cuda_benchmark(n_iters: int) -> float:
        nonlocal key, value, key_cache, value_cache, slot_mapping
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = time.perf_counter()
        for _ in range(n_iters):
            function_under_test()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
        end = time.perf_counter()
        return (end - start) / n_iters

@@ -129,7 +129,7 @@ def run_benchmark(

    # free tensors to mitigate OOM when sweeping
    del key, value, key_cache, value_cache, slot_mapping
-    torch.cuda.empty_cache()
+    torch.accelerator.empty_cache()

    return lat

--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -251,7 +251,7 @@ def benchmark(
        kernel(
            y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
@@ -259,7 +259,7 @@ def benchmark(
    # Benchmark
    latencies: list[float] = []
    for _ in range(runs):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

        start_event.record()
        for i in range(iterations_per_run):
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -126,7 +126,7 @@ def benchmark_decode(
    )

    def time_fn(fn, warmup=10, trials=20):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = torch.Event(enable_timing=True)
        end = torch.Event(enable_timing=True)
        times = []
@@ -136,7 +136,7 @@ def benchmark_decode(
            start.record()
            fn()
            end.record()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            times.append(start.elapsed_time(end))  # ms
        return sum(times) / len(times), torch.std(torch.tensor(times))

--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -138,7 +138,7 @@ def benchmark_prefill(
    )

    def time_fn(fn, warmup=10, trials=20):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = torch.Event(enable_timing=True)
        end = torch.Event(enable_timing=True)
        times = []
@@ -148,7 +148,7 @@ def benchmark_prefill(
            start.record()
            fn()
            end.record()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            times.append(start.elapsed_time(end))  # ms
        return sum(times) / len(times), torch.std(torch.tensor(times))

--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -177,18 +177,18 @@ def benchmark_config(
    def run():
        w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)

-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # JIT complication & warmup
    for _ in range(5):
        run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)

    latencies: list[float] = []
    for i in range(num_iters):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        run()
        end_event.record()
@@ -285,7 +285,7 @@ def tune_on_gpu(args_dict):
    weight_shapes = args_dict["weight_shapes"]
    args = args_dict["args"]

-    torch.cuda.set_device(gpu_id)
+    torch.accelerator.set_device_index(gpu_id)
    print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")

    block_n = args.block_n
@@ -334,7 +334,7 @@ def distribute_batch_sizes(batch_sizes, num_gpus):

 def main(args):
    print(args)
-    num_gpus = torch.cuda.device_count()
+    num_gpus = torch.accelerator.device_count()
    if num_gpus == 0:
        raise RuntimeError("No GPU available for tuning")
    print(f"Found {num_gpus} GPUs for parallel tuning")
--- a/Show More
+++ b/Show More