[ROCm] Fix AttributeError for torch.compiler.skip_all_guards_unsafe on older PyTorch (#37219 )

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
[Refactor] Relocate completion and chat completion tests (#37125 )
2026-03-17 11:34:49 +08:00 · 2026-03-17 11:31:23 +08:00 · 2026-03-17 03:24:34 +00:00 · 2026-03-16 18:04:15 -07:00 · 2026-03-16 17:48:42 -07:00 · 2026-03-17 00:38:52 +00:00
1164 changed files with 85235 additions and 45354 deletions
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -10,7 +10,7 @@ steps:
      docker build
      --build-arg max_jobs=16
      --build-arg REMOTE_VLLM=1
-      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx942;gfx950'
+      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
      --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
      --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
      -f docker/Dockerfile.rocm
--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -21,6 +21,20 @@ steps:
      pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
      pytest -x -v -s tests/kernels/test_onednn.py"
 - label: CPU-Compatibility Tests
  depends_on: []
  soft_fail: true
  device: intel_cpu
  no_plugin: true
  source_file_dependencies:
  - cmake/cpu_extension.cmake
  - setup.py
  - vllm/platforms/cpu.py
  commands:
    - |
      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
      bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
 - label: CPU-Language Generation and Pooling Model Tests
  depends_on: []
  soft_fail: true
--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -25,9 +25,7 @@ fi
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --build-arg VLLM_CPU_AVX512BF16=true \
+  --build-arg VLLM_CPU_X86=true \
  --build-arg VLLM_CPU_AVX512VNNI=true \
  --build-arg VLLM_CPU_AMXBF16=true \
  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
  --target vllm-test \
  --progress plain .
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -13,9 +13,10 @@ import os
 from contextlib import contextmanager
 import lm_eval
 import numpy as np
 import yaml
 from vllm.platforms import current_platform
 DEFAULT_RTOL = 0.08
@@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size):
        "allow_deprecated_quantization=True,"
    )
    if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]:
        model_args += "attention_backend=TRITON_ATTN"
    env_vars = eval_config.get("env_vars", None)
    with scoped_env_vars(env_vars):
        results = lm_eval.simple_evaluate(
@@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
                f"ground_truth={ground_truth:.3f} | "
                f"measured={measured_value:.3f} | rtol={rtol}"
            )
-            success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
+
            min_acceptable = ground_truth * (1 - rtol)
            success = success and measured_value >= min_acceptable
    assert success
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@@ -7,12 +7,12 @@ import argparse
 import html as _html
 import json
 import os
 from contextlib import nullcontext
 from dataclasses import dataclass
 from importlib import util
 from pathlib import Path
 import pandas as pd
 import regex as re
 pd.options.display.float_format = "{:.2f}".format
 plotly_found = util.find_spec("plotly.express") is not None
@@ -33,6 +33,45 @@ pd.set_option("display.precision", 2)
 pd.set_option("display.float_format", lambda x: f"{x:.2f}")
 # -----------------------------
 # Concurrency normalization (NEW, small)
 # -----------------------------
 def _find_concurrency_col(df: pd.DataFrame) -> str:
    for c in [
        "# of max concurrency.",
        "# of max concurrency",
        "Max Concurrency",
        "max_concurrency",
        "Concurrency",
    ]:
        if c in df.columns:
            return c
    for c in df.columns:
        if "concurr" in str(c).lower():
            s = df[c]
            if s.dtype.kind in "iu" and s.nunique() > 1 and s.min() >= 1:
                return c
    raise ValueError(
        "Cannot infer concurrency column. "
        "Please rename the column to one of the known names "
        "or add an explicit override (e.g., --concurrency-col)."
    )
 def _normalize_concurrency_in_df(
    df: pd.DataFrame, canonical: str = "# of max concurrency."
 ) -> pd.DataFrame:
    if canonical in df.columns:
        return df
    detected = _find_concurrency_col(df)
    if detected in df.columns and detected != canonical:
        return df.rename(columns={detected: canonical})
    df[canonical] = pd.NA
    return df
 # -----------------------------
 # Core data compare
 # -----------------------------
@@ -52,19 +91,25 @@ def compare_data_columns(
    - Concat along axis=1 (indexes align), then reset_index so callers can
      group by columns.
    - If --debug, add a <file_label>_name column per file.
    Minimal fix to support different max_concurrency lists across files:
      - normalize concurrency column naming to "# of max concurrency."
      - align on UNION of keys (missing points become NaN)
      - BUGFIX: don't drop throughput rows based on P99/Median presence
    """
    print("\ncompare_data_column:", data_column)
    frames = []
    raw_data_cols: list[str] = []
    compare_frames = []
    # Determine key cols after normalizing concurrency
    cols_per_file: list[set] = []
    for f in files:
        try:
            df_tmp = pd.read_json(f, orient="records")
        except Exception as err:
            raise ValueError(f"Failed to read {f}") from err
        df_tmp = _normalize_concurrency_in_df(df_tmp, canonical="# of max concurrency.")
        cols_per_file.append(set(df_tmp.columns))
    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
@@ -75,12 +120,25 @@ def compare_data_columns(
            "No common key columns found from info_cols across the input files."
        )
-    meta_added = False
+    union_index = None
    metas: list[pd.DataFrame] = []
    staged: list[tuple[str, pd.Series, pd.Series | None]] = []
    for file in files:
        df = pd.read_json(file, orient="records")
        df = _normalize_concurrency_in_df(df, canonical="# of max concurrency.")
-        if drop_column in df.columns:
+        # BUGFIX: only drop rows for latency-like metrics; throughput rows may have
        # NaN in P99/Median columns even if the column exists in the JSON.
        metric_lc = str(data_column).lower()
        is_latency_metric = (
            "ttft" in metric_lc
            or "tpot" in metric_lc
            or "p99" in metric_lc
            or "median" in metric_lc
            or metric_lc.strip() in {"p99", "median"}
        )
        if is_latency_metric and drop_column in df.columns:
            df = df.dropna(subset=[drop_column], ignore_index=True)
        for c in (
@@ -105,35 +163,61 @@ def compare_data_columns(
            meta = meta.groupby(level=key_cols, dropna=False).first()
        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
-        s = df_idx[data_column]
+
-        if not s.index.is_unique:
+        if data_column in df_idx.columns:
-            s = s.groupby(level=key_cols, dropna=False).mean()
+            s = df_idx[data_column]
            if not s.index.is_unique:
                s = s.groupby(level=key_cols, dropna=False).mean()
        else:
            # keep NA series to preserve meta keys for union_index
            s = pd.Series(pd.NA, index=meta.index)
        s.name = file_label
-        if not meta_added:
+        name_s = None
            frames.append(meta)
            meta_added = True
        if debug and name_column in df_idx.columns:
            name_s = df_idx[name_column]
            if not name_s.index.is_unique:
                name_s = name_s.groupby(level=key_cols, dropna=False).first()
            name_s.name = f"{file_label}_name"
            frames.append(name_s)
-        frames.append(s)
+        if union_index is None:
            union_index = meta.index
        else:
            union_index = union_index.union(meta.index)
        metas.append(meta)
        staged.append((file_label, s, name_s))
    if union_index is None:
        raise ValueError("No data found after loading inputs.")
    # meta first (union-aligned): build UNION meta across all files
    if metas:
        meta_union = pd.concat(metas, axis=0)
        # Collapse duplicates on the MultiIndex; keep first non-null per column
        meta_union = meta_union.groupby(level=key_cols, dropna=False).first()
        frames.append(meta_union.reindex(union_index))
    # values + ratios (union-aligned)
    metric_series_aligned: list[pd.Series] = []
    for file_label, s, name_s in staged:
        s_aligned = s.reindex(union_index)
        frames.append(s_aligned)
        raw_data_cols.append(file_label)
-        compare_frames.append(s)
+        metric_series_aligned.append(s_aligned)
-        if len(compare_frames) >= 2:
+        if debug and name_s is not None:
-            base = compare_frames[0]
+            frames.append(name_s.reindex(union_index))
-            current = compare_frames[-1]
+
-            if "P99" in data_column or "Median" in data_column:
+        if len(metric_series_aligned) >= 2:
            base = metric_series_aligned[0]
            current = metric_series_aligned[-1]
            if "P99" in str(data_column) or "Median" in str(data_column):
                ratio = base / current
            else:
                ratio = current / base
            ratio = ratio.mask(base == 0)
-            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
+            ratio.name = f"Ratio 1 vs {len(metric_series_aligned)}"
            frames.append(ratio)
    concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
@@ -204,24 +288,10 @@ def split_json_by_tp_pp(
 # -----------------------------
 # Styling helpers
 # -----------------------------
 def _find_concurrency_col(df: pd.DataFrame) -> str:
    for c in [
        "# of max concurrency.",
        "# of max concurrency",
        "Max Concurrency",
        "max_concurrency",
        "Concurrency",
    ]:
        if c in df.columns:
            return c
    for c in df.columns:
        if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
            return c
    return "# of max concurrency."
 def _highlight_threshold(
-    df: pd.DataFrame, threshold: float
+    df: pd.DataFrame,
    threshold: float,
    slack_pct: float = 0.0,
 ) -> pd.io.formats.style.Styler:
    conc_col = _find_concurrency_col(df)
    key_cols = [
@@ -234,12 +304,24 @@ def _highlight_threshold(
    ]
    conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
-    return df.style.map(
+    try:
-        lambda v: "background-color:#e6ffe6;font-weight:bold;"
+        slack_pct = float(slack_pct or 0.0)
-        if pd.notna(v) and v <= threshold
+    except Exception:
-        else "",
+        slack_pct = 0.0
-        subset=conf_cols,
+    slack_limit = threshold * (1.0 + slack_pct / 100.0)
-    )
+
    def _cell(v):
        if pd.isna(v):
            return ""
        if v <= threshold:
            # Strict SLA
            return "background-color:#e6ffe6;font-weight:bold;"
        if v <= slack_limit:
            # Within slack range
            return "background-color:#ffe5cc;font-weight:bold;"
        return ""
    return df.style.map(_cell, subset=conf_cols)
 def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
@@ -286,11 +368,30 @@ def _sanitize_sheet_name(name: str) -> str:
      - max 31 chars
      - cannot contain: : \ / ? * [ ]
      - cannot be empty
    NOTE: Use fast, non-regex operations here to avoid the third-party `regex`
    module's compile overhead/edge-cases on some systems.
    """
    name = "sheet" if name is None else str(name)
-    name = re.sub(r"[:\\/?*\[\]]", "_", name)
+
    # Replace illegal characters with underscore.
    trans = str.maketrans(
        {
            ":": "_",
            "\\": "_",
            "/": "_",
            "?": "_",
            "*": "_",
            "[": "_",
            "]": "_",
        }
    )
    name = name.translate(trans)
    # Strip quotes/spaces and collapse whitespace.
    name = name.strip().strip("'")
-    name = re.sub(r"\s+", " ", name)
+    name = " ".join(name.split())
    if not name:
        name = "sheet"
    return name[:31]
@@ -298,30 +399,57 @@ def _sanitize_sheet_name(name: str) -> str:
 def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
    d = dict(zip(group_cols, gkey_tuple))
-    model = d.get("Model", "model")
+
-    model_short = str(model).split("/")[-1]
+    # Always keep input/output lengths (these are important).
    ilen = d.get("Input Len", "")
    olen = d.get("Output Len", "")
    lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
    # Shorten model name aggressively to make room for lens.
    model = d.get("Model", "model")
    leaf = str(model).split("/")[-1]
    max_model_len = max(1, 31 - len(lens))
    model_short = leaf[:max_model_len]
    return _sanitize_sheet_name(f"{model_short}{lens}")
 def _write_tables_to_excel_sheet(
    writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
 ):
-    startrow = 0
+    """Write all blocks to a sheet with a single to_excel() call.
    Pandas+openpyxl can be extremely slow when called many times per sheet.
    We flatten blocks into one table with a 'Section' column to keep structure
    while making Excel generation fast and deterministic.
    """
    if not blocks:
        pd.DataFrame().to_excel(writer, sheet_name=sheet, index=False)
        return
    combined_parts: list[pd.DataFrame] = []
    for title, df in blocks:
-        pd.DataFrame([[title]]).to_excel(
+        df2 = df.copy()
-            writer, sheet_name=sheet, index=False, header=False, startrow=startrow
+        # Put the section label as the first column for readability.
-        )
+        df2.insert(0, "Section", title)
-        startrow += 1
+        combined_parts.append(df2)
-        df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow)
+
-        startrow += len(df) + 3
+    combined = pd.concat(combined_parts, axis=0, ignore_index=True, sort=False)
    combined.to_excel(writer, sheet_name=sheet, index=False)
 def _safe_filename(s: str) -> str:
-    s = re.sub(r"[^\w\-.]+", "_", str(s).strip())
+    # Fast path without the third-party `regex` module.
-    return s[:180] if len(s) > 180 else s
+    s = " ".join(str(s).strip().split())
    allowed = []
    for ch in s:
        if ch.isalnum() or ch in "._-":
            allowed.append(ch)
        else:
            allowed.append("_")
    out = "".join(allowed)
    return out[:180] if len(out) > 180 else out
 # -----------------------------
@@ -428,7 +556,11 @@ def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
 def _max_concurrency_ok(
-    df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
+    df: pd.DataFrame,
    conc_col: str,
    cfg_col: str,
    threshold: float,
    slack_pct: float = 0.0,
 ):
    if df is None or conc_col not in df.columns or cfg_col not in df.columns:
        return pd.NA
@@ -441,7 +573,14 @@ def _max_concurrency_ok(
    if d.empty:
        return pd.NA
-    ok = d[d[cfg_col] <= threshold]
+    # Accept values up to (1 + slack_pct%) above the SLA.
    try:
        slack_pct = float(slack_pct or 0.0)
    except Exception:
        slack_pct = 0.0
    effective_limit = float(threshold) * (1.0 + slack_pct / 100.0)
    ok = d[d[cfg_col] <= effective_limit]
    if ok.empty:
        return pd.NA
@@ -507,15 +646,25 @@ def build_valid_max_concurrency_summary_html(
    if not cfg_cols:
        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
    # Display SLA ranges in the table header (SLA .. SLA*(1+slack))
    ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
    tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
    ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
    tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
    rows = []
    for cfg in cfg_cols:
        ttft_max = (
-            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
+            _max_concurrency_ok(
                ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
            )
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_max = (
-            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
+            _max_concurrency_ok(
                tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
            )
            if tpot_group_df is not None
            else pd.NA
        )
@@ -544,8 +693,8 @@ def build_valid_max_concurrency_summary_html(
        rows.append(
            {
                "Configuration": cfg,
-                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
+                f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
-                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
+                f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
                f"Max {conc_col} (Both)": both,
                "Output Tput @ Both (tok/s)": tput_at_both,
                "TTFT @ Both (ms)": ttft_at_both,
@@ -620,15 +769,24 @@ def build_valid_max_concurrency_summary_df(
    if not cfg_cols:
        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
    ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
    tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
    ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
    tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
    rows = []
    for cfg in cfg_cols:
        ttft_max = (
-            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
+            _max_concurrency_ok(
                ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
            )
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_max = (
-            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
+            _max_concurrency_ok(
                tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
            )
            if tpot_group_df is not None
            else pd.NA
        )
@@ -657,8 +815,8 @@ def build_valid_max_concurrency_summary_df(
        rows.append(
            {
                "Configuration": cfg,
-                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
+                f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
-                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
+                f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
                f"Max {conc_col} (Both)": both,
                "Output Tput @ Both (tok/s)": tput_at_both,
                "TTFT @ Both (ms)": ttft_at_both,
@@ -751,7 +909,21 @@ def build_parser() -> argparse.ArgumentParser:
        help="Reference limit for TPOT plots (ms)",
    )
-    # ---- NEW: export options ----
+    # ---- SLA tolerance (slack) options ----
    parser.add_argument(
        "--ttft-slack-pct",
        type=float,
        default=5.0,
        help="Allowed percentage above TTFT SLA (default: 5).",
    )
    parser.add_argument(
        "--tpot-slack-pct",
        type=float,
        default=5.0,
        help="Allowed percentage above TPOT SLA (default: 5).",
    )
    # ---- export options ----
    parser.add_argument(
        "--excel-out",
        type=str,
@@ -843,9 +1015,13 @@ def render_metric_table_html(
    metric_name = metric_label.lower()
    if "ttft" in metric_name:
-        styler = _highlight_threshold(display_group, args.ttft_max_ms)
+        styler = _highlight_threshold(
            display_group, args.ttft_max_ms, args.ttft_slack_pct
        )
    elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
-        styler = _highlight_threshold(display_group, args.tpot_max_ms)
+        styler = _highlight_threshold(
            display_group, args.tpot_max_ms, args.tpot_slack_pct
        )
    else:
        styler = display_group.style
@@ -962,22 +1138,46 @@ def write_report_group_first(
        csv_dir.mkdir(parents=True, exist_ok=True)
    excel_path = args.excel_out or "perf_comparison.xlsx"
-    with pd.ExcelWriter(excel_path, engine="openpyxl") as xw:
+    disable_excel = os.getenv("VLLM_COMPARE_DISABLE_EXCEL", "0") == "1"
    # Prefer xlsxwriter for speed; fallback to openpyxl if unavailable.
    excel_engine = (
        os.getenv("VLLM_COMPARE_EXCEL_ENGINE", "xlsxwriter").strip() or "xlsxwriter"
    )
    if excel_engine == "xlsxwriter" and util.find_spec("xlsxwriter") is None:
        excel_engine = "openpyxl"
    excel_engine_kwargs = {}
    if excel_engine == "xlsxwriter":
        # Reduce memory pressure & usually faster writes.
        excel_engine_kwargs = {"options": {"constant_memory": True}}
    xw_ctx = (
        nullcontext(None)
        if disable_excel
        else pd.ExcelWriter(
            excel_path, engine=excel_engine, engine_kwargs=excel_engine_kwargs
        )
    )
    with xw_ctx as xw:
        used_sheets: set[str] = set()
        # ---- Environment sheet (first) ----
        env_sheet = _sanitize_sheet_name("Environment")
        env_df = _load_env_df_for_inputs(args, files)
-        if env_df is None or env_df.empty:
+        if xw is not None:
-            pd.DataFrame(
+            if env_df is None or env_df.empty:
-                [
+                pd.DataFrame(
-                    {
+                    [
-                        "Section": "Environment",
+                        {
-                        "Key": "vllm_env.txt",
+                            "Section": "Environment",
-                        "Value": "NOT FOUND (or empty)",
+                            "Key": "vllm_env.txt",
-                    }
+                            "Value": "NOT FOUND (or empty)",
-                ]
+                        }
-            ).to_excel(xw, sheet_name=env_sheet, index=False)
+                    ]
-        else:
+                ).to_excel(xw, sheet_name=env_sheet, index=False)
-            env_df.to_excel(xw, sheet_name=env_sheet, index=False)
+            else:
                env_df.to_excel(xw, sheet_name=env_sheet, index=False)
            used_sheets.add(env_sheet)
        with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
            main_fh.write('<meta charset="utf-8">\n')
            for gkey in group_keys:
@@ -993,12 +1193,19 @@ def write_report_group_first(
                main_fh.write(group_header)
                do_excel = xw is not None
                sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
                sheet_base = sheet
-                dedup_i = 1
+                if do_excel:
-                while sheet in xw.sheets:
+                    dedup_i = 1
-                    dedup_i += 1
+                    while sheet in used_sheets:
-                    sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}")
+                        dedup_i += 1
                        suffix = f"_{dedup_i}"
                        # Ensure uniqueness even when sheet names are truncated.
                        base = str(sheet_base)
                        keep = max(1, 31 - len(suffix))
                        sheet = _sanitize_sheet_name(base[:keep] + suffix)
                    used_sheets.add(sheet)
                excel_blocks: list[tuple[str, pd.DataFrame]] = []
@@ -1059,7 +1266,7 @@ def write_report_group_first(
                        )
                        excel_blocks.append(
-                            (metric_label, display_group.reset_index(drop=True))
+                            (metric_label, group_df.reset_index(drop=True))
                        )
                        if csv_dir:
                            fn = _safe_filename(
@@ -1067,7 +1274,7 @@ def write_report_group_first(
                                    "/", "_"
                                )
                            )
-                            display_group.to_csv(csv_dir / f"{fn}.csv", index=False)
+                            group_df.to_csv(csv_dir / f"{fn}.csv", index=False)
                    summary_html = build_valid_max_concurrency_summary_html(
                        tput_group_df=tput_group_df,
@@ -1097,9 +1304,13 @@ def write_report_group_first(
                            )
                            summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
-                _write_tables_to_excel_sheet(xw, sheet, excel_blocks)
+                if do_excel:
                    _write_tables_to_excel_sheet(xw, sheet, excel_blocks)
-    print(f"Wrote Excel: {excel_path}")
+    if disable_excel:
        print("Skipped Excel generation (VLLM_COMPARE_DISABLE_EXCEL=1).")
    else:
        print(f"Wrote Excel: {excel_path}")
    if csv_dir:
        print(f"Wrote CSVs under: {csv_dir}")
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -12,6 +12,13 @@ DRY_RUN="${DRY_RUN:-0}"
 MODEL_FILTER="${MODEL_FILTER:-}"
 DTYPE_FILTER="${DTYPE_FILTER:-}"
 # Adaptive search controls
 ENABLE_ADAPTIVE_CONCURRENCY="${ENABLE_ADAPTIVE_CONCURRENCY:-0}"
 SLA_TTFT_MS="${SLA_TTFT_MS:-3000}"
 SLA_TPOT_MS="${SLA_TPOT_MS:-100}"
 ADAPTIVE_MAX_PROBES="${ADAPTIVE_MAX_PROBES:-8}"
 ADAPTIVE_MAX_CONCURRENCY="${ADAPTIVE_MAX_CONCURRENCY:-1024}"
 check_gpus() {
  if command -v nvidia-smi; then
    # check the number of GPUs and GPU type.
@@ -183,6 +190,304 @@ upload_to_buildkite() {
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }
 # -------------------------------
 # Adaptive concurrency helpers
 # -------------------------------
 result_json_path_for_serving() {
  local test_name=$1
  local qps=$2
  local max_concurrency=$3
  echo "$RESULTS_FOLDER/${test_name}_qps_${qps}_concurrency_${max_concurrency}.json"
 }
 extract_metric_ms() {
  local metric_name=$1
  local json_file=$2
  [[ -f "$json_file" ]] || return 0
  if [[ "$metric_name" == "ttft" ]]; then
    jq -r '
      [
        .ttft_ms.p99?,
        .metrics.ttft_ms.p99?,
        .ttft.p99?,
        .metrics.ttft.p99?,
        .p99_ttft_ms?,
        .ttft_ms.mean?,
        .metrics.ttft_ms.mean?,
        .ttft.mean?,
        .metrics.ttft.mean?,
        .mean_ttft_ms?
      ] | map(select(. != null)) | .[0] // empty
    ' "$json_file"
  else
    jq -r '
      [
        .tpot_ms.p99?,
        .metrics.tpot_ms.p99?,
        .tpot.p99?,
        .metrics.tpot.p99?,
        .p99_tpot_ms?,
        .itl_ms.p99?,
        .metrics.itl_ms.p99?,
        .inter_token_latency_ms.p99?,
        .tpot_ms.mean?,
        .metrics.tpot_ms.mean?,
        .tpot.mean?,
        .metrics.tpot.mean?,
        .itl_ms.mean?,
        .metrics.itl_ms.mean?,
        .mean_tpot_ms?,
        .mean_itl_ms?
      ] | map(select(. != null)) | .[0] // empty
    ' "$json_file"
  fi
 }
 evaluate_sla_from_json() {
  local json_file=$1
  local ttft
  local tpot
  local pass
  [[ -f "$json_file" ]] || return 2
  ttft=$(extract_metric_ms ttft "$json_file")
  tpot=$(extract_metric_ms tpot "$json_file")
  [[ -n "$ttft" && -n "$tpot" ]] || return 2
  pass=$(jq -n \
    --argjson ttft "$ttft" \
    --argjson tpot "$tpot" \
    --argjson sla_ttft "$SLA_TTFT_MS" \
    --argjson sla_tpot "$SLA_TPOT_MS" \
    '($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)')
  [[ "$pass" == "true" ]]
 }
 write_adaptive_summary_json() {
  local summary_file=$1
  local test_name=$2
  local qps=$3
  local static_last_pass=$4
  local static_first_fail=$5
  local final_last_pass=$6
  local final_first_fail=$7
  jq -n \
    --arg test_name "$test_name" \
    --arg qps "$qps" \
    --argjson sla_ttft "$SLA_TTFT_MS" \
    --argjson sla_tpot "$SLA_TPOT_MS" \
    --arg static_last_pass "${static_last_pass:-}" \
    --arg static_first_fail "${static_first_fail:-}" \
    --arg final_last_pass "${final_last_pass:-}" \
    --arg final_first_fail "${final_first_fail:-}" \
    '{
      test_name: $test_name,
      qps: $qps,
      sla_ttft_ms: $sla_ttft,
      sla_tpot_ms: $sla_tpot,
      static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end),
      static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end),
      final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end),
      final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end)
    }' > "$summary_file"
 }
 run_single_serving_probe() {
  local test_name=$1
  local qps=$2
  local max_concurrency=$3
  local tp=$4
  local compilation_config_mode=$5
  local optimization_level=$6
  local client_args_effective=$7
  local client_remote_args=$8
  local server_command=$9
  local new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
  local result_json
  local num_prompts_arg=""
  local client_command
  result_json=$(result_json_path_for_serving "$test_name" "$qps" "$max_concurrency")
  if [[ -f "$result_json" ]]; then
    evaluate_sla_from_json "$result_json"
    return $?
  fi
  if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
    num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
    if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
    if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
    num_prompts_arg="--num-prompts $num_prompts"
  fi
  client_command="vllm bench serve \
    --save-result \
    --result-dir $RESULTS_FOLDER \
    --result-filename ${new_test_name}.json \
    --request-rate $qps \
    --max-concurrency $max_concurrency \
    $num_prompts_arg \
    --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level adaptive_search=1 \
    $client_args_effective $client_remote_args "
  echo "Adaptive probe: $client_command"
  if [[ "${DRY_RUN:-0}" != "1" ]]; then
    bash -c "$client_command"
  fi
  jq_output=$(jq -n \
    --arg server "$server_command" \
    --arg client "$client_command" \
    --arg gpu "$gpu_type" \
    '{
      server_command: $server,
      client_command: $client,
      gpu_type: $gpu,
      adaptive_search: true
    }')
  echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
  evaluate_sla_from_json "$result_json"
 }
 adaptive_refine_from_static_results() {
  local test_name=$1
  local qps=$2
  local max_concurrency_list_raw=$3
  local tp=$4
  local compilation_config_mode=$5
  local optimization_level=$6
  local client_args_effective=$7
  local client_remote_args=$8
  local server_command=$9
  local sorted_points
  local point
  local rc
  local static_last_pass=""
  local static_first_fail=""
  local largest_static=""
  local step_hint=1
  local previous_point=""
  local low
  local high
  local mid
  local probes=0
  local summary_file="$RESULTS_FOLDER/${test_name}_qps_${qps}_sla_summary.json"
  [[ "${ENABLE_ADAPTIVE_CONCURRENCY}" == "1" ]] || return 0
  [[ "${DRY_RUN:-0}" != "1" ]] || return 0
  sorted_points=$(for point in $max_concurrency_list_raw; do printf '%s\n' "$point"; done | tr -d "'" | awk '/^[0-9]+$/' | sort -n | uniq)
  [[ -n "$sorted_points" ]] || return 0
  while read -r point; do
    [[ -z "$point" ]] && continue
    largest_static="$point"
    evaluate_sla_from_json "$(result_json_path_for_serving "$test_name" "$qps" "$point")"
    rc=$?
    if (( rc == 0 )); then
      static_last_pass="$point"
    elif (( rc == 1 )); then
      if [[ -n "$static_last_pass" ]]; then
        static_first_fail="$point"
        break
      fi
    fi
    if [[ -n "$previous_point" ]]; then
      step_hint=$(( point - previous_point ))
      if (( step_hint < 1 )); then step_hint=1; fi
    fi
    previous_point="$point"
  done <<< "$sorted_points"
  if [[ -z "$static_last_pass" ]]; then
    write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "" "$static_first_fail" "" "$static_first_fail"
    return 0
  fi
  if [[ -n "$static_first_fail" ]]; then
    low=$static_last_pass
    high=$static_first_fail
    while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
      mid=$(( (low + high) / 2 ))
      probes=$(( probes + 1 ))
      run_single_serving_probe \
        "$test_name" "$qps" "$mid" "$tp" \
        "$compilation_config_mode" "$optimization_level" \
        "$client_args_effective" "$client_remote_args" "$server_command"
      rc=$?
      if (( rc == 0 )); then
        low=$mid
      elif (( rc == 1 )); then
        high=$mid
      else
        break
      fi
    done
    write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "$static_first_fail" "$low" "$high"
    return 0
  fi
  low=$largest_static
  high=""
  while (( probes < ADAPTIVE_MAX_PROBES )); do
    point=$(( low + step_hint ))
    if (( point > ADAPTIVE_MAX_CONCURRENCY )); then
      point=$ADAPTIVE_MAX_CONCURRENCY
    fi
    (( point > low )) || break
    probes=$(( probes + 1 ))
    run_single_serving_probe \
      "$test_name" "$qps" "$point" "$tp" \
      "$compilation_config_mode" "$optimization_level" \
      "$client_args_effective" "$client_remote_args" "$server_command"
    rc=$?
    if (( rc == 0 )); then
      low=$point
      (( point == ADAPTIVE_MAX_CONCURRENCY )) && break
      step_hint=$(( step_hint * 2 ))
      if (( step_hint < 1 )); then step_hint=1; fi
    elif (( rc == 1 )); then
      high=$point
      break
    else
      break
    fi
  done
  if [[ -n "$high" ]]; then
    while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
      mid=$(( (low + high) / 2 ))
      probes=$(( probes + 1 ))
      run_single_serving_probe \
        "$test_name" "$qps" "$mid" "$tp" \
        "$compilation_config_mode" "$optimization_level" \
        "$client_args_effective" "$client_remote_args" "$server_command"
      rc=$?
      if (( rc == 0 )); then
        low=$mid
      elif (( rc == 1 )); then
        high=$mid
      else
        break
      fi
    done
  fi
  write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "" "$low" "$high"
 }
 run_benchmark_tests() {
  # run benchmark tests using `vllm bench <test_type>` command
  # $1: test type (latency or throughput)
@@ -347,10 +652,48 @@ run_serving_tests() {
    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
    client_params=$(echo "$params" | jq -r '.client_parameters')
-    server_args=$(json2args "$server_params")
+    # vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly.
    server_model=$(echo "$server_params" | jq -r '.model // empty')
    if [[ -z "$server_model" || "$server_model" == "null" ]]; then
      echo "Error: serving test '$test_name' is missing server_parameters.model" >&2
      exit 1
    fi
    server_params_no_model=$(echo "$server_params" | jq -c 'del(.model)')
    server_args=$(json2args "$server_params_no_model")
    server_envs=$(json2envs "$server_envs")
    client_args=$(json2args "$client_params")
    # ------------------------------------------------------------
    # Option 1: Dynamic num-prompts scaling based on max_concurrency
    #
    # If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with:
    #   num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY
    #
    # If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior
    # unchanged (i.e., whatever is in serving-tests-*.json).
    # ------------------------------------------------------------
    PROMPTS_PER_CONCURRENCY="${PROMPTS_PER_CONCURRENCY-}"  # no default on purpose
    MIN_NUM_PROMPTS="${MIN_NUM_PROMPTS:-1}"
    MAX_NUM_PROMPTS="${MAX_NUM_PROMPTS:-1000000}"
    if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
      # Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
      # Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
      # Handles: --num-prompts 123   and   --num-prompts=123
      client_args_no_np="$(
        printf ' %s ' "$client_args" \
        | sed -E \
          -e 's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g' \
          -e 's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g'
      )"
      # normalize whitespace
      client_args_no_np="$(echo "$client_args_no_np" | tr -s ' ' | sed -E 's/^ //; s/ $//')"
      client_args_no_np="$(echo "$client_args_no_np" | xargs)"
      client_args_effective="$client_args_no_np"
    else
      client_args_effective="$client_args"
    fi
    # qps_list
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
@@ -382,14 +725,13 @@ run_serving_tests() {
    fi
    # check if server model and client model is aligned
    server_model=$(echo "$server_params" | jq -r '.model')
    client_model=$(echo "$client_params" | jq -r '.model')
    if [[ $server_model != "$client_model" ]]; then
      echo "Server model and client model must be the same. Skip testcase $test_name."
      continue
    fi
-    server_command="$server_envs vllm serve \
+    server_command="$server_envs vllm serve $server_model \
      $server_args"
    # run the server
@@ -436,6 +778,14 @@ run_serving_tests() {
      for max_concurrency in $max_concurrency_list; do
        new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
        echo " new test name $new_test_name"
        # If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts.
        num_prompts_arg=""
        if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
          num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
          if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
          if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
          num_prompts_arg="--num-prompts $num_prompts"
        fi
        # pass the tensor parallel size, the compilation mode, and the optimization
        # level to the client so that they can be used on the benchmark dashboard
        client_command="vllm bench serve \
@@ -444,8 +794,9 @@ run_serving_tests() {
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --max-concurrency $max_concurrency \
          $num_prompts_arg \
          --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
-          $client_args $client_remote_args "
+          $client_args_effective $client_remote_args "
        echo "Running test case $test_name with qps $qps"
        echo "Client command: $client_command"
@@ -467,6 +818,11 @@ run_serving_tests() {
        echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
      done
      adaptive_refine_from_static_results \
        "$test_name" "$qps" "$max_concurrency_list" "$tp" \
        "$compilation_config_mode" "$optimization_level" \
        "$client_args_effective" "$client_remote_args" "$server_command"
    done
    # clean up
@@ -532,6 +888,7 @@ main() {
  # postprocess benchmarking results
  pip install tabulate pandas
  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
  python3 $QUICK_BENCHMARK_ROOT/scripts/compare-json-results.py -f $RESULTS_FOLDER/benchmark_results.json
  upload_to_buildkite
 }
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
@@ -0,0 +1,37 @@
 {
  "defaults": {
    "qps_list": [
      "inf"
    ],
    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
    "server_environment_variables": {
      "VLLM_RPC_TIMEOUT": 100000,
      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
    },
    "server_parameters": {
      "dtype": "bfloat16",
      "model": "openai/whisper-large-v3-turbo"
    },
    "client_parameters": {
      "model": "openai/whisper-large-v3-turbo",
      "backend": "openai-audio",
      "endpoint": "/v1/audio/transcriptions",
      "dataset_name": "hf",
      "dataset_path": "openslr/librispeech_asr",
      "hf_subset": "clean",
      "hf_split": "test",
      "no_stream": "",
      "no_oversample": "",
      "num_prompts": 200
    }
  },
  "tests": [
    {
      "test_name": "serving_whisper_large_v3_turbo_librispeech_clean_tp1",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {}
    }
  ]
 }
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
@@ -149,6 +149,39 @@
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_2048_2048",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp2_random_2048_2048",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_2048_2048",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_int4_tp1_random_128_128",
      "server_parameters": {
@@ -188,6 +221,45 @@
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int8_tp1_random_128_128",
      "server_parameters": {
        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int8_tp2_random_128_128",
      "server_parameters": {
        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int8_tp4_random_128_128",
      "server_parameters": {
        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama3B_tp1_random_128_128",
      "server_parameters": {
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -72,17 +72,6 @@
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_128_128",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_128_2048",
      "server_parameters": {
@@ -105,17 +94,6 @@
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_128_2048",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_2048_128",
      "server_parameters": {
@@ -139,14 +117,25 @@
      }
    },
    {
-      "test_name": "serving_llama8B_tp4_random_2048_128",
+      "test_name": "serving_llama8B_tp1_random_2048_2048",
      "server_parameters": {
-        "tensor_parallel_size": 4
+        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
-        "random-output-len": 128
+        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp2_random_2048_2048",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 2048
      }
    }
  ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
@@ -10,7 +10,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -37,7 +36,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -64,7 +62,6 @@
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -91,7 +88,6 @@
        "server_parameters": {
            "model": "deepseek-ai/DeepSeek-R1",
            "tensor_parallel_size": 8,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
--- a/.buildkite/performance-benchmarks/tests/serving-tests.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests.json
@@ -5,7 +5,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -23,7 +22,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -41,7 +39,6 @@
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -59,7 +56,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", 
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "speculative_config": {
                "model": "turboderp/Qwama-0.5B-Instruct",
                "num_speculative_tokens": 4,
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -83,7 +83,7 @@ steps:
        agents:
          queue: cpu_queue_postmerge
        commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
          - "mkdir artifacts"
          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
@@ -152,7 +152,7 @@ steps:
          queue: cpu_queue_postmerge
        commands:
          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
        env:
--- a/.buildkite/scripts/check-ray-compatibility.sh
+++ b/.buildkite/scripts/check-ray-compatibility.sh
@@ -166,12 +166,19 @@ See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for contex
 EOF
 fi
-# Notify Slack if webhook is configured.
+# Notify Slack if webhook is configured and PR/branch are valid.
 if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
-    echo ">>> Sending Slack notification"
+    PR="${BUILDKITE_PULL_REQUEST:-}"
-    # Single quotes are intentional: the f-string expressions are Python, not shell.
+    BRANCH="${BUILDKITE_BRANCH:-}"
-    # shellcheck disable=SC2016
+
-    PAYLOAD=$(python3 -c '
+    # Skip notification if PR is invalid or branch is empty
    if [[ "$PR" = "false" || -z "$PR" || -z "$BRANCH" ]]; then
        echo ">>> Skipping Slack notification (invalid PR or empty branch: PR=$PR, branch=$BRANCH)"
    else
        echo ">>> Sending Slack notification"
        # Single quotes are intentional: the f-string expressions are Python, not shell.
        # shellcheck disable=SC2016
        PAYLOAD=$(python3 -c '
 import json, os, sys
 pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A")
 branch = os.getenv("BUILDKITE_BRANCH", "unknown")
@@ -194,10 +201,11 @@ data = {
 print(json.dumps(data))
 ')
-    HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
+        HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
-        -H 'Content-type: application/json' \
+            -H 'Content-type: application/json' \
-        -d "$PAYLOAD")
+            -d "$PAYLOAD")
-    echo "    Slack webhook response: $HTTP_CODE"
+        echo "    Slack webhook response: $HTTP_CODE"
    fi
 else
    echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
 fi
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -205,6 +205,13 @@ re_quote_pytest_markers() {
      esac
      if $is_boundary; then
        # Strip surrounding double quotes if present (from upstream
        # single-to-double conversion); without this, wrapping below
        # would produce '"expr"' with literal double-quote characters.
        if [[ "$marker_buf" == '"'*'"' ]]; then
          marker_buf="${marker_buf#\"}"
          marker_buf="${marker_buf%\"}"
        fi
        # Flush the collected marker expression
        if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
          output+="'${marker_buf}' "
@@ -242,6 +249,11 @@ re_quote_pytest_markers() {
  # Flush any trailing marker expression (marker at end of command)
  if $collecting && [[ -n "$marker_buf" ]]; then
    # Strip surrounding double quotes (see mid-stream flush comment)
    if [[ "$marker_buf" == '"'*'"' ]]; then
      marker_buf="${marker_buf#\"}"
      marker_buf="${marker_buf%\"}"
    fi
    if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
      output+="'${marker_buf}'"
    else
@@ -321,15 +333,15 @@ apply_rocm_test_overrides() {
  # --- Entrypoint ignores ---
  if [[ $cmds == *" entrypoints/openai "* ]]; then
    cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
-    --ignore=entrypoints/openai/test_audio.py \
+    --ignore=entrypoints/openai/chat_completion/test_audio.py \
-    --ignore=entrypoints/openai/test_shutdown.py \
+    --ignore=entrypoints/openai/completion/test_shutdown.py \
    --ignore=entrypoints/openai/test_completion.py \
    --ignore=entrypoints/openai/test_models.py \
    --ignore=entrypoints/openai/test_lora_adapters.py \
    --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
-    --ignore=entrypoints/openai/test_root_path.py \
+    --ignore=entrypoints/openai/chat_completion/test_root_path.py \
    --ignore=entrypoints/openai/test_tokenization.py \
-    --ignore=entrypoints/openai/test_prompt_validation.py "}
+    --ignore=entrypoints/openai/completion/test_prompt_validation.py "}
  fi
  if [[ $cmds == *" entrypoints/llm "* ]]; then
@@ -492,6 +504,8 @@ else
    -e HF_TOKEN \
    -e AWS_ACCESS_KEY_ID \
    -e AWS_SECRET_ACCESS_KEY \
    -e BUILDKITE_PARALLEL_JOB \
    -e BUILDKITE_PARALLEL_JOB_COUNT \
    -v "${HF_CACHE}:${HF_MOUNT}" \
    -e "HF_HOME=${HF_MOUNT}" \
    -e "PYTHONPATH=${MYPYTHONPATH}" \
--- a/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
@@ -0,0 +1,65 @@
 #!/bin/bash
 set -euox pipefail
 export VLLM_CPU_KVCACHE_SPACE=1 
 export VLLM_CPU_CI_ENV=1
 # Reduce sub-processes for acceleration
 export TORCH_COMPILE_DISABLE=1 
 export VLLM_ENABLE_V1_MULTIPROCESSING=0
 SDE_ARCHIVE="sde-external-10.7.0-2026-02-18-lin.tar.xz"
 SDE_CHECKSUM="CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217"
 wget "https://downloadmirror.intel.com/913594/${SDE_ARCHIVE}"
 echo "${SDE_CHECKSUM}  ${SDE_ARCHIVE}" | sha256sum --check
 mkdir -p sde
 tar -xvf "./${SDE_ARCHIVE}" --strip-components=1 -C ./sde/
 wait_for_pid_and_check_log() {
    local pid="$1"
    local log_file="$2"
    local exit_status
    if [ -z "$pid" ] || [ -z "$log_file" ]; then
        echo "Usage: wait_for_pid_and_check_log <PID> <LOG_FILE>"
        return 1
    fi
    echo "Waiting for process $pid to finish..."
    # Use the 'wait' command to pause the script until the specific PID exits.
    # The 'wait' command's own exit status will be that of the waited-for process.
    if wait "$pid"; then
        exit_status=$?
        echo "Process $pid finished with exit status $exit_status (Success)."
    else
        exit_status=$?
        echo "Process $pid finished with exit status $exit_status (Failure)."
    fi
    if [ "$exit_status" -ne 0 ]; then
        echo "Process exited with a non-zero status."
        echo "--- Last few lines of log file: $log_file ---"
        tail -n 50 "$log_file"
        echo "---------------------------------------------"
        return 1 # Indicate failure based on exit status
    fi
    echo "No errors detected in log file and process exited successfully."
    return 0
 }
 # Test Sky Lake (AVX512F)
 ./sde/sde64 -skl -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_0.log 2>&1 &
 PID_TEST_0=$!
 # Test Cascade Lake (AVX512F + VNNI)
 ./sde/sde64 -clx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_1.log 2>&1 &
 PID_TEST_1=$!
 # Test Cooper Lake (AVX512F + VNNI + BF16)
 ./sde/sde64 -cpx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_2.log 2>&1 &
 PID_TEST_2=$!
 wait_for_pid_and_check_log $PID_TEST_0 test_0.log
 wait_for_pid_and_check_log $PID_TEST_1 test_1.log
 wait_for_pid_and_check_log $PID_TEST_2 test_2.log
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -34,7 +34,7 @@ function cpu_tests() {
  # offline inference
  docker exec cpu-test bash -c "
    set -e
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"
  # Run model tests
  docker exec cpu-test bash -c "
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -27,7 +27,7 @@ function cpu_tests() {
  podman exec -it "$container_id" bash -c "
    export TORCH_COMPILE_DISABLE=1
    set -xve
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
  # Run basic model test
  podman exec -it "$container_id" bash -c "
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@@ -25,5 +25,5 @@ remove_docker_container
 # Run the image and test offline inference
 docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
+    python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -76,7 +76,7 @@ docker run --rm --runtime=habana --name="${container_name}" --network=host \
  -e PT_HPU_LAZY_MODE=1 \
  "${image_name}" \
  /bin/bash -c '
-  cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
+  cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
 '
 EXITCODE=$?
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -34,17 +34,17 @@ docker run \
    set -e
    echo $ZE_AFFINITY_MASK
    pip install tblib==3.1.0
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
-    python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
+    python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
+    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
+    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
    cd tests
-    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
+    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -24,7 +24,7 @@ if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:
  BACKENDS=("allgather_reducescatter")
  # Disable MOE padding for ROCm since it is causing eplb to fail
  export VLLM_ROCM_MOE_PADDING=0
-  PLATFORM_ARGS=("--no-async-scheduling")
+  PLATFORM_ARGS=("--no-async-scheduling" "--attention-backend=TRITON_ATTN")
  echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
 else
  # Non-ROCm platform (CUDA/other)
--- a/.buildkite/scripts/tool_call/run-bfcl-eval.sh
+++ b/.buildkite/scripts/tool_call/run-bfcl-eval.sh
@@ -0,0 +1,248 @@
 #!/bin/bash
 # Run BFCL (Berkeley Function Call Leaderboard) tool-calling correctness
 # evaluation against a local vLLM server.
 #
 # Usage:
 #   # Run with defaults (gpt-oss-20b, multi_turn)
 #   bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
 #
 #   # Run with gpt-oss-120b and multiple test categories
 #   BFCL_MODEL="openai/gpt-oss-120b" BFCL_TP_SIZE=4 \
 #     BFCL_TEST_CATEGORY="live_simple, multiple, parallel_multiple" \
 #     bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
 #
 #   # Chain both API types (use BFCL_OUTPUT_DIR to avoid overwriting results)
 #   BFCL_OUTPUT_DIR=./bfcl-chat-completions BFCL_API_TYPE=chat_completions \
 #     bash .buildkite/scripts/tool_call/run-bfcl-eval.sh && \
 #   BFCL_OUTPUT_DIR=./bfcl-responses BFCL_API_TYPE=responses \
 #     bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
 #
 # Environment variables (all optional, with defaults):
 #   BFCL_MODEL          - HF model name (default: openai/gpt-oss-20b)
 #   BFCL_API_TYPE       - API type: "chat_completions" or "responses" (default: chat_completions)
 #   BFCL_OUTPUT_DIR     - Directory for BFCL results (default: current working directory)
 #   BFCL_TEST_CATEGORY  - BFCL test categories (default: multi_turn)
 #   BFCL_TOOL_CALL_PARSER - Tool call parser name (default: openai)
 #   BFCL_NUM_THREADS    - Threads for BFCL generate (default: 8)
 #   BFCL_TP_SIZE        - Tensor parallel size (default: 1)
 #   BFCL_MAX_MODEL_LEN  - Max model length (default: 4096)
 #   BFCL_PORT           - Server port (default: 8000)
 #   BFCL_REASONING_PARSER - Reasoning parser name (default: disabled)
 #   BFCL_EXTRA_ARGS     - Additional vLLM server args
 set -euo pipefail
 # ---- Configuration ----
 MODEL="${BFCL_MODEL:-openai/gpt-oss-20b}"
 API_TYPE="${BFCL_API_TYPE:-chat_completions}"
 OUTPUT_DIR="${BFCL_OUTPUT_DIR:-}"
 TEST_CATEGORY="${BFCL_TEST_CATEGORY:-multi_turn}"
 TOOL_CALL_PARSER="${BFCL_TOOL_CALL_PARSER:-openai}"
 NUM_THREADS="${BFCL_NUM_THREADS:-8}"
 TP_SIZE="${BFCL_TP_SIZE:-1}"
 MAX_MODEL_LEN="${BFCL_MAX_MODEL_LEN:-4096}"
 PORT="${BFCL_PORT:-8000}"
 REASONING_PARSER="${BFCL_REASONING_PARSER:-}"
 EXTRA_ARGS="${BFCL_EXTRA_ARGS:-}"
 # Set up output directory
 if [ -n "$OUTPUT_DIR" ]; then
    mkdir -p "$OUTPUT_DIR"
    OUTPUT_DIR="$(cd "$OUTPUT_DIR" && pwd)"
 fi
 echo "============================================"
 echo "BFCL Tool Call Correctness Evaluation"
 echo "============================================"
 echo "Model:          $MODEL"
 echo "Tool parser:    $TOOL_CALL_PARSER"
 echo "API type:       $API_TYPE"
 echo "Output dir:     ${OUTPUT_DIR:-<cwd>}"
 echo "Test category:  $TEST_CATEGORY"
 echo "TP size:        $TP_SIZE"
 echo "Max model len:  $MAX_MODEL_LEN"
 echo "Port:           $PORT"
 echo "Num threads:    $NUM_THREADS"
 echo "============================================"
 # ---- Install bfcl-eval if missing ----
 if ! python3 -c "import bfcl_eval" 2>/dev/null; then
    echo "Installing bfcl-eval..."
    pip install "bfcl-eval>=2025.10.20.1,<2026"
 fi
 # ---- Cleanup handler ----
 SERVER_PID=""
 cleanup() {
    if [ -n "$SERVER_PID" ]; then
        echo "Stopping vLLM server (pid=$SERVER_PID)..."
        kill "$SERVER_PID" 2>/dev/null || true
        wait "$SERVER_PID" 2>/dev/null || true
    fi
    # Remove BFCL lock files (created by filelock for thread-safe writes)
    rm -rf .file_locks/
    if [ -n "${OUTPUT_DIR:-}" ]; then
        rm -rf "$OUTPUT_DIR/.file_locks/"
    fi
 }
 trap cleanup EXIT
 # ---- Start vLLM server ----
 echo "Starting vLLM server..."
 SERVE_ARGS=(
    "$MODEL"
    --port "$PORT"
    --enable-auto-tool-choice
    --tool-call-parser "$TOOL_CALL_PARSER"
    --tensor-parallel-size "$TP_SIZE"
    --max-model-len "$MAX_MODEL_LEN"
    --enforce-eager
    --no-enable-prefix-caching
 )
 # Append reasoning parser if specified
 if [ -n "$REASONING_PARSER" ]; then
    SERVE_ARGS+=(--reasoning-parser "$REASONING_PARSER")
 fi
 # Append any extra args
 if [ -n "$EXTRA_ARGS" ]; then
    read -ra EXTRA_ARGS_ARRAY <<< "$EXTRA_ARGS"
    SERVE_ARGS+=("${EXTRA_ARGS_ARRAY[@]}")
 fi
 echo "Command: vllm serve ${SERVE_ARGS[*]}"
 vllm serve "${SERVE_ARGS[@]}" &
 SERVER_PID=$!
 # ---- Wait for server to be ready ----
 echo "Waiting for vLLM server to start (timeout: 600s)..."
 SECONDS_WAITED=0
 until curl -sf "http://localhost:${PORT}/health" > /dev/null 2>&1; do
    if [ $SECONDS_WAITED -ge 600 ]; then
        echo ""
        echo "ERROR: vLLM server failed to start within 600s"
        exit 1
    fi
    if (( SECONDS_WAITED % 30 == 0 && SECONDS_WAITED > 0 )); then
        echo "  Still waiting... (${SECONDS_WAITED}s elapsed)"
    fi
    sleep 2
    SECONDS_WAITED=$((SECONDS_WAITED + 2))
 done
 echo "vLLM server is ready. (started in ${SECONDS_WAITED}s)"
 # ---- Run BFCL evaluation ----
 # bfcl-eval has no CLI entry point; generate() and evaluate() are Typer
 # functions that must be called from Python. The MODEL_CONFIG_MAPPING must
 # be patched in-process so BFCL knows to use the OpenAI-compatible handler
 # against our local vLLM server.
 bfcl_exit_code=0
 python3 - "$MODEL" "$TEST_CATEGORY" "$NUM_THREADS" "$PORT" "$API_TYPE" "$OUTPUT_DIR" << 'PYEOF' || bfcl_exit_code=$?
 import os
 import sys
 model = sys.argv[1]
 test_category = sys.argv[2]
 num_threads = int(sys.argv[3])
 port = sys.argv[4]
 api_type = sys.argv[5]
 output_dir = sys.argv[6] if len(sys.argv) > 6 and sys.argv[6] else os.getcwd()
 os.environ["OPENAI_BASE_URL"] = f"http://localhost:{port}/v1"
 os.environ["OPENAI_API_KEY"] = "dummy"
 os.environ["BFCL_PROJECT_ROOT"] = output_dir
 import bfcl_eval.constants.model_config as bfcl_model_config
 from bfcl_eval.constants.model_config import ModelConfig
 from bfcl_eval.model_handler.api_inference.openai_completion import (
    OpenAICompletionsHandler,
 )
 from bfcl_eval.model_handler.api_inference.openai_response import (
    OpenAIResponsesHandler,
 )
 if api_type == "responses":
    handler = OpenAIResponsesHandler
 else:
    handler = OpenAICompletionsHandler
 bfcl_model_config.MODEL_CONFIG_MAPPING[model] = ModelConfig(
    model_name=model,
    display_name=f"{model} (FC) (vLLM)",
    url=f"https://huggingface.co/{model}",
    org="",
    license="apache-2.0",
    model_handler=handler,
    input_price=None,
    output_price=None,
    is_fc_model=True,
    underscore_to_dot=True,
 )
 from bfcl_eval.__main__ import evaluate, generate
 import inspect
 import typer
 def _get_default_kwargs(function):
    kwargs = {}
    for k, v in inspect.signature(function).parameters.items():
        if v.default is not inspect.Parameter.empty:
            default = v.default
            if isinstance(default, typer.models.OptionInfo):
                default = default.default
            kwargs[k] = default
    return kwargs
 # ---- generate ----
 print(f"=== BFCL generate: model={model} test_category={test_category} ===")
 gen_kwargs = _get_default_kwargs(generate)
 gen_kwargs["model"] = [model]
 gen_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
 gen_kwargs["skip_server_setup"] = True
 gen_kwargs["num_threads"] = num_threads
 generate(**gen_kwargs)
 # ---- evaluate ----
 print(f"=== BFCL evaluate: model={model} test_category={test_category} ===")
 eval_kwargs = _get_default_kwargs(evaluate)
 eval_kwargs["model"] = [model]
 eval_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
 evaluate(**eval_kwargs)
 print("=== BFCL evaluation completed successfully ===")
 PYEOF
 # ---- Upload results to buildkite ----
 if command -v buildkite-agent &>/dev/null; then
    if [ $bfcl_exit_code -eq 0 ]; then
        STYLE="success"
        STATUS="PASSED"
    else
        STYLE="error"
        STATUS="FAILED"
    fi
    buildkite-agent annotate --style "$STYLE" --context "bfcl-results" <<EOF
 ### BFCL Tool Call Correctness - ${STATUS}
 - **Model:** \`${MODEL}\`
 - **Parser:** \`${TOOL_CALL_PARSER}\`
 - **API type:** \`${API_TYPE}\`
 - **Test category:** \`${TEST_CATEGORY}\`
 EOF
    # BFCL writes results to $BFCL_PROJECT_ROOT/result/ and scores to
    # $BFCL_PROJECT_ROOT/score/
    RESULTS_ROOT="${OUTPUT_DIR:-.}"
    if [ -d "$RESULTS_ROOT/result" ]; then
        buildkite-agent artifact upload "$RESULTS_ROOT/result/**/*"
    fi
    if [ -d "$RESULTS_ROOT/score" ]; then
        buildkite-agent artifact upload "$RESULTS_ROOT/score/**/*"
    fi
 fi
 exit $bfcl_exit_code
--- a/.buildkite/scripts/upload-nightly-wheels.sh
+++ b/.buildkite/scripts/upload-nightly-wheels.sh
@@ -72,7 +72,7 @@ obj_json="objects.json"
 aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
 mkdir -p "$INDICES_OUTPUT_DIR"
-# call script to generate indicies for all existing wheels
+# call script to generate indices for all existing wheels
 # this indices have relative paths that could work as long as it is next to the wheel directory in s3
 # i.e., the wheels are always in s3://vllm-wheels/<commit>/
 # and indices can be placed in /<commit>/, or /nightly/, or /<version>/
--- a/.buildkite/scripts/upload-release-wheels-pypi.sh
+++ b/.buildkite/scripts/upload-release-wheels-pypi.sh
@@ -54,10 +54,13 @@ mkdir -p $DIST_DIR
 # include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
 aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
 echo "Wheels copied to local directory"
-# generate source tarball
+# generate source distribution using setup.py
-git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT"
+python setup.py sdist --dist-dir=$DIST_DIR
 ls -la $DIST_DIR
 SDIST_FILE=$(find $DIST_DIR -name "vllm*.tar.gz")
 echo "Found sdist: $SDIST_FILE"
 # upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
 PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
 if [[ -z "$PYPI_WHEEL_FILES" ]]; then
@@ -65,6 +68,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
  exit 1
 fi
-python3 -m twine check "$PYPI_WHEEL_FILES"
+python3 -m twine check "$PYPI_WHEEL_FILES" "$SDIST_FILE"
-python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES"
+python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES" "$SDIST_FILE"
-echo "Wheels uploaded to PyPI"
+echo "Wheels and source distribution uploaded to PyPI"
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
--- a/.buildkite/test_areas/basic_correctness.yaml
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -14,8 +14,3 @@ steps:
  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -36,6 +36,16 @@ steps:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
 - label: AsyncTP Correctness Tests (B200)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/"
  device: b200
  optional: true
  num_devices: 2
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
 - label: Distributed Compile Unit Tests (2xH100)
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
@@ -91,8 +101,8 @@ steps:
    - nvidia-smi
    # Run all models and attn backends but only Inductor partition and native custom ops
    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
-    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and (qwen3 or deepseek)"
 - label: Fusion E2E Config Sweep (H100)
  timeout_in_minutes: 30
@@ -122,9 +132,9 @@ steps:
  commands:
    - nvidia-smi
    # Run all models but only FLASHINFER, Inductor partition and native custom ops
-    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    # Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek)) or llama-3)"
 - label: Fusion E2E TP2 Quick (H100)
  timeout_in_minutes: 20
@@ -140,8 +150,8 @@ steps:
  commands:
    - nvidia-smi
    # Run all models and attn backends but only Inductor partition and native custom ops
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
 - label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
  timeout_in_minutes: 40
@@ -195,7 +205,7 @@ steps:
  commands:
    - nvidia-smi
    # Run all models but only FLASHINFER, Inductor partition and native custom ops
-    # include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # include qwen/deepseek with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    # for ar-rms-quant-fp4, also sweep llama3
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))) or Llama-3.1-8B-Instruct-FP4"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -50,23 +50,18 @@ steps:
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
- label: Distributed Tests (4 GPUs)
+- label: Distributed Torchrun + Examples (4 GPUs)
-  timeout_in_minutes: 50
+  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
-  - tests/distributed/test_utils
+  - tests/distributed/test_torchrun_example.py
-  - tests/distributed/test_pynccl
+  - tests/distributed/test_torchrun_example_moe.py
  - tests/distributed/test_events
  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - examples/offline_inference/new_weight_syncing/
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_symm_mem_allreduce.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
@@ -84,19 +79,6 @@ steps:
  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/fullgraph/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  # OLD rlhf examples
  - cd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
@@ -106,6 +88,47 @@ steps:
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
 - label: Distributed DP Tests (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_utils
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
 - label: Distributed Compile + Comm (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
  - tests/compile/fullgraph/test_basic_correctness.py
  - tests/distributed/test_symm_mem_allreduce.py
  - tests/distributed/test_multiproc_executor.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  - pytest -v -s compile/fullgraph/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # test multi-node TP with multiproc executor (simulated on single node)
  - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
 - label: Distributed Tests (8 GPUs)(H100)
  timeout_in_minutes: 10
  device: h100
@@ -146,7 +169,7 @@ steps:
  num_devices: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
-    # - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py --- failing, need to re-enable
+    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py
@@ -210,6 +233,19 @@ steps:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 - label: NixlConnector PD + Spec Decode acceptance (2 GPUs)
  timeout_in_minutes: 30
  device: a100
  working_dir: "/vllm-workspace/tests"
  num_devices: 2
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - vllm/v1/worker/kv_connector_model_runner_mixin.py
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
 - label: Pipeline + Context Parallelism (4 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -1,5 +1,5 @@
 group: Engine
-depends_on: 
+depends_on:
  - image-build
 steps:
 - label: Engine
@@ -14,28 +14,30 @@ steps:
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
- label: V1 e2e + engine (1 GPU)
+- label: Engine (1 GPU)
-  timeout_in_minutes: 45
+  timeout_in_minutes: 30
  source_file_dependencies:
-    - vllm/
+    - vllm/v1/engine/
-    - tests/v1
+    - tests/v1/engine/
  commands:
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - pytest -v -s v1/e2e
    # Run this test standalone for now;
    # need to untangle use (implicit) use of spawn/fork across the tests.
    - pytest -v -s v1/engine/test_preprocess_error_handling.py
    # Run the rest of v1/engine tests
    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
-  mirror:
+
-    amd:
+- label: e2e Scheduling (1 GPU)
-      device: mi325_1
+  timeout_in_minutes: 30
-      depends_on:
+  source_file_dependencies:
-      - image-build-amd
+    - vllm/v1/
-      commands:
+    - tests/v1/e2e/general/
-      - pytest -v -s v1/e2e
+  commands:
-      - pytest -v -s v1/engine
+    - pytest -v -s v1/e2e/general/test_async_scheduling.py
 - label: e2e Core (1 GPU)
  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/v1/
    - tests/v1/e2e/general/
  commands:
    - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
 - label: V1 e2e (2 GPUs)
  timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
@@ -46,7 +48,7 @@ steps:
    - tests/v1/e2e
  commands:
    # Only run tests that need exactly 2 GPUs
-    - pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
+    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
  mirror:
    amd:
      device: mi325_2
@@ -62,7 +64,7 @@ steps:
    - tests/v1/e2e
  commands:
    # Only run tests that need 4 GPUs
-    - pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
+    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
  mirror:
    amd:
      device: mi325_4
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -24,11 +24,6 @@ steps:
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Entrypoints Integration (API Server 1)
  timeout_in_minutes: 130
@@ -39,8 +34,13 @@ steps:
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
  - pytest -v -s entrypoints/test_chat_utils.py
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Entrypoints Integration (API Server 2)
  timeout_in_minutes: 130
@@ -65,11 +65,6 @@ steps:
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/pooling
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Entrypoints Integration (Responses API)
  timeout_in_minutes: 50
@@ -87,6 +82,11 @@ steps:
    - tests/v1
  commands:
    - pytest -v -s v1/entrypoints
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: OpenAI API Correctness
  timeout_in_minutes: 30
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -8,8 +8,9 @@ steps:
  - csrc/
  - tests/kernels/core
  - tests/kernels/test_top_k_per_row.py
  - tests/kernels/test_concat_mla_q.py
  commands:
-    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py
 - label: Kernels Attention Test %N
  timeout_in_minutes: 35
@@ -96,7 +97,7 @@ steps:
  - vllm/platforms/cuda.py
  commands:
    - nvidia-smi
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
    # Attention
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
    - pytest -v -s tests/kernels/attention/test_attention_selector.py
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -67,12 +67,13 @@ steps:
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
-    - python3 offline_inference/basic/chat.py # for basic
+     # for basic
-    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+    - python3 basic/offline_inference/chat.py
-    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
-    - python3 offline_inference/basic/classify.py
+    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/embed.py
+    - python3 basic/offline_inference/classify.py
-    - python3 offline_inference/basic/score.py
+    - python3 basic/offline_inference/embed.py
    - python3 basic/offline_inference/score.py
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
--- a/.buildkite/test_areas/model_executor.yaml
+++ b/.buildkite/test_areas/model_executor.yaml
@@ -9,9 +9,9 @@ steps:
  - vllm/config/model.py
  - vllm/model_executor
  - tests/model_executor
-  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
+  - tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
  commands:
    - apt-get update && apt-get install -y curl libsodium23
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s model_executor
-    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
+    - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py
--- a/.buildkite/test_areas/model_runner_v2.yaml
+++ b/.buildkite/test_areas/model_runner_v2.yaml
@@ -0,0 +1,110 @@
 group: Model Runner V2
 depends_on:
  - image-build
 steps:
 - label: Model Runner V2 Core Tests
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/v1/worker/gpu/
  - vllm/v1/worker/gpu_worker.py
  - vllm/v1/core/sched/
  - vllm/v1/attention/
  - tests/v1/engine/test_llm_engine.py
  - tests/v1/e2e/
  - tests/v1/entrypoints/llm/test_struct_output_generate.py
  commands:
  - set -x
  - export VLLM_USE_V2_MODEL_RUNNER=1
  - pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics"
  # This requires eager until we sort out CG correctness issues.
  # TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged.
  - ENFORCE_EAGER=1 pytest -v -s v1/e2e/general/test_async_scheduling.py -k "not ngram"
  - pytest -v -s v1/e2e/general/test_context_length.py
  - pytest -v -s v1/e2e/general/test_min_tokens.py
  # Temporary hack filter to exclude ngram spec decoding based tests.
  - pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
 - label: Model Runner V2 Examples
  timeout_in_minutes: 45
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
    - vllm/v1/worker/gpu/
    - vllm/v1/core/sched/
    - vllm/v1/worker/gpu_worker.py
    - examples/offline_inference/
    - examples/basic/offline_inference/
    - examples/pooling/embed/vision_embedding_offline.py
    - examples/others/tensorize_vllm_model.py
  commands:
    - set -x
    - export VLLM_USE_V2_MODEL_RUNNER=1
    - pip install tensorizer # for tensorizer test
    - python3 basic/offline_inference/chat.py # for basic
    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
    #- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10  # TODO
    #- python3 basic/offline_inference/embed.py   # TODO
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    # for pooling models
    - python3 pooling/embed/vision_embedding_offline.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 - label: Model Runner V2 Distributed (2 GPUs)
  timeout_in_minutes: 45
  working_dir: "/vllm-workspace/tests"
  num_devices: 2
  source_file_dependencies:
    - vllm/v1/worker/gpu/
    - vllm/v1/worker/gpu_worker.py
    - tests/basic_correctness/test_basic_correctness.py
    - tests/v1/distributed/test_async_llm_dp.py
    - tests/v1/distributed/test_eagle_dp.py
  commands:
    - set -x
    - export VLLM_USE_V2_MODEL_RUNNER=1
    # The "and not True" here is a hacky way to exclude the prompt_embeds cases which aren't yet supported.
    - TARGET_TEST_SUITE=L4 pytest -v -s basic_correctness/test_basic_correctness.py -m 'distributed(num_gpus=2)' -k "not ray and not True"
    # https://github.com/NVIDIA/nccl/issues/1838
    - export NCCL_CUMEM_HOST_ENABLE=0
    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py -k "not ray"
    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
 # These require fix https://github.com/vllm-project/vllm/pull/36280
 - label: Model Runner V2 Pipeline Parallelism (4 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
    - vllm/v1/worker/gpu/
    - vllm/v1/worker/gpu_worker.py
    - tests/distributed/test_pipeline_parallel.py
    #- tests/distributed/test_pp_cudagraph.py
  commands:
    - set -x
    - export VLLM_USE_V2_MODEL_RUNNER=1
    - pytest -v -s distributed/test_pipeline_parallel.py -k "not ray and not Jamba"
    # TODO: Uncomment once https://github.com/vllm-project/vllm/pull/35162 is merged.
    #- pytest -v -s distributed/test_pp_cudagraph.py -k "not ray"
 - label: Model Runner V2 Spec Decode
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/worker/gpu/
  - vllm/v1/worker/gpu_worker.py
  - tests/v1/spec_decode/test_max_len.py
  - tests/v1/e2e/spec_decode/test_spec_decode.py
  commands:
  - set -x
  - export VLLM_USE_V2_MODEL_RUNNER=1
  - pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp"
  - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle or mtp"
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -65,7 +65,7 @@ steps:
    - pytest -v -s tests/models/test_transformers.py
    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -2,16 +2,65 @@ group: Models - Multimodal
 depends_on: 
  - image-build
 steps:
- label: Multi-Modal Models (Standard) # 60min
+- label: "Multi-Modal Models (Standard) 1: qwen2"
-  timeout_in_minutes: 80
+  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2"
-    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+    - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: "Multi-Modal Models (Standard) 2: qwen3 + gemma"
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma"
    - pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl"
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma"
    - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: "Multi-Modal Models (Standard) 4: other + whisper"
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Multi-Modal Processor Test (CPU)
  depends_on: 
@@ -54,6 +103,11 @@ steps:
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Multi-Modal Models (Extended) 2
  optional: true
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -15,9 +15,12 @@ steps:
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
  # end platform plugin tests
-  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+  # begin io_processor plugins test
  # test generic io_processor plugins functions
  - pytest -v -s ./plugins_tests/test_io_processor_plugins.py
  # test Terratorch io_processor plugins
  - pip install -e ./plugins/prithvi_io_processor_plugin
-  - pytest -v -s plugins_tests/test_io_processor_plugins.py
+  - pytest -v -s plugins_tests/test_terratorch_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # test bge_m3_sparse io_processor plugin
  - pip install -e ./plugins/bge_m3_sparse_plugin
@@ -33,6 +36,6 @@ steps:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py
-  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+  - pytest -v -s entrypoints/openai/chat_completion/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
--- a/.buildkite/test_areas/spec_decode.yaml
+++ b/.buildkite/test_areas/spec_decode.yaml
@@ -0,0 +1,40 @@
 group: Spec Decode
 depends_on:
  - image-build
 steps:
 - label: Spec Decode Eagle
  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/v1/spec_decode/
    - vllm/v1/worker/gpu/spec_decode/
    - tests/v1/e2e/spec_decode/
  commands:
    - pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"
 - label: Spec Decode Speculators + MTP
  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/v1/spec_decode/
    - vllm/v1/worker/gpu/spec_decode/
    - vllm/transformers_utils/configs/speculators/
    - tests/v1/e2e/spec_decode/
  commands:
    - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
 - label: Spec Decode Ngram + Suffix
  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/v1/spec_decode/
    - vllm/v1/worker/gpu/spec_decode/
    - tests/v1/e2e/spec_decode/
  commands:
    - pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"
 - label: Spec Decode Draft Model
  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/v1/spec_decode/
    - vllm/v1/worker/gpu/spec_decode/
    - tests/v1/e2e/spec_decode/
  commands:
    - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -3,6 +3,7 @@ pull_request_rules:
  description: Automatically apply documentation label
  conditions:
    - label != stale
    - -closed
    - or:
      - files~=^[^/]+\.md$
      - files~=^docs/
@@ -26,7 +27,7 @@ pull_request_rules:
        Hi @{{author}}, the pre-commit checks have failed. Please run:
        ```bash 
-        uv pip install pre-commit
+        uv pip install pre-commit>=4.5.1
        pre-commit install
        pre-commit run --all-files
        ```
@@ -37,15 +38,13 @@ pull_request_rules:
        > [!TIP]
        > <details>
-        > <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
+        > <summary>Is <code>mypy</code> failing?</summary>
        > <br/>
-        > <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
+        > <code>mypy</code> is run differently in CI. If the failure is related to this check, please use the following command to run it locally:
        >
        > ```bash
        > # For mypy (substitute "3.10" with the failing version if needed)
        > pre-commit run --hook-stage manual mypy-3.10
        > # For markdownlint
        > pre-commit run --hook-stage manual markdownlint
        > ```
        > </details>
@@ -335,7 +334,7 @@ pull_request_rules:
    - or:
      - files~=^tests/tool_use/
      - files~=^tests/entrypoints/openai/tool_parsers/
-      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+      - files=tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
      - files~=^vllm/entrypoints/openai/tool_parsers/
      - files=docs/features/tool_calling.md
      - files~=^examples/tool_chat_*
@@ -382,7 +381,7 @@ pull_request_rules:
    - or:
      - files~=^vllm/model_executor/model_loader/tensorizer.py
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
-      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
+      - files~=^tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
      - files~=^tests/model_executor/model_loader/tensorizer_loader/
  actions:
    assign:
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -6,6 +6,9 @@ on:
      - main
  workflow_dispatch:  # Manual trigger
 permissions:
  contents: read
 jobs:
  macos-m1-smoke-test:
    runs-on: macos-latest
--- a/.gitignore
+++ b/.gitignore
@@ -189,11 +189,9 @@ cython_debug/
 .vscode/
 # Claude
 CLAUDE.md
 .claude/
 # Codex
 AGENTS.md
 .codex/
 # Cursor
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@ repos:
    args: [--output-format, github, --fix]
  - id: ruff-format
 - repo: https://github.com/crate-ci/typos
-  rev: v1.38.1
+  rev: v1.43.5
  hooks:
  - id: typos
    args: [--force-exclude]
@@ -24,12 +24,13 @@ repos:
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
    types_or: [c++, cuda]
    args: [--style=file, --verbose]
- repo: https://github.com/igorshubovych/markdownlint-cli
+- repo: https://github.com/DavidAnson/markdownlint-cli2
-  rev: v0.45.0
+  rev: v0.21.0
  hooks:
-  - id: markdownlint
+  - id: markdownlint-cli2
-    exclude: '.*\.inc\.md'
+    language_version: lts
-    stages: [manual] # Only run in CI
+    args: [--fix]
    exclude: ^CLAUDE\.md$
 - repo: https://github.com/rhysd/actionlint
  rev: v1.7.7
  hooks:
@@ -55,7 +56,7 @@ repos:
      language: python
      types_or: [python, pyi]
      require_serial: true
-      additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
+      additional_dependencies: ["mypy[faster-cache]==1.19.1", regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.10
    entry: python tools/pre_commit/mypy.py 1 "3.10"
@@ -127,6 +128,13 @@ repos:
    language: python
    types: [python]
    additional_dependencies: [regex]
  # prevent use torch.cuda APIs
  - id: check-torch-cuda-call
    name: "Prevent new 'torch.cuda' APIs call"
    entry: python tools/pre_commit/check_torch_cuda.py
    language: python
    types: [python]
    additional_dependencies: [regex]
  - id: validate-config
    name: Validate configuration has default values and that each field has a docstring
    entry: python tools/pre_commit/validate_config.py
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -9,6 +9,7 @@ build:
    python: "3.12"
  jobs:
    post_checkout:
      # - bash docs/maybe_skip_pr_build.sh
      - git fetch origin main --unshallow --no-tags --filter=blob:none || true
    pre_create_environment:
      - pip install uv
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -0,0 +1,113 @@
 # Agent Instructions for vLLM
 > These instructions apply to **all** AI-assisted contributions to `vllm-project/vllm`.
 > Breaching these guidelines can result in automatic banning.
 ## 1. Contribution Policy (Mandatory)
 ### Duplicate-work checks
 Before proposing a PR, run these checks:
 ```bash
 gh issue view <issue_number> --repo vllm-project/vllm --comments
 gh pr list --repo vllm-project/vllm --state open --search "<issue_number> in:body"
 gh pr list --repo vllm-project/vllm --state open --search "<short area keywords>"
 ```
 - If an open PR already addresses the same fix, do not open another.
 - If your approach is materially different, explain the difference in the issue.
 ### No low-value busywork PRs
 Do not open one-off PRs for tiny edits (single typo, isolated style change, one mutable default, etc.). Mechanical cleanups are acceptable only when bundled with substantive work.
 ### Accountability
 - Pure code-agent PRs are **not allowed**. A human submitter must understand and defend the change end-to-end.
 - The submitting human must review every changed line and run relevant tests.
 - PR descriptions for AI-assisted work **must** include:
    - Why this is not duplicating an existing PR.
    - Test commands run and results.
    - Clear statement that AI assistance was used.
 ### Fail-closed behavior
 If work is duplicate/trivial busywork, **do not proceed**. Return a short explanation of what is missing.
 ---
 ## 2. Development Workflow
 ### Environment setup
 ```bash
 # Install `uv` if you don't have it already:
 curl -LsSf https://astral.sh/uv/install.sh | sh
 # Always use `uv` for Python environment management:
 uv venv --python 3.12
 source .venv/bin/activate
 # Always make sure `pre-commit` and its hooks are installed:
 uv pip install -r requirements/lint.txt
 pre-commit install
 ```
 ### Installing dependencies
 ```bash
 # If you are only making Python changes:
 VLLM_USE_PRECOMPILED=1 uv pip install -e .
 # If you are also making C/C++ changes:
 uv pip install -e .
 ```
 ### Running tests
 Tests require extra dependencies.
 All versions for test dependencies should be read from `requirements/test.txt`
 ```bash
 # Install bare minimum test dependencies:
 uv pip install pytest pytest-asyncio tblib
 # Install additional test dependencies as needed, or install them all as follows:
 uv pip install -r requirements/test.txt
 # Run specific test from specific test file
 pytest tests/path/to/test.py -v -s -k test_name
 # Run all tests in directory
 pytest tests/path/to/dir -v -s
 ```
 ### Running linters
 ```bash
 # Run all pre-commit hooks on staged files:
 pre-commit run
 # Run on all files:
 pre-commit run --all-files
 # Run a specific hook:
 pre-commit run ruff-check --all-files
 # Run mypy as it is in CI:
 pre-commit run mypy-3.10 --all-files --hook-stage manual
 ```
 ### Commit messages
 Add attribution using commit trailers such as `Co-authored-by:` (other projects use `Assisted-by:` or `Generated-by:`). For example:
 ```text
 Your commit message here
 Co-authored-by: GitHub Copilot
 Co-authored-by: Claude
 Co-authored-by: gemini-code-assist
 Signed-off-by: Your Name <your.email@example.com>
 ```
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -0,0 +1 @@
@AGENTS.md
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,7 +37,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1150;gfx1151;gfx1152;gfx1153;gfx1200;gfx1201")
 # ROCm installation prefix. Default to /opt/rocm but allow override via
 # -DROCM_PATH=/your/rocm/path when invoking cmake.
--- a/benchmarks/attention_benchmarks/README.md
+++ b/benchmarks/attention_benchmarks/README.md
@@ -187,7 +187,7 @@ python benchmark.py \
 ## Hardware Requirements
 | Backend | Hardware |
-|---------|----------|
+| ------- | -------- |
 | Flash/Triton/FlashInfer | Any CUDA GPU |
 | CUTLASS MLA | Blackwell (SM100+) |
 | FlashAttn MLA | Hopper (SM90+) |
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -47,6 +47,8 @@ from common import (
    is_mla_backend,
 )
 from vllm.v1.worker.workspace import init_workspace_manager
 def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
    """Run standard attention benchmark (Flash/Triton/FlashInfer)."""
@@ -59,7 +61,9 @@ def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
    """Run MLA benchmark with appropriate backend."""
    from mla_runner import run_mla_benchmark as run_mla
-    return run_mla(config.backend, config, **kwargs)
+    return run_mla(
        config.backend, config, prefill_backend=config.prefill_backend, **kwargs
    )
 def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
@@ -440,20 +444,27 @@ def main():
    # Backend selection
    parser.add_argument(
        "--backends",
        "--decode-backends",
        nargs="+",
-        help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
+        help="Decode backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
        "flashinfer_mla, flashattn_mla, flashmla)",
    )
    parser.add_argument(
        "--backend",
        help="Single backend (alternative to --backends)",
    )
    parser.add_argument(
        "--prefill-backends",
        nargs="+",
        help="Prefill backends to compare (fa2, fa3, fa4). "
        "Uses the first decode backend for impl construction.",
    )
    # Batch specifications
    parser.add_argument(
        "--batch-specs",
        nargs="+",
-        default=["q2k", "8q1s1k"],
+        default=None,
        help="Batch specifications using extended grammar",
    )
@@ -469,6 +480,21 @@ def main():
    parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
    parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
    parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
    parser.add_argument(
        "--kv-cache-dtype",
        default="auto",
        choices=["auto", "fp8"],
        help="KV cache dtype: auto or fp8",
    )
    parser.add_argument(
        "--cuda-graphs",
        action=argparse.BooleanOptionalAction,
        default=True,
        help=(
            "Launch kernels with CUDA graphs to eliminate CPU overhead"
            "in measurements (default: True)"
        ),
    )
    # Parameter sweep (use YAML config for advanced sweeps)
    parser.add_argument(
@@ -502,7 +528,7 @@ def main():
        # Override args with YAML values, but CLI args take precedence
        # Check if CLI provided backends (they would be non-None and not default)
-        cli_backends_provided = args.backends is not None or args.backend is not None
+        cli_backends_provided = args.backend is not None or args.backends is not None
        # Backend(s) - only use YAML if CLI didn't specify
        if not cli_backends_provided:
@@ -512,6 +538,12 @@ def main():
            elif "backends" in yaml_config:
                args.backends = yaml_config["backends"]
                args.backend = None
            elif "decode_backends" in yaml_config:
                args.backends = yaml_config["decode_backends"]
                args.backend = None
        # Prefill backends (e.g., ["fa3", "fa4"])
        args.prefill_backends = yaml_config.get("prefill_backends", None)
        # Check for special modes
        if "mode" in yaml_config:
@@ -521,21 +553,24 @@ def main():
        # Batch specs and sizes
        # Support both explicit batch_specs and generated batch_spec_ranges
-        if "batch_spec_ranges" in yaml_config:
+        # CLI --batch-specs takes precedence over YAML when provided.
-            # Generate batch specs from ranges
+        cli_batch_specs_provided = args.batch_specs is not None
-            generated_specs = generate_batch_specs_from_ranges(
+        if not cli_batch_specs_provided:
-                yaml_config["batch_spec_ranges"]
+            if "batch_spec_ranges" in yaml_config:
-            )
+                # Generate batch specs from ranges
-            # Combine with any explicit batch_specs
+                generated_specs = generate_batch_specs_from_ranges(
-            if "batch_specs" in yaml_config:
+                    yaml_config["batch_spec_ranges"]
-                args.batch_specs = yaml_config["batch_specs"] + generated_specs
+                )
-            else:
+                # Combine with any explicit batch_specs
-                args.batch_specs = generated_specs
+                if "batch_specs" in yaml_config:
-            console.print(
+                    args.batch_specs = yaml_config["batch_specs"] + generated_specs
-                f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
+                else:
-            )
+                    args.batch_specs = generated_specs
-        elif "batch_specs" in yaml_config:
+                console.print(
-            args.batch_specs = yaml_config["batch_specs"]
+                    f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
                )
            elif "batch_specs" in yaml_config:
                args.batch_specs = yaml_config["batch_specs"]
        if "batch_sizes" in yaml_config:
            args.batch_sizes = yaml_config["batch_sizes"]
@@ -560,6 +595,10 @@ def main():
            args.warmup_iters = yaml_config["warmup_iters"]
        if "profile_memory" in yaml_config:
            args.profile_memory = yaml_config["profile_memory"]
        if "kv_cache_dtype" in yaml_config:
            args.kv_cache_dtype = yaml_config["kv_cache_dtype"]
        if "cuda_graphs" in yaml_config:
            args.cuda_graphs = yaml_config["cuda_graphs"]
        # Parameter sweep configuration
        if "parameter_sweep" in yaml_config:
@@ -613,10 +652,19 @@ def main():
    # Determine backends
    backends = args.backends or ([args.backend] if args.backend else ["flash"])
    prefill_backends = getattr(args, "prefill_backends", None)
    if not args.batch_specs:
        args.batch_specs = ["q2k", "8q1s1k"]
    console.print(f"Backends: {', '.join(backends)}")
    if prefill_backends:
        console.print(f"Prefill backends: {', '.join(prefill_backends)}")
    console.print(f"Batch specs: {', '.join(args.batch_specs)}")
    console.print(f"KV cache dtype: {args.kv_cache_dtype}")
    console.print(f"CUDA graphs: {args.cuda_graphs}")
    console.print()
    init_workspace_manager(args.device)
    # Run benchmarks
    all_results = []
@@ -669,6 +717,8 @@ def main():
                        repeats=args.repeats,
                        warmup_iters=args.warmup_iters,
                        profile_memory=args.profile_memory,
                        kv_cache_dtype=args.kv_cache_dtype,
                        use_cuda_graphs=args.cuda_graphs,
                    )
                    # Add decode pipeline config
@@ -821,6 +871,8 @@ def main():
            "repeats": args.repeats,
            "warmup_iters": args.warmup_iters,
            "profile_memory": args.profile_memory,
            "kv_cache_dtype": args.kv_cache_dtype,
            "use_cuda_graphs": args.cuda_graphs,
        }
        all_results = run_model_parameter_sweep(
            backends,
@@ -843,6 +895,8 @@ def main():
            "repeats": args.repeats,
            "warmup_iters": args.warmup_iters,
            "profile_memory": args.profile_memory,
            "kv_cache_dtype": args.kv_cache_dtype,
            "use_cuda_graphs": args.cuda_graphs,
        }
        all_results = run_parameter_sweep(
            backends, args.batch_specs, base_config_args, args.parameter_sweep, console
@@ -850,37 +904,95 @@ def main():
    else:
        # Normal mode: compare backends
-        total = len(backends) * len(args.batch_specs)
+        decode_results = []
        prefill_results = []
-        with tqdm(total=total, desc="Benchmarking") as pbar:
+        # Run decode backend comparison
-            for spec in args.batch_specs:
+        if not prefill_backends:
-                for backend in backends:
+            # No prefill backends specified: compare decode backends as before
-                    config = BenchmarkConfig(
+            total = len(backends) * len(args.batch_specs)
                        backend=backend,
                        batch_spec=spec,
                        num_layers=args.num_layers,
                        head_dim=args.head_dim,
                        num_q_heads=args.num_q_heads,
                        num_kv_heads=args.num_kv_heads,
                        block_size=args.block_size,
                        device=args.device,
                        repeats=args.repeats,
                        warmup_iters=args.warmup_iters,
                        profile_memory=args.profile_memory,
                    )
-                    result = run_benchmark(config)
+            with tqdm(total=total, desc="Benchmarking") as pbar:
-                    all_results.append(result)
+                for spec in args.batch_specs:
                    for backend in backends:
                        config = BenchmarkConfig(
                            backend=backend,
                            batch_spec=spec,
                            num_layers=args.num_layers,
                            head_dim=args.head_dim,
                            num_q_heads=args.num_q_heads,
                            num_kv_heads=args.num_kv_heads,
                            block_size=args.block_size,
                            device=args.device,
                            repeats=args.repeats,
                            warmup_iters=args.warmup_iters,
                            profile_memory=args.profile_memory,
                            kv_cache_dtype=args.kv_cache_dtype,
                            use_cuda_graphs=args.cuda_graphs,
                        )
-                    if not result.success:
+                        result = run_benchmark(config)
-                        console.print(f"[red]Error {backend} {spec}: {result.error}[/]")
+                        decode_results.append(result)
-                    pbar.update(1)
+                        if not result.success:
                            console.print(
                                f"[red]Error {backend} {spec}: {result.error}[/]"
                            )
-        # Display results
+                        pbar.update(1)
-        console.print("\n[bold green]Results:[/]")
+
-        formatter = ResultsFormatter(console)
+            console.print("\n[bold green]Results:[/]")
-        formatter.print_table(all_results, backends)
+            formatter = ResultsFormatter(console)
            formatter.print_table(decode_results, backends)
        # Run prefill backend comparison
        if prefill_backends:
            # Use first decode backend for impl construction
            decode_backend = backends[0]
            total = len(prefill_backends) * len(args.batch_specs)
            console.print(
                f"[yellow]Prefill comparison mode: "
                f"using {decode_backend} for decode impl[/]"
            )
            with tqdm(total=total, desc="Prefill benchmarking") as pbar:
                for spec in args.batch_specs:
                    for pb in prefill_backends:
                        config = BenchmarkConfig(
                            backend=decode_backend,
                            batch_spec=spec,
                            num_layers=args.num_layers,
                            head_dim=args.head_dim,
                            num_q_heads=args.num_q_heads,
                            num_kv_heads=args.num_kv_heads,
                            block_size=args.block_size,
                            device=args.device,
                            repeats=args.repeats,
                            warmup_iters=args.warmup_iters,
                            profile_memory=args.profile_memory,
                            prefill_backend=pb,
                        )
                        result = run_benchmark(config)
                        # Label result with prefill backend name for display
                        labeled_config = replace(result.config, backend=pb)
                        result = replace(result, config=labeled_config)
                        prefill_results.append(result)
                        if not result.success:
                            console.print(f"[red]Error {pb} {spec}: {result.error}[/]")
                        pbar.update(1)
            console.print("\n[bold green]Prefill Backend Results:[/]")
            formatter = ResultsFormatter(console)
            formatter.print_table(
                prefill_results, prefill_backends, compare_to_fastest=True
            )
        all_results = decode_results + prefill_results
    # Save results
    if all_results:
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -30,7 +30,7 @@ def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
        max_kv_len = max(r.kv_len for r in requests) if requests else 0
        return (batch_size, max_q_len, max_kv_len)
    except Exception:
-        # Fallback for unparseable specs
+        # Fallback for unparsable specs
        return (0, 0, 0)
@@ -77,6 +77,7 @@ class MockKVBProj:
        self.qk_nope_head_dim = qk_nope_head_dim
        self.v_head_dim = v_head_dim
        self.out_dim = qk_nope_head_dim + v_head_dim
        self.weight = torch.empty(0, dtype=torch.bfloat16)
    def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
        """
@@ -212,7 +213,11 @@ class BenchmarkConfig:
    profile_memory: bool = False
    use_cuda_graphs: bool = False
    # "auto" or "fp8"
    kv_cache_dtype: str = "auto"
    # MLA-specific
    prefill_backend: str | None = None
    kv_lora_rank: int | None = None
    qk_nope_head_dim: int | None = None
    qk_rope_head_dim: int | None = None
@@ -367,6 +372,7 @@ class ResultsFormatter:
                    "backend",
                    "batch_spec",
                    "num_layers",
                    "kv_cache_dtype",
                    "mean_time",
                    "std_time",
                    "throughput",
@@ -380,6 +386,7 @@ class ResultsFormatter:
                        "backend": r.config.backend,
                        "batch_spec": r.config.batch_spec,
                        "num_layers": r.config.num_layers,
                        "kv_cache_dtype": r.config.kv_cache_dtype,
                        "mean_time": r.mean_time,
                        "std_time": r.std_time,
                        "throughput": r.throughput_tokens_per_sec or 0,
--- a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
@@ -30,9 +30,9 @@ batch_specs:
  - "2q16k_32q1s4k"         # 2 very large prefill + 32 decode
  # Context extension + decode
-  - "2q1kkv2k_16q1s1k"       # 2 extend + 16 decode
+  - "2q1ks2k_16q1s1k"       # 2 extend + 16 decode
-  - "4q2kkv4k_32q1s2k"       # 4 extend + 32 decode
+  - "4q2ks4k_32q1s2k"       # 4 extend + 32 decode
-  - "2q1kkv8k_32q1s2k"       # 2 large extend + 32 decode
+  - "2q1ks8k_32q1s2k"       # 2 large extend + 32 decode
  # Explicitly chunked prefill
  - "q8k"           # 8k prefill with chunking hint
--- a/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
@@ -1,4 +1,19 @@
-# MLA prefill-only benchmark configuration for sparse backends
+# MLA prefill backend comparison
 #
 # Compares all available MLA prefill backends:
 #   FA backends:  fa2, fa3, fa4 (FlashAttention versions)
 #   Non-FA:       flashinfer, cudnn, trtllm (Blackwell-only, require flashinfer)
 #
 # Uses cutlass_mla as the decode backend for impl construction
 # (only the prefill path is exercised).
 #
 # Backends that aren't available on the current platform will report errors
 # in the results table (e.g., fa3 on Blackwell, cudnn without artifactory).
 #
 # Usage:
 #   python benchmark.py --config configs/mla_prefill.yaml
 description: "MLA prefill backend comparison"
 model:
  name: "deepseek-v3"
@@ -12,20 +27,25 @@ model:
  v_head_dim: 128
  block_size: 128
-# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
+# model:
-# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
+#   name: "deepseek-v2-lite"
-model_parameter_sweep:
+#   num_layers: 27
-  param_name: "num_q_heads"
+#   num_q_heads: 16
-  values: [128, 64, 32, 16]
+#   num_kv_heads: 1
-  label_format: "{backend}_{value}h"
+#   head_dim: 576
 #   kv_lora_rank: 512
 #   qk_nope_head_dim: 128
 #   qk_rope_head_dim: 64
 #   v_head_dim: 128
 #   block_size: 128
 batch_specs:
  # Pure prefill
-  - "1q512"
+  - "q512"
-  - "1q1k"
+  - "q1k"
-  - "1q2k"
+  - "q2k"
-  - "1q4k"
+  - "q4k"
-  - "1q8k"
+  - "q8k"
  # Batched pure prefill
  - "2q512"
@@ -44,19 +64,63 @@ batch_specs:
  - "8q4k"
  - "8q8k"
-  # Extend
+  # Chunked prefill / extend
-  - "1q512s4k"
+  # Short context
-  - "1q512s8k"
+  - "q128s1k"
-  - "1q1ks8k"
+  - "q256s2k"
-  - "1q2ks8k"
+  - "q512s4k"
-  - "1q2ks16k"
+  - "q1ks4k"
-  - "1q4ks16k"
+  - "q2ks8k"
  - "2q128s1k"
  - "2q256s2k"
  - "2q512s4k"
  - "2q1ks4k"
  - "2q2ks8k"
  - "4q128s1k"
  - "4q256s2k"
  - "4q512s4k"
  - "4q1ks4k"
  - "4q2ks8k"
  - "8q128s1k"
  - "8q256s2k"
  - "8q512s4k"
  - "8q1ks4k"
-backends:
+  # Medium context
-  - FLASHMLA_SPARSE
+  - "q128s16k"
-  - FLASHINFER_MLA_SPARSE
+  - "q512s16k"
  - "q1ks16k"
  - "q2ks16k"
  - "2q128s16k"
  - "2q512s16k"
  - "2q1ks16k"
  - "2q2ks16k"
  - "4q128s16k"
  - "4q512s16k"
  - "4q1ks16k"
  - "4q2ks16k"
  # Long context
  - "q128s64k"
  - "q512s64k"
  - "q1ks64k"
  - "q2ks64k"
  - "2q128s64k"
  - "2q512s64k"
  - "2q1ks64k"
  - "2q2ks64k"
 decode_backends:
  - CUTLASS_MLA
 prefill_backends:
  - fa2
  - fa3
  - fa4
  - flashinfer
  - cudnn
  - trtllm
 device: "cuda:0"
-repeats: 10
+repeats: 20
-warmup_iters: 3
+warmup_iters: 5
 profile_memory: true
--- a/benchmarks/attention_benchmarks/configs/mla_sparse_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_sparse_decode.yaml
@@ -0,0 +1,58 @@
 # MLA decode-only benchmark configuration
 model:
  name: "deepseek-v3"
  num_layers: 60
  num_q_heads: 128  # Base value, can be swept for TP simulation
  num_kv_heads: 1  # MLA uses single latent KV
  head_dim: 576
  kv_lora_rank: 512
  qk_nope_head_dim: 128
  qk_rope_head_dim: 64
  v_head_dim: 128
  block_size: 128  # CUTLASS MLA and FlashAttn MLA use 128
 # Model parameter sweep: simulate tensor parallelism by varying num_q_heads
 # TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
 model_parameter_sweep:
  param_name: "num_q_heads"
  values: [128, 64, 32, 16]
  label_format: "{backend}_{value}h"
 batch_specs:
  # Small batches, varying sequence lengths
  - "16q1s512"     # 16 requests, 512 KV cache
  - "16q1s1k"      # 16 requests, 1k KV cache
  - "16q1s2k"      # 16 requests, 2k KV cache
  - "16q1s4k"      # 16 requests, 4k KV cache
  # Medium batches
  - "32q1s1k"      # 32 requests, 1k KV cache
  - "32q1s2k"      # 32 requests, 2k KV cache
  - "32q1s4k"      # 32 requests, 4k KV cache
  - "32q1s8k"      # 32 requests, 8k KV cache
  # Large batches
  - "64q1s1k"      # 64 requests, 1k KV cache
  - "64q1s2k"      # 64 requests, 2k KV cache
  - "64q1s4k"      # 64 requests, 4k KV cache
  - "64q1s8k"      # 64 requests, 8k KV cache
  # Very large batches
  - "128q1s1k"     # 128 requests, 1k KV cache
  - "128q1s2k"     # 128 requests, 2k KV cache
  - "128q1s4k"     # 128 requests, 4k KV cache
  - "128q1s8k"     # 128 requests, 8k KV cache
  # Long context
  - "32q1s16k"     # 32 requests, 16k KV cache
  - "32q1s32k"     # 32 requests, 32k KV cache
 backends:
  - FLASHMLA_SPARSE
  - FLASHINFER_MLA_SPARSE
 device: "cuda:0"
 repeats: 100
 warmup_iters: 10
 profile_memory: true
--- a/benchmarks/attention_benchmarks/configs/mla_sparse_prefill.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_sparse_prefill.yaml
@@ -0,0 +1,62 @@
 # MLA prefill-only benchmark configuration for sparse backends
 model:
  name: "deepseek-v3"
  num_layers: 60
  num_q_heads: 128
  num_kv_heads: 1
  head_dim: 576
  kv_lora_rank: 512
  qk_nope_head_dim: 128
  qk_rope_head_dim: 64
  v_head_dim: 128
  block_size: 128
 # Model parameter sweep: simulate tensor parallelism by varying num_q_heads
 # TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
 model_parameter_sweep:
  param_name: "num_q_heads"
  values: [128, 64, 32, 16]
  label_format: "{backend}_{value}h"
 batch_specs:
  # Pure prefill
  - "1q512"
  - "1q1k"
  - "1q2k"
  - "1q4k"
  - "1q8k"
  # Batched pure prefill
  - "2q512"
  - "2q1k"
  - "2q2k"
  - "2q4k"
  - "2q8k"
  - "4q512"
  - "4q1k"
  - "4q2k"
  - "4q4k"
  - "4q8k"
  - "8q512"
  - "8q1k"
  - "8q2k"
  - "8q4k"
  - "8q8k"
  # Extend
  - "1q512s4k"
  - "1q512s8k"
  - "1q1ks8k"
  - "1q2ks8k"
  - "1q2ks16k"
  - "1q4ks16k"
 backends:
  - FLASHMLA_SPARSE
  - FLASHINFER_MLA_SPARSE
 device: "cuda:0"
 repeats: 10
 warmup_iters: 3
 profile_memory: true
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -60,8 +60,11 @@ def create_minimal_vllm_config(
    model_name: str = "deepseek-v3",
    block_size: int = 128,
    max_num_seqs: int = 256,
    max_num_batched_tokens: int = 8192,
    mla_dims: dict | None = None,
    index_topk: int | None = None,
    prefill_backend: str | None = None,
    kv_cache_dtype: str = "auto",
 ) -> VllmConfig:
    """
    Create minimal VllmConfig for MLA benchmarks.
@@ -75,6 +78,9 @@ def create_minimal_vllm_config(
                  setup_mla_dims(model_name)
        index_topk: Optional topk value for sparse MLA backends. If provided,
                    the config will include index_topk for sparse attention.
        prefill_backend: Prefill backend name (e.g., "fa3", "fa4", "flashinfer",
                        "cudnn", "trtllm"). Configures the attention config to
                        force the specified prefill backend.
    Returns:
        VllmConfig for benchmarking
@@ -145,14 +151,13 @@ def create_minimal_vllm_config(
    cache_config = CacheConfig(
        block_size=block_size,
        gpu_memory_utilization=0.9,
-        swap_space=0,
+        cache_dtype=kv_cache_dtype,
        cache_dtype="auto",
        enable_prefix_caching=False,
    )
    scheduler_config = SchedulerConfig(
        max_num_seqs=max_num_seqs,
-        max_num_batched_tokens=8192,
+        max_num_batched_tokens=max(max_num_batched_tokens, max_num_seqs),
        max_model_len=32768,
        is_encoder_decoder=False,
        enable_chunked_prefill=True,
@@ -164,7 +169,7 @@ def create_minimal_vllm_config(
    compilation_config = CompilationConfig()
-    return VllmConfig(
+    vllm_config = VllmConfig(
        model_config=model_config,
        cache_config=cache_config,
        parallel_config=parallel_config,
@@ -172,9 +177,84 @@ def create_minimal_vllm_config(
        compilation_config=compilation_config,
    )
    if prefill_backend is not None:
        prefill_cfg = get_prefill_backend_config(prefill_backend)
        if prefill_cfg["flash_attn_version"] is not None:
            vllm_config.attention_config.flash_attn_version = prefill_cfg[
                "flash_attn_version"
            ]
        vllm_config.attention_config.disable_flashinfer_prefill = prefill_cfg[
            "disable_flashinfer_prefill"
        ]
        vllm_config.attention_config.use_cudnn_prefill = prefill_cfg[
            "use_cudnn_prefill"
        ]
        vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill = prefill_cfg[
            "use_trtllm_ragged_deepseek_prefill"
        ]
    return vllm_config
 # ============================================================================
-# Backend Configuration
+# Prefill Backend Configuration
 # ============================================================================
 # Maps prefill backend names to attention config overrides.
 # FA backends set flash_attn_version and disable non-FA paths.
 # Non-FA backends enable their specific path and disable others.
 _PREFILL_BACKEND_CONFIG: dict[str, dict] = {
    "fa2": {
        "flash_attn_version": 2,
        "disable_flashinfer_prefill": True,
        "use_cudnn_prefill": False,
        "use_trtllm_ragged_deepseek_prefill": False,
    },
    "fa3": {
        "flash_attn_version": 3,
        "disable_flashinfer_prefill": True,
        "use_cudnn_prefill": False,
        "use_trtllm_ragged_deepseek_prefill": False,
    },
    "fa4": {
        "flash_attn_version": 4,
        "disable_flashinfer_prefill": True,
        "use_cudnn_prefill": False,
        "use_trtllm_ragged_deepseek_prefill": False,
    },
    "flashinfer": {
        "flash_attn_version": None,
        "disable_flashinfer_prefill": False,
        "use_cudnn_prefill": False,
        "use_trtllm_ragged_deepseek_prefill": False,
    },
    "cudnn": {
        "flash_attn_version": None,
        "disable_flashinfer_prefill": True,
        "use_cudnn_prefill": True,
        "use_trtllm_ragged_deepseek_prefill": False,
    },
    "trtllm": {
        "flash_attn_version": None,
        "disable_flashinfer_prefill": True,
        "use_cudnn_prefill": False,
        "use_trtllm_ragged_deepseek_prefill": True,
    },
 }
 def get_prefill_backend_config(prefill_backend: str) -> dict:
    """Get attention config overrides for a prefill backend."""
    if prefill_backend not in _PREFILL_BACKEND_CONFIG:
        raise ValueError(
            f"Unknown prefill backend: {prefill_backend!r}. "
            f"Available: {list(_PREFILL_BACKEND_CONFIG.keys())}"
        )
    return _PREFILL_BACKEND_CONFIG[prefill_backend]
 # ============================================================================
 # Decode Backend Configuration
 # ============================================================================
@@ -204,6 +284,7 @@ def _get_backend_config(backend: str) -> dict:
    Returns:
        Dict with backend configuration
    """
    from vllm.v1.attention.backend import MultipleOf
    from vllm.v1.attention.backends.registry import AttentionBackendEnum
    try:
@@ -220,8 +301,8 @@ def _get_backend_config(backend: str) -> dict:
    block_sizes = backend_class.get_supported_kernel_block_sizes()
    # Use first supported block size (backends typically support one for MLA)
    block_size = block_sizes[0] if block_sizes else None
-    if hasattr(block_size, "value"):
+    if isinstance(block_size, MultipleOf):
-        # Handle MultipleOf enum
+        # No fixed block size; fall back to config value
        block_size = None
    # Check if sparse via class method if available
@@ -456,6 +537,7 @@ def _create_backend_impl(
    device: torch.device,
    max_num_tokens: int = 8192,
    index_topk: int | None = None,
    kv_cache_dtype: str = "auto",
 ):
    """
    Create backend implementation instance.
@@ -504,7 +586,7 @@ def _create_backend_impl(
        "num_kv_heads": mla_dims["num_kv_heads"],
        "alibi_slopes": None,
        "sliding_window": None,
-        "kv_cache_dtype": "auto",
+        "kv_cache_dtype": kv_cache_dtype,
        "logits_soft_cap": None,
        "attn_type": "decoder",
        "kv_sharing_target_layer_name": None,
@@ -622,6 +704,7 @@ def _run_single_benchmark(
    mla_dims: dict,
    device: torch.device,
    indexer=None,
    kv_cache_dtype: str | None = None,
 ) -> BenchmarkResult:
    """
    Run a single benchmark iteration.
@@ -655,53 +738,123 @@ def _run_single_benchmark(
    )
    # Create KV cache
-    kv_cache = torch.zeros(
+    if kv_cache_dtype is None:
-        num_blocks,
+        kv_cache_dtype = getattr(config, "kv_cache_dtype", "auto")
-        block_size,
+    head_size = mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"]
-        mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
+    if kv_cache_dtype == "fp8_ds_mla":
-        device=device,
+        # FlashMLA sparse custom format: 656 bytes per token, stored as uint8.
-        dtype=torch.bfloat16,
+        # Layout: kv_lora_rank fp8 bytes + 4 float32 tile scales
-    )
+        #         + 2*rope_dim bf16 bytes
        # = 512 + 16 + 128 = 656 bytes for DeepSeek dims.
        kv_cache = torch.zeros(
            num_blocks,
            block_size,
            656,
            device=device,
            dtype=torch.uint8,
        )
    elif kv_cache_dtype == "fp8":
        from vllm.platforms import current_platform
-    # Create input tensors for both decode and prefill modes
+        kv_cache = torch.zeros(
-    decode_inputs, prefill_inputs = _create_input_tensors(
+            num_blocks,
-        total_q,
+            block_size,
-        mla_dims,
+            head_size,
-        backend_cfg["query_format"],
+            device=device,
-        device,
+            dtype=torch.uint8,
-        torch.bfloat16,
+        ).view(current_platform.fp8_dtype())
-    )
+    else:
        kv_cache = torch.zeros(
            num_blocks,
            block_size,
            head_size,
            device=device,
            dtype=torch.bfloat16,
        )
    # Fill indexer with random indices for sparse backends
    is_sparse = backend_cfg.get("is_sparse", False)
    if is_sparse and indexer is not None:
        indexer.fill_random_indices(total_q, max_kv_len)
-    # Determine which forward method to use
+    # Determine which forward methods to use based on metadata.
-    if is_sparse:
+    # Sparse MLA backends always use forward_mqa
-        # Sparse backends use forward_mqa
+    has_decode = is_sparse or getattr(metadata, "decode", None) is not None
-        forward_fn = lambda: impl.forward_mqa(decode_inputs, kv_cache, metadata, layer)
+    has_prefill = not is_sparse and getattr(metadata, "prefill", None) is not None
-    elif metadata.decode is not None:
+    if not has_decode and not has_prefill:
        forward_fn = lambda: impl._forward_decode(
            decode_inputs, kv_cache, metadata, layer
        )
    elif metadata.prefill is not None:
        forward_fn = lambda: impl._forward_prefill(
            prefill_inputs["q"],
            prefill_inputs["k_c_normed"],
            prefill_inputs["k_pe"],
            kv_cache,
            metadata,
            prefill_inputs["k_scale"],
            prefill_inputs["output"],
        )
    else:
        raise RuntimeError("Metadata has neither decode nor prefill metadata")
    num_decode = (
        metadata.num_decode_tokens
        if (has_decode and has_prefill)
        else total_q
        if has_decode
        else 0
    )
    num_prefill = total_q - num_decode
    # Some backends requires fp8 queries when using fp8 KV cache.
    is_fp8_kvcache = kv_cache_dtype.startswith("fp8")
    quantize_query = is_fp8_kvcache and getattr(
        impl, "supports_quant_query_input", False
    )
    # quantize_query forces concat format
    query_fmt = "concat" if quantize_query else backend_cfg["query_format"]
    # Create decode query tensors
    if has_decode:
        decode_inputs, _ = _create_input_tensors(
            num_decode, mla_dims, query_fmt, device, torch.bfloat16
        )
        # Cast decode query to fp8 if the backend supports it
        if quantize_query:
            from vllm.platforms import current_platform
            if isinstance(decode_inputs, tuple):
                decode_inputs = torch.cat(list(decode_inputs), dim=-1)
            decode_inputs = decode_inputs.to(current_platform.fp8_dtype())
    # Create prefill input tensors
    if has_prefill:
        _, prefill_inputs = _create_input_tensors(
            num_prefill, mla_dims, query_fmt, device, torch.bfloat16
        )
    # Build forward function
    def forward_fn():
        results = []
        if has_decode:
            results.append(impl.forward_mqa(decode_inputs, kv_cache, metadata, layer))
        if has_prefill:
            results.append(
                impl.forward_mha(
                    prefill_inputs["q"],
                    prefill_inputs["k_c_normed"],
                    prefill_inputs["k_pe"],
                    kv_cache,
                    metadata,
                    prefill_inputs["k_scale"],
                    prefill_inputs["output"],
                )
            )
        return results[0] if len(results) == 1 else tuple(results)
    # Warmup
    for _ in range(config.warmup_iters):
        forward_fn()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Optionally capture a CUDA graph after warmup.
    # Graph replay eliminates CPU launch overhead so timings reflect pure
    # kernel time.
    if config.use_cuda_graphs:
        graph = torch.cuda.CUDAGraph()
        with torch.cuda.graph(graph):
            forward_fn()
        benchmark_fn = graph.replay
    else:
        benchmark_fn = forward_fn
    # Benchmark
    times = []
@@ -711,10 +864,10 @@ def _run_single_benchmark(
        start.record()
        for _ in range(config.num_layers):
-            forward_fn()
+            benchmark_fn()
        end.record()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        elapsed_ms = start.elapsed_time(end)
        times.append(elapsed_ms / 1000.0 / config.num_layers)
@@ -733,6 +886,7 @@ def _run_mla_benchmark_batched(
    backend: str,
    configs_with_params: list[tuple],  # [(config, threshold, num_splits), ...]
    index_topk: int = 2048,
    prefill_backend: str | None = None,
 ) -> list[BenchmarkResult]:
    """
    Unified batched MLA benchmark runner for all backends.
@@ -744,11 +898,13 @@ def _run_mla_benchmark_batched(
    to avoid setup/teardown overhead.
    Args:
-        backend: Backend name
+        backend: Backend name (decode backend used for impl construction)
        configs_with_params: List of (config, threshold, num_splits) tuples
            - threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
            - num_splits: num_kv_splits (CUTLASS only)
        index_topk: Topk value for sparse MLA backends (default 2048)
        prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
            When set, forces the specified FlashAttention version for prefill.
    Returns:
        List of BenchmarkResult objects
@@ -758,7 +914,7 @@ def _run_mla_benchmark_batched(
    backend_cfg = _get_backend_config(backend)
    device = torch.device(configs_with_params[0][0].device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    # Determine block size
    config_block_size = configs_with_params[0][0].block_size
@@ -775,26 +931,91 @@ def _run_mla_benchmark_batched(
    # Determine if this is a sparse backend
    is_sparse = backend_cfg.get("is_sparse", False)
    # Extract kv_cache_dtype from the first config
    kv_cache_dtype = getattr(first_config, "kv_cache_dtype", "auto")
    # FlashMLA sparse only supports "fp8_ds_mla" internally (not generic "fp8").
    # Remap here so the user can pass --kv-cache-dtype fp8 regardless of backend.
    if backend.upper() == "FLASHMLA_SPARSE" and kv_cache_dtype == "fp8":
        kv_cache_dtype = "fp8_ds_mla"
    # Compute max total_q across all configs so the metadata builder buffer
    # and scheduler config are large enough for all batch specs.
    max_total_q = max(
        sum(r.q_len for r in parse_batch_spec(cfg.batch_spec))
        for cfg, *_ in configs_with_params
    )
    # Create and set vLLM config for MLA (reused across all benchmarks)
    vllm_config = create_minimal_vllm_config(
        model_name="deepseek-v3",  # Used only for model path
        block_size=block_size,
        max_num_batched_tokens=max_total_q,
        mla_dims=mla_dims,  # Use custom dims from config or default
        index_topk=index_topk if is_sparse else None,
        prefill_backend=prefill_backend,
        kv_cache_dtype=kv_cache_dtype,
    )
    results = []
    with set_current_vllm_config(vllm_config):
        # Clear cached prefill backend detection functions so they re-evaluate
        # with the current VllmConfig. These are @functools.cache decorated and
        # would otherwise return stale results from a previous backend's config.
        from vllm.model_executor.layers.attention.mla_attention import (
            use_cudnn_prefill,
            use_flashinfer_prefill,
            use_trtllm_ragged_deepseek_prefill,
        )
        use_flashinfer_prefill.cache_clear()
        use_cudnn_prefill.cache_clear()
        use_trtllm_ragged_deepseek_prefill.cache_clear()
        # Create backend impl, layer, builder, and indexer (reused across benchmarks)
        impl, layer, builder_instance, indexer = _create_backend_impl(
            backend_cfg,
            mla_dims,
            vllm_config,
            device,
            max_num_tokens=max_total_q,
            index_topk=index_topk if is_sparse else None,
            kv_cache_dtype=kv_cache_dtype,
        )
        # Verify the actual prefill backend matches what was requested
        if prefill_backend is not None:
            prefill_cfg = get_prefill_backend_config(prefill_backend)
            fa_version = prefill_cfg["flash_attn_version"]
            if fa_version is not None:
                # FA backend: verify the impl's FA version
                actual_fa_version = getattr(impl, "vllm_flash_attn_version", None)
                if actual_fa_version != fa_version:
                    raise RuntimeError(
                        f"Prefill backend '{prefill_backend}' requested FA "
                        f"version {fa_version}, but the impl is using FA "
                        f"version {actual_fa_version}. Check "
                        f"vllm/v1/attention/backends/fa_utils.py."
                    )
            else:
                # Non-FA backend: verify the builder picked the right path
                expected_flags = {
                    "flashinfer": "_use_fi_prefill",
                    "cudnn": "_use_cudnn_prefill",
                    "trtllm": "_use_trtllm_ragged_prefill",
                }
                flag_name = expected_flags.get(prefill_backend)
                if flag_name and not getattr(builder_instance, flag_name, False):
                    raise RuntimeError(
                        f"Prefill backend '{prefill_backend}' was requested "
                        f"but the metadata builder did not enable it. This "
                        f"usually means a dependency is missing (e.g., "
                        f"flashinfer not installed) or the platform doesn't "
                        f"support it."
                    )
        # Run each benchmark with the shared impl
        for config, threshold, num_splits in configs_with_params:
            # Set threshold for this benchmark (FlashAttn/FlashMLA only)
@@ -819,6 +1040,7 @@ def _run_mla_benchmark_batched(
                    mla_dims,
                    device,
                    indexer=indexer,
                    kv_cache_dtype=kv_cache_dtype,
                )
                results.append(result)
@@ -845,6 +1067,7 @@ def run_mla_benchmark(
    reorder_batch_threshold: int | None = None,
    num_kv_splits: int | None = None,
    index_topk: int = 2048,
    prefill_backend: str | None = None,
 ) -> BenchmarkResult | list[BenchmarkResult]:
    """
    Unified MLA benchmark runner for all backends.
@@ -862,6 +1085,8 @@ def run_mla_benchmark(
                                 (single config mode only)
        num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
        index_topk: Topk value for sparse MLA backends (default 2048)
        prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
            When set, forces the specified FlashAttention version for prefill.
    Returns:
        BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
@@ -885,7 +1110,9 @@ def run_mla_benchmark(
        return_single = True
    # Use unified batched execution
-    results = _run_mla_benchmark_batched(backend, configs_with_params, index_topk)
+    results = _run_mla_benchmark_batched(
        backend, configs_with_params, index_topk, prefill_backend=prefill_backend
    )
    # Return single result or list based on input
    return results[0] if return_single else results
--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -140,8 +140,7 @@ def _create_vllm_config(
    cache_config = CacheConfig(
        block_size=config.block_size,
-        cache_dtype="auto",
+        cache_dtype=config.kv_cache_dtype,
        swap_space=0,
    )
    cache_config.num_gpu_blocks = max_num_blocks
    cache_config.num_cpu_blocks = 0
@@ -216,7 +215,7 @@ def _create_backend_impl(
        num_kv_heads=config.num_kv_heads,
        alibi_slopes=None,
        sliding_window=None,
-        kv_cache_dtype="auto",
+        kv_cache_dtype=config.kv_cache_dtype,
    )
    kv_cache_spec = FullAttentionSpec(
@@ -289,12 +288,22 @@ def _create_input_tensors(
    total_q: int,
    device: torch.device,
    dtype: torch.dtype,
    quantize_query: bool = False,
 ) -> tuple:
-    """Create Q, K, V input tensors for all layers."""
+    """Create Q, K, V input tensors for all layers.
    When quantize_query is True, queries are cast to fp8 to match backends
    that require query/key/value dtype consistency.
    """
    q_dtype = dtype
    if quantize_query:
        from vllm.platforms import current_platform
        q_dtype = current_platform.fp8_dtype()
    q_list = [
        torch.randn(
            total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
-        )
+        ).to(q_dtype)
        for _ in range(config.num_layers)
    ]
    k_list = [
@@ -345,10 +354,17 @@ def _create_kv_cache(
    # Compute inverse permutation to get back to logical view
    inv_order = [stride_order.index(i) for i in range(len(stride_order))]
    # Use fp8 dtype for cache when requested.
    cache_dtype = dtype
    if config.kv_cache_dtype == "fp8":
        from vllm.platforms import current_platform
        cache_dtype = current_platform.fp8_dtype()
    cache_list = []
    for _ in range(config.num_layers):
        # Allocate in physical layout order (contiguous in memory)
-        cache = torch.zeros(*physical_shape, device=device, dtype=dtype)
+        cache = torch.zeros(*physical_shape, device=device, dtype=cache_dtype)
        # Permute to logical view
        cache = cache.permute(*inv_order)
        cache_list.append(cache)
@@ -391,7 +407,38 @@ def _run_single_benchmark(
                attn_metadata,
                output=out,
            )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Optionally capture a CUDA graph after warmup.
    # Graph replay eliminates CPU launch overhead so timings reflect pure
    # kernel time.
    if config.use_cuda_graphs:
        graph = torch.cuda.CUDAGraph()
        with torch.cuda.graph(graph):
            for i in range(config.num_layers):
                impl.forward(
                    layer,
                    q_list[i],
                    k_list[i],
                    v_list[i],
                    cache_list[i],
                    attn_metadata,
                    output=out,
                )
        benchmark_fn = graph.replay
    else:
        def benchmark_fn():
            for i in range(config.num_layers):
                impl.forward(
                    layer,
                    q_list[i],
                    k_list[i],
                    v_list[i],
                    cache_list[i],
                    attn_metadata,
                    output=out,
                )
    # Benchmark
    times = []
@@ -400,27 +447,18 @@ def _run_single_benchmark(
        end = torch.cuda.Event(enable_timing=True)
        start.record()
-        for i in range(config.num_layers):
+        benchmark_fn()
            impl.forward(
                layer,
                q_list[i],
                k_list[i],
                v_list[i],
                cache_list[i],
                attn_metadata,
                output=out,
            )
        end.record()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        elapsed_ms = start.elapsed_time(end)
        times.append(elapsed_ms / 1000.0 / config.num_layers)  # seconds per layer
    mem_stats = {}
    if config.profile_memory:
        mem_stats = {
-            "allocated_mb": torch.cuda.memory_allocated(device) / 1024**2,
+            "allocated_mb": torch.accelerator.memory_allocated(device) / 1024**2,
-            "reserved_mb": torch.cuda.memory_reserved(device) / 1024**2,
+            "reserved_mb": torch.accelerator.memory_reserved(device) / 1024**2,
        }
    return times, mem_stats
@@ -444,7 +482,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
        BenchmarkResult with timing and memory statistics
    """
    device = torch.device(config.device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    backend_cfg = _get_backend_config(config.backend)
@@ -503,8 +541,12 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
                common_attn_metadata=common_metadata,
            )
            # Only quantize queries when the impl supports it
            quantize_query = config.kv_cache_dtype.startswith("fp8") and getattr(
                impl, "supports_quant_query_input", False
            )
            q_list, k_list, v_list = _create_input_tensors(
-                config, total_q, device, dtype
+                config, total_q, device, dtype, quantize_query=quantize_query
            )
            cache_list = _create_kv_cache(
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -41,7 +41,7 @@ MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LE
 | --- | --- | --- |
 | `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
 | `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` |
-| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
+| `SYSTEM` | **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
 | `TP` | **Required.** The tensor-parallelism size. | `1` |
 | `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
 | `INPUT_LEN` | **Required.** Request input length. | `4000` |
--- a/benchmarks/benchmark_topk_topp.py
+++ b/benchmarks/benchmark_topk_topp.py
@@ -94,15 +94,18 @@ def create_logits(
 def measure_memory() -> tuple[int, int]:
    """Return (allocated, reserved) memory in bytes."""
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
-    return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()
+    return (
        torch.accelerator.memory_allocated(),
        torch.accelerator.max_memory_allocated(),
    )
 def reset_memory_stats():
    """Reset peak memory statistics."""
    reset_buffer_cache()
-    torch.cuda.reset_peak_memory_stats()
+    torch.accelerator.reset_peak_memory_stats()
-    torch.cuda.empty_cache()
+    torch.accelerator.empty_cache()
    gc.collect()
@@ -123,7 +126,7 @@ def benchmark_function(
    for _ in range(warmup_iters):
        logits_copy = logits.clone()
        func(logits_copy, k, p)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Reset memory stats before benchmark
    reset_memory_stats()
@@ -140,7 +143,7 @@ def benchmark_function(
        func(logits_copy, k, p)
        end_events[i].record()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Calculate timing
    times = [
--- a/benchmarks/kernels/bench_concat_mla_q.py
+++ b/benchmarks/kernels/bench_concat_mla_q.py
@@ -0,0 +1,98 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import torch
 from vllm import _custom_ops as ops
 from vllm.triton_utils import triton
 # DeepSeek V3 dimensions
 NOPE_DIM = 512
 ROPE_DIM = 64
 NUM_HEADS = 128
 NUM_TOKENS = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
 def get_configs():
    return NUM_TOKENS
 def make_inputs(num_tokens, dtype):
    """Create inputs matching the real code path.
    Args:
        contiguous_nope: If False, simulate the transposed BMM output
                         (non-contiguous nope with stride pattern from
                         [N,B,L].transpose(0,1)).
    """
    # Simulate: bmm output [N, B, L].transpose(0, 1) -> [B, N, L]
    raw = torch.randn(NUM_HEADS, num_tokens, NOPE_DIM, dtype=dtype, device="cuda")
    ql_nope = raw.transpose(0, 1)
    q_pe = torch.randn(num_tokens, NUM_HEADS, ROPE_DIM, dtype=dtype, device="cuda")
    return ql_nope, q_pe
 # ---- Non-contiguous nope benchmark (real code path) ----
@triton.testing.perf_report(
    triton.testing.Benchmark(
        x_names=["num_tokens"],
        x_vals=get_configs(),
        line_arg="provider",
        line_vals=["torch_cat", "concat_mla_q"],
        line_names=["torch.cat", "concat_mla_q (v8)"],
        styles=[("blue", "--"), ("green", "-")],
        ylabel="Latency (us)",
        plot_name="concat_mla_q-transposed",
        args={},
    )
 )
 def bench_transposed(num_tokens, provider):
    dtype = torch.bfloat16
    ql_nope, q_pe = make_inputs(num_tokens, dtype)
    q_out = torch.empty(
        num_tokens, NUM_HEADS, NOPE_DIM + ROPE_DIM, dtype=dtype, device="cuda"
    )
    quantiles = [0.5, 0.2, 0.8]
    if provider == "torch_cat":
        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
            lambda: torch.cat((ql_nope, q_pe), dim=-1), quantiles=quantiles, rep=500
        )
    else:
        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
            lambda: ops.concat_mla_q(ql_nope, q_pe, q_out), quantiles=quantiles, rep=500
        )
    return ms * 1000, max_ms * 1000, min_ms * 1000  # us
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Benchmark concat_mla_q vs torch.cat")
    parser.add_argument(
        "--save-path", type=str, default=None, help="Path to save benchmark results"
    )
    args = parser.parse_args()
    print("\n" + "=" * 70)
    print("CONCAT MLA Q KERNEL BENCHMARKS")
    print("=" * 70)
    print(f"Dimensions: nope={NOPE_DIM}, rope={ROPE_DIM}, heads={NUM_HEADS}")
    print(
        f"Per-head output: {NOPE_DIM + ROPE_DIM} bf16 = "
        f"{(NOPE_DIM + ROPE_DIM) * 2} bytes"
    )
    print(f"num_tokens (decode=batch_size, prefill=chunk_size): {NUM_TOKENS}")
    print("=" * 70)
    print("\n--- Non-contiguous nope inputs (transposed BMM output) ---")
    bench_transposed.run(print_data=True, save_path=args.save_path)
    print("\n" + "=" * 70)
    print("Benchmarking complete!")
    print("=" * 70)
--- a/benchmarks/kernels/bench_cp_gather_fp8.py
+++ b/benchmarks/kernels/bench_cp_gather_fp8.py
@@ -0,0 +1,153 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import math
 import torch
 from vllm import _custom_ops as ops
 from vllm.triton_utils import triton
 # DeepSeek V3 MLA dimensions
 NOPE_DIM = 512
 ROPE_DIM = 64
 HEAD_DIM = NOPE_DIM + ROPE_DIM  # 576 BF16 output elements per token
 ENTRY_BYTES = 656  # 512 FP8 + 16 scales + 128 BF16 RoPE
 BLOCK_SIZE = 64  # tokens per physical cache block - get_supported_kernel_block_sizes
 # Realistic prefill scenarios:
 #   - 1 long prefill: single request, 16K-96K tokens
 #   - 4 medium prefills: 4 requests, 4K-24K tokens each
 #   - 16 shorter prefills: 16 requests, 1K-6K tokens each
 SCENARIOS = [
    # (label, num_reqs, total_tokens_list)
    ("1-req", 1, [8192, 16384, 32768, 65536, 98304]),
    ("4-reqs", 4, [8192, 16384, 32768, 65536, 98304]),
    ("16-reqs", 16, [8192, 16384, 32768, 65536, 98304]),
 ]
 def make_inputs(total_tokens, num_reqs, block_size):
    """Create synthetic FP8 cache, block table, and output buffer.
    Fills the cache with random bytes (we only measure throughput,
    not correctness). Block table maps each request to contiguous
    physical blocks.
    """
    # Divide tokens evenly across requests
    base_len = total_tokens // num_reqs
    remainder = total_tokens % num_reqs
    seq_lens = [base_len + (1 if r < remainder else 0) for r in range(num_reqs)]
    # workspace_starts: cumulative sum of seq_lens
    workspace_starts = [0] * num_reqs
    for r in range(1, num_reqs):
        workspace_starts[r] = workspace_starts[r - 1] + seq_lens[r - 1]
    # Physical blocks needed per request
    blocks_per_req = [math.ceil(s / block_size) for s in seq_lens]
    total_blocks = sum(blocks_per_req)
    max_blocks = max(blocks_per_req)
    # Allocate cache with random data (content doesn't matter for perf)
    cache = torch.randint(
        0,
        256,
        (total_blocks, block_size, ENTRY_BYTES),
        dtype=torch.uint8,
        device="cuda",
    )
    # Block table: contiguous block assignments
    block_table = torch.zeros(num_reqs, max_blocks, dtype=torch.int32, device="cuda")
    block_idx = 0
    for r in range(num_reqs):
        for b in range(blocks_per_req[r]):
            block_table[r, b] = block_idx
            block_idx += 1
    # Output workspace
    dst = torch.zeros(total_tokens, HEAD_DIM, dtype=torch.bfloat16, device="cuda")
    seq_lens_t = torch.tensor(seq_lens, dtype=torch.int32, device="cuda")
    workspace_starts_t = torch.tensor(
        workspace_starts, dtype=torch.int32, device="cuda"
    )
    return cache, dst, block_table, seq_lens_t, workspace_starts_t
 def bench_scenario(label, num_reqs, total_tokens_list, save_path):
    """Run benchmark for a specific (num_reqs, total_tokens) scenario."""
    @triton.testing.perf_report(
        triton.testing.Benchmark(
            x_names=["total_tokens"],
            x_vals=total_tokens_list,
            line_arg="provider",
            line_vals=["cuda_kernel"],
            line_names=["cp_gather_fp8 (CUDA)"],
            styles=[("green", "-")],
            ylabel="Latency (us)",
            plot_name=f"cp_gather_fp8-{label}-bs{BLOCK_SIZE}",
            args={"num_reqs": num_reqs},
        )
    )
    def bench_fn(total_tokens, provider, num_reqs):
        cache, dst, block_table, seq_lens_t, ws_starts = make_inputs(
            total_tokens, num_reqs, BLOCK_SIZE
        )
        quantiles = [0.5, 0.2, 0.8]
        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
            lambda: ops.cp_gather_and_upconvert_fp8_kv_cache(
                cache, dst, block_table, seq_lens_t, ws_starts, num_reqs
            ),
            quantiles=quantiles,
            rep=500,
        )
        return ms * 1000, max_ms * 1000, min_ms * 1000  # us
    seq_len_per_req = total_tokens_list[0] // num_reqs
    seq_len_per_req_max = total_tokens_list[-1] // num_reqs
    print(
        f"\n--- {label}: {num_reqs} request(s), "
        f"~{seq_len_per_req}-{seq_len_per_req_max} tokens/req ---"
    )
    bench_fn.run(print_data=True, save_path=save_path)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Benchmark cp_gather_and_upconvert_fp8_kv_cache"
    )
    parser.add_argument(
        "--save-path",
        type=str,
        default=None,
        help="Path to save benchmark results as CSV",
    )
    args = parser.parse_args()
    # Print data volume info for bandwidth analysis
    read_per_token = ENTRY_BYTES  # 656 bytes from cache
    write_per_token = HEAD_DIM * 2  # 576 * 2 = 1152 bytes to workspace
    total_per_token = read_per_token + write_per_token  # 1808 bytes
    print("\n" + "=" * 70)
    print("CP_GATHER_AND_UPCONVERT_FP8_KV_CACHE BENCHMARKS")
    print("=" * 70)
    print(f"Cache entry: {ENTRY_BYTES} bytes (512 FP8 + 16 scales + 128 RoPE)")
    print(f"Output row:  {HEAD_DIM} BF16 = {HEAD_DIM * 2} bytes")
    print(f"Per token:   {total_per_token} bytes (read + write)")
    print(f"Block size:  {BLOCK_SIZE} tokens/block")
    print("=" * 70)
    for label, num_reqs, total_tokens_list in SCENARIOS:
        bench_scenario(label, num_reqs, total_tokens_list, args.save_path)
    print("\n" + "=" * 70)
    print("Benchmarking complete!")
    print("=" * 70)
--- a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
@@ -168,7 +168,7 @@ def bench_impl(
    # warmup
    for kwargs in kwargs_list:
        impl_type.get_impl()(**kwargs)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Merge into a single kwargs and qualify arguments as ArgPool
    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
@@ -202,7 +202,7 @@ def test_correctness(T: int, N: int):
    # reference output
    ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
-    # test ouptut
+    # test output
    out_q, out_s = output_from_impl(
        ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
    )
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -64,7 +64,7 @@ def bench_run(
    per_out_ch: bool,
    mkn: tuple[int, int, int],
 ):
-    init_workspace_manager(torch.cuda.current_device())
+    init_workspace_manager(torch.accelerator.current_device_index())
    (m, k, n) = mkn
    dtype = torch.half
@@ -171,7 +171,7 @@ def bench_run(
                activation=MoEActivation.SILU,
                global_num_experts=num_experts,
            )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
    triton_stream = torch.cuda.Stream()
@@ -187,14 +187,14 @@ def bench_run(
                topk_ids,
                quant_config=quant_config,
            )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
        """Benchmark CUDA graph using events like benchmark_moe.py"""
        # Warmup
        for _ in range(num_warmup):
            graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        # Timing
        start_event = torch.Event(enable_timing=True)
@@ -202,7 +202,7 @@ def bench_run(
        latencies = []
        for _ in range(num_iters):
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            start_event.record()
            graph.replay()
            end_event.record()
--- a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
@@ -307,7 +307,7 @@ def bench_run(
    def replay_graph(graph, num_repeats):
        for _ in range(num_repeats):
            graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
    cutlass_stream = torch.cuda.Stream()
    cutlass_graph = torch.cuda.CUDAGraph()
@@ -330,7 +330,7 @@ def bench_run(
            e=num_experts,
            device=device,
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    triton_stream = torch.cuda.Stream()
    triton_graph = torch.cuda.CUDAGraph()
@@ -345,7 +345,7 @@ def bench_run(
            w2_fp8scale,
            a_fp8_scale,
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    min_run_time = 5
    num_warmup = 5
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -342,7 +342,7 @@ class CommunicatorBenchmark:
            if not should_use_fn(tensor):
                return None
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            stream = torch.cuda.Stream()
            with torch.cuda.stream(stream):
                graph_input = tensor.clone()
@@ -360,17 +360,17 @@ class CommunicatorBenchmark:
                        for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
                            allreduce_fn(graph_input)
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            for _ in range(num_warmup):
                graph.replay()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            start_time = time.perf_counter()
            for _ in range(num_trials):
                graph.replay()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            end_time = time.perf_counter()
@@ -495,7 +495,7 @@ def main():
    # Set device
    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    # Get CPU process group
    cpu_group = dist.new_group(backend="gloo")
--- a/benchmarks/kernels/benchmark_fused_collective.py
+++ b/benchmarks/kernels/benchmark_fused_collective.py
@@ -385,32 +385,32 @@ def benchmark_operation(
    # Warmup before graph capture
    for _ in range(warmup):
        operation_func(*args, **kwargs)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Create CUDA graph
    graph = torch.cuda.CUDAGraph()
    num_op_per_cudagraph = 10
    # Use vLLM's graph_capture to make tensor_model_parallel_all_reduce graph-safe
-    device = torch.device(f"cuda:{torch.cuda.current_device()}")
+    device = torch.device(f"cuda:{torch.accelerator.current_device_index()}")
    with graph_capture(device=device), torch.cuda.graph(graph):
        for _ in range(num_op_per_cudagraph):
            operation_func(*args, **kwargs)
    # Graph warmup
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    for _ in range(warmup):
        graph.replay()
    # Benchmark with CUDA graph
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_time = time.perf_counter()
    for _ in range(trials // num_op_per_cudagraph):
        # operation_func(*args, **kwargs)
        graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    end_time = time.perf_counter()
    avg_time_ms = ((end_time - start_time) / trials) * 1000
@@ -984,7 +984,7 @@ def main():
    world_size = int(os.environ["WORLD_SIZE"])
    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    torch.set_default_device(device)
    init_distributed_environment()
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -50,7 +50,7 @@ def bench_run(
    per_out_ch: bool,
    mkn: tuple[int, int, int],
 ):
-    init_workspace_manager(torch.cuda.current_device())
+    init_workspace_manager(torch.accelerator.current_device_index())
    label = "Quant Matmul"
    sub_label = (
@@ -224,7 +224,7 @@ def bench_run(
    def replay_graph(graph, num_repeats):
        for _ in range(num_repeats):
            graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
    cutlass_stream = torch.cuda.Stream()
    cutlass_graph = torch.cuda.CUDAGraph()
@@ -239,7 +239,7 @@ def bench_run(
            topk_weights,
            topk_ids,
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    triton_stream = torch.cuda.Stream()
    triton_graph = torch.cuda.CUDAGraph()
@@ -254,7 +254,7 @@ def bench_run(
            w2_scale,
            a_scale,
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    min_run_time = 5
    num_warmup = 5
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -34,14 +34,14 @@ def main(
    residual = torch.randn_like(x) * scale if add_residual else None
    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
        for _ in range(num_iters):
            layer(x, residual)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        end_time = time.perf_counter()
        if profile:
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -1035,7 +1035,7 @@ def bench_optype(
    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
    for kwargs in kwargs_list:
        op_type.bench_fn()(**kwargs)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Merge into a single kwargs and qualify arguments as ArgPool
    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
--- a/benchmarks/kernels/benchmark_mla_k_concat.py
+++ b/benchmarks/kernels/benchmark_mla_k_concat.py
@@ -47,13 +47,13 @@ def benchmark_method(
    # Warmup
    for _ in range(num_warmup):
        _ = method(k_nope, k_pe)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Benchmark
    start = time.perf_counter()
    for _ in range(num_iters):
        _ = method(k_nope, k_pe)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    end = time.perf_counter()
    return (end - start) / num_iters * 1000  # Convert to ms
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -54,7 +54,7 @@ def clear_triton_cache():
    # Clear CUDA memory cache
    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
+        torch.accelerator.empty_cache()
    # Try to clear Triton's runtime cache
    try:
@@ -304,19 +304,19 @@ def benchmark_config(
    # JIT compilation & warmup
    run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Capture 10 invocations with CUDA graph
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        for _ in range(10):
            run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Warmup
    for _ in range(5):
        graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
@@ -324,7 +324,7 @@ def benchmark_config(
    latencies: list[float] = []
    for i in range(num_iters):
        prepare(i)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        graph.replay()
@@ -626,7 +626,11 @@ class BenchmarkWorker:
            if visible_device != f"{self.device_id}":
                need_device_guard = True
-        with torch.cuda.device(self.device_id) if need_device_guard else nullcontext():
+        with (
            torch.accelerator.device_index(self.device_id)
            if need_device_guard
            else nullcontext()
        ):
            for idx, config in enumerate(tqdm(search_space)):
                try:
                    kernel_time = benchmark_config(
--- a/benchmarks/kernels/benchmark_moe_defaults.py
+++ b/benchmarks/kernels/benchmark_moe_defaults.py
@@ -131,7 +131,7 @@ def benchmark_config(
                topk_ids,
                quant_config=quant_config,
            )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Benchmark
    start = torch.cuda.Event(enable_timing=True)
@@ -149,7 +149,7 @@ def benchmark_config(
                quant_config=quant_config,
            )
    end.record()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    return start.elapsed_time(end) / num_iters * 1000  # ms -> us
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -69,19 +69,19 @@ def benchmark_permute(
    # JIT compilation & warmup
    run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Capture 10 invocations with CUDA graph
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        for _ in range(10):
            run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Warmup
    for _ in range(5):
        graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
@@ -89,7 +89,7 @@ def benchmark_permute(
    latencies: list[float] = []
    for i in range(num_iters):
        prepare(i)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        graph.replay()
@@ -159,26 +159,26 @@ def benchmark_unpermute(
    # JIT compilation & warmup
    input = prepare()
    run(input)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Capture 10 invocations with CUDA graph
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        for _ in range(10):
            run(input)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Warmup
    for _ in range(5):
        graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
    latencies: list[float] = []
    for i in range(num_iters):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        graph.replay()
        end_event.record()
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -135,14 +135,14 @@ def benchmark_mrope(
            key.clone(),
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Time reference implementation
    torch_times = []
    for _ in range(benchmark_iter):
        query_clone = query.clone()
        key_clone = key.clone()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_time = time.time()
        mrope_helper_class.forward_native(
@@ -151,7 +151,7 @@ def benchmark_mrope(
            key_clone,
        )
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        torch_times.append(time.time() - start_time)
    # Time triton kernel implementation
@@ -159,14 +159,14 @@ def benchmark_mrope(
    for _ in range(benchmark_iter):
        query_clone = query.clone()
        key_clone = key.clone()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_time = time.time()
        mrope_helper_class.forward_cuda(
            positions,
            query_clone,
            key_clone,
        )
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        triton_times.append(time.time() - start_time)
    # Calculate statistics
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -103,7 +103,7 @@ def main(
        max_logits = torch.empty_like(exp_sums)
    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
@@ -173,7 +173,7 @@ def main(
                    )
            else:
                raise ValueError(f"Invalid version: {version}")
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        end_time = time.perf_counter()
        if profile:
--- a/benchmarks/kernels/benchmark_per_token_group_quant.py
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@@ -28,7 +28,7 @@ def _time_cuda(
    # warmup
    for _ in range(warmup_iters):
        fn()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start = torch.Event(enable_timing=True)
    end = torch.Event(enable_timing=True)
@@ -37,7 +37,7 @@ def _time_cuda(
    for _ in range(bench_iters):
        fn()
    end.record()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    return start.elapsed_time(end) / bench_iters  # ms/iter
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -29,7 +29,7 @@ def main(
    scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None
    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
@@ -39,7 +39,7 @@ def main(
                ops.scaled_int8_quant(x, scale)
            else:
                ops.scaled_fp8_quant(x, scale)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        end_time = time.perf_counter()
        if profile:
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@@ -84,16 +84,16 @@ def run_benchmark(
        g = torch.cuda.CUDAGraph()
        with torch.cuda.graph(g):
            function_under_test()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        function_under_test = lambda: g.replay()
    def run_cuda_benchmark(n_iters: int) -> float:
        nonlocal key, value, key_cache, value_cache, slot_mapping
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = time.perf_counter()
        for _ in range(n_iters):
            function_under_test()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
        end = time.perf_counter()
        return (end - start) / n_iters
@@ -104,7 +104,7 @@ def run_benchmark(
    # free tensors to mitigate OOM when sweeping
    del key, value, key_cache, value_cache, slot_mapping
-    torch.cuda.empty_cache()
+    torch.accelerator.empty_cache()
    return lat
--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@@ -109,16 +109,16 @@ def run_benchmark(
        g = torch.cuda.CUDAGraph()
        with torch.cuda.graph(g):
            function_under_test()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        function_under_test = lambda: g.replay()
    def run_cuda_benchmark(n_iters: int) -> float:
        nonlocal key, value, key_cache, value_cache, slot_mapping
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = time.perf_counter()
        for _ in range(n_iters):
            function_under_test()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
        end = time.perf_counter()
        return (end - start) / n_iters
@@ -129,7 +129,7 @@ def run_benchmark(
    # free tensors to mitigate OOM when sweeping
    del key, value, key_cache, value_cache, slot_mapping
-    torch.cuda.empty_cache()
+    torch.accelerator.empty_cache()
    return lat
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -251,7 +251,7 @@ def benchmark(
        kernel(
            y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
@@ -259,7 +259,7 @@ def benchmark(
    # Benchmark
    latencies: list[float] = []
    for _ in range(runs):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        for i in range(iterations_per_run):
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -126,7 +126,7 @@ def benchmark_decode(
    )
    def time_fn(fn, warmup=10, trials=20):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = torch.Event(enable_timing=True)
        end = torch.Event(enable_timing=True)
        times = []
@@ -136,7 +136,7 @@ def benchmark_decode(
            start.record()
            fn()
            end.record()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            times.append(start.elapsed_time(end))  # ms
        return sum(times) / len(times), torch.std(torch.tensor(times))
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -138,7 +138,7 @@ def benchmark_prefill(
    )
    def time_fn(fn, warmup=10, trials=20):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = torch.Event(enable_timing=True)
        end = torch.Event(enable_timing=True)
        times = []
@@ -148,7 +148,7 @@ def benchmark_prefill(
            start.record()
            fn()
            end.record()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            times.append(start.elapsed_time(end))  # ms
        return sum(times) / len(times), torch.std(torch.tensor(times))
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -177,18 +177,18 @@ def benchmark_config(
    def run():
        w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # JIT complication & warmup
    for _ in range(5):
        run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
    latencies: list[float] = []
    for i in range(num_iters):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        run()
        end_event.record()
@@ -285,7 +285,7 @@ def tune_on_gpu(args_dict):
    weight_shapes = args_dict["weight_shapes"]
    args = args_dict["args"]
-    torch.cuda.set_device(gpu_id)
+    torch.accelerator.set_device_index(gpu_id)
    print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
    block_n = args.block_n
@@ -334,7 +334,7 @@ def distribute_batch_sizes(batch_sizes, num_gpus):
 def main(args):
    print(args)
-    num_gpus = torch.cuda.device_count()
+    num_gpus = torch.accelerator.device_count()
    if num_gpus == 0:
        raise RuntimeError("No GPU available for tuning")
    print(f"Found {num_gpus} GPUs for parallel tuning")
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -35,7 +35,7 @@ def benchmark_shape(
    B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
    # Reference result in BF16
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    C_ref = A @ B.t()
    # Pre-quantize B for all implementations
@@ -121,14 +121,14 @@ def benchmark_shape(
        # Warmup
        for _ in range(warmup):
            func()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
        # Timing loop
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = time.time()
        for _ in range(repeat):
            func()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        end = time.time()
        # Calculate timing and TFLOPS
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -79,7 +79,8 @@ else()
    find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
    find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
    find_isa(${CPUINFO} "S390" S390_FOUND)
-    find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
+    find_isa(${CPUINFO} "zvfhmin" RVV_FP16_FOUND) # Check for RISC-V Vector FP16 support
    find_isa(${CPUINFO} "zvfbfmin" RVV_BF16_FOUND) # Check for RISC-V Vector BF16 support
    # Support cross-compilation by allowing override via environment variables
    if (ENABLE_ARM_BF16)
@@ -101,11 +102,13 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64" OR ENABLE_X86_ISA)
        "-mavx512f"
        "-mavx512vl"
        "-mavx512bw"
-        "-mavx512dq"
+        "-mavx512dq")
-        "-mavx512bf16"
+    list(APPEND CXX_COMPILE_FLAGS_AVX512_AMX 
-        "-mavx512vnni"
+        ${CXX_COMPILE_FLAGS_AVX512}
        "-mamx-bf16"
-        "-mamx-tile")
+        "-mamx-tile"
        "-mavx512bf16"
        "-mavx512vnni")
    list(APPEND CXX_COMPILE_FLAGS_AVX2
        "-mavx2")
 elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
@@ -142,11 +145,19 @@ elseif (S390_FOUND)
        "-march=native"
        "-mtune=native")
 elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
-    if(RVV_FOUND)
+    message(STATUS "RISC-V detected")
-	    message(FAIL_ERROR "Can't support rvv now.")
+    if(RVV_BF16_FOUND)
        message(STATUS "BF16 extension detected")
        set(MARCH_FLAGS -march=rv64gcv_zvfh_zfbfmin_zvfbfmin_zvl128b -mrvv-vector-bits=zvl -mabi=lp64d)
        add_compile_definitions(RISCV_BF16_SUPPORT)
    elseif (RVV_FP16_FOUND)
        message(WARNING "BF16 functionality is not available")
        set(MARCH_FLAGS -march=rv64gcv_zvfh_zvl128b -mrvv-vector-bits=zvl -mabi=lp64d)
    else()
        message(STATUS "compile riscv with scalar")
        list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
    endif()
    list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})
 else()
    message(FATAL_ERROR "vLLM CPU backend requires X86, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
 endif()
@@ -242,13 +253,24 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
        )
    else()
        message(STATUS "Downloading oneDNN from GitHub")
-        FetchContent_Declare(
+        if(ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
-            oneDNN
+            message(STATUS "aarch64 detected: using pinned oneDNN commit 9c5be1cc59e368aebf0909e6cf20f981ea61462a")
-            GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
+            FetchContent_Declare(
-            GIT_TAG v3.10
+                oneDNN
-            GIT_PROGRESS TRUE
+                GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-            GIT_SHALLOW TRUE
+                GIT_TAG        9c5be1cc59e368aebf0909e6cf20f981ea61462a
-        )
+                GIT_PROGRESS   TRUE
                GIT_SHALLOW    FALSE
            )
        else()
            FetchContent_Declare(
                oneDNN
                GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
                GIT_TAG        v3.10
                GIT_PROGRESS   TRUE
                GIT_SHALLOW    TRUE
            )
        endif()
    endif()
    set(ONEDNN_LIBRARY_TYPE "STATIC")
@@ -294,7 +316,8 @@ endif()
 # TODO: Refactor this
 if (ENABLE_X86_ISA)
-    message(STATUS "CPU extension (AVX512) compile flags: ${CXX_COMPILE_FLAGS_AVX512}")
+    message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) compile flags: ${CXX_COMPILE_FLAGS_AVX512_AMX}")
    message(STATUS "CPU extension (AVX512F) compile flags: ${CXX_COMPILE_FLAGS_AVX512}")
    message(STATUS "CPU extension (AVX2) compile flags: ${CXX_COMPILE_FLAGS_AVX2}")
 else()
    message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
@@ -346,13 +369,15 @@ if(USE_ONEDNN)
 endif()
 if (ENABLE_X86_ISA)
-    set(VLLM_EXT_SRC_AVX512
+    set(VLLM_EXT_SRC_SGL
        "csrc/cpu/sgl-kernels/gemm.cpp"
        "csrc/cpu/sgl-kernels/gemm_int8.cpp"
        "csrc/cpu/sgl-kernels/gemm_fp8.cpp"
        "csrc/cpu/sgl-kernels/moe.cpp"
        "csrc/cpu/sgl-kernels/moe_int8.cpp"
-        "csrc/cpu/sgl-kernels/moe_fp8.cpp"
+        "csrc/cpu/sgl-kernels/moe_fp8.cpp")
    set(VLLM_EXT_SRC_AVX512
        "csrc/cpu/shm.cpp"
        "csrc/cpu/cpu_wna16.cpp"
        "csrc/cpu/cpu_fused_moe.cpp"
@@ -378,31 +403,48 @@ if (ENABLE_X86_ISA)
        "csrc/cpu/pos_encoding.cpp"
        "csrc/moe/dynamic_4bit_int_moe_cpu.cpp") 
-    message(STATUS "CPU extension (AVX512) source files: ${VLLM_EXT_SRC_AVX512}")
+    message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) source files: ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL}")
    message(STATUS "CPU extension (AVX512F) source files: ${VLLM_EXT_SRC_AVX512}")
    message(STATUS "CPU extension (AVX2) source files: ${VLLM_EXT_SRC_AVX2}")
    set(_C_LIBS numa dnnl_ext)
    set(_C_AVX512_LIBS numa dnnl_ext)
    set(_C_AVX2_LIBS numa)
    # AMX + AVX512F + AVX512BF16 + AVX512VNNI
    define_extension_target(
        _C
        DESTINATION vllm
        LANGUAGE CXX
        SOURCES ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL}
        LIBRARIES ${_C_LIBS}
        COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512_AMX}
        USE_SABI 3
        WITH_SOABI
    )
    # For AMX kernels
    target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16")
    # AVX512F 
    define_extension_target(
        _C_AVX512
        DESTINATION vllm
        LANGUAGE CXX
        SOURCES ${VLLM_EXT_SRC_AVX512}
-        LIBRARIES ${LIBS}
+        LIBRARIES ${_C_AVX512_LIBS}
        COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512}
        USE_SABI 3
        WITH_SOABI
    )
-    # For SGL kernels
+    # AVX2 
    target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AVX512")
    # For AMX kernels
    target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16")
    define_extension_target(
        _C_AVX2
        DESTINATION vllm
        LANGUAGE CXX
        SOURCES ${VLLM_EXT_SRC_AVX2}
-        LIBRARIES ${LIBS}
+        LIBRARIES ${_C_AVX2_LIBS}
        COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX2}
        USE_SABI 3
        WITH_SOABI
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -39,7 +39,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 140c00c0241bb60cc6e44e7c1be9998d4b20d8d2
+          GIT_TAG 1488682bb545f7d020e958a33116b1419d1cfc83
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -74,6 +74,12 @@ void indexer_k_quant_and_cache(
    int64_t quant_block_size,     // quantization block size
    const std::string& scale_fmt);
 // Concatenate query nope and rope for MLA/DSA attention
 void concat_mla_q(
    torch::Tensor& ql_nope,  // [num_tokens, num_heads, nope_dim]
    torch::Tensor& q_pe,     // [num_tokens, num_heads, rope_dim]
    torch::Tensor& q_out);   // [num_tokens, num_heads, nope_dim + rope_dim]
 // Extract function to gather quantized K cache
 void cp_gather_indexer_k_quant_cache(
    const torch::Tensor& kv_cache,  // [num_blocks, block_size, cache_stride]
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -8,6 +8,7 @@
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
 #include "quantization/vectorization_utils.cuh"
 #include "concat_mla_q.cuh"
 #ifdef USE_ROCM
  #include "quantization/w8a8/fp8/amd/quant_utils.cuh"
@@ -918,8 +919,8 @@ __global__ void gather_and_maybe_dequant_cache(
 // SCALAR_T is the data type of the destination tensor.
 // CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
-#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE)                        \
+#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE, ENTRY_SZ)              \
-  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576,      \
+  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, ENTRY_SZ, \
                                       thread_block_size>                     \
      <<<grid, block, 0, stream>>>(                                           \
          reinterpret_cast<CACHE_T*>(src_cache.data_ptr()),                   \
@@ -930,6 +931,12 @@ __global__ void gather_and_maybe_dequant_cache(
          dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
          seq_starts_ptr);
 #define CALL_GATHER_CACHE_576(SCALAR_T, CACHE_T, KV_DTYPE) \
  CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE, 576)
 #define CALL_GATHER_CACHE_320(SCALAR_T, CACHE_T, KV_DTYPE) \
  CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE, 320)
 // Gather sequences from the cache into the destination tensor.
 //  - cu_seq_lens contains the cumulative sequence lengths for each batch
 //  - block_table contains the cache block indices for each sequence
@@ -959,9 +966,10 @@ void gather_and_maybe_dequant_cache(
    TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
                "seq_starts must be int32");
  }
-  TORCH_CHECK(head_dim == 576,
+  TORCH_CHECK(
-              "gather_and_maybe_dequant_cache only support the head_dim to 576 "
+      head_dim == 320 || head_dim == 576,
-              "for better performance")
+      "gather_and_maybe_dequant_cache only support the head_dim to 320 or 576 "
      "for better performance")
  TORCH_CHECK(src_cache.device() == dst.device(),
              "src_cache and dst must be on the same device");
@@ -986,7 +994,13 @@ void gather_and_maybe_dequant_cache(
  const int32_t* seq_starts_ptr =
      seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
-  DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, CALL_GATHER_CACHE);
+  if (head_dim == 576) {
    DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype,
                               CALL_GATHER_CACHE_576);
  } else {
    DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype,
                               CALL_GATHER_CACHE_320);
  }
 }
 namespace vllm {
@@ -995,75 +1009,67 @@ namespace vllm {
 // Similar to cp_gather_cache but specifically for FP8->BF16 conversion
 __global__ void cp_gather_and_upconvert_fp8_kv_cache(
    const uint8_t* __restrict__ src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, 656]
-    __nv_bfloat16* __restrict__ dst,          // [TOT_TOKENS, 576]
+    __nv_bfloat16* __restrict__ dst,          // [total_tokens, 576]
-    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
+    const int32_t* __restrict__ block_table,  // [num_reqs, BLOCK_INDICES]
-    const int32_t* __restrict__ seq_lens,     // [BATCH]
+    const int32_t* __restrict__ workspace_starts,  // [num_reqs]
-    const int32_t* __restrict__ workspace_starts,  // [BATCH]
+    const int32_t num_reqs, const int32_t block_size,
-    const int32_t block_size, const int32_t head_dim,
+    const int32_t total_tokens, const int64_t block_table_stride,
-    const int64_t block_table_stride, const int64_t cache_block_stride,
+    const int64_t cache_block_stride, const int64_t cache_entry_stride,
-    const int64_t cache_entry_stride, const int64_t dst_entry_stride) {
+    const int64_t dst_entry_stride) {
-  const int64_t bid = blockIdx.x;  // Batch ID
+  const int flat_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) >> 5;
-  const int32_t num_splits = gridDim.y;
+  if (flat_warp_id >= total_tokens) return;
-  const int32_t split = blockIdx.y;
+  const int lane_id = threadIdx.x & 31;
  const int32_t seq_start = workspace_starts[bid];
  const int32_t seq_len = seq_lens[bid];
  const int32_t tot_slots = seq_len;
  const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
-  const int32_t split_start = split * split_slots;
+  // Binary search to find which request owns this output token
-  const int32_t split_end = min((split + 1) * split_slots, tot_slots);
+  int lo = 0, hi = num_reqs - 1;
-
+  while (lo < hi) {
-  const bool is_active_split = (split_start < tot_slots);
+    int mid = (lo + hi + 1) >> 1;
-
+    if (workspace_starts[mid] <= flat_warp_id)
-  if (!is_active_split) return;
+      lo = mid;
-
+    else
-  // Adjust the pointer for the block_table for this batch
+      hi = mid - 1;
  const int32_t batch_offset = bid * block_table_stride;
  int32_t offset = split_start;
  int32_t offset_div = offset / block_size;
  offset = offset % block_size;
  const int32_t* batch_block_table = block_table + batch_offset;
  // Adjust dst pointer based on the cumulative sequence lengths
  dst += seq_start * dst_entry_stride;
  const int tid = threadIdx.x;
  // Process each token in this split
  for (int pid = split_start; pid < split_end; ++pid) {
    auto block_id = batch_block_table[offset_div];
    const uint8_t* token_ptr =
        src_cache + block_id * cache_block_stride + offset * cache_entry_stride;
    __nv_bfloat16* dst_ptr = dst + pid * dst_entry_stride;
    // FP8 format: 512 bytes fp8 + 16 bytes scales + 128 bytes rope (64 bf16)
    const uint8_t* no_pe_ptr = token_ptr;
    const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
    const __nv_bfloat16* rope_ptr =
        reinterpret_cast<const __nv_bfloat16*>(token_ptr + 512 + 16);
    // Parallelize fp8 dequant (512 elements) and rope copy (64 elements)
    if (tid < 512) {
      // FP8 dequantization
      const int tile = tid >> 7;  // each tile is 128 elements
      const float scale = scales_ptr[tile];
      const uint8_t val = no_pe_ptr[tid];
      dst_ptr[tid] =
          fp8::scaled_convert<__nv_bfloat16, uint8_t,
                              vllm::Fp8KVCacheDataType::kFp8E4M3>(val, scale);
    } else if (tid < 576) {
      // Rope copy (64 bf16 elements)
      const int rope_idx = tid - 512;
      dst_ptr[512 + rope_idx] = rope_ptr[rope_idx];
    }
    // Move to next token
    offset += 1;
    if (offset == block_size) {
      offset_div += 1;
      offset = 0;
    }
  }
  const int req_id = lo;
  // Compute physical token address via block table
  const int out_token_id = flat_warp_id;
  const int token_offset = out_token_id - workspace_starts[req_id];
  const int cache_block_idx = token_offset / block_size;
  const int offset_in_block = token_offset % block_size;
  const int physical_block =
      block_table[req_id * block_table_stride + cache_block_idx];
  const uint8_t* token_ptr = src_cache + physical_block * cache_block_stride +
                             offset_in_block * cache_entry_stride;
  const int4* nope_src = reinterpret_cast<const int4*>(token_ptr);
  const int4 fp8_data = nope_src[lane_id];
  const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
  const float scale = scales_ptr[lane_id >> 3];
  const uint2 fp8_lo = make_uint2(fp8_data.x, fp8_data.y);
  const uint2 fp8_hi = make_uint2(fp8_data.z, fp8_data.w);
 #ifdef USE_ROCM
  const bf16_8_t bf16_lo =
      fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_lo, scale);
  const bf16_8_t bf16_hi =
      fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_hi, scale);
 #else
  const bf16_8_t bf16_lo =
      fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_lo, scale, __NV_E4M3);
  const bf16_8_t bf16_hi =
      fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_hi, scale, __NV_E4M3);
 #endif
  __nv_bfloat16* dst_ptr = dst + out_token_id * dst_entry_stride;
  int4* nope_dst = reinterpret_cast<int4*>(dst_ptr) + lane_id * 2;
  nope_dst[0] = *reinterpret_cast<const int4*>(&bf16_lo);
  nope_dst[1] = *reinterpret_cast<const int4*>(&bf16_hi);
  const int* rope_src = reinterpret_cast<const int*>(token_ptr + 528);
  int* rope_dst = reinterpret_cast<int*>(dst_ptr + 512);
  rope_dst[lane_id] = rope_src[lane_id];
 }
 template <typename scalar_t>
@@ -1257,15 +1263,16 @@ void cp_gather_and_upconvert_fp8_kv_cache(
    src_ptr = reinterpret_cast<const uint8_t*>(src_cache.data_ptr());
  }
-  // Decide on the number of splits based on the batch size
+  const int total_tokens = dst.size(0);
-  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
+  constexpr int warps_per_block = 8;
-  dim3 grid(batch_size, num_splits);
+  const int grid_size = (total_tokens + warps_per_block - 1) / warps_per_block;
-  dim3 block(576);
+  const int block_size_threads = warps_per_block * 32;  // 256 threads
-  vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid, block, 0, stream>>>(
+  vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid_size, block_size_threads, 0,
                                               stream>>>(
      src_ptr, reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
-      block_table.data_ptr<int32_t>(), seq_lens.data_ptr<int32_t>(),
+      block_table.data_ptr<int32_t>(), workspace_starts.data_ptr<int32_t>(),
-      workspace_starts.data_ptr<int32_t>(), block_size, head_dim,
+      static_cast<int32_t>(batch_size), block_size, total_tokens,
      block_table_stride, cache_block_stride, cache_entry_stride,
      dst_entry_stride);
 }
@@ -1365,3 +1372,43 @@ void cp_gather_indexer_k_quant_cache(
    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32);
  }
 }
 // Concatenate ql_nope and q_pe into a contiguous q_out tensor for MLA/DSA.
 // Replaces torch.cat((ql_nope, q_pe), dim=-1).
 void concat_mla_q(torch::Tensor& ql_nope,  // [num_tokens, num_heads, nope_dim]
                  torch::Tensor& q_pe,     // [num_tokens, num_heads, rope_dim]
                  torch::Tensor& q_out     // [num_tokens, num_heads, nope_dim +
                                           // rope_dim]
 ) {
  const int num_tokens = ql_nope.size(0);
  const int num_heads = ql_nope.size(1);
  const int nope_dim = ql_nope.size(2);
  const int rope_dim = q_pe.size(2);
  TORCH_CHECK(nope_dim % 512 == 0, "nope_dim must be a multiple of 512, got ",
              nope_dim);
  TORCH_CHECK(rope_dim == 64, "rope_dim must be 64, got ", rope_dim);
  TORCH_CHECK(q_out.size(2) == nope_dim + rope_dim);
  TORCH_CHECK(ql_nope.stride(2) == 1, "ql_nope must have stride 1 in dim 2");
  TORCH_CHECK(q_pe.stride(2) == 1, "q_pe must have stride 1 in dim 2");
  TORCH_CHECK(q_out.stride(2) == 1, "q_out must have stride 1 in dim 2");
  if (num_tokens == 0) return;
  constexpr int warps_per_block = 8;
  const int total_warps = num_tokens * num_heads;
  const int grid_size = (total_warps + warps_per_block - 1) / warps_per_block;
  const int block_size = warps_per_block * 32;
  const at::cuda::OptionalCUDAGuard device_guard(device_of(ql_nope));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(ql_nope.scalar_type(), "concat_mla_q", [&] {
    vllm::ConcatMLAQKernel<scalar_t, 512><<<grid_size, block_size, 0, stream>>>(
        q_out.data_ptr<scalar_t>(), ql_nope.data_ptr<scalar_t>(),
        q_pe.data_ptr<scalar_t>(), num_tokens, num_heads, q_out.stride(0),
        q_out.stride(1), ql_nope.stride(0), ql_nope.stride(1), q_pe.stride(0),
        q_pe.stride(1));
  });
 }
--- a/csrc/concat_mla_q.cuh
+++ b/csrc/concat_mla_q.cuh
@@ -0,0 +1,60 @@
 #ifndef CONCAT_MLA_Q_CUH_
 #define CONCAT_MLA_Q_CUH_
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #include "cuda_vec_utils.cuh"
 namespace vllm {
 // Concatenates ql_nope [num_tokens, num_heads, NOPE_DIM] and
 // q_pe [num_tokens, num_heads, 64]
 // into q_out [num_tokens, num_heads, NOPE_DIM+64].
 // Currently instantiated only for NOPE_DIM=512.
 // Rope dim is hardcoded to 64 (DeepSeek V3.2 MLA)
 template <typename DType, int NOPE_DIM>
 __global__ void ConcatMLAQKernel(
    DType* __restrict__ q_out, const DType* __restrict__ ql_nope,
    const DType* __restrict__ q_pe, const int num_tokens, const int num_heads,
    const int64_t out_stride_0, const int64_t out_stride_1,
    const int64_t nope_stride_0, const int64_t nope_stride_1,
    const int64_t pe_stride_0, const int64_t pe_stride_1) {
  const int flat_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) >> 5;
  if (flat_warp_id >= num_tokens * num_heads) return;
  const int token_id = flat_warp_id / num_heads;
  const int head_id = flat_warp_id % num_heads;
  const int lane_id = threadIdx.x & 31;
  constexpr bool use_256b = VLLM_256B_PTX_ENABLED;
  constexpr int nope_vec_loads =
      NOPE_DIM * sizeof(DType) / (VecTraits<use_256b>::ARCH_MAX_VEC_SIZE * 32);
  const DType* nope_src =
      ql_nope + token_id * nope_stride_0 + head_id * nope_stride_1;
  DType* nope_dst = q_out + token_id * out_stride_0 + head_id * out_stride_1;
 #pragma unroll
  for (int i = 0; i < nope_vec_loads; i++) {
    const int offset = i * 32 + lane_id;
    if constexpr (use_256b) {
      st256_cs(reinterpret_cast<u32x8_t*>(nope_dst) + offset,
               ld256_cs(reinterpret_cast<const u32x8_t*>(nope_src) + offset));
    } else {
      st128_cs(reinterpret_cast<int4*>(nope_dst) + offset,
               ld128_cs(reinterpret_cast<const int4*>(nope_src) + offset));
    }
  }
  const int* rope_src = reinterpret_cast<const int*>(
      q_pe + token_id * pe_stride_0 + head_id * pe_stride_1);
  int* rope_dst = reinterpret_cast<int*>(q_out + token_id * out_stride_0 +
                                         head_id * out_stride_1 + NOPE_DIM);
  st32_cs(rope_dst + lane_id, ld32_cs(rope_src + lane_id));
 }
 }  // namespace vllm
 #endif  // CONCAT_MLA_Q_CUH_
--- a/csrc/cpu/cpu_attn_amx.hpp
+++ b/csrc/cpu/cpu_attn_amx.hpp
@@ -420,7 +420,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
      const int64_t block_size, const int64_t block_size_stride) {
    // For AMX 2D tiles, size of each line is 64 bytes
    constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES;
-    // For AMX B martix, N always is 16
+    // For AMX B matrix, N always is 16
    constexpr int64_t amx_b_tile_n_size = AMX_TILE_ROW_BYTES / 4;
    constexpr int64_t amx_b_tile_k_size = amx_tile_row_size / sizeof(scalar_t);
    // For now suppose block_size is divisible by amx_tile_column_num
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -13,6 +13,9 @@
 #elif defined(__aarch64__)
  // arm implementation
  #include "cpu_types_arm.hpp"
 #elif defined(__riscv_v)
  // riscv implementation
  #include "cpu_types_riscv.hpp"
 #else
  #warning "unsupported vLLM cpu implementation, vLLM will compile with scalar"
  #include "cpu_types_scalar.hpp"
--- a/csrc/cpu/cpu_types_riscv.hpp
+++ b/csrc/cpu/cpu_types_riscv.hpp
@@ -0,0 +1,832 @@
 #ifndef CPU_TYPES_RISCV_HPP
 #define CPU_TYPES_RISCV_HPP
 #include <algorithm>
 #include <cmath>
 #include <cstring>
 #include <iostream>
 #include <limits>
 #include <riscv_vector.h>
 #include <torch/all.h>
 // ============================================================================
 // Vector Register Type Definitions (VLEN=128 bits)
 // ============================================================================
 typedef vfloat16m1_t fixed_vfloat16m1_t
    __attribute__((riscv_rvv_vector_bits(128)));
 typedef vfloat16m2_t fixed_vfloat16m2_t
    __attribute__((riscv_rvv_vector_bits(256)));
 typedef vfloat32m1_t fixed_vfloat32m1_t
    __attribute__((riscv_rvv_vector_bits(128)));
 typedef vfloat32m2_t fixed_vfloat32m2_t
    __attribute__((riscv_rvv_vector_bits(256)));
 typedef vfloat32m4_t fixed_vfloat32m4_t
    __attribute__((riscv_rvv_vector_bits(512)));
 typedef vfloat32m8_t fixed_vfloat32m8_t
    __attribute__((riscv_rvv_vector_bits(1024)));
 typedef vint32m2_t fixed_vint32m2_t __attribute__((riscv_rvv_vector_bits(256)));
 typedef vint32m4_t fixed_vint32m4_t __attribute__((riscv_rvv_vector_bits(512)));
 typedef vuint16m1_t fixed_vuint16m1_t
    __attribute__((riscv_rvv_vector_bits(128)));
 typedef vuint16m2_t fixed_vuint16m2_t
    __attribute__((riscv_rvv_vector_bits(256)));
 typedef vuint16m4_t fixed_vuint16m4_t
    __attribute__((riscv_rvv_vector_bits(512)));
 #ifdef RISCV_BF16_SUPPORT
 typedef vbfloat16m1_t fixed_vbfloat16m1_t
    __attribute__((riscv_rvv_vector_bits(128)));
 typedef vbfloat16m2_t fixed_vbfloat16m2_t
    __attribute__((riscv_rvv_vector_bits(256)));
 typedef vbfloat16m4_t fixed_vbfloat16m4_t
    __attribute__((riscv_rvv_vector_bits(512)));
 #endif
 namespace vec_op {
 #ifdef RISCV_BF16_SUPPORT
  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
    AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
 #else
  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
 #endif
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 #define FORCE_INLINE __attribute__((always_inline)) inline
 namespace {
 template <typename T, T... indexes, typename F>
 constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
  (f(std::integral_constant<T, indexes>{}), ...);
 };
 }  // namespace
 template <typename T, T count, typename F,
          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
 constexpr void unroll_loop(F&& f) {
  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
 }
 template <typename T>
 struct Vec {
  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; };
 };
 struct FP32Vec8;
 struct FP32Vec16;
 // ============================================================================
 // FP16 Implementation
 // ============================================================================
 struct FP16Vec8 : public Vec<FP16Vec8> {
  constexpr static int VEC_ELEM_NUM = 8;
  fixed_vfloat16m1_t reg;
  explicit FP16Vec8(const void* ptr)
      : reg(__riscv_vle16_v_f16m1(static_cast<const _Float16*>(ptr),
                                  VEC_ELEM_NUM)) {};
  explicit FP16Vec8(const FP32Vec8&);
  void save(void* ptr) const {
    __riscv_vse16_v_f16m1(static_cast<_Float16*>(ptr), reg, VEC_ELEM_NUM);
  }
  void save(void* ptr, int elem_num) const {
    __riscv_vse16_v_f16m1(static_cast<_Float16*>(ptr), reg, elem_num);
  }
  void save_strided(void* ptr, ptrdiff_t stride) const {
    ptrdiff_t byte_stride = stride * sizeof(_Float16);
    __riscv_vsse16_v_f16m1(static_cast<_Float16*>(ptr), byte_stride, reg,
                           VEC_ELEM_NUM);
  }
 };
 struct FP16Vec16 : public Vec<FP16Vec16> {
  constexpr static int VEC_ELEM_NUM = 16;
  fixed_vfloat16m2_t reg;
  explicit FP16Vec16(const void* ptr)
      : reg(__riscv_vle16_v_f16m2(static_cast<const _Float16*>(ptr),
                                  VEC_ELEM_NUM)) {};
  explicit FP16Vec16(const FP32Vec16& vec);
  void save(void* ptr) const {
    __riscv_vse16_v_f16m2(static_cast<_Float16*>(ptr), reg, VEC_ELEM_NUM);
  }
  void save(void* ptr, int elem_num) const {
    __riscv_vse16_v_f16m2(static_cast<_Float16*>(ptr), reg, elem_num);
  }
  void save_strided(void* ptr, ptrdiff_t stride) const {
    ptrdiff_t byte_stride = stride * sizeof(_Float16);
    __riscv_vsse16_v_f16m2(static_cast<_Float16*>(ptr), byte_stride, reg,
                           VEC_ELEM_NUM);
  }
 };
 // ============================================================================
 // BF16 Implementation
 // ============================================================================
 #ifdef RISCV_BF16_SUPPORT
 FORCE_INLINE fixed_vuint16m1_t bf16_to_u16(fixed_vbfloat16m1_t v) {
  return __riscv_vreinterpret_v_bf16m1_u16m1(v);
 }
 FORCE_INLINE fixed_vuint16m2_t bf16_to_u16(fixed_vbfloat16m2_t v) {
  return __riscv_vreinterpret_v_bf16m2_u16m2(v);
 }
 FORCE_INLINE fixed_vuint16m4_t bf16_to_u16(fixed_vbfloat16m4_t v) {
  return __riscv_vreinterpret_v_bf16m4_u16m4(v);
 }
 struct BF16Vec8 : public Vec<BF16Vec8> {
  constexpr static int VEC_ELEM_NUM = 8;
  fixed_vbfloat16m1_t reg;
  explicit BF16Vec8(const void* ptr)
      : reg(__riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vle16_v_u16m1(
            reinterpret_cast<const uint16_t*>(ptr), VEC_ELEM_NUM))) {};
  explicit BF16Vec8(fixed_vbfloat16m1_t data) : reg(data) {};
  explicit BF16Vec8(const FP32Vec8&);
  void save(void* ptr) const {
    __riscv_vse16_v_u16m1(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
                          VEC_ELEM_NUM);
  }
  void save(void* ptr, int elem_num) const {
    __riscv_vse16_v_u16m1(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
                          elem_num);
  }
  void save_strided(void* ptr, ptrdiff_t stride) const {
    ptrdiff_t byte_stride = stride * sizeof(uint16_t);
    __riscv_vsse16_v_u16m1(reinterpret_cast<uint16_t*>(ptr), byte_stride,
                           bf16_to_u16(reg), VEC_ELEM_NUM);
  }
 };
 struct BF16Vec16 : public Vec<BF16Vec16> {
  constexpr static int VEC_ELEM_NUM = 16;
  fixed_vbfloat16m2_t reg;
  explicit BF16Vec16(const void* ptr)
      : reg(__riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vle16_v_u16m2(
            reinterpret_cast<const uint16_t*>(ptr), VEC_ELEM_NUM))) {};
  explicit BF16Vec16(fixed_vbfloat16m2_t data) : reg(data) {};
  explicit BF16Vec16(const FP32Vec16&);
  void save(void* ptr) const {
    __riscv_vse16_v_u16m2(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
                          VEC_ELEM_NUM);
  }
  void save(void* ptr, int elem_num) const {
    __riscv_vse16_v_u16m2(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
                          elem_num);
  }
  void save_strided(void* ptr, ptrdiff_t stride) const {
    ptrdiff_t byte_stride = stride * sizeof(uint16_t);
    __riscv_vsse16_v_u16m2(reinterpret_cast<uint16_t*>(ptr), byte_stride,
                           bf16_to_u16(reg), VEC_ELEM_NUM);
  }
 };
 struct BF16Vec32 : public Vec<BF16Vec32> {
  constexpr static int VEC_ELEM_NUM = 32;
  fixed_vbfloat16m4_t reg;
  explicit BF16Vec32(const void* ptr)
      : reg(__riscv_vreinterpret_v_u16m4_bf16m4(__riscv_vle16_v_u16m4(
            reinterpret_cast<const uint16_t*>(ptr), VEC_ELEM_NUM))) {};
  explicit BF16Vec32(fixed_vbfloat16m4_t data) : reg(data) {};
  explicit BF16Vec32(const BF16Vec8& v) {
    fixed_vuint16m1_t u16_val = bf16_to_u16(v.reg);
    fixed_vuint16m4_t u16_combined =
        __riscv_vcreate_v_u16m1_u16m4(u16_val, u16_val, u16_val, u16_val);
    reg = __riscv_vreinterpret_v_u16m4_bf16m4(u16_combined);
  };
  void save(void* ptr) const {
    __riscv_vse16_v_u16m4(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
                          VEC_ELEM_NUM);
  }
  void save(void* ptr, int elem_num) const {
    __riscv_vse16_v_u16m4(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
                          elem_num);
  }
  void save_strided(void* ptr, ptrdiff_t stride) const {
    ptrdiff_t byte_stride = stride * sizeof(uint16_t);
    __riscv_vsse16_v_u16m4(reinterpret_cast<uint16_t*>(ptr), byte_stride,
                           bf16_to_u16(reg), VEC_ELEM_NUM);
  }
 };
 #else
 // ============================================================================
 // BF16 Fallback Implementation (FP32 Simulation)
 // ============================================================================
 struct BF16Vec8 : public Vec<BF16Vec8> {
  constexpr static int VEC_ELEM_NUM = 8;
  fixed_vfloat32m2_t reg_fp32;
  explicit BF16Vec8(const void* ptr) {
    const uint16_t* u16 = static_cast<const uint16_t*>(ptr);
    float tmp[8];
    for (int i = 0; i < 8; ++i) {
      uint32_t v = static_cast<uint32_t>(u16[i]) << 16;
      std::memcpy(&tmp[i], &v, 4);
    }
    reg_fp32 = __riscv_vle32_v_f32m2(tmp, 8);
  }
  explicit BF16Vec8(const FP32Vec8&);
  void save(void* ptr) const {
    float tmp[8];
    __riscv_vse32_v_f32m2(tmp, reg_fp32, 8);
    uint16_t* u16 = static_cast<uint16_t*>(ptr);
    for (int i = 0; i < 8; ++i) {
      uint32_t v;
      std::memcpy(&v, &tmp[i], 4);
      u16[i] = static_cast<uint16_t>(v >> 16);
    }
  }
  void save(void* ptr, int elem_num) const {
    float tmp[8];
    __riscv_vse32_v_f32m2(tmp, reg_fp32, 8);
    uint16_t* u16 = static_cast<uint16_t*>(ptr);
    for (int i = 0; i < elem_num; ++i) {
      uint32_t v;
      std::memcpy(&v, &tmp[i], 4);
      u16[i] = static_cast<uint16_t>(v >> 16);
    }
  }
  void save_strided(void* ptr, ptrdiff_t stride) const {
    float tmp[8];
    __riscv_vse32_v_f32m2(tmp, reg_fp32, 8);
    uint8_t* u8 = static_cast<uint8_t*>(ptr);
    ptrdiff_t byte_stride = stride * sizeof(uint16_t);
    for (int i = 0; i < 8; ++i) {
      uint32_t v;
      std::memcpy(&v, &tmp[i], 4);
      uint16_t val = static_cast<uint16_t>(v >> 16);
      *reinterpret_cast<uint16_t*>(u8 + i * byte_stride) = val;
    }
  }
 };
 struct BF16Vec16 : public Vec<BF16Vec16> {
  constexpr static int VEC_ELEM_NUM = 16;
  fixed_vfloat32m4_t reg_fp32;
  explicit BF16Vec16(const void* ptr) {
    const uint16_t* u16 = static_cast<const uint16_t*>(ptr);
    float tmp[16];
    for (int i = 0; i < 16; ++i) {
      uint32_t v = static_cast<uint32_t>(u16[i]) << 16;
      std::memcpy(&tmp[i], &v, 4);
    }
    reg_fp32 = __riscv_vle32_v_f32m4(tmp, 16);
  }
  explicit BF16Vec16(const FP32Vec16&);
  void save(void* ptr) const {
    float tmp[16];
    __riscv_vse32_v_f32m4(tmp, reg_fp32, 16);
    uint16_t* u16 = static_cast<uint16_t*>(ptr);
    for (int i = 0; i < 16; ++i) {
      uint32_t v;
      std::memcpy(&v, &tmp[i], 4);
      u16[i] = static_cast<uint16_t>(v >> 16);
    }
  }
  void save(void* ptr, int elem_num) const {
    float tmp[16];
    __riscv_vse32_v_f32m4(tmp, reg_fp32, 16);
    uint16_t* u16 = static_cast<uint16_t*>(ptr);
    for (int i = 0; i < elem_num; ++i) {
      uint32_t v;
      std::memcpy(&v, &tmp[i], 4);
      u16[i] = static_cast<uint16_t>(v >> 16);
    }
  }
  void save_strided(void* ptr, ptrdiff_t stride) const {
    float tmp[16];
    __riscv_vse32_v_f32m4(tmp, reg_fp32, 16);
    uint8_t* u8 = static_cast<uint8_t*>(ptr);
    ptrdiff_t byte_stride = stride * sizeof(uint16_t);
    for (int i = 0; i < 16; ++i) {
      uint32_t v;
      std::memcpy(&v, &tmp[i], 4);
      uint16_t val = static_cast<uint16_t>(v >> 16);
      *reinterpret_cast<uint16_t*>(u8 + i * byte_stride) = val;
    }
  }
 };
 struct BF16Vec32 : public Vec<BF16Vec32> {
  constexpr static int VEC_ELEM_NUM = 32;
  fixed_vfloat32m8_t reg_fp32;
  explicit BF16Vec32(const void* ptr) {
    const uint16_t* u16 = static_cast<const uint16_t*>(ptr);
    float tmp[32];
    for (int i = 0; i < 32; ++i) {
      uint32_t v = static_cast<uint32_t>(u16[i]) << 16;
      std::memcpy(&tmp[i], &v, 4);
    }
    reg_fp32 = __riscv_vle32_v_f32m8(tmp, 32);
  }
  explicit BF16Vec32(const BF16Vec8& v) {
    float tmp_small[8];
    __riscv_vse32_v_f32m2(tmp_small, v.reg_fp32, 8);
    float tmp_large[32];
    for (int i = 0; i < 4; ++i) {
      std::memcpy(tmp_large + (i * 8), tmp_small, 8 * sizeof(float));
    }
    reg_fp32 = __riscv_vle32_v_f32m8(tmp_large, 32);
  }
  void save(void* ptr) const {
    float tmp[32];
    __riscv_vse32_v_f32m8(tmp, reg_fp32, 32);
    uint16_t* u16 = static_cast<uint16_t*>(ptr);
    for (int i = 0; i < 32; ++i) {
      uint32_t v;
      std::memcpy(&v, &tmp[i], 4);
      u16[i] = static_cast<uint16_t>(v >> 16);
    }
  }
  void save(void* ptr, int elem_num) const {
    float tmp[32];
    __riscv_vse32_v_f32m8(tmp, reg_fp32, 32);
    uint16_t* u16 = static_cast<uint16_t*>(ptr);
    for (int i = 0; i < elem_num; ++i) {
      uint32_t v;
      std::memcpy(&v, &tmp[i], 4);
      u16[i] = static_cast<uint16_t>(v >> 16);
    }
  }
  void save_strided(void* ptr, ptrdiff_t stride) const {
    float tmp[32];
    __riscv_vse32_v_f32m8(tmp, reg_fp32, 32);
    uint8_t* u8 = static_cast<uint8_t*>(ptr);
    ptrdiff_t byte_stride = stride * sizeof(uint16_t);
    for (int i = 0; i < 32; ++i) {
      uint32_t v;
      std::memcpy(&v, &tmp[i], 4);
      uint16_t val = static_cast<uint16_t>(v >> 16);
      *reinterpret_cast<uint16_t*>(u8 + i * byte_stride) = val;
    }
  }
 };
 #endif
 // ============================================================================
 // FP32 Implementation
 // ============================================================================
 struct FP32Vec4 : public Vec<FP32Vec4> {
  constexpr static int VEC_ELEM_NUM = 4;
  fixed_vfloat32m1_t reg;
  explicit FP32Vec4(float v) : reg(__riscv_vfmv_v_f_f32m1(v, VEC_ELEM_NUM)) {};
  explicit FP32Vec4() : reg(__riscv_vfmv_v_f_f32m1(0.0f, VEC_ELEM_NUM)) {};
  explicit FP32Vec4(const float* ptr)
      : reg(__riscv_vle32_v_f32m1(ptr, VEC_ELEM_NUM)) {};
  explicit FP32Vec4(fixed_vfloat32m1_t data) : reg(data) {};
  explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {};
  void save(float* ptr) const { __riscv_vse32_v_f32m1(ptr, reg, VEC_ELEM_NUM); }
  void save(float* ptr, int elem_num) const {
    __riscv_vse32_v_f32m1(ptr, reg, elem_num);
  }
 };
 struct FP32Vec8 : public Vec<FP32Vec8> {
  constexpr static int VEC_ELEM_NUM = 8;
  fixed_vfloat32m2_t reg;
  explicit FP32Vec8(float v) : reg(__riscv_vfmv_v_f_f32m2(v, VEC_ELEM_NUM)) {};
  explicit FP32Vec8() : reg(__riscv_vfmv_v_f_f32m2(0.0f, VEC_ELEM_NUM)) {};
  explicit FP32Vec8(const float* ptr)
      : reg(__riscv_vle32_v_f32m2(ptr, VEC_ELEM_NUM)) {};
  explicit FP32Vec8(fixed_vfloat32m2_t data) : reg(data) {};
  explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
  explicit FP32Vec8(const FP16Vec8& v)
      : reg(__riscv_vfwcvt_f_f_v_f32m2(v.reg, VEC_ELEM_NUM)) {};
  explicit FP32Vec8(fixed_vfloat16m1_t v)
      : reg(__riscv_vfwcvt_f_f_v_f32m2(v, VEC_ELEM_NUM)) {};
 #ifdef RISCV_BF16_SUPPORT
  explicit FP32Vec8(fixed_vbfloat16m1_t v)
      : reg(__riscv_vfwcvtbf16_f_f_v_f32m2(v, VEC_ELEM_NUM)) {};
  explicit FP32Vec8(const BF16Vec8& v)
      : reg(__riscv_vfwcvtbf16_f_f_v_f32m2(v.reg, VEC_ELEM_NUM)) {};
 #else
  explicit FP32Vec8(const BF16Vec8& v) : reg(v.reg_fp32) {};
 #endif
  float reduce_sum() const {
    fixed_vfloat32m1_t scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1);
    scalar = __riscv_vfredusum_vs_f32m2_f32m1(reg, scalar, VEC_ELEM_NUM);
    return __riscv_vfmv_f_s_f32m1_f32(scalar);
  }
  FP32Vec8 operator*(const FP32Vec8& b) const {
    return FP32Vec8(__riscv_vfmul_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
  }
  FP32Vec8 operator+(const FP32Vec8& b) const {
    return FP32Vec8(__riscv_vfadd_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
  }
  FP32Vec8 operator-(const FP32Vec8& b) const {
    return FP32Vec8(__riscv_vfsub_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
  }
  FP32Vec8 operator/(const FP32Vec8& b) const {
    return FP32Vec8(__riscv_vfdiv_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
  }
  FP32Vec8 min(const FP32Vec8& b) const {
    return FP32Vec8(__riscv_vfmin_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
  }
  FP32Vec8 max(const FP32Vec8& b) const {
    return FP32Vec8(__riscv_vfmax_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
  }
  FP32Vec8 abs() const {
    return FP32Vec8(__riscv_vfabs_v_f32m2(reg, VEC_ELEM_NUM));
  }
  FP32Vec8 min(const FP32Vec8& b, int elem_num) const {
    return FP32Vec8(__riscv_vfmin_vv_f32m2(reg, b.reg, elem_num));
  }
  FP32Vec8 max(const FP32Vec8& b, int elem_num) const {
    return FP32Vec8(__riscv_vfmax_vv_f32m2(reg, b.reg, elem_num));
  }
  FP32Vec8 clamp(const FP32Vec8& min_v, const FP32Vec8& max_v) const {
    fixed_vfloat32m2_t temp =
        __riscv_vfmax_vv_f32m2(min_v.reg, reg, VEC_ELEM_NUM);
    return FP32Vec8(__riscv_vfmin_vv_f32m2(max_v.reg, temp, VEC_ELEM_NUM));
  }
  void save(float* ptr) const { __riscv_vse32_v_f32m2(ptr, reg, VEC_ELEM_NUM); }
  void save(float* ptr, int elem_num) const {
    __riscv_vse32_v_f32m2(ptr, reg, elem_num);
  }
  void save_strided(float* ptr, ptrdiff_t stride) const {
    ptrdiff_t byte_stride = stride * sizeof(float);
    __riscv_vsse32_v_f32m2(ptr, byte_stride, reg, VEC_ELEM_NUM);
  }
  FP32Vec8 exp() const {
    const float inv_ln2 = 1.44269504088896341f;
    fixed_vfloat32m2_t x_scaled =
        __riscv_vfmul_vf_f32m2(reg, inv_ln2, VEC_ELEM_NUM);
    fixed_vint32m2_t n_int = __riscv_vfcvt_x_f_v_i32m2(x_scaled, VEC_ELEM_NUM);
    fixed_vfloat32m2_t n_float = __riscv_vfcvt_f_x_v_f32m2(n_int, VEC_ELEM_NUM);
    fixed_vfloat32m2_t r =
        __riscv_vfsub_vv_f32m2(x_scaled, n_float, VEC_ELEM_NUM);
    fixed_vfloat32m2_t poly =
        __riscv_vfmv_v_f_f32m2(0.001333355810164f, VEC_ELEM_NUM);
    poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
    poly = __riscv_vfadd_vf_f32m2(poly, 0.009618129107628f, VEC_ELEM_NUM);
    poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
    poly = __riscv_vfadd_vf_f32m2(poly, 0.055504108664821f, VEC_ELEM_NUM);
    poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
    poly = __riscv_vfadd_vf_f32m2(poly, 0.240226506959101f, VEC_ELEM_NUM);
    poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
    poly = __riscv_vfadd_vf_f32m2(poly, 0.693147180559945f, VEC_ELEM_NUM);
    poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
    poly = __riscv_vfadd_vf_f32m2(poly, 1.0f, VEC_ELEM_NUM);
    fixed_vint32m2_t biased_exp =
        __riscv_vadd_vx_i32m2(n_int, 127, VEC_ELEM_NUM);
    biased_exp = __riscv_vmax_vx_i32m2(biased_exp, 0, VEC_ELEM_NUM);
    fixed_vint32m2_t exponent_bits =
        __riscv_vsll_vx_i32m2(biased_exp, 23, VEC_ELEM_NUM);
    fixed_vfloat32m2_t scale =
        __riscv_vreinterpret_v_i32m2_f32m2(exponent_bits);
    return FP32Vec8(__riscv_vfmul_vv_f32m2(poly, scale, VEC_ELEM_NUM));
  }
  FP32Vec8 tanh() const {
    fixed_vfloat32m2_t x_clamped = __riscv_vfmin_vf_f32m2(
        __riscv_vfmax_vf_f32m2(reg, -9.0f, VEC_ELEM_NUM), 9.0f, VEC_ELEM_NUM);
    fixed_vfloat32m2_t x2 =
        __riscv_vfmul_vf_f32m2(x_clamped, 2.0f, VEC_ELEM_NUM);
    FP32Vec8 exp_val = FP32Vec8(x2).exp();
    fixed_vfloat32m2_t num =
        __riscv_vfsub_vf_f32m2(exp_val.reg, 1.0f, VEC_ELEM_NUM);
    fixed_vfloat32m2_t den =
        __riscv_vfadd_vf_f32m2(exp_val.reg, 1.0f, VEC_ELEM_NUM);
    return FP32Vec8(__riscv_vfdiv_vv_f32m2(num, den, VEC_ELEM_NUM));
  }
  FP32Vec8 er() const {
    const float p = 0.3275911f, a1 = 0.254829592f, a2 = -0.284496736f,
                a3 = 1.421413741f, a4 = -1.453152027f, a5 = 1.061405429f;
    fixed_vfloat32m2_t abs_x = __riscv_vfabs_v_f32m2(reg, VEC_ELEM_NUM);
    fixed_vfloat32m2_t t = __riscv_vfadd_vf_f32m2(
        __riscv_vfmul_vf_f32m2(abs_x, p, VEC_ELEM_NUM), 1.0f, VEC_ELEM_NUM);
    t = __riscv_vfrdiv_vf_f32m2(t, 1.0f, VEC_ELEM_NUM);
    fixed_vfloat32m2_t poly = __riscv_vfmv_v_f_f32m2(a5, VEC_ELEM_NUM);
    poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
                                  a4, VEC_ELEM_NUM);
    poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
                                  a3, VEC_ELEM_NUM);
    poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
                                  a2, VEC_ELEM_NUM);
    poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
                                  a1, VEC_ELEM_NUM);
    poly = __riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM);
    fixed_vfloat32m2_t exp_val =
        FP32Vec8(__riscv_vfneg_v_f32m2(
                     __riscv_vfmul_vv_f32m2(abs_x, abs_x, VEC_ELEM_NUM),
                     VEC_ELEM_NUM))
            .exp()
            .reg;
    fixed_vfloat32m2_t res = __riscv_vfrsub_vf_f32m2(
        __riscv_vfmul_vv_f32m2(poly, exp_val, VEC_ELEM_NUM), 1.0f,
        VEC_ELEM_NUM);
    vbool16_t mask = __riscv_vmflt_vf_f32m2_b16(reg, 0.0f, VEC_ELEM_NUM);
    return FP32Vec8(__riscv_vfneg_v_f32m2_m(mask, res, VEC_ELEM_NUM));
  }
 };
 struct FP32Vec16 : public Vec<FP32Vec16> {
  constexpr static int VEC_ELEM_NUM = 16;
  fixed_vfloat32m4_t reg;
  explicit FP32Vec16(float v) : reg(__riscv_vfmv_v_f_f32m4(v, VEC_ELEM_NUM)) {};
  explicit FP32Vec16() : reg(__riscv_vfmv_v_f_f32m4(0.0f, VEC_ELEM_NUM)) {};
  explicit FP32Vec16(const float* ptr)
      : reg(__riscv_vle32_v_f32m4(ptr, VEC_ELEM_NUM)) {};
  explicit FP32Vec16(fixed_vfloat32m4_t data) : reg(data) {};
  explicit FP32Vec16(const FP32Vec8& data)
      : reg(__riscv_vcreate_v_f32m2_f32m4(data.reg, data.reg)) {};
  explicit FP32Vec16(const FP32Vec16& data) : reg(data.reg) {};
  explicit FP32Vec16(const FP16Vec16& v);
 #ifdef RISCV_BF16_SUPPORT
  explicit FP32Vec16(fixed_vbfloat16m2_t v)
      : reg(__riscv_vfwcvtbf16_f_f_v_f32m4(v, VEC_ELEM_NUM)) {};
  explicit FP32Vec16(const BF16Vec16& v)
      : reg(__riscv_vfwcvtbf16_f_f_v_f32m4(v.reg, VEC_ELEM_NUM)) {};
 #else
  explicit FP32Vec16(const BF16Vec16& v) : reg(v.reg_fp32) {};
 #endif
  FP32Vec16 operator+(const FP32Vec16& b) const {
    return FP32Vec16(__riscv_vfadd_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
  }
  FP32Vec16 operator-(const FP32Vec16& b) const {
    return FP32Vec16(__riscv_vfsub_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
  }
  FP32Vec16 operator*(const FP32Vec16& b) const {
    return FP32Vec16(__riscv_vfmul_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
  }
  FP32Vec16 operator/(const FP32Vec16& b) const {
    return FP32Vec16(__riscv_vfdiv_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
  }
  FP32Vec16 fma(const FP32Vec16& a, const FP32Vec16& b) const {
    return FP32Vec16(__riscv_vfmacc_vv_f32m4(reg, a.reg, b.reg, VEC_ELEM_NUM));
  }
  float reduce_sum() const {
    fixed_vfloat32m1_t scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1);
    scalar = __riscv_vfredusum_vs_f32m4_f32m1(reg, scalar, VEC_ELEM_NUM);
    return __riscv_vfmv_f_s_f32m1_f32(scalar);
  }
  float reduce_max() const {
    fixed_vfloat32m1_t scalar =
        __riscv_vfmv_s_f_f32m1(std::numeric_limits<float>::lowest(), 1);
    scalar = __riscv_vfredmax_vs_f32m4_f32m1(reg, scalar, VEC_ELEM_NUM);
    return __riscv_vfmv_f_s_f32m1_f32(scalar);
  }
  float reduce_min() const {
    fixed_vfloat32m1_t scalar =
        __riscv_vfmv_s_f_f32m1(std::numeric_limits<float>::max(), 1);
    scalar = __riscv_vfredmin_vs_f32m4_f32m1(reg, scalar, VEC_ELEM_NUM);
    return __riscv_vfmv_f_s_f32m1_f32(scalar);
  }
  template <int group_size>
  float reduce_sub_sum(int idx) {
    static_assert(VEC_ELEM_NUM % group_size == 0);
    const int start = idx * group_size;
    vuint32m4_t indices = __riscv_vid_v_u32m4(VEC_ELEM_NUM);
    vbool8_t mask = __riscv_vmand_mm_b8(
        __riscv_vmsgeu_vx_u32m4_b8(indices, start, VEC_ELEM_NUM),
        __riscv_vmsltu_vx_u32m4_b8(indices, start + group_size, VEC_ELEM_NUM),
        VEC_ELEM_NUM);
    fixed_vfloat32m1_t scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1);
    scalar =
        __riscv_vfredusum_vs_f32m4_f32m1_m(mask, reg, scalar, VEC_ELEM_NUM);
    return __riscv_vfmv_f_s_f32m1_f32(scalar);
  };
  FP32Vec16 max(const FP32Vec16& b) const {
    return FP32Vec16(__riscv_vfmax_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
  }
  FP32Vec16 min(const FP32Vec16& b) const {
    return FP32Vec16(__riscv_vfmin_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
  }
  FP32Vec16 abs() const {
    return FP32Vec16(__riscv_vfabs_v_f32m4(reg, VEC_ELEM_NUM));
  }
  FP32Vec16 clamp(const FP32Vec16& min_v, const FP32Vec16& max_v) const {
    return FP32Vec16(__riscv_vfmin_vv_f32m4(
        max_v.reg, __riscv_vfmax_vv_f32m4(min_v.reg, reg, VEC_ELEM_NUM),
        VEC_ELEM_NUM));
  }
  void save(float* ptr) const { __riscv_vse32_v_f32m4(ptr, reg, VEC_ELEM_NUM); }
  void save(float* ptr, int elem_num) const {
    __riscv_vse32_v_f32m4(ptr, reg, elem_num);
  }
  void save_strided(float* ptr, ptrdiff_t stride) const {
    ptrdiff_t byte_stride = stride * sizeof(float);
    __riscv_vsse32_v_f32m4(ptr, byte_stride, reg, VEC_ELEM_NUM);
  }
  FP32Vec16 exp() const {
    const float inv_ln2 = 1.44269504088896341f;
    fixed_vfloat32m4_t x_scaled =
        __riscv_vfmul_vf_f32m4(reg, inv_ln2, VEC_ELEM_NUM);
    fixed_vint32m4_t n_int = __riscv_vfcvt_x_f_v_i32m4(x_scaled, VEC_ELEM_NUM);
    fixed_vfloat32m4_t n_float = __riscv_vfcvt_f_x_v_f32m4(n_int, VEC_ELEM_NUM);
    fixed_vfloat32m4_t r =
        __riscv_vfsub_vv_f32m4(x_scaled, n_float, VEC_ELEM_NUM);
    fixed_vfloat32m4_t poly =
        __riscv_vfmv_v_f_f32m4(0.001333355810164f, VEC_ELEM_NUM);
    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
                                  0.009618129107628f, VEC_ELEM_NUM);
    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
                                  0.055504108664821f, VEC_ELEM_NUM);
    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
                                  0.240226506959101f, VEC_ELEM_NUM);
    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
                                  0.693147180559945f, VEC_ELEM_NUM);
    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
                                  1.0f, VEC_ELEM_NUM);
    fixed_vint32m4_t biased_exp = __riscv_vmax_vx_i32m4(
        __riscv_vadd_vx_i32m4(n_int, 127, VEC_ELEM_NUM), 0, VEC_ELEM_NUM);
    fixed_vfloat32m4_t scale = __riscv_vreinterpret_v_i32m4_f32m4(
        __riscv_vsll_vx_i32m4(biased_exp, 23, VEC_ELEM_NUM));
    return FP32Vec16(__riscv_vfmul_vv_f32m4(poly, scale, VEC_ELEM_NUM));
  }
  FP32Vec16 tanh() const {
    fixed_vfloat32m4_t x_clamped = __riscv_vfmin_vf_f32m4(
        __riscv_vfmax_vf_f32m4(reg, -9.0f, VEC_ELEM_NUM), 9.0f, VEC_ELEM_NUM);
    FP32Vec16 exp_val =
        FP32Vec16(__riscv_vfmul_vf_f32m4(x_clamped, 2.0f, VEC_ELEM_NUM)).exp();
    return FP32Vec16(__riscv_vfdiv_vv_f32m4(
        __riscv_vfsub_vf_f32m4(exp_val.reg, 1.0f, VEC_ELEM_NUM),
        __riscv_vfadd_vf_f32m4(exp_val.reg, 1.0f, VEC_ELEM_NUM), VEC_ELEM_NUM));
  }
  FP32Vec16 er() const {
    const float p = 0.3275911f, a1 = 0.254829592f, a2 = -0.284496736f,
                a3 = 1.421413741f, a4 = -1.453152027f, a5 = 1.061405429f;
    fixed_vfloat32m4_t abs_x = __riscv_vfabs_v_f32m4(reg, VEC_ELEM_NUM);
    fixed_vfloat32m4_t t = __riscv_vfrdiv_vf_f32m4(
        __riscv_vfadd_vf_f32m4(__riscv_vfmul_vf_f32m4(abs_x, p, VEC_ELEM_NUM),
                               1.0f, VEC_ELEM_NUM),
        1.0f, VEC_ELEM_NUM);
    fixed_vfloat32m4_t poly = __riscv_vfmv_v_f_f32m4(a5, VEC_ELEM_NUM);
    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
                                  a4, VEC_ELEM_NUM);
    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
                                  a3, VEC_ELEM_NUM);
    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
                                  a2, VEC_ELEM_NUM);
    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
                                  a1, VEC_ELEM_NUM);
    poly = __riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM);
    fixed_vfloat32m4_t exp_val =
        FP32Vec16(__riscv_vfneg_v_f32m4(
                      __riscv_vfmul_vv_f32m4(abs_x, abs_x, VEC_ELEM_NUM),
                      VEC_ELEM_NUM))
            .exp()
            .reg;
    fixed_vfloat32m4_t res = __riscv_vfrsub_vf_f32m4(
        __riscv_vfmul_vv_f32m4(poly, exp_val, VEC_ELEM_NUM), 1.0f,
        VEC_ELEM_NUM);
    vbool8_t mask = __riscv_vmflt_vf_f32m4_b8(reg, 0.0f, VEC_ELEM_NUM);
    return FP32Vec16(__riscv_vfneg_v_f32m4_m(mask, res, VEC_ELEM_NUM));
  }
 };
 // ============================================================================
 // Type Traits & Global Helpers
 // ============================================================================
 template <typename T>
 struct VecType {
  using vec_type = void;
  using vec_t = void;
 };
 template <typename T>
 using vec_t = typename VecType<T>::vec_type;
 template <>
 struct VecType<float> {
  using vec_type = FP32Vec8;
  using vec_t = FP32Vec8;
 };
 template <>
 struct VecType<c10::Half> {
  using vec_type = FP16Vec8;
  using vec_t = FP16Vec8;
 };
 template <>
 struct VecType<c10::BFloat16> {
  using vec_type = BF16Vec8;
  using vec_t = BF16Vec8;
 };
 template <typename T>
 void storeFP32(float v, T* ptr) {
  *ptr = v;
 }
 template <>
 inline void storeFP32<c10::Half>(float v, c10::Half* ptr) {
  *reinterpret_cast<_Float16*>(ptr) = static_cast<_Float16>(v);
 }
 inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
  reg = __riscv_vfncvt_f_f_w_f16m2(v.reg, VEC_ELEM_NUM);
 }
 inline FP16Vec8::FP16Vec8(const FP32Vec8& v) {
  reg = __riscv_vfncvt_f_f_w_f16m1(v.reg, VEC_ELEM_NUM);
 }
 inline FP32Vec16::FP32Vec16(const FP16Vec16& v) {
  reg = __riscv_vfwcvt_f_f_v_f32m4(v.reg, VEC_ELEM_NUM);
 }
 inline void fma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
  acc = acc.fma(a, b);
 }
 #ifdef RISCV_BF16_SUPPORT
 template <>
 inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
  *ptr = static_cast<__bf16>(v);
 };
 inline BF16Vec8::BF16Vec8(const FP32Vec8& v)
    : reg(__riscv_vfncvtbf16_f_f_w_bf16m1(v.reg, VEC_ELEM_NUM)) {};
 inline BF16Vec16::BF16Vec16(const FP32Vec16& v)
    : reg(__riscv_vfncvtbf16_f_f_w_bf16m2(v.reg, VEC_ELEM_NUM)) {};
 #else
 template <>
 inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
  uint32_t val;
  std::memcpy(&val, &v, 4);
  *reinterpret_cast<uint16_t*>(ptr) = static_cast<uint16_t>(val >> 16);
 }
 inline BF16Vec8::BF16Vec8(const FP32Vec8& v) : reg_fp32(v.reg) {}
 inline BF16Vec16::BF16Vec16(const FP32Vec16& v) : reg_fp32(v.reg) {}
 #endif
 inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 1); }
 }  // namespace vec_op
 #ifndef CPU_KERNEL_GUARD_IN
  #define CPU_KERNEL_GUARD_IN(NAME)
 #endif
 #ifndef CPU_KERNEL_GUARD_OUT
  #define CPU_KERNEL_GUARD_OUT(NAME)
 #endif
 #endif  // CPU_TYPES_RISCV_HPP
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@@ -237,13 +237,10 @@ W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
  };
  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
                                   {b_k_stride_, b_n_stride_});
-#ifdef __aarch64__
+
  // dummy M size for prepacking weights
  // Prepacking weights improves performance and avoid runtime reorders
  constexpr dnnl_dim_t kProbeM = 128;
 #else
  constexpr dnnl_dim_t kProbeM = DNNL_RUNTIME_DIM_VAL;
 #endif
  prepack_weight(args.b_ptr, original_b_md,
                 create_primitive_desc(
@@ -411,21 +408,19 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
                                   {b_k_stride_, b_n_stride_});
  // dummy M size for prepacking weights
  // Prepacking weights improves performance and avoid runtime reorders
  constexpr dnnl_dim_t kProbeM = 128;
  prepack_weight(args.b_ptr, original_b_md,
                 create_primitive_desc(
-                     MSizeCacheKey{
+                     MSizeCacheKey{// Use a concrete M so oneDNN's kernel
-#ifdef VLLM_USE_ACL
+                                   // selector can choose an optimally blocked
-                         // Arm Compute Library (ACL) backend for oneDNN does
+                                   // weight layout.
-                         // not support runtime
+                                   .a_m_size = kProbeM,
-                         // dimensions, so we set M to a default value
+                                   .a_m_stride = b_k_size_,
-                         .a_m_size = 128,
+                                   .use_bias = false,
-                         .a_m_stride = b_k_size_,
+                                   .bias_type = dnnl::memory::data_type::undef},
 #else
                         .a_m_size = DNNL_RUNTIME_DIM_VAL,
                         .a_m_stride = DNNL_RUNTIME_DIM_VAL,
 #endif
                         .use_bias = false,
                         .bias_type = dnnl::memory::data_type::undef},
                     true)
                     .weights_desc());
  init_runtime_memory_cache(args);
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -4,7 +4,7 @@
 #include <torch/library.h>
-// Note: overwrite the external defination for sharing same name between
+// Note: overwrite the external definition for sharing same name between
 // libraries use different ISAs.
 #define TORCH_EXTENSION_NAME _C
--- a/csrc/cuda_vec_utils.cuh
+++ b/csrc/cuda_vec_utils.cuh
@@ -196,7 +196,7 @@ __forceinline__ __device__ u32x8_t ld256_cs(const u32x8_t* addr) {
  return val;
 #else
  assert(false && "ld256_cs requires SM100+ with CUDA 12.9+");
-  return {};
+  return u32x8_t{};
 #endif
 }
@@ -211,23 +211,51 @@ __forceinline__ __device__ void st256_cs(u32x8_t* addr, u32x8_t val) {
 #endif
 }
-// 32-bit cache-streaming (.cs) load / store  — SM100+ only.
+// 32-bit load / store.
 __device__ __forceinline__ int ld32(const int* addr) { return __ldg(addr); }
 __device__ __forceinline__ void st32(int* addr, int val) { *addr = val; }
 // 32-bit cache-streaming (.cs) load / store.
 // Falls back to ld32/st32 on ROCm (no .cs hint).
 __forceinline__ __device__ int ld32_cs(const int* addr) {
 #if VLLM_256B_PTX_ENABLED
  int val;
 #ifndef USE_ROCM
  asm volatile("ld.global.cs.b32 %0, [%1];" : "=r"(val) : "l"(addr));
  return val;
 #else
-  assert(false && "ld32_cs requires SM100+ with CUDA 12.9+");
+  val = ld32(addr);
  return 0;
 #endif
  return val;
 }
 __forceinline__ __device__ void st32_cs(int* addr, int val) {
-#if VLLM_256B_PTX_ENABLED
+#ifndef USE_ROCM
  asm volatile("st.global.cs.b32 [%0], %1;" ::"l"(addr), "r"(val));
 #else
-  assert(false && "st32_cs requires SM100+ with CUDA 12.9+");
+  st32(addr, val);
 #endif
 }
 // 128-bit cache-streaming (.cs) load / store.
 // Falls back to ld128/st128 on ROCm (no .cs hint).
 __forceinline__ __device__ int4 ld128_cs(const int4* addr) {
  int4 val;
 #ifndef USE_ROCM
  asm volatile("ld.global.cs.v4.u32 {%0,%1,%2,%3}, [%4];"
               : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
               : "l"(addr));
 #else
  ld128(val, addr);
 #endif
  return val;
 }
 __forceinline__ __device__ void st128_cs(int4* addr, int4 val) {
 #ifndef USE_ROCM
  asm volatile("st.global.cs.v4.u32 [%0], {%1,%2,%3,%4};" ::"l"(addr),
               "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w));
 #else
  st128(val, addr);
 #endif
 }
@@ -260,7 +288,7 @@ __device__ __forceinline__ void ld256_cg_or_zero(u32x8_t& val, const void* ptr,
 __device__ __forceinline__ void ld128_cg_or_zero(uint4& val, const void* ptr,
                                                 bool pred) {
-#if VLLM_256B_PTX_ENABLED
+#ifndef USE_ROCM
  uint32_t r0, r1, r2, r3;
  asm volatile(
@@ -278,7 +306,7 @@ __device__ __forceinline__ void ld128_cg_or_zero(uint4& val, const void* ptr,
  val = uint4{r0, r1, r2, r3};
 #else
-  assert(false && "ld128_cg_or_zero requires SM100+ with CUDA 12.9+");
+  assert(false && "ld128_cg_or_zero is not supported on ROCm");
 #endif
 }
--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -109,16 +109,18 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
 #ifndef USE_ROCM
  int flag = 0;
-  CUDA_CHECK(cuDeviceGetAttribute(
+  CUresult rdma_result = cuDeviceGetAttribute(
      &flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
-      device));
+      device);
-  if (flag) {  // support GPUDirect RDMA if possible
+  if (rdma_result == CUDA_SUCCESS &&
      flag) {  // support GPUDirect RDMA if possible
    prop.allocFlags.gpuDirectRDMACapable = 1;
  }
  int fab_flag = 0;
-  CUDA_CHECK(cuDeviceGetAttribute(
+  CUresult fab_result = cuDeviceGetAttribute(
-      &fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device));
+      &fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device);
-  if (fab_flag) {  // support fabric handle if possible
+  if (fab_result == CUDA_SUCCESS &&
      fab_flag) {  // support fabric handle if possible
    prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
  }
 #endif
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -35,11 +35,11 @@ __global__ void batched_moe_align_block_size_kernel(
  int32_t const block_ids_size = sorted_ids_size / block_size;
  int32_t const SENTINEL =
      num_batches * max_tokens_per_batch;  // To denote invalid entries.
-  // Intialize sorted_ids
+  // Initialize sorted_ids
  for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
    sorted_ids[i] = SENTINEL;
  }
-  // Intialize expert_ids with -1
+  // Initialize expert_ids with -1
  for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
    block_ids[i] = -1;
  }
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -73,10 +73,9 @@ void moe_permute(
  MOE_DISPATCH(input.scalar_type(), [&] {
    expandInputRowsKernelLauncher<scalar_t>(
        get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
-        get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
+        get_ptr<int>(sorted_row_idx), get_ptr<int>(inv_permuted_idx),
-        get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
+        get_ptr<int>(permuted_idx), get_ptr<int64_t>(expert_first_token_offset),
-        get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
+        n_token, valid_num_ptr, n_hidden, topk, n_local_expert, stream);
        n_hidden, topk, n_local_expert, stream);
  });
 }
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
@@ -57,7 +57,7 @@ void sortAndScanExpert(const int* expert_for_source_row, const int* source_rows,
 template <typename T>
 void expandInputRowsKernelLauncher(
-    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
+    T const* unpermuted_input, T* permuted_output,
    int const* expanded_dest_row_to_expanded_source_row,
    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
    int64_t const* expert_first_token_offset, int64_t const num_rows,
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
@@ -2,7 +2,7 @@
 template <typename T, bool CHECK_SKIPPED>
 __global__ void expandInputRowsKernel(
-    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
+    T const* unpermuted_input, T* permuted_output,
    int const* expanded_dest_row_to_expanded_source_row,
    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
    int64_t const* expert_first_token_offset, int64_t const num_rows,
@@ -16,7 +16,6 @@ __global__ void expandInputRowsKernel(
  int64_t expanded_dest_row = blockIdx.x;
  int64_t const expanded_source_row =
      expanded_dest_row_to_expanded_source_row[expanded_dest_row];
  int expert_id = sorted_experts[expanded_dest_row];
  if (threadIdx.x == 0) {
    assert(expanded_dest_row <= INT32_MAX);
@@ -54,7 +53,7 @@ __global__ void expandInputRowsKernel(
 template <typename T>
 void expandInputRowsKernelLauncher(
-    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
+    T const* unpermuted_input, T* permuted_output,
    int const* expanded_dest_row_to_expanded_source_row,
    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
    int64_t const* expert_first_token_offset, int64_t const num_rows,
@@ -70,12 +69,12 @@ void expandInputRowsKernelLauncher(
  bool is_check_skip = num_valid_tokens_ptr != nullptr;
  auto func = func_map[is_check_skip];
-  func<<<blocks, threads, 0, stream>>>(
+  func<<<blocks, threads, 0, stream>>>(unpermuted_input, permuted_output,
-      unpermuted_input, permuted_output, sorted_experts,
+                                       expanded_dest_row_to_expanded_source_row,
-      expanded_dest_row_to_expanded_source_row,
+                                       expanded_source_row_to_expanded_dest_row,
-      expanded_source_row_to_expanded_dest_row, permuted_idx,
+                                       permuted_idx, expert_first_token_offset,
-      expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
+                                       num_rows, num_valid_tokens_ptr, cols, k,
-      num_local_experts);
+                                       num_local_experts);
 }
 template <class T, class U>
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -295,10 +295,14 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
 std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a);
-void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
+std::tuple<torch::Tensor, torch::Tensor> scaled_fp4_quant_func(
-                      torch::Tensor& output_scale,
+    torch::Tensor const& input, torch::Tensor const& input_scale,
-                      torch::Tensor const& input_scale,
+    bool is_sf_swizzled_layout);
-                      bool is_sf_swizzled_layout);
+
 void scaled_fp4_quant_out(torch::Tensor const& input,
                          torch::Tensor const& input_scale,
                          bool is_sf_swizzled_layout, torch::Tensor& output,
                          torch::Tensor& output_scale);
 void scaled_fp4_experts_quant(
    torch::Tensor& output, torch::Tensor& output_scale,
--- a/Show More
+++ b/Show More