[NemotronH] Small fix reasoning parser (#36635 )

Signed-off-by: Roi Koren <roik@nvidia.com> (cherry picked from commit e661b9ee83)
add nemotron v3 reasoning parser (#36393 )
2026-03-11 02:51:18 -07:00 · 2026-03-11 02:51:04 -07:00 · 2026-03-10 21:20:41 -07:00 · 2026-03-10 21:20:23 -07:00 · 2026-03-10 19:08:18 -07:00 · 2026-03-10 11:47:09 -07:00
1107 changed files with 44909 additions and 81578 deletions
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -10,7 +10,7 @@ steps:
      docker build
      --build-arg max_jobs=16
      --build-arg REMOTE_VLLM=1
-      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
+      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx942;gfx950'
      --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
      --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
      -f docker/Dockerfile.rocm
--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -21,20 +21,6 @@ steps:
      pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
      pytest -x -v -s tests/kernels/test_onednn.py"

- label: CPU-Compatibility Tests
-  depends_on: []
-  soft_fail: true
-  device: intel_cpu
-  no_plugin: true
-  source_file_dependencies:
-  - cmake/cpu_extension.cmake
-  - setup.py
-  - vllm/platforms/cpu.py
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
-      bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
-
 - label: CPU-Language Generation and Pooling Model Tests
  depends_on: []
  soft_fail: true
--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -25,7 +25,9 @@ fi
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --build-arg VLLM_CPU_X86=true \
+  --build-arg VLLM_CPU_AVX512BF16=true \
+  --build-arg VLLM_CPU_AVX512VNNI=true \
+  --build-arg VLLM_CPU_AMXBF16=true \
  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
  --target vllm-test \
  --progress plain .
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -13,10 +13,9 @@ import os
 from contextlib import contextmanager

 import lm_eval
+import numpy as np
 import yaml

-from vllm.platforms import current_platform
-
 DEFAULT_RTOL = 0.08


@@ -64,9 +63,6 @@ def launch_lm_eval(eval_config, tp_size):
        "allow_deprecated_quantization=True,"
    )

-    if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]:
-        model_args += "attention_backend=TRITON_ATTN"
-
    env_vars = eval_config.get("env_vars", None)
    with scoped_env_vars(env_vars):
        results = lm_eval.simple_evaluate(
@@ -106,8 +102,6 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
                f"ground_truth={ground_truth:.3f} | "
                f"measured={measured_value:.3f} | rtol={rtol}"
            )
-
-            min_acceptable = ground_truth * (1 - rtol)
-            success = success and measured_value >= min_acceptable
+            success = success and np.isclose(ground_truth, measured_value, rtol=rtol)

    assert success
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -83,6 +83,7 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
+            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@@ -7,12 +7,12 @@ import argparse
 import html as _html
 import json
 import os
-from contextlib import nullcontext
 from dataclasses import dataclass
 from importlib import util
 from pathlib import Path

 import pandas as pd
+import regex as re

 pd.options.display.float_format = "{:.2f}".format
 plotly_found = util.find_spec("plotly.express") is not None
@@ -33,45 +33,6 @@ pd.set_option("display.precision", 2)
 pd.set_option("display.float_format", lambda x: f"{x:.2f}")


-# -----------------------------
-# Concurrency normalization (NEW, small)
-# -----------------------------
-def _find_concurrency_col(df: pd.DataFrame) -> str:
-    for c in [
-        "# of max concurrency.",
-        "# of max concurrency",
-        "Max Concurrency",
-        "max_concurrency",
-        "Concurrency",
-    ]:
-        if c in df.columns:
-            return c
-
-    for c in df.columns:
-        if "concurr" in str(c).lower():
-            s = df[c]
-            if s.dtype.kind in "iu" and s.nunique() > 1 and s.min() >= 1:
-                return c
-
-    raise ValueError(
-        "Cannot infer concurrency column. "
-        "Please rename the column to one of the known names "
-        "or add an explicit override (e.g., --concurrency-col)."
-    )
-
-
-def _normalize_concurrency_in_df(
-    df: pd.DataFrame, canonical: str = "# of max concurrency."
-) -> pd.DataFrame:
-    if canonical in df.columns:
-        return df
-    detected = _find_concurrency_col(df)
-    if detected in df.columns and detected != canonical:
-        return df.rename(columns={detected: canonical})
-    df[canonical] = pd.NA
-    return df
-
-
 # -----------------------------
 # Core data compare
 # -----------------------------
@@ -91,25 +52,19 @@ def compare_data_columns(
    - Concat along axis=1 (indexes align), then reset_index so callers can
      group by columns.
    - If --debug, add a <file_label>_name column per file.
-
-    Minimal fix to support different max_concurrency lists across files:
-      - normalize concurrency column naming to "# of max concurrency."
-      - align on UNION of keys (missing points become NaN)
-      - BUGFIX: don't drop throughput rows based on P99/Median presence
    """
    print("\ncompare_data_column:", data_column)

    frames = []
    raw_data_cols: list[str] = []
+    compare_frames = []

-    # Determine key cols after normalizing concurrency
    cols_per_file: list[set] = []
    for f in files:
        try:
            df_tmp = pd.read_json(f, orient="records")
        except Exception as err:
            raise ValueError(f"Failed to read {f}") from err
-        df_tmp = _normalize_concurrency_in_df(df_tmp, canonical="# of max concurrency.")
        cols_per_file.append(set(df_tmp.columns))

    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
@@ -120,25 +75,12 @@ def compare_data_columns(
            "No common key columns found from info_cols across the input files."
        )

-    union_index = None
-    metas: list[pd.DataFrame] = []
-    staged: list[tuple[str, pd.Series, pd.Series | None]] = []
+    meta_added = False

    for file in files:
        df = pd.read_json(file, orient="records")
-        df = _normalize_concurrency_in_df(df, canonical="# of max concurrency.")

-        # BUGFIX: only drop rows for latency-like metrics; throughput rows may have
-        # NaN in P99/Median columns even if the column exists in the JSON.
-        metric_lc = str(data_column).lower()
-        is_latency_metric = (
-            "ttft" in metric_lc
-            or "tpot" in metric_lc
-            or "p99" in metric_lc
-            or "median" in metric_lc
-            or metric_lc.strip() in {"p99", "median"}
-        )
-        if is_latency_metric and drop_column in df.columns:
+        if drop_column in df.columns:
            df = df.dropna(subset=[drop_column], ignore_index=True)

        for c in (
@@ -163,61 +105,35 @@ def compare_data_columns(
            meta = meta.groupby(level=key_cols, dropna=False).first()

        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
-
-        if data_column in df_idx.columns:
-            s = df_idx[data_column]
-            if not s.index.is_unique:
-                s = s.groupby(level=key_cols, dropna=False).mean()
-        else:
-            # keep NA series to preserve meta keys for union_index
-            s = pd.Series(pd.NA, index=meta.index)
+        s = df_idx[data_column]
+        if not s.index.is_unique:
+            s = s.groupby(level=key_cols, dropna=False).mean()
        s.name = file_label

-        name_s = None
+        if not meta_added:
+            frames.append(meta)
+            meta_added = True
+
        if debug and name_column in df_idx.columns:
            name_s = df_idx[name_column]
            if not name_s.index.is_unique:
                name_s = name_s.groupby(level=key_cols, dropna=False).first()
            name_s.name = f"{file_label}_name"
+            frames.append(name_s)

-        if union_index is None:
-            union_index = meta.index
-        else:
-            union_index = union_index.union(meta.index)
-        metas.append(meta)
-
-        staged.append((file_label, s, name_s))
-
-    if union_index is None:
-        raise ValueError("No data found after loading inputs.")
-
-    # meta first (union-aligned): build UNION meta across all files
-    if metas:
-        meta_union = pd.concat(metas, axis=0)
-        # Collapse duplicates on the MultiIndex; keep first non-null per column
-        meta_union = meta_union.groupby(level=key_cols, dropna=False).first()
-        frames.append(meta_union.reindex(union_index))
-
-    # values + ratios (union-aligned)
-    metric_series_aligned: list[pd.Series] = []
-    for file_label, s, name_s in staged:
-        s_aligned = s.reindex(union_index)
-        frames.append(s_aligned)
+        frames.append(s)
        raw_data_cols.append(file_label)
-        metric_series_aligned.append(s_aligned)
+        compare_frames.append(s)

-        if debug and name_s is not None:
-            frames.append(name_s.reindex(union_index))
-
-        if len(metric_series_aligned) >= 2:
-            base = metric_series_aligned[0]
-            current = metric_series_aligned[-1]
-            if "P99" in str(data_column) or "Median" in str(data_column):
+        if len(compare_frames) >= 2:
+            base = compare_frames[0]
+            current = compare_frames[-1]
+            if "P99" in data_column or "Median" in data_column:
                ratio = base / current
            else:
                ratio = current / base
            ratio = ratio.mask(base == 0)
-            ratio.name = f"Ratio 1 vs {len(metric_series_aligned)}"
+            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
            frames.append(ratio)

    concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
@@ -288,10 +204,24 @@ def split_json_by_tp_pp(
 # -----------------------------
 # Styling helpers
 # -----------------------------
+def _find_concurrency_col(df: pd.DataFrame) -> str:
+    for c in [
+        "# of max concurrency.",
+        "# of max concurrency",
+        "Max Concurrency",
+        "max_concurrency",
+        "Concurrency",
+    ]:
+        if c in df.columns:
+            return c
+    for c in df.columns:
+        if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
+            return c
+    return "# of max concurrency."
+
+
 def _highlight_threshold(
-    df: pd.DataFrame,
-    threshold: float,
-    slack_pct: float = 0.0,
+    df: pd.DataFrame, threshold: float
 ) -> pd.io.formats.style.Styler:
    conc_col = _find_concurrency_col(df)
    key_cols = [
@@ -304,24 +234,12 @@ def _highlight_threshold(
    ]
    conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]

-    try:
-        slack_pct = float(slack_pct or 0.0)
-    except Exception:
-        slack_pct = 0.0
-    slack_limit = threshold * (1.0 + slack_pct / 100.0)
-
-    def _cell(v):
-        if pd.isna(v):
-            return ""
-        if v <= threshold:
-            # Strict SLA
-            return "background-color:#e6ffe6;font-weight:bold;"
-        if v <= slack_limit:
-            # Within slack range
-            return "background-color:#ffe5cc;font-weight:bold;"
-        return ""
-
-    return df.style.map(_cell, subset=conf_cols)
+    return df.style.map(
+        lambda v: "background-color:#e6ffe6;font-weight:bold;"
+        if pd.notna(v) and v <= threshold
+        else "",
+        subset=conf_cols,
+    )


 def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
@@ -368,30 +286,11 @@ def _sanitize_sheet_name(name: str) -> str:
      - max 31 chars
      - cannot contain: : \ / ? * [ ]
      - cannot be empty
-
-    NOTE: Use fast, non-regex operations here to avoid the third-party `regex`
-    module's compile overhead/edge-cases on some systems.
    """
    name = "sheet" if name is None else str(name)
-
-    # Replace illegal characters with underscore.
-    trans = str.maketrans(
-        {
-            ":": "_",
-            "\\": "_",
-            "/": "_",
-            "?": "_",
-            "*": "_",
-            "[": "_",
-            "]": "_",
-        }
-    )
-    name = name.translate(trans)
-
-    # Strip quotes/spaces and collapse whitespace.
+    name = re.sub(r"[:\\/?*\[\]]", "_", name)
    name = name.strip().strip("'")
-    name = " ".join(name.split())
-
+    name = re.sub(r"\s+", " ", name)
    if not name:
        name = "sheet"
    return name[:31]
@@ -399,57 +298,30 @@ def _sanitize_sheet_name(name: str) -> str:

 def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
    d = dict(zip(group_cols, gkey_tuple))
-
-    # Always keep input/output lengths (these are important).
+    model = d.get("Model", "model")
+    model_short = str(model).split("/")[-1]
    ilen = d.get("Input Len", "")
    olen = d.get("Output Len", "")
    lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
-
-    # Shorten model name aggressively to make room for lens.
-    model = d.get("Model", "model")
-    leaf = str(model).split("/")[-1]
-
-    max_model_len = max(1, 31 - len(lens))
-    model_short = leaf[:max_model_len]
-
    return _sanitize_sheet_name(f"{model_short}{lens}")


 def _write_tables_to_excel_sheet(
    writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
 ):
-    """Write all blocks to a sheet with a single to_excel() call.
-
-    Pandas+openpyxl can be extremely slow when called many times per sheet.
-    We flatten blocks into one table with a 'Section' column to keep structure
-    while making Excel generation fast and deterministic.
-    """
-    if not blocks:
-        pd.DataFrame().to_excel(writer, sheet_name=sheet, index=False)
-        return
-
-    combined_parts: list[pd.DataFrame] = []
+    startrow = 0
    for title, df in blocks:
-        df2 = df.copy()
-        # Put the section label as the first column for readability.
-        df2.insert(0, "Section", title)
-        combined_parts.append(df2)
-
-    combined = pd.concat(combined_parts, axis=0, ignore_index=True, sort=False)
-    combined.to_excel(writer, sheet_name=sheet, index=False)
+        pd.DataFrame([[title]]).to_excel(
+            writer, sheet_name=sheet, index=False, header=False, startrow=startrow
+        )
+        startrow += 1
+        df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow)
+        startrow += len(df) + 3


 def _safe_filename(s: str) -> str:
-    # Fast path without the third-party `regex` module.
-    s = " ".join(str(s).strip().split())
-    allowed = []
-    for ch in s:
-        if ch.isalnum() or ch in "._-":
-            allowed.append(ch)
-        else:
-            allowed.append("_")
-    out = "".join(allowed)
-    return out[:180] if len(out) > 180 else out
+    s = re.sub(r"[^\w\-.]+", "_", str(s).strip())
+    return s[:180] if len(s) > 180 else s


 # -----------------------------
@@ -556,11 +428,7 @@ def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:


 def _max_concurrency_ok(
-    df: pd.DataFrame,
-    conc_col: str,
-    cfg_col: str,
-    threshold: float,
-    slack_pct: float = 0.0,
+    df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
 ):
    if df is None or conc_col not in df.columns or cfg_col not in df.columns:
        return pd.NA
@@ -573,14 +441,7 @@ def _max_concurrency_ok(
    if d.empty:
        return pd.NA

-    # Accept values up to (1 + slack_pct%) above the SLA.
-    try:
-        slack_pct = float(slack_pct or 0.0)
-    except Exception:
-        slack_pct = 0.0
-    effective_limit = float(threshold) * (1.0 + slack_pct / 100.0)
-
-    ok = d[d[cfg_col] <= effective_limit]
+    ok = d[d[cfg_col] <= threshold]
    if ok.empty:
        return pd.NA

@@ -646,25 +507,15 @@ def build_valid_max_concurrency_summary_html(
    if not cfg_cols:
        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)

-    # Display SLA ranges in the table header (SLA .. SLA*(1+slack))
-    ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
-    tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
-    ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
-    tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
-
    rows = []
    for cfg in cfg_cols:
        ttft_max = (
-            _max_concurrency_ok(
-                ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
-            )
+            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_max = (
-            _max_concurrency_ok(
-                tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
-            )
+            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
            if tpot_group_df is not None
            else pd.NA
        )
@@ -693,8 +544,8 @@ def build_valid_max_concurrency_summary_html(
        rows.append(
            {
                "Configuration": cfg,
-                f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
-                f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
+                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
+                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
                f"Max {conc_col} (Both)": both,
                "Output Tput @ Both (tok/s)": tput_at_both,
                "TTFT @ Both (ms)": ttft_at_both,
@@ -769,24 +620,15 @@ def build_valid_max_concurrency_summary_df(
    if not cfg_cols:
        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)

-    ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
-    tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
-    ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
-    tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
-
    rows = []
    for cfg in cfg_cols:
        ttft_max = (
-            _max_concurrency_ok(
-                ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
-            )
+            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_max = (
-            _max_concurrency_ok(
-                tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
-            )
+            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
            if tpot_group_df is not None
            else pd.NA
        )
@@ -815,8 +657,8 @@ def build_valid_max_concurrency_summary_df(
        rows.append(
            {
                "Configuration": cfg,
-                f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
-                f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
+                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
+                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
                f"Max {conc_col} (Both)": both,
                "Output Tput @ Both (tok/s)": tput_at_both,
                "TTFT @ Both (ms)": ttft_at_both,
@@ -909,21 +751,7 @@ def build_parser() -> argparse.ArgumentParser:
        help="Reference limit for TPOT plots (ms)",
    )

-    # ---- SLA tolerance (slack) options ----
-    parser.add_argument(
-        "--ttft-slack-pct",
-        type=float,
-        default=5.0,
-        help="Allowed percentage above TTFT SLA (default: 5).",
-    )
-    parser.add_argument(
-        "--tpot-slack-pct",
-        type=float,
-        default=5.0,
-        help="Allowed percentage above TPOT SLA (default: 5).",
-    )
-
-    # ---- export options ----
+    # ---- NEW: export options ----
    parser.add_argument(
        "--excel-out",
        type=str,
@@ -1015,13 +843,9 @@ def render_metric_table_html(

    metric_name = metric_label.lower()
    if "ttft" in metric_name:
-        styler = _highlight_threshold(
-            display_group, args.ttft_max_ms, args.ttft_slack_pct
-        )
+        styler = _highlight_threshold(display_group, args.ttft_max_ms)
    elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
-        styler = _highlight_threshold(
-            display_group, args.tpot_max_ms, args.tpot_slack_pct
-        )
+        styler = _highlight_threshold(display_group, args.tpot_max_ms)
    else:
        styler = display_group.style

@@ -1138,46 +962,22 @@ def write_report_group_first(
        csv_dir.mkdir(parents=True, exist_ok=True)

    excel_path = args.excel_out or "perf_comparison.xlsx"
-    disable_excel = os.getenv("VLLM_COMPARE_DISABLE_EXCEL", "0") == "1"
-
-    # Prefer xlsxwriter for speed; fallback to openpyxl if unavailable.
-    excel_engine = (
-        os.getenv("VLLM_COMPARE_EXCEL_ENGINE", "xlsxwriter").strip() or "xlsxwriter"
-    )
-    if excel_engine == "xlsxwriter" and util.find_spec("xlsxwriter") is None:
-        excel_engine = "openpyxl"
-
-    excel_engine_kwargs = {}
-    if excel_engine == "xlsxwriter":
-        # Reduce memory pressure & usually faster writes.
-        excel_engine_kwargs = {"options": {"constant_memory": True}}
-
-    xw_ctx = (
-        nullcontext(None)
-        if disable_excel
-        else pd.ExcelWriter(
-            excel_path, engine=excel_engine, engine_kwargs=excel_engine_kwargs
-        )
-    )
-    with xw_ctx as xw:
-        used_sheets: set[str] = set()
+    with pd.ExcelWriter(excel_path, engine="openpyxl") as xw:
        # ---- Environment sheet (first) ----
        env_sheet = _sanitize_sheet_name("Environment")
        env_df = _load_env_df_for_inputs(args, files)
-        if xw is not None:
-            if env_df is None or env_df.empty:
-                pd.DataFrame(
-                    [
-                        {
-                            "Section": "Environment",
-                            "Key": "vllm_env.txt",
-                            "Value": "NOT FOUND (or empty)",
-                        }
-                    ]
-                ).to_excel(xw, sheet_name=env_sheet, index=False)
-            else:
-                env_df.to_excel(xw, sheet_name=env_sheet, index=False)
-            used_sheets.add(env_sheet)
+        if env_df is None or env_df.empty:
+            pd.DataFrame(
+                [
+                    {
+                        "Section": "Environment",
+                        "Key": "vllm_env.txt",
+                        "Value": "NOT FOUND (or empty)",
+                    }
+                ]
+            ).to_excel(xw, sheet_name=env_sheet, index=False)
+        else:
+            env_df.to_excel(xw, sheet_name=env_sheet, index=False)
        with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
            main_fh.write('<meta charset="utf-8">\n')
            for gkey in group_keys:
@@ -1193,19 +993,12 @@ def write_report_group_first(

                main_fh.write(group_header)

-                do_excel = xw is not None
                sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
                sheet_base = sheet
-                if do_excel:
-                    dedup_i = 1
-                    while sheet in used_sheets:
-                        dedup_i += 1
-                        suffix = f"_{dedup_i}"
-                        # Ensure uniqueness even when sheet names are truncated.
-                        base = str(sheet_base)
-                        keep = max(1, 31 - len(suffix))
-                        sheet = _sanitize_sheet_name(base[:keep] + suffix)
-                    used_sheets.add(sheet)
+                dedup_i = 1
+                while sheet in xw.sheets:
+                    dedup_i += 1
+                    sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}")

                excel_blocks: list[tuple[str, pd.DataFrame]] = []

@@ -1266,7 +1059,7 @@ def write_report_group_first(
                        )

                        excel_blocks.append(
-                            (metric_label, group_df.reset_index(drop=True))
+                            (metric_label, display_group.reset_index(drop=True))
                        )
                        if csv_dir:
                            fn = _safe_filename(
@@ -1274,7 +1067,7 @@ def write_report_group_first(
                                    "/", "_"
                                )
                            )
-                            group_df.to_csv(csv_dir / f"{fn}.csv", index=False)
+                            display_group.to_csv(csv_dir / f"{fn}.csv", index=False)

                    summary_html = build_valid_max_concurrency_summary_html(
                        tput_group_df=tput_group_df,
@@ -1304,13 +1097,9 @@ def write_report_group_first(
                            )
                            summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)

-                if do_excel:
-                    _write_tables_to_excel_sheet(xw, sheet, excel_blocks)
+                _write_tables_to_excel_sheet(xw, sheet, excel_blocks)

-    if disable_excel:
-        print("Skipped Excel generation (VLLM_COMPARE_DISABLE_EXCEL=1).")
-    else:
-        print(f"Wrote Excel: {excel_path}")
+    print(f"Wrote Excel: {excel_path}")
    if csv_dir:
        print(f"Wrote CSVs under: {csv_dir}")

--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -12,13 +12,6 @@ DRY_RUN="${DRY_RUN:-0}"
 MODEL_FILTER="${MODEL_FILTER:-}"
 DTYPE_FILTER="${DTYPE_FILTER:-}"

-# Adaptive search controls
-ENABLE_ADAPTIVE_CONCURRENCY="${ENABLE_ADAPTIVE_CONCURRENCY:-0}"
-SLA_TTFT_MS="${SLA_TTFT_MS:-3000}"
-SLA_TPOT_MS="${SLA_TPOT_MS:-100}"
-ADAPTIVE_MAX_PROBES="${ADAPTIVE_MAX_PROBES:-8}"
-ADAPTIVE_MAX_CONCURRENCY="${ADAPTIVE_MAX_CONCURRENCY:-1024}"
-
 check_gpus() {
  if command -v nvidia-smi; then
    # check the number of GPUs and GPU type.
@@ -190,304 +183,6 @@ upload_to_buildkite() {
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }

-# -------------------------------
-# Adaptive concurrency helpers
-# -------------------------------
-result_json_path_for_serving() {
-  local test_name=$1
-  local qps=$2
-  local max_concurrency=$3
-  echo "$RESULTS_FOLDER/${test_name}_qps_${qps}_concurrency_${max_concurrency}.json"
-}
-
-extract_metric_ms() {
-  local metric_name=$1
-  local json_file=$2
-
-  [[ -f "$json_file" ]] || return 0
-
-  if [[ "$metric_name" == "ttft" ]]; then
-    jq -r '
-      [
-        .ttft_ms.p99?,
-        .metrics.ttft_ms.p99?,
-        .ttft.p99?,
-        .metrics.ttft.p99?,
-        .p99_ttft_ms?,
-        .ttft_ms.mean?,
-        .metrics.ttft_ms.mean?,
-        .ttft.mean?,
-        .metrics.ttft.mean?,
-        .mean_ttft_ms?
-      ] | map(select(. != null)) | .[0] // empty
-    ' "$json_file"
-  else
-    jq -r '
-      [
-        .tpot_ms.p99?,
-        .metrics.tpot_ms.p99?,
-        .tpot.p99?,
-        .metrics.tpot.p99?,
-        .p99_tpot_ms?,
-        .itl_ms.p99?,
-        .metrics.itl_ms.p99?,
-        .inter_token_latency_ms.p99?,
-        .tpot_ms.mean?,
-        .metrics.tpot_ms.mean?,
-        .tpot.mean?,
-        .metrics.tpot.mean?,
-        .itl_ms.mean?,
-        .metrics.itl_ms.mean?,
-        .mean_tpot_ms?,
-        .mean_itl_ms?
-      ] | map(select(. != null)) | .[0] // empty
-    ' "$json_file"
-  fi
-}
-
-evaluate_sla_from_json() {
-  local json_file=$1
-  local ttft
-  local tpot
-  local pass
-
-  [[ -f "$json_file" ]] || return 2
-
-  ttft=$(extract_metric_ms ttft "$json_file")
-  tpot=$(extract_metric_ms tpot "$json_file")
-
-  [[ -n "$ttft" && -n "$tpot" ]] || return 2
-
-  pass=$(jq -n \
-    --argjson ttft "$ttft" \
-    --argjson tpot "$tpot" \
-    --argjson sla_ttft "$SLA_TTFT_MS" \
-    --argjson sla_tpot "$SLA_TPOT_MS" \
-    '($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)')
-
-  [[ "$pass" == "true" ]]
-}
-
-write_adaptive_summary_json() {
-  local summary_file=$1
-  local test_name=$2
-  local qps=$3
-  local static_last_pass=$4
-  local static_first_fail=$5
-  local final_last_pass=$6
-  local final_first_fail=$7
-
-  jq -n \
-    --arg test_name "$test_name" \
-    --arg qps "$qps" \
-    --argjson sla_ttft "$SLA_TTFT_MS" \
-    --argjson sla_tpot "$SLA_TPOT_MS" \
-    --arg static_last_pass "${static_last_pass:-}" \
-    --arg static_first_fail "${static_first_fail:-}" \
-    --arg final_last_pass "${final_last_pass:-}" \
-    --arg final_first_fail "${final_first_fail:-}" \
-    '{
-      test_name: $test_name,
-      qps: $qps,
-      sla_ttft_ms: $sla_ttft,
-      sla_tpot_ms: $sla_tpot,
-      static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end),
-      static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end),
-      final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end),
-      final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end)
-    }' > "$summary_file"
-}
-
-run_single_serving_probe() {
-  local test_name=$1
-  local qps=$2
-  local max_concurrency=$3
-  local tp=$4
-  local compilation_config_mode=$5
-  local optimization_level=$6
-  local client_args_effective=$7
-  local client_remote_args=$8
-  local server_command=$9
-
-  local new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
-  local result_json
-  local num_prompts_arg=""
-  local client_command
-
-  result_json=$(result_json_path_for_serving "$test_name" "$qps" "$max_concurrency")
-
-  if [[ -f "$result_json" ]]; then
-    evaluate_sla_from_json "$result_json"
-    return $?
-  fi
-
-  if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
-    num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
-    if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
-    if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
-    num_prompts_arg="--num-prompts $num_prompts"
-  fi
-
-  client_command="vllm bench serve \
-    --save-result \
-    --result-dir $RESULTS_FOLDER \
-    --result-filename ${new_test_name}.json \
-    --request-rate $qps \
-    --max-concurrency $max_concurrency \
-    $num_prompts_arg \
-    --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level adaptive_search=1 \
-    $client_args_effective $client_remote_args "
-
-  echo "Adaptive probe: $client_command"
-
-  if [[ "${DRY_RUN:-0}" != "1" ]]; then
-    bash -c "$client_command"
-  fi
-
-  jq_output=$(jq -n \
-    --arg server "$server_command" \
-    --arg client "$client_command" \
-    --arg gpu "$gpu_type" \
-    '{
-      server_command: $server,
-      client_command: $client,
-      gpu_type: $gpu,
-      adaptive_search: true
-    }')
-  echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
-
-  evaluate_sla_from_json "$result_json"
-}
-
-adaptive_refine_from_static_results() {
-  local test_name=$1
-  local qps=$2
-  local max_concurrency_list_raw=$3
-  local tp=$4
-  local compilation_config_mode=$5
-  local optimization_level=$6
-  local client_args_effective=$7
-  local client_remote_args=$8
-  local server_command=$9
-
-  local sorted_points
-  local point
-  local rc
-  local static_last_pass=""
-  local static_first_fail=""
-  local largest_static=""
-  local step_hint=1
-  local previous_point=""
-  local low
-  local high
-  local mid
-  local probes=0
-  local summary_file="$RESULTS_FOLDER/${test_name}_qps_${qps}_sla_summary.json"
-
-  [[ "${ENABLE_ADAPTIVE_CONCURRENCY}" == "1" ]] || return 0
-  [[ "${DRY_RUN:-0}" != "1" ]] || return 0
-
-  sorted_points=$(for point in $max_concurrency_list_raw; do printf '%s\n' "$point"; done | tr -d "'" | awk '/^[0-9]+$/' | sort -n | uniq)
-  [[ -n "$sorted_points" ]] || return 0
-
-  while read -r point; do
-    [[ -z "$point" ]] && continue
-    largest_static="$point"
-    evaluate_sla_from_json "$(result_json_path_for_serving "$test_name" "$qps" "$point")"
-    rc=$?
-    if (( rc == 0 )); then
-      static_last_pass="$point"
-    elif (( rc == 1 )); then
-      if [[ -n "$static_last_pass" ]]; then
-        static_first_fail="$point"
-        break
-      fi
-    fi
-
-    if [[ -n "$previous_point" ]]; then
-      step_hint=$(( point - previous_point ))
-      if (( step_hint < 1 )); then step_hint=1; fi
-    fi
-    previous_point="$point"
-  done <<< "$sorted_points"
-
-  if [[ -z "$static_last_pass" ]]; then
-    write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "" "$static_first_fail" "" "$static_first_fail"
-    return 0
-  fi
-
-  if [[ -n "$static_first_fail" ]]; then
-    low=$static_last_pass
-    high=$static_first_fail
-    while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
-      mid=$(( (low + high) / 2 ))
-      probes=$(( probes + 1 ))
-      run_single_serving_probe \
-        "$test_name" "$qps" "$mid" "$tp" \
-        "$compilation_config_mode" "$optimization_level" \
-        "$client_args_effective" "$client_remote_args" "$server_command"
-      rc=$?
-      if (( rc == 0 )); then
-        low=$mid
-      elif (( rc == 1 )); then
-        high=$mid
-      else
-        break
-      fi
-    done
-    write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "$static_first_fail" "$low" "$high"
-    return 0
-  fi
-
-  low=$largest_static
-  high=""
-  while (( probes < ADAPTIVE_MAX_PROBES )); do
-    point=$(( low + step_hint ))
-    if (( point > ADAPTIVE_MAX_CONCURRENCY )); then
-      point=$ADAPTIVE_MAX_CONCURRENCY
-    fi
-    (( point > low )) || break
-    probes=$(( probes + 1 ))
-    run_single_serving_probe \
-      "$test_name" "$qps" "$point" "$tp" \
-      "$compilation_config_mode" "$optimization_level" \
-      "$client_args_effective" "$client_remote_args" "$server_command"
-    rc=$?
-    if (( rc == 0 )); then
-      low=$point
-      (( point == ADAPTIVE_MAX_CONCURRENCY )) && break
-      step_hint=$(( step_hint * 2 ))
-      if (( step_hint < 1 )); then step_hint=1; fi
-    elif (( rc == 1 )); then
-      high=$point
-      break
-    else
-      break
-    fi
-  done
-
-  if [[ -n "$high" ]]; then
-    while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
-      mid=$(( (low + high) / 2 ))
-      probes=$(( probes + 1 ))
-      run_single_serving_probe \
-        "$test_name" "$qps" "$mid" "$tp" \
-        "$compilation_config_mode" "$optimization_level" \
-        "$client_args_effective" "$client_remote_args" "$server_command"
-      rc=$?
-      if (( rc == 0 )); then
-        low=$mid
-      elif (( rc == 1 )); then
-        high=$mid
-      else
-        break
-      fi
-    done
-  fi
-
-  write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "" "$low" "$high"
-}
-
 run_benchmark_tests() {
  # run benchmark tests using `vllm bench <test_type>` command
  # $1: test type (latency or throughput)
@@ -652,48 +347,10 @@ run_serving_tests() {
    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
    client_params=$(echo "$params" | jq -r '.client_parameters')

-    # vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly.
-    server_model=$(echo "$server_params" | jq -r '.model // empty')
-    if [[ -z "$server_model" || "$server_model" == "null" ]]; then
-      echo "Error: serving test '$test_name' is missing server_parameters.model" >&2
-      exit 1
-    fi
-    server_params_no_model=$(echo "$server_params" | jq -c 'del(.model)')
-    server_args=$(json2args "$server_params_no_model")
-
+    server_args=$(json2args "$server_params")
    server_envs=$(json2envs "$server_envs")
    client_args=$(json2args "$client_params")

-    # ------------------------------------------------------------
-    # Option 1: Dynamic num-prompts scaling based on max_concurrency
-    #
-    # If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with:
-    #   num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY
-    #
-    # If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior
-    # unchanged (i.e., whatever is in serving-tests-*.json).
-    # ------------------------------------------------------------
-    PROMPTS_PER_CONCURRENCY="${PROMPTS_PER_CONCURRENCY-}"  # no default on purpose
-    MIN_NUM_PROMPTS="${MIN_NUM_PROMPTS:-1}"
-    MAX_NUM_PROMPTS="${MAX_NUM_PROMPTS:-1000000}"
-
-    if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
-      # Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
-      # Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
-      # Handles: --num-prompts 123   and   --num-prompts=123
-      client_args_no_np="$(
-        printf ' %s ' "$client_args" \
-        | sed -E \
-          -e 's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g' \
-          -e 's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g'
-      )"
-      # normalize whitespace
-      client_args_no_np="$(echo "$client_args_no_np" | tr -s ' ' | sed -E 's/^ //; s/ $//')"
-      client_args_no_np="$(echo "$client_args_no_np" | xargs)"
-      client_args_effective="$client_args_no_np"
-    else
-      client_args_effective="$client_args"
-    fi
    # qps_list
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
@@ -725,13 +382,14 @@ run_serving_tests() {
    fi

    # check if server model and client model is aligned
+    server_model=$(echo "$server_params" | jq -r '.model')
    client_model=$(echo "$client_params" | jq -r '.model')
    if [[ $server_model != "$client_model" ]]; then
      echo "Server model and client model must be the same. Skip testcase $test_name."
      continue
    fi

-    server_command="$server_envs vllm serve $server_model \
+    server_command="$server_envs vllm serve \
      $server_args"

    # run the server
@@ -778,14 +436,6 @@ run_serving_tests() {
      for max_concurrency in $max_concurrency_list; do
        new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
        echo " new test name $new_test_name"
-        # If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts.
-        num_prompts_arg=""
-        if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
-          num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
-          if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
-          if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
-          num_prompts_arg="--num-prompts $num_prompts"
-        fi
        # pass the tensor parallel size, the compilation mode, and the optimization
        # level to the client so that they can be used on the benchmark dashboard
        client_command="vllm bench serve \
@@ -794,9 +444,8 @@ run_serving_tests() {
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --max-concurrency $max_concurrency \
-          $num_prompts_arg \
          --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
-          $client_args_effective $client_remote_args "
+          $client_args $client_remote_args "

        echo "Running test case $test_name with qps $qps"
        echo "Client command: $client_command"
@@ -818,11 +467,6 @@ run_serving_tests() {
        echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"

      done
-
-      adaptive_refine_from_static_results \
-        "$test_name" "$qps" "$max_concurrency_list" "$tp" \
-        "$compilation_config_mode" "$optimization_level" \
-        "$client_args_effective" "$client_remote_args" "$server_command"
    done

    # clean up
@@ -888,7 +532,6 @@ main() {
  # postprocess benchmarking results
  pip install tabulate pandas
  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
-  python3 $QUICK_BENCHMARK_ROOT/scripts/compare-json-results.py -f $RESULTS_FOLDER/benchmark_results.json

  upload_to_buildkite
 }
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
@@ -1,37 +0,0 @@
-{
-  "defaults": {
-    "qps_list": [
-      "inf"
-    ],
-    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-    "server_environment_variables": {
-      "VLLM_RPC_TIMEOUT": 100000,
-      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
-    },
-    "server_parameters": {
-      "dtype": "bfloat16",
-      "model": "openai/whisper-large-v3-turbo"
-    },
-    "client_parameters": {
-      "model": "openai/whisper-large-v3-turbo",
-      "backend": "openai-audio",
-      "endpoint": "/v1/audio/transcriptions",
-      "dataset_name": "hf",
-      "dataset_path": "openslr/librispeech_asr",
-      "hf_subset": "clean",
-      "hf_split": "test",
-      "no_stream": "",
-      "no_oversample": "",
-      "num_prompts": 200
-    }
-  },
-  "tests": [
-    {
-      "test_name": "serving_whisper_large_v3_turbo_librispeech_clean_tp1",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {}
-    }
-  ]
-}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
@@ -149,39 +149,6 @@
        "random-output-len": 128
      }
    },
-    {
-      "test_name": "serving_llama8B_tp1_random_2048_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_2048_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp4_random_2048_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 2048
-      }
-    },
    {
      "test_name": "serving_llama8B_int4_tp1_random_128_128",
      "server_parameters": {
@@ -221,45 +188,6 @@
        "random-output-len": 128
      }
    },
-    {
-      "test_name": "serving_llama8B_int8_tp1_random_128_128",
-      "server_parameters": {
-        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int8_tp2_random_128_128",
-      "server_parameters": {
-        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int8_tp4_random_128_128",
-      "server_parameters": {
-        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
    {
      "test_name": "serving_llama3B_tp1_random_128_128",
      "server_parameters": {
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -72,6 +72,17 @@
        "random-output-len": 128
      }
    },
+    {
+      "test_name": "serving_llama8B_tp4_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
    {
      "test_name": "serving_llama8B_tp1_random_128_2048",
      "server_parameters": {
@@ -94,6 +105,17 @@
        "random-output-len": 2048
      }
    },
+    {
+      "test_name": "serving_llama8B_tp4_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
    {
      "test_name": "serving_llama8B_tp1_random_2048_128",
      "server_parameters": {
@@ -117,25 +139,14 @@
      }
    },
    {
-      "test_name": "serving_llama8B_tp1_random_2048_2048",
+      "test_name": "serving_llama8B_tp4_random_2048_128",
      "server_parameters": {
-        "tensor_parallel_size": 1
+        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_2048_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 2048
+        "random-output-len": 128
      }
    }
  ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
@@ -10,6 +10,7 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
+            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -36,6 +37,7 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
+            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -62,6 +64,7 @@
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
+            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -88,6 +91,7 @@
        "server_parameters": {
            "model": "deepseek-ai/DeepSeek-R1",
            "tensor_parallel_size": 8,
+            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
--- a/.buildkite/performance-benchmarks/tests/serving-tests.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests.json
@@ -5,6 +5,7 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
+            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -22,6 +23,7 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
+            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -39,6 +41,7 @@
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
+            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -56,6 +59,7 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", 
            "tensor_parallel_size": 4,
+            "swap_space": 16,
            "speculative_config": {
                "model": "turboderp/Qwama-0.5B-Instruct",
                "num_speculative_tokens": 4,
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -83,7 +83,7 @@ steps:
        agents:
          queue: cpu_queue_postmerge
        commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
          - "mkdir artifacts"
          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
@@ -152,7 +152,7 @@ steps:
          queue: cpu_queue_postmerge
        commands:
          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
        env:
--- a/.buildkite/scripts/check-ray-compatibility.sh
+++ b/.buildkite/scripts/check-ray-compatibility.sh
@@ -166,19 +166,12 @@ See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for contex
 EOF
 fi

-# Notify Slack if webhook is configured and PR/branch are valid.
+# Notify Slack if webhook is configured.
 if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
-    PR="${BUILDKITE_PULL_REQUEST:-}"
-    BRANCH="${BUILDKITE_BRANCH:-}"
-
-    # Skip notification if PR is invalid or branch is empty
-    if [[ "$PR" = "false" || -z "$PR" || -z "$BRANCH" ]]; then
-        echo ">>> Skipping Slack notification (invalid PR or empty branch: PR=$PR, branch=$BRANCH)"
-    else
-        echo ">>> Sending Slack notification"
-        # Single quotes are intentional: the f-string expressions are Python, not shell.
-        # shellcheck disable=SC2016
-        PAYLOAD=$(python3 -c '
+    echo ">>> Sending Slack notification"
+    # Single quotes are intentional: the f-string expressions are Python, not shell.
+    # shellcheck disable=SC2016
+    PAYLOAD=$(python3 -c '
 import json, os, sys
 pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A")
 branch = os.getenv("BUILDKITE_BRANCH", "unknown")
@@ -201,11 +194,10 @@ data = {
 print(json.dumps(data))
 ')

-        HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
-            -H 'Content-type: application/json' \
-            -d "$PAYLOAD")
-        echo "    Slack webhook response: $HTTP_CODE"
-    fi
+    HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
+        -H 'Content-type: application/json' \
+        -d "$PAYLOAD")
+    echo "    Slack webhook response: $HTTP_CODE"
 else
    echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
 fi
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -205,13 +205,6 @@ re_quote_pytest_markers() {
      esac

      if $is_boundary; then
-        # Strip surrounding double quotes if present (from upstream
-        # single-to-double conversion); without this, wrapping below
-        # would produce '"expr"' with literal double-quote characters.
-        if [[ "$marker_buf" == '"'*'"' ]]; then
-          marker_buf="${marker_buf#\"}"
-          marker_buf="${marker_buf%\"}"
-        fi
        # Flush the collected marker expression
        if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
          output+="'${marker_buf}' "
@@ -249,11 +242,6 @@ re_quote_pytest_markers() {

  # Flush any trailing marker expression (marker at end of command)
  if $collecting && [[ -n "$marker_buf" ]]; then
-    # Strip surrounding double quotes (see mid-stream flush comment)
-    if [[ "$marker_buf" == '"'*'"' ]]; then
-      marker_buf="${marker_buf#\"}"
-      marker_buf="${marker_buf%\"}"
-    fi
    if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
      output+="'${marker_buf}'"
    else
@@ -504,8 +492,6 @@ else
    -e HF_TOKEN \
    -e AWS_ACCESS_KEY_ID \
    -e AWS_SECRET_ACCESS_KEY \
-    -e BUILDKITE_PARALLEL_JOB \
-    -e BUILDKITE_PARALLEL_JOB_COUNT \
    -v "${HF_CACHE}:${HF_MOUNT}" \
    -e "HF_HOME=${HF_MOUNT}" \
    -e "PYTHONPATH=${MYPYTHONPATH}" \
--- a/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
@@ -1,65 +0,0 @@
-#!/bin/bash
-set -euox pipefail
-
-export VLLM_CPU_KVCACHE_SPACE=1 
-export VLLM_CPU_CI_ENV=1
-# Reduce sub-processes for acceleration
-export TORCH_COMPILE_DISABLE=1 
-export VLLM_ENABLE_V1_MULTIPROCESSING=0
-
-SDE_ARCHIVE="sde-external-10.7.0-2026-02-18-lin.tar.xz"
-SDE_CHECKSUM="CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217"
-wget "https://downloadmirror.intel.com/913594/${SDE_ARCHIVE}"
-echo "${SDE_CHECKSUM}  ${SDE_ARCHIVE}" | sha256sum --check
-mkdir -p sde
-tar -xvf "./${SDE_ARCHIVE}" --strip-components=1 -C ./sde/
-
-wait_for_pid_and_check_log() {
-    local pid="$1"
-    local log_file="$2"
-    local exit_status
-
-    if [ -z "$pid" ] || [ -z "$log_file" ]; then
-        echo "Usage: wait_for_pid_and_check_log <PID> <LOG_FILE>"
-        return 1
-    fi
-
-    echo "Waiting for process $pid to finish..."
-    
-    # Use the 'wait' command to pause the script until the specific PID exits.
-    # The 'wait' command's own exit status will be that of the waited-for process.
-    if wait "$pid"; then
-        exit_status=$?
-        echo "Process $pid finished with exit status $exit_status (Success)."
-    else
-        exit_status=$?
-        echo "Process $pid finished with exit status $exit_status (Failure)."
-    fi
-
-    if [ "$exit_status" -ne 0 ]; then
-        echo "Process exited with a non-zero status."
-        echo "--- Last few lines of log file: $log_file ---"
-        tail -n 50 "$log_file"
-        echo "---------------------------------------------"
-        return 1 # Indicate failure based on exit status
-    fi
-
-    echo "No errors detected in log file and process exited successfully."
-    return 0
-}
-
-# Test Sky Lake (AVX512F)
-./sde/sde64 -skl -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_0.log 2>&1 &
-PID_TEST_0=$!
-
-# Test Cascade Lake (AVX512F + VNNI)
-./sde/sde64 -clx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_1.log 2>&1 &
-PID_TEST_1=$!
-
-# Test Cooper Lake (AVX512F + VNNI + BF16)
-./sde/sde64 -cpx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_2.log 2>&1 &
-PID_TEST_2=$!
-
-wait_for_pid_and_check_log $PID_TEST_0 test_0.log
-wait_for_pid_and_check_log $PID_TEST_1 test_1.log
-wait_for_pid_and_check_log $PID_TEST_2 test_2.log
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -34,7 +34,7 @@ function cpu_tests() {
  # offline inference
  docker exec cpu-test bash -c "
    set -e
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"

  # Run model tests
  docker exec cpu-test bash -c "
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -27,7 +27,7 @@ function cpu_tests() {
  podman exec -it "$container_id" bash -c "
    export TORCH_COMPILE_DISABLE=1
    set -xve
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log

  # Run basic model test
  podman exec -it "$container_id" bash -c "
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@@ -25,5 +25,5 @@ remove_docker_container

 # Run the image and test offline inference
 docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
+    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -76,7 +76,7 @@ docker run --rm --runtime=habana --name="${container_name}" --network=host \
  -e PT_HPU_LAZY_MODE=1 \
  "${image_name}" \
  /bin/bash -c '
-  cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
+  cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
 '

 EXITCODE=$?
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -34,15 +34,15 @@ docker run \
    set -e
    echo $ZE_AFFINITY_MASK
    pip install tblib==3.1.0
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
-    python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
-    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
-    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
+    python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
+    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
+    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
    cd tests
    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
    pytest -v -s v1/engine
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -24,7 +24,7 @@ if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:
  BACKENDS=("allgather_reducescatter")
  # Disable MOE padding for ROCm since it is causing eplb to fail
  export VLLM_ROCM_MOE_PADDING=0
-  PLATFORM_ARGS=("--no-async-scheduling" "--attention-backend=TRITON_ATTN")
+  PLATFORM_ARGS=("--no-async-scheduling")
  echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
 else
  # Non-ROCm platform (CUDA/other)
--- a/.buildkite/scripts/tool_call/run-bfcl-eval.sh
+++ b/.buildkite/scripts/tool_call/run-bfcl-eval.sh
@@ -1,248 +0,0 @@
-#!/bin/bash
-# Run BFCL (Berkeley Function Call Leaderboard) tool-calling correctness
-# evaluation against a local vLLM server.
-#
-# Usage:
-#   # Run with defaults (gpt-oss-20b, multi_turn)
-#   bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
-#
-#   # Run with gpt-oss-120b and multiple test categories
-#   BFCL_MODEL="openai/gpt-oss-120b" BFCL_TP_SIZE=4 \
-#     BFCL_TEST_CATEGORY="live_simple, multiple, parallel_multiple" \
-#     bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
-#
-#   # Chain both API types (use BFCL_OUTPUT_DIR to avoid overwriting results)
-#   BFCL_OUTPUT_DIR=./bfcl-chat-completions BFCL_API_TYPE=chat_completions \
-#     bash .buildkite/scripts/tool_call/run-bfcl-eval.sh && \
-#   BFCL_OUTPUT_DIR=./bfcl-responses BFCL_API_TYPE=responses \
-#     bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
-#
-# Environment variables (all optional, with defaults):
-#   BFCL_MODEL          - HF model name (default: openai/gpt-oss-20b)
-#   BFCL_API_TYPE       - API type: "chat_completions" or "responses" (default: chat_completions)
-#   BFCL_OUTPUT_DIR     - Directory for BFCL results (default: current working directory)
-#   BFCL_TEST_CATEGORY  - BFCL test categories (default: multi_turn)
-#   BFCL_TOOL_CALL_PARSER - Tool call parser name (default: openai)
-#   BFCL_NUM_THREADS    - Threads for BFCL generate (default: 8)
-#   BFCL_TP_SIZE        - Tensor parallel size (default: 1)
-#   BFCL_MAX_MODEL_LEN  - Max model length (default: 4096)
-#   BFCL_PORT           - Server port (default: 8000)
-#   BFCL_REASONING_PARSER - Reasoning parser name (default: disabled)
-#   BFCL_EXTRA_ARGS     - Additional vLLM server args
-
-set -euo pipefail
-
-# ---- Configuration ----
-MODEL="${BFCL_MODEL:-openai/gpt-oss-20b}"
-API_TYPE="${BFCL_API_TYPE:-chat_completions}"
-OUTPUT_DIR="${BFCL_OUTPUT_DIR:-}"
-TEST_CATEGORY="${BFCL_TEST_CATEGORY:-multi_turn}"
-TOOL_CALL_PARSER="${BFCL_TOOL_CALL_PARSER:-openai}"
-NUM_THREADS="${BFCL_NUM_THREADS:-8}"
-TP_SIZE="${BFCL_TP_SIZE:-1}"
-MAX_MODEL_LEN="${BFCL_MAX_MODEL_LEN:-4096}"
-PORT="${BFCL_PORT:-8000}"
-REASONING_PARSER="${BFCL_REASONING_PARSER:-}"
-EXTRA_ARGS="${BFCL_EXTRA_ARGS:-}"
-
-# Set up output directory
-if [ -n "$OUTPUT_DIR" ]; then
-    mkdir -p "$OUTPUT_DIR"
-    OUTPUT_DIR="$(cd "$OUTPUT_DIR" && pwd)"
-fi
-
-echo "============================================"
-echo "BFCL Tool Call Correctness Evaluation"
-echo "============================================"
-echo "Model:          $MODEL"
-echo "Tool parser:    $TOOL_CALL_PARSER"
-echo "API type:       $API_TYPE"
-echo "Output dir:     ${OUTPUT_DIR:-<cwd>}"
-echo "Test category:  $TEST_CATEGORY"
-echo "TP size:        $TP_SIZE"
-echo "Max model len:  $MAX_MODEL_LEN"
-echo "Port:           $PORT"
-echo "Num threads:    $NUM_THREADS"
-echo "============================================"
-
-# ---- Install bfcl-eval if missing ----
-if ! python3 -c "import bfcl_eval" 2>/dev/null; then
-    echo "Installing bfcl-eval..."
-    pip install "bfcl-eval>=2025.10.20.1,<2026"
-fi
-
-# ---- Cleanup handler ----
-SERVER_PID=""
-cleanup() {
-    if [ -n "$SERVER_PID" ]; then
-        echo "Stopping vLLM server (pid=$SERVER_PID)..."
-        kill "$SERVER_PID" 2>/dev/null || true
-        wait "$SERVER_PID" 2>/dev/null || true
-    fi
-    # Remove BFCL lock files (created by filelock for thread-safe writes)
-    rm -rf .file_locks/
-    if [ -n "${OUTPUT_DIR:-}" ]; then
-        rm -rf "$OUTPUT_DIR/.file_locks/"
-    fi
-}
-trap cleanup EXIT
-
-# ---- Start vLLM server ----
-echo "Starting vLLM server..."
-
-SERVE_ARGS=(
-    "$MODEL"
-    --port "$PORT"
-    --enable-auto-tool-choice
-    --tool-call-parser "$TOOL_CALL_PARSER"
-    --tensor-parallel-size "$TP_SIZE"
-    --max-model-len "$MAX_MODEL_LEN"
-    --enforce-eager
-    --no-enable-prefix-caching
-)
-
-# Append reasoning parser if specified
-if [ -n "$REASONING_PARSER" ]; then
-    SERVE_ARGS+=(--reasoning-parser "$REASONING_PARSER")
-fi
-
-# Append any extra args
-if [ -n "$EXTRA_ARGS" ]; then
-    read -ra EXTRA_ARGS_ARRAY <<< "$EXTRA_ARGS"
-    SERVE_ARGS+=("${EXTRA_ARGS_ARRAY[@]}")
-fi
-
-echo "Command: vllm serve ${SERVE_ARGS[*]}"
-vllm serve "${SERVE_ARGS[@]}" &
-SERVER_PID=$!
-
-# ---- Wait for server to be ready ----
-echo "Waiting for vLLM server to start (timeout: 600s)..."
-SECONDS_WAITED=0
-until curl -sf "http://localhost:${PORT}/health" > /dev/null 2>&1; do
-    if [ $SECONDS_WAITED -ge 600 ]; then
-        echo ""
-        echo "ERROR: vLLM server failed to start within 600s"
-        exit 1
-    fi
-    if (( SECONDS_WAITED % 30 == 0 && SECONDS_WAITED > 0 )); then
-        echo "  Still waiting... (${SECONDS_WAITED}s elapsed)"
-    fi
-    sleep 2
-    SECONDS_WAITED=$((SECONDS_WAITED + 2))
-done
-echo "vLLM server is ready. (started in ${SECONDS_WAITED}s)"
-
-# ---- Run BFCL evaluation ----
-# bfcl-eval has no CLI entry point; generate() and evaluate() are Typer
-# functions that must be called from Python. The MODEL_CONFIG_MAPPING must
-# be patched in-process so BFCL knows to use the OpenAI-compatible handler
-# against our local vLLM server.
-bfcl_exit_code=0
-python3 - "$MODEL" "$TEST_CATEGORY" "$NUM_THREADS" "$PORT" "$API_TYPE" "$OUTPUT_DIR" << 'PYEOF' || bfcl_exit_code=$?
-import os
-import sys
-
-model = sys.argv[1]
-test_category = sys.argv[2]
-num_threads = int(sys.argv[3])
-port = sys.argv[4]
-api_type = sys.argv[5]
-output_dir = sys.argv[6] if len(sys.argv) > 6 and sys.argv[6] else os.getcwd()
-
-os.environ["OPENAI_BASE_URL"] = f"http://localhost:{port}/v1"
-os.environ["OPENAI_API_KEY"] = "dummy"
-os.environ["BFCL_PROJECT_ROOT"] = output_dir
-
-import bfcl_eval.constants.model_config as bfcl_model_config
-from bfcl_eval.constants.model_config import ModelConfig
-from bfcl_eval.model_handler.api_inference.openai_completion import (
-    OpenAICompletionsHandler,
-)
-from bfcl_eval.model_handler.api_inference.openai_response import (
-    OpenAIResponsesHandler,
-)
-
-if api_type == "responses":
-    handler = OpenAIResponsesHandler
-else:
-    handler = OpenAICompletionsHandler
-
-bfcl_model_config.MODEL_CONFIG_MAPPING[model] = ModelConfig(
-    model_name=model,
-    display_name=f"{model} (FC) (vLLM)",
-    url=f"https://huggingface.co/{model}",
-    org="",
-    license="apache-2.0",
-    model_handler=handler,
-    input_price=None,
-    output_price=None,
-    is_fc_model=True,
-    underscore_to_dot=True,
-)
-
-from bfcl_eval.__main__ import evaluate, generate
-import inspect
-import typer
-
-
-def _get_default_kwargs(function):
-    kwargs = {}
-    for k, v in inspect.signature(function).parameters.items():
-        if v.default is not inspect.Parameter.empty:
-            default = v.default
-            if isinstance(default, typer.models.OptionInfo):
-                default = default.default
-            kwargs[k] = default
-    return kwargs
-
-
-# ---- generate ----
-print(f"=== BFCL generate: model={model} test_category={test_category} ===")
-gen_kwargs = _get_default_kwargs(generate)
-gen_kwargs["model"] = [model]
-gen_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
-gen_kwargs["skip_server_setup"] = True
-gen_kwargs["num_threads"] = num_threads
-generate(**gen_kwargs)
-
-# ---- evaluate ----
-print(f"=== BFCL evaluate: model={model} test_category={test_category} ===")
-eval_kwargs = _get_default_kwargs(evaluate)
-eval_kwargs["model"] = [model]
-eval_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
-evaluate(**eval_kwargs)
-
-print("=== BFCL evaluation completed successfully ===")
-PYEOF
-
-# ---- Upload results to buildkite ----
-if command -v buildkite-agent &>/dev/null; then
-    if [ $bfcl_exit_code -eq 0 ]; then
-        STYLE="success"
-        STATUS="PASSED"
-    else
-        STYLE="error"
-        STATUS="FAILED"
-    fi
-
-    buildkite-agent annotate --style "$STYLE" --context "bfcl-results" <<EOF
-### BFCL Tool Call Correctness - ${STATUS}
- **Model:** \`${MODEL}\`
- **Parser:** \`${TOOL_CALL_PARSER}\`
- **API type:** \`${API_TYPE}\`
- **Test category:** \`${TEST_CATEGORY}\`
-EOF
-
-    # BFCL writes results to $BFCL_PROJECT_ROOT/result/ and scores to
-    # $BFCL_PROJECT_ROOT/score/
-    RESULTS_ROOT="${OUTPUT_DIR:-.}"
-    if [ -d "$RESULTS_ROOT/result" ]; then
-        buildkite-agent artifact upload "$RESULTS_ROOT/result/**/*"
-    fi
-    if [ -d "$RESULTS_ROOT/score" ]; then
-        buildkite-agent artifact upload "$RESULTS_ROOT/score/**/*"
-    fi
-fi
-
-exit $bfcl_exit_code
--- a/.buildkite/scripts/upload-nightly-wheels.sh
+++ b/.buildkite/scripts/upload-nightly-wheels.sh
@@ -72,7 +72,7 @@ obj_json="objects.json"
 aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
 mkdir -p "$INDICES_OUTPUT_DIR"

-# call script to generate indices for all existing wheels
+# call script to generate indicies for all existing wheels
 # this indices have relative paths that could work as long as it is next to the wheel directory in s3
 # i.e., the wheels are always in s3://vllm-wheels/<commit>/
 # and indices can be placed in /<commit>/, or /nightly/, or /<version>/
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
--- a/.buildkite/test_areas/basic_correctness.yaml
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -14,3 +14,8 @@ steps:
  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -36,16 +36,6 @@ steps:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py

- label: AsyncTP Correctness Tests (B200)
-  timeout_in_minutes: 50
-  working_dir: "/vllm-workspace/"
-  device: b200
-  optional: true
-  num_devices: 2
-  commands:
-  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
-
 - label: Distributed Compile Unit Tests (2xH100)
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
@@ -101,8 +91,8 @@ steps:
    - nvidia-smi
    # Run all models and attn backends but only Inductor partition and native custom ops
    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
-    # Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and (qwen3 or deepseek)"
+    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"

 - label: Fusion E2E Config Sweep (H100)
  timeout_in_minutes: 30
@@ -132,9 +122,9 @@ steps:
  commands:
    - nvidia-smi
    # Run all models but only FLASHINFER, Inductor partition and native custom ops
-    # Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    # Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek)) or llama-3)"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"

 - label: Fusion E2E TP2 Quick (H100)
  timeout_in_minutes: 20
@@ -150,8 +140,8 @@ steps:
  commands:
    - nvidia-smi
    # Run all models and attn backends but only Inductor partition and native custom ops
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"

 - label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
  timeout_in_minutes: 40
@@ -205,7 +195,7 @@ steps:
  commands:
    - nvidia-smi
    # Run all models but only FLASHINFER, Inductor partition and native custom ops
-    # include qwen/deepseek with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    # for ar-rms-quant-fp4, also sweep llama3
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))) or Llama-3.1-8B-Instruct-FP4"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -50,18 +50,23 @@ steps:
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py

- label: Distributed Torchrun + Examples (4 GPUs)
-  timeout_in_minutes: 30
+- label: Distributed Tests (4 GPUs)
+  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
-  - tests/distributed/test_torchrun_example.py
-  - tests/distributed/test_torchrun_example_moe.py
+  - tests/distributed/test_utils
+  - tests/distributed/test_pynccl
+  - tests/distributed/test_events
+  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - examples/offline_inference/new_weight_syncing/
  - tests/examples/offline_inference/data_parallel.py
+  - tests/v1/distributed
+  - tests/v1/engine/test_engine_core_client.py
+  - tests/distributed/test_symm_mem_allreduce.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
@@ -79,6 +84,19 @@ steps:
  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
+  - pytest -v -s distributed/test_utils.py
+  - pytest -v -s compile/fullgraph/test_basic_correctness.py
+  - pytest -v -s distributed/test_pynccl.py
+  - pytest -v -s distributed/test_events.py
+  - pytest -v -s distributed/test_symm_mem_allreduce.py
+  # TODO: create a dedicated test section for multi-GPU example tests
+  # when we have multiple distributed example tests
  # OLD rlhf examples
  - cd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
@@ -88,47 +106,6 @@ steps:
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py

- label: Distributed DP Tests (4 GPUs)
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
-  source_file_dependencies:
-  - vllm/distributed/
-  - tests/v1/distributed
-  - tests/v1/engine/test_engine_core_client.py
-  - tests/distributed/test_utils
-  commands:
-  # https://github.com/NVIDIA/nccl/issues/1838
-  - export NCCL_CUMEM_HOST_ENABLE=0
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
-  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
-  - pytest -v -s distributed/test_utils.py
-
- label: Distributed Compile + Comm (4 GPUs)
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
-  source_file_dependencies:
-  - vllm/distributed/
-  - tests/distributed/test_pynccl
-  - tests/distributed/test_events
-  - tests/compile/fullgraph/test_basic_correctness.py
-  - tests/distributed/test_symm_mem_allreduce.py
-  - tests/distributed/test_multiproc_executor.py
-  commands:
-  # https://github.com/NVIDIA/nccl/issues/1838
-  - export NCCL_CUMEM_HOST_ENABLE=0
-  - pytest -v -s compile/fullgraph/test_basic_correctness.py
-  - pytest -v -s distributed/test_pynccl.py
-  - pytest -v -s distributed/test_events.py
-  - pytest -v -s distributed/test_symm_mem_allreduce.py
-  # test multi-node TP with multiproc executor (simulated on single node)
-  - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
-
 - label: Distributed Tests (8 GPUs)(H100)
  timeout_in_minutes: 10
  device: h100
@@ -169,7 +146,7 @@ steps:
  num_devices: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
-    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
+    # - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py --- failing, need to re-enable
    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py

@@ -233,19 +210,6 @@ steps:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

- label: NixlConnector PD + Spec Decode acceptance (2 GPUs)
-  timeout_in_minutes: 30
-  device: a100
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
-  source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-    - vllm/v1/worker/kv_connector_model_runner_mixin.py
-    - tests/v1/kv_connector/nixl_integration/
-  commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
-
 - label: Pipeline + Context Parallelism (4 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -1,5 +1,5 @@
 group: Engine
-depends_on:
+depends_on: 
  - image-build
 steps:
 - label: Engine
@@ -14,30 +14,28 @@ steps:
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py

- label: Engine (1 GPU)
-  timeout_in_minutes: 30
+- label: V1 e2e + engine (1 GPU)
+  timeout_in_minutes: 45
  source_file_dependencies:
-    - vllm/v1/engine/
-    - tests/v1/engine/
+    - vllm/
+    - tests/v1
  commands:
+    # TODO: accuracy does not match, whether setting
+    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - pytest -v -s v1/e2e
+    # Run this test standalone for now;
+    # need to untangle use (implicit) use of spawn/fork across the tests.
    - pytest -v -s v1/engine/test_preprocess_error_handling.py
+    # Run the rest of v1/engine tests
    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
-
- label: e2e Scheduling (1 GPU)
-  timeout_in_minutes: 30
-  source_file_dependencies:
-    - vllm/v1/
-    - tests/v1/e2e/general/
-  commands:
-    - pytest -v -s v1/e2e/general/test_async_scheduling.py
-
- label: e2e Core (1 GPU)
-  timeout_in_minutes: 30
-  source_file_dependencies:
-    - vllm/v1/
-    - tests/v1/e2e/general/
-  commands:
-    - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
+      commands:
+      - pytest -v -s v1/e2e
+      - pytest -v -s v1/engine

 - label: V1 e2e (2 GPUs)
  timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
@@ -48,7 +46,7 @@ steps:
    - tests/v1/e2e
  commands:
    # Only run tests that need exactly 2 GPUs
-    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
+    - pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
  mirror:
    amd:
      device: mi325_2
@@ -64,7 +62,7 @@ steps:
    - tests/v1/e2e
  commands:
    # Only run tests that need 4 GPUs
-    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
+    - pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
  mirror:
    amd:
      device: mi325_4
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -24,6 +24,11 @@ steps:
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd

 - label: Entrypoints Integration (API Server 1)
  timeout_in_minutes: 130
@@ -34,13 +39,8 @@ steps:
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
  - pytest -v -s entrypoints/test_chat_utils.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: Entrypoints Integration (API Server 2)
  timeout_in_minutes: 130
@@ -65,6 +65,11 @@ steps:
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/pooling
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd

 - label: Entrypoints Integration (Responses API)
  timeout_in_minutes: 50
@@ -82,11 +87,6 @@ steps:
    - tests/v1
  commands:
    - pytest -v -s v1/entrypoints
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: OpenAI API Correctness
  timeout_in_minutes: 30
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -8,9 +8,8 @@ steps:
  - csrc/
  - tests/kernels/core
  - tests/kernels/test_top_k_per_row.py
-  - tests/kernels/test_concat_mla_q.py
  commands:
-    - pytest -v -s kernels/core kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py

 - label: Kernels Attention Test %N
  timeout_in_minutes: 35
@@ -97,7 +96,7 @@ steps:
  - vllm/platforms/cuda.py
  commands:
    - nvidia-smi
-    - python3 examples/basic/offline_inference/chat.py
+    - python3 examples/offline_inference/basic/chat.py
    # Attention
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
    - pytest -v -s tests/kernels/attention/test_attention_selector.py
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -67,13 +67,12 @@ steps:
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
-     # for basic
-    - python3 basic/offline_inference/chat.py
-    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
-    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 basic/offline_inference/classify.py
-    - python3 basic/offline_inference/embed.py
-    - python3 basic/offline_inference/score.py
+    - python3 offline_inference/basic/chat.py # for basic
+    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 offline_inference/basic/classify.py
+    - python3 offline_inference/basic/embed.py
+    - python3 offline_inference/basic/score.py
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
--- a/.buildkite/test_areas/model_runner_v2.yaml
+++ b/.buildkite/test_areas/model_runner_v2.yaml
@@ -1,110 +0,0 @@
-group: Model Runner V2
-depends_on:
-  - image-build
-steps:
- label: Model Runner V2 Core Tests
-  timeout_in_minutes: 45
-  source_file_dependencies:
-  - vllm/v1/worker/gpu/
-  - vllm/v1/worker/gpu_worker.py
-  - vllm/v1/core/sched/
-  - vllm/v1/attention/
-  - tests/v1/engine/test_llm_engine.py
-  - tests/v1/e2e/
-  - tests/v1/entrypoints/llm/test_struct_output_generate.py
-  commands:
-  - set -x
-  - export VLLM_USE_V2_MODEL_RUNNER=1
-  - pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics"
-  # This requires eager until we sort out CG correctness issues.
-  # TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged.
-  - ENFORCE_EAGER=1 pytest -v -s v1/e2e/general/test_async_scheduling.py -k "not ngram"
-  - pytest -v -s v1/e2e/general/test_context_length.py
-  - pytest -v -s v1/e2e/general/test_min_tokens.py
-  # Temporary hack filter to exclude ngram spec decoding based tests.
-  - pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
-
- label: Model Runner V2 Examples
-  timeout_in_minutes: 45
-  working_dir: "/vllm-workspace/examples"
-  source_file_dependencies:
-    - vllm/v1/worker/gpu/
-    - vllm/v1/core/sched/
-    - vllm/v1/worker/gpu_worker.py
-    - examples/offline_inference/
-    - examples/basic/offline_inference/
-    - examples/pooling/embed/vision_embedding_offline.py
-    - examples/others/tensorize_vllm_model.py
-  commands:
-    - set -x
-    - export VLLM_USE_V2_MODEL_RUNNER=1
-    - pip install tensorizer # for tensorizer test
-    - python3 basic/offline_inference/chat.py # for basic
-    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
-    #- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10  # TODO
-    #- python3 basic/offline_inference/embed.py   # TODO
-    # for multi-modal models
-    - python3 offline_inference/audio_language.py --seed 0
-    - python3 offline_inference/vision_language.py --seed 0
-    - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
-    # for pooling models
-    - python3 pooling/embed/vision_embedding_offline.py --seed 0
-    # for features demo
-    - python3 offline_inference/prefix_caching.py
-    - python3 offline_inference/llm_engine_example.py
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
-    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
-
- label: Model Runner V2 Distributed (2 GPUs)
-  timeout_in_minutes: 45
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
-  source_file_dependencies:
-    - vllm/v1/worker/gpu/
-    - vllm/v1/worker/gpu_worker.py
-    - tests/basic_correctness/test_basic_correctness.py
-    - tests/v1/distributed/test_async_llm_dp.py
-    - tests/v1/distributed/test_eagle_dp.py
-  commands:
-    - set -x
-    - export VLLM_USE_V2_MODEL_RUNNER=1
-    # The "and not True" here is a hacky way to exclude the prompt_embeds cases which aren't yet supported.
-    - TARGET_TEST_SUITE=L4 pytest -v -s basic_correctness/test_basic_correctness.py -m 'distributed(num_gpus=2)' -k "not ray and not True"
-    # https://github.com/NVIDIA/nccl/issues/1838
-    - export NCCL_CUMEM_HOST_ENABLE=0
-    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py -k "not ray"
-    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
-
-# These require fix https://github.com/vllm-project/vllm/pull/36280
- label: Model Runner V2 Pipeline Parallelism (4 GPUs)
-  timeout_in_minutes: 60
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
-  source_file_dependencies:
-    - vllm/v1/worker/gpu/
-    - vllm/v1/worker/gpu_worker.py
-    - tests/distributed/test_pipeline_parallel.py
-    #- tests/distributed/test_pp_cudagraph.py
-  commands:
-    - set -x
-    - export VLLM_USE_V2_MODEL_RUNNER=1
-    - pytest -v -s distributed/test_pipeline_parallel.py -k "not ray and not Jamba"
-    # TODO: Uncomment once https://github.com/vllm-project/vllm/pull/35162 is merged.
-    #- pytest -v -s distributed/test_pp_cudagraph.py -k "not ray"
-
- label: Model Runner V2 Spec Decode
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/tests"
-  source_file_dependencies:
-  - vllm/v1/worker/gpu/
-  - vllm/v1/worker/gpu_worker.py
-  - tests/v1/spec_decode/test_max_len.py
-  - tests/v1/e2e/spec_decode/test_spec_decode.py
-  commands:
-  - set -x
-  - export VLLM_USE_V2_MODEL_RUNNER=1
-  - pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp"
-  - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle or mtp"
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -65,7 +65,7 @@ steps:
    - pytest -v -s tests/models/test_transformers.py
    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
-    - python3 examples/basic/offline_inference/chat.py
+    - python3 examples/offline_inference/basic/chat.py
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -2,65 +2,16 @@ group: Models - Multimodal
 depends_on: 
  - image-build
 steps:
- label: "Multi-Modal Models (Standard) 1: qwen2"
-  timeout_in_minutes: 45
+- label: Multi-Modal Models (Standard) # 60min
+  timeout_in_minutes: 80
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2"
-    - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-
- label: "Multi-Modal Models (Standard) 2: qwen3 + gemma"
-  timeout_in_minutes: 45
-  source_file_dependencies:
-  - vllm/
-  - tests/models/multimodal
-  commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma"
-    - pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-
- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl"
-  timeout_in_minutes: 45
-  source_file_dependencies:
-  - vllm/
-  - tests/models/multimodal
-  commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma"
-    - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-
- label: "Multi-Modal Models (Standard) 4: other + whisper"
-  timeout_in_minutes: 45
-  source_file_dependencies:
-  - vllm/
-  - tests/models/multimodal
-  commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: Multi-Modal Processor Test (CPU)
  depends_on: 
@@ -103,11 +54,6 @@ steps:
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: Multi-Modal Models (Extended) 2
  optional: true
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -15,12 +15,9 @@ steps:
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
  # end platform plugin tests
-  # begin io_processor plugins test
-  # test generic io_processor plugins functions
-  - pytest -v -s ./plugins_tests/test_io_processor_plugins.py
-  # test Terratorch io_processor plugins
+  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
  - pip install -e ./plugins/prithvi_io_processor_plugin
-  - pytest -v -s plugins_tests/test_terratorch_io_processor_plugins.py
+  - pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # test bge_m3_sparse io_processor plugin
  - pip install -e ./plugins/bge_m3_sparse_plugin
--- a/.buildkite/test_areas/spec_decode.yaml
+++ b/.buildkite/test_areas/spec_decode.yaml
@@ -1,40 +0,0 @@
-group: Spec Decode
-depends_on:
-  - image-build
-steps:
- label: Spec Decode Eagle
-  timeout_in_minutes: 30
-  source_file_dependencies:
-    - vllm/v1/spec_decode/
-    - vllm/v1/worker/gpu/spec_decode/
-    - tests/v1/e2e/spec_decode/
-  commands:
-    - pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"
-
- label: Spec Decode Speculators + MTP
-  timeout_in_minutes: 30
-  source_file_dependencies:
-    - vllm/v1/spec_decode/
-    - vllm/v1/worker/gpu/spec_decode/
-    - vllm/transformers_utils/configs/speculators/
-    - tests/v1/e2e/spec_decode/
-  commands:
-    - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
-
- label: Spec Decode Ngram + Suffix
-  timeout_in_minutes: 30
-  source_file_dependencies:
-    - vllm/v1/spec_decode/
-    - vllm/v1/worker/gpu/spec_decode/
-    - tests/v1/e2e/spec_decode/
-  commands:
-    - pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"
-
- label: Spec Decode Draft Model
-  timeout_in_minutes: 30
-  source_file_dependencies:
-    - vllm/v1/spec_decode/
-    - vllm/v1/worker/gpu/spec_decode/
-    - tests/v1/e2e/spec_decode/
-  commands:
-    - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -3,7 +3,6 @@ pull_request_rules:
  description: Automatically apply documentation label
  conditions:
    - label != stale
-    - -closed
    - or:
      - files~=^[^/]+\.md$
      - files~=^docs/
@@ -27,7 +26,7 @@ pull_request_rules:
        Hi @{{author}}, the pre-commit checks have failed. Please run:

        ```bash 
-        uv pip install pre-commit>=4.5.1
+        uv pip install pre-commit
        pre-commit install
        pre-commit run --all-files
        ```
@@ -38,13 +37,15 @@ pull_request_rules:

        > [!TIP]
        > <details>
-        > <summary>Is <code>mypy</code> failing?</summary>
+        > <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
        > <br/>
-        > <code>mypy</code> is run differently in CI. If the failure is related to this check, please use the following command to run it locally:
+        > <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
        >
        > ```bash
        > # For mypy (substitute "3.10" with the failing version if needed)
        > pre-commit run --hook-stage manual mypy-3.10
+        > # For markdownlint
+        > pre-commit run --hook-stage manual markdownlint
        > ```
        > </details>

@@ -334,7 +335,7 @@ pull_request_rules:
    - or:
      - files~=^tests/tool_use/
      - files~=^tests/entrypoints/openai/tool_parsers/
-      - files=tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
+      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
      - files~=^vllm/entrypoints/openai/tool_parsers/
      - files=docs/features/tool_calling.md
      - files~=^examples/tool_chat_*
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -6,9 +6,6 @@ on:
      - main
  workflow_dispatch:  # Manual trigger

-permissions:
-  contents: read
-
 jobs:
  macos-m1-smoke-test:
    runs-on: macos-latest
--- a/.gitignore
+++ b/.gitignore
@@ -189,9 +189,11 @@ cython_debug/
 .vscode/

 # Claude
+CLAUDE.md
 .claude/

 # Codex
+AGENTS.md
 .codex/

 # Cursor
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@ repos:
    args: [--output-format, github, --fix]
  - id: ruff-format
 - repo: https://github.com/crate-ci/typos
-  rev: v1.43.5
+  rev: v1.38.1
  hooks:
  - id: typos
    args: [--force-exclude]
@@ -24,13 +24,12 @@ repos:
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
    types_or: [c++, cuda]
    args: [--style=file, --verbose]
- repo: https://github.com/DavidAnson/markdownlint-cli2
-  rev: v0.21.0
+- repo: https://github.com/igorshubovych/markdownlint-cli
+  rev: v0.45.0
  hooks:
-  - id: markdownlint-cli2
-    language_version: lts
-    args: [--fix]
-    exclude: ^CLAUDE\.md$
+  - id: markdownlint
+    exclude: '.*\.inc\.md'
+    stages: [manual] # Only run in CI
 - repo: https://github.com/rhysd/actionlint
  rev: v1.7.7
  hooks:
@@ -56,7 +55,7 @@ repos:
      language: python
      types_or: [python, pyi]
      require_serial: true
-      additional_dependencies: ["mypy[faster-cache]==1.19.1", regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
+      additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.10
    entry: python tools/pre_commit/mypy.py 1 "3.10"
@@ -128,13 +127,6 @@ repos:
    language: python
    types: [python]
    additional_dependencies: [regex]
-  # prevent use torch.cuda APIs
-  - id: check-torch-cuda-call
-    name: "Prevent new 'torch.cuda' APIs call"
-    entry: python tools/pre_commit/check_torch_cuda.py
-    language: python
-    types: [python]
-    additional_dependencies: [regex]
  - id: validate-config
    name: Validate configuration has default values and that each field has a docstring
    entry: python tools/pre_commit/validate_config.py
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -9,7 +9,6 @@ build:
    python: "3.12"
  jobs:
    post_checkout:
-      # - bash docs/maybe_skip_pr_build.sh
      - git fetch origin main --unshallow --no-tags --filter=blob:none || true
    pre_create_environment:
      - pip install uv
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1,113 +0,0 @@
-# Agent Instructions for vLLM
-
-> These instructions apply to **all** AI-assisted contributions to `vllm-project/vllm`.
-> Breaching these guidelines can result in automatic banning.
-
-## 1. Contribution Policy (Mandatory)
-
-### Duplicate-work checks
-
-Before proposing a PR, run these checks:
-
-```bash
-gh issue view <issue_number> --repo vllm-project/vllm --comments
-gh pr list --repo vllm-project/vllm --state open --search "<issue_number> in:body"
-gh pr list --repo vllm-project/vllm --state open --search "<short area keywords>"
-```
-
- If an open PR already addresses the same fix, do not open another.
- If your approach is materially different, explain the difference in the issue.
-
-### No low-value busywork PRs
-
-Do not open one-off PRs for tiny edits (single typo, isolated style change, one mutable default, etc.). Mechanical cleanups are acceptable only when bundled with substantive work.
-
-### Accountability
-
- Pure code-agent PRs are **not allowed**. A human submitter must understand and defend the change end-to-end.
- The submitting human must review every changed line and run relevant tests.
- PR descriptions for AI-assisted work **must** include:
-    - Why this is not duplicating an existing PR.
-    - Test commands run and results.
-    - Clear statement that AI assistance was used.
-
-### Fail-closed behavior
-
-If work is duplicate/trivial busywork, **do not proceed**. Return a short explanation of what is missing.
-
---
-
-## 2. Development Workflow
-
-### Environment setup
-
-```bash
-# Install `uv` if you don't have it already:
-curl -LsSf https://astral.sh/uv/install.sh | sh
-
-# Always use `uv` for Python environment management:
-uv venv --python 3.12
-source .venv/bin/activate
-
-# Always make sure `pre-commit` and its hooks are installed:
-uv pip install -r requirements/lint.txt
-pre-commit install
-```
-
-### Installing dependencies
-
-```bash
-# If you are only making Python changes:
-VLLM_USE_PRECOMPILED=1 uv pip install -e .
-
-# If you are also making C/C++ changes:
-uv pip install -e .
-```
-
-### Running tests
-
-Tests require extra dependencies.
-All versions for test dependencies should be read from `requirements/test.txt`
-
-```bash
-# Install bare minimum test dependencies:
-uv pip install pytest pytest-asyncio tblib
-
-# Install additional test dependencies as needed, or install them all as follows:
-uv pip install -r requirements/test.txt
-
-# Run specific test from specific test file
-pytest tests/path/to/test.py -v -s -k test_name
-
-# Run all tests in directory
-pytest tests/path/to/dir -v -s
-```
-
-### Running linters
-
-```bash
-# Run all pre-commit hooks on staged files:
-pre-commit run
-
-# Run on all files:
-pre-commit run --all-files
-
-# Run a specific hook:
-pre-commit run ruff-check --all-files
-
-# Run mypy as it is in CI:
-pre-commit run mypy-3.10 --all-files --hook-stage manual
-```
-
-### Commit messages
-
-Add attribution using commit trailers such as `Co-authored-by:` (other projects use `Assisted-by:` or `Generated-by:`). For example:
-
-```text
-Your commit message here
-
-Co-authored-by: GitHub Copilot
-Co-authored-by: Claude
-Co-authored-by: gemini-code-assist
-Signed-off-by: Your Name <your.email@example.com>
-```
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -1 +0,0 @@
-@AGENTS.md
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,7 +37,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")

 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1150;gfx1151;gfx1152;gfx1153;gfx1200;gfx1201")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")

 # ROCm installation prefix. Default to /opt/rocm but allow override via
 # -DROCM_PATH=/your/rocm/path when invoking cmake.
--- a/benchmarks/attention_benchmarks/README.md
+++ b/benchmarks/attention_benchmarks/README.md
@@ -187,7 +187,7 @@ python benchmark.py \
 ## Hardware Requirements

 | Backend | Hardware |
-| ------- | -------- |
+|---------|----------|
 | Flash/Triton/FlashInfer | Any CUDA GPU |
 | CUTLASS MLA | Blackwell (SM100+) |
 | FlashAttn MLA | Hopper (SM90+) |
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -59,9 +59,7 @@ def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
    """Run MLA benchmark with appropriate backend."""
    from mla_runner import run_mla_benchmark as run_mla

-    return run_mla(
-        config.backend, config, prefill_backend=config.prefill_backend, **kwargs
-    )
+    return run_mla(config.backend, config, **kwargs)


 def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
@@ -442,21 +440,14 @@ def main():
    # Backend selection
    parser.add_argument(
        "--backends",
-        "--decode-backends",
        nargs="+",
-        help="Decode backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
+        help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
        "flashinfer_mla, flashattn_mla, flashmla)",
    )
    parser.add_argument(
        "--backend",
        help="Single backend (alternative to --backends)",
    )
-    parser.add_argument(
-        "--prefill-backends",
-        nargs="+",
-        help="Prefill backends to compare (fa2, fa3, fa4). "
-        "Uses the first decode backend for impl construction.",
-    )

    # Batch specifications
    parser.add_argument(
@@ -511,7 +502,7 @@ def main():

        # Override args with YAML values, but CLI args take precedence
        # Check if CLI provided backends (they would be non-None and not default)
-        cli_backends_provided = args.backend is not None or args.backends is not None
+        cli_backends_provided = args.backends is not None or args.backend is not None

        # Backend(s) - only use YAML if CLI didn't specify
        if not cli_backends_provided:
@@ -521,12 +512,6 @@ def main():
            elif "backends" in yaml_config:
                args.backends = yaml_config["backends"]
                args.backend = None
-            elif "decode_backends" in yaml_config:
-                args.backends = yaml_config["decode_backends"]
-                args.backend = None
-
-        # Prefill backends (e.g., ["fa3", "fa4"])
-        args.prefill_backends = yaml_config.get("prefill_backends", None)

        # Check for special modes
        if "mode" in yaml_config:
@@ -628,10 +613,7 @@ def main():

    # Determine backends
    backends = args.backends or ([args.backend] if args.backend else ["flash"])
-    prefill_backends = getattr(args, "prefill_backends", None)
    console.print(f"Backends: {', '.join(backends)}")
-    if prefill_backends:
-        console.print(f"Prefill backends: {', '.join(prefill_backends)}")
    console.print(f"Batch specs: {', '.join(args.batch_specs)}")
    console.print()

@@ -868,93 +850,37 @@ def main():

    else:
        # Normal mode: compare backends
-        decode_results = []
-        prefill_results = []
+        total = len(backends) * len(args.batch_specs)

-        # Run decode backend comparison
-        if not prefill_backends:
-            # No prefill backends specified: compare decode backends as before
-            total = len(backends) * len(args.batch_specs)
+        with tqdm(total=total, desc="Benchmarking") as pbar:
+            for spec in args.batch_specs:
+                for backend in backends:
+                    config = BenchmarkConfig(
+                        backend=backend,
+                        batch_spec=spec,
+                        num_layers=args.num_layers,
+                        head_dim=args.head_dim,
+                        num_q_heads=args.num_q_heads,
+                        num_kv_heads=args.num_kv_heads,
+                        block_size=args.block_size,
+                        device=args.device,
+                        repeats=args.repeats,
+                        warmup_iters=args.warmup_iters,
+                        profile_memory=args.profile_memory,
+                    )

-            with tqdm(total=total, desc="Benchmarking") as pbar:
-                for spec in args.batch_specs:
-                    for backend in backends:
-                        config = BenchmarkConfig(
-                            backend=backend,
-                            batch_spec=spec,
-                            num_layers=args.num_layers,
-                            head_dim=args.head_dim,
-                            num_q_heads=args.num_q_heads,
-                            num_kv_heads=args.num_kv_heads,
-                            block_size=args.block_size,
-                            device=args.device,
-                            repeats=args.repeats,
-                            warmup_iters=args.warmup_iters,
-                            profile_memory=args.profile_memory,
-                        )
+                    result = run_benchmark(config)
+                    all_results.append(result)

-                        result = run_benchmark(config)
-                        decode_results.append(result)
+                    if not result.success:
+                        console.print(f"[red]Error {backend} {spec}: {result.error}[/]")

-                        if not result.success:
-                            console.print(
-                                f"[red]Error {backend} {spec}: {result.error}[/]"
-                            )
+                    pbar.update(1)

-                        pbar.update(1)
-
-            console.print("\n[bold green]Results:[/]")
-            formatter = ResultsFormatter(console)
-            formatter.print_table(decode_results, backends)
-
-        # Run prefill backend comparison
-        if prefill_backends:
-            # Use first decode backend for impl construction
-            decode_backend = backends[0]
-            total = len(prefill_backends) * len(args.batch_specs)
-
-            console.print(
-                f"[yellow]Prefill comparison mode: "
-                f"using {decode_backend} for decode impl[/]"
-            )
-
-            with tqdm(total=total, desc="Prefill benchmarking") as pbar:
-                for spec in args.batch_specs:
-                    for pb in prefill_backends:
-                        config = BenchmarkConfig(
-                            backend=decode_backend,
-                            batch_spec=spec,
-                            num_layers=args.num_layers,
-                            head_dim=args.head_dim,
-                            num_q_heads=args.num_q_heads,
-                            num_kv_heads=args.num_kv_heads,
-                            block_size=args.block_size,
-                            device=args.device,
-                            repeats=args.repeats,
-                            warmup_iters=args.warmup_iters,
-                            profile_memory=args.profile_memory,
-                            prefill_backend=pb,
-                        )
-
-                        result = run_benchmark(config)
-
-                        # Label result with prefill backend name for display
-                        labeled_config = replace(result.config, backend=pb)
-                        result = replace(result, config=labeled_config)
-                        prefill_results.append(result)
-
-                        if not result.success:
-                            console.print(f"[red]Error {pb} {spec}: {result.error}[/]")
-
-                        pbar.update(1)
-
-            console.print("\n[bold green]Prefill Backend Results:[/]")
-            formatter = ResultsFormatter(console)
-            formatter.print_table(
-                prefill_results, prefill_backends, compare_to_fastest=True
-            )
-
-        all_results = decode_results + prefill_results
+        # Display results
+        console.print("\n[bold green]Results:[/]")
+        formatter = ResultsFormatter(console)
+        formatter.print_table(all_results, backends)

    # Save results
    if all_results:
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -30,7 +30,7 @@ def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
        max_kv_len = max(r.kv_len for r in requests) if requests else 0
        return (batch_size, max_q_len, max_kv_len)
    except Exception:
-        # Fallback for unparsable specs
+        # Fallback for unparseable specs
        return (0, 0, 0)


@@ -77,7 +77,6 @@ class MockKVBProj:
        self.qk_nope_head_dim = qk_nope_head_dim
        self.v_head_dim = v_head_dim
        self.out_dim = qk_nope_head_dim + v_head_dim
-        self.weight = torch.empty(0, dtype=torch.bfloat16)

    def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
        """
@@ -214,7 +213,6 @@ class BenchmarkConfig:
    use_cuda_graphs: bool = False

    # MLA-specific
-    prefill_backend: str | None = None
    kv_lora_rank: int | None = None
    qk_nope_head_dim: int | None = None
    qk_rope_head_dim: int | None = None
--- a/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
@@ -1,19 +1,4 @@
-# MLA prefill backend comparison
-#
-# Compares all available MLA prefill backends:
-#   FA backends:  fa2, fa3, fa4 (FlashAttention versions)
-#   Non-FA:       flashinfer, cudnn, trtllm (Blackwell-only, require flashinfer)
-#
-# Uses cutlass_mla as the decode backend for impl construction
-# (only the prefill path is exercised).
-#
-# Backends that aren't available on the current platform will report errors
-# in the results table (e.g., fa3 on Blackwell, cudnn without artifactory).
-#
-# Usage:
-#   python benchmark.py --config configs/mla_prefill.yaml
-
-description: "MLA prefill backend comparison"
+# MLA prefill-only benchmark configuration for sparse backends

 model:
  name: "deepseek-v3"
@@ -27,25 +12,20 @@ model:
  v_head_dim: 128
  block_size: 128

-# model:
-#   name: "deepseek-v2-lite"
-#   num_layers: 27
-#   num_q_heads: 16
-#   num_kv_heads: 1
-#   head_dim: 576
-#   kv_lora_rank: 512
-#   qk_nope_head_dim: 128
-#   qk_rope_head_dim: 64
-#   v_head_dim: 128
-#   block_size: 128
+# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
+# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
+model_parameter_sweep:
+  param_name: "num_q_heads"
+  values: [128, 64, 32, 16]
+  label_format: "{backend}_{value}h"

 batch_specs:
  # Pure prefill
-  - "q512"
-  - "q1k"
-  - "q2k"
-  - "q4k"
-  - "q8k"
+  - "1q512"
+  - "1q1k"
+  - "1q2k"
+  - "1q4k"
+  - "1q8k"

  # Batched pure prefill
  - "2q512"
@@ -64,63 +44,19 @@ batch_specs:
  - "8q4k"
  - "8q8k"

-  # Chunked prefill / extend
-  # Short context
-  - "q128s1k"
-  - "q256s2k"
-  - "q512s4k"
-  - "q1ks4k"
-  - "q2ks8k"
-  - "2q128s1k"
-  - "2q256s2k"
-  - "2q512s4k"
-  - "2q1ks4k"
-  - "2q2ks8k"
-  - "4q128s1k"
-  - "4q256s2k"
-  - "4q512s4k"
-  - "4q1ks4k"
-  - "4q2ks8k"
-  - "8q128s1k"
-  - "8q256s2k"
-  - "8q512s4k"
-  - "8q1ks4k"
+  # Extend
+  - "1q512s4k"
+  - "1q512s8k"
+  - "1q1ks8k"
+  - "1q2ks8k"
+  - "1q2ks16k"
+  - "1q4ks16k"

-  # Medium context
-  - "q128s16k"
-  - "q512s16k"
-  - "q1ks16k"
-  - "q2ks16k"
-  - "2q128s16k"
-  - "2q512s16k"
-  - "2q1ks16k"
-  - "2q2ks16k"
-  - "4q128s16k"
-  - "4q512s16k"
-  - "4q1ks16k"
-  - "4q2ks16k"
-
-  # Long context
-  - "q128s64k"
-  - "q512s64k"
-  - "q1ks64k"
-  - "q2ks64k"
-  - "2q128s64k"
-  - "2q512s64k"
-  - "2q1ks64k"
-  - "2q2ks64k"
-
-decode_backends:
-  - CUTLASS_MLA
-
-prefill_backends:
-  - fa2
-  - fa3
-  - fa4
-  - flashinfer
-  - cudnn
-  - trtllm
+backends:
+  - FLASHMLA_SPARSE
+  - FLASHINFER_MLA_SPARSE

 device: "cuda:0"
-repeats: 20
-warmup_iters: 5
+repeats: 10
+warmup_iters: 3
+profile_memory: true
--- a/benchmarks/attention_benchmarks/configs/mla_sparse_prefill.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_sparse_prefill.yaml
@@ -1,62 +0,0 @@
-# MLA prefill-only benchmark configuration for sparse backends
-
-model:
-  name: "deepseek-v3"
-  num_layers: 60
-  num_q_heads: 128
-  num_kv_heads: 1
-  head_dim: 576
-  kv_lora_rank: 512
-  qk_nope_head_dim: 128
-  qk_rope_head_dim: 64
-  v_head_dim: 128
-  block_size: 128
-
-# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
-# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
-model_parameter_sweep:
-  param_name: "num_q_heads"
-  values: [128, 64, 32, 16]
-  label_format: "{backend}_{value}h"
-
-batch_specs:
-  # Pure prefill
-  - "1q512"
-  - "1q1k"
-  - "1q2k"
-  - "1q4k"
-  - "1q8k"
-
-  # Batched pure prefill
-  - "2q512"
-  - "2q1k"
-  - "2q2k"
-  - "2q4k"
-  - "2q8k"
-  - "4q512"
-  - "4q1k"
-  - "4q2k"
-  - "4q4k"
-  - "4q8k"
-  - "8q512"
-  - "8q1k"
-  - "8q2k"
-  - "8q4k"
-  - "8q8k"
-
-  # Extend
-  - "1q512s4k"
-  - "1q512s8k"
-  - "1q1ks8k"
-  - "1q2ks8k"
-  - "1q2ks16k"
-  - "1q4ks16k"
-
-backends:
-  - FLASHMLA_SPARSE
-  - FLASHINFER_MLA_SPARSE
-
-device: "cuda:0"
-repeats: 10
-warmup_iters: 3
-profile_memory: true
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -62,7 +62,6 @@ def create_minimal_vllm_config(
    max_num_seqs: int = 256,
    mla_dims: dict | None = None,
    index_topk: int | None = None,
-    prefill_backend: str | None = None,
 ) -> VllmConfig:
    """
    Create minimal VllmConfig for MLA benchmarks.
@@ -76,9 +75,6 @@ def create_minimal_vllm_config(
                  setup_mla_dims(model_name)
        index_topk: Optional topk value for sparse MLA backends. If provided,
                    the config will include index_topk for sparse attention.
-        prefill_backend: Prefill backend name (e.g., "fa3", "fa4", "flashinfer",
-                        "cudnn", "trtllm"). Configures the attention config to
-                        force the specified prefill backend.

    Returns:
        VllmConfig for benchmarking
@@ -149,6 +145,7 @@ def create_minimal_vllm_config(
    cache_config = CacheConfig(
        block_size=block_size,
        gpu_memory_utilization=0.9,
+        swap_space=0,
        cache_dtype="auto",
        enable_prefix_caching=False,
    )
@@ -167,7 +164,7 @@ def create_minimal_vllm_config(

    compilation_config = CompilationConfig()

-    vllm_config = VllmConfig(
+    return VllmConfig(
        model_config=model_config,
        cache_config=cache_config,
        parallel_config=parallel_config,
@@ -175,84 +172,9 @@ def create_minimal_vllm_config(
        compilation_config=compilation_config,
    )

-    if prefill_backend is not None:
-        prefill_cfg = get_prefill_backend_config(prefill_backend)
-        if prefill_cfg["flash_attn_version"] is not None:
-            vllm_config.attention_config.flash_attn_version = prefill_cfg[
-                "flash_attn_version"
-            ]
-        vllm_config.attention_config.disable_flashinfer_prefill = prefill_cfg[
-            "disable_flashinfer_prefill"
-        ]
-        vllm_config.attention_config.use_cudnn_prefill = prefill_cfg[
-            "use_cudnn_prefill"
-        ]
-        vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill = prefill_cfg[
-            "use_trtllm_ragged_deepseek_prefill"
-        ]
-
-    return vllm_config
-

 # ============================================================================
-# Prefill Backend Configuration
-# ============================================================================
-
-# Maps prefill backend names to attention config overrides.
-# FA backends set flash_attn_version and disable non-FA paths.
-# Non-FA backends enable their specific path and disable others.
-_PREFILL_BACKEND_CONFIG: dict[str, dict] = {
-    "fa2": {
-        "flash_attn_version": 2,
-        "disable_flashinfer_prefill": True,
-        "use_cudnn_prefill": False,
-        "use_trtllm_ragged_deepseek_prefill": False,
-    },
-    "fa3": {
-        "flash_attn_version": 3,
-        "disable_flashinfer_prefill": True,
-        "use_cudnn_prefill": False,
-        "use_trtllm_ragged_deepseek_prefill": False,
-    },
-    "fa4": {
-        "flash_attn_version": 4,
-        "disable_flashinfer_prefill": True,
-        "use_cudnn_prefill": False,
-        "use_trtllm_ragged_deepseek_prefill": False,
-    },
-    "flashinfer": {
-        "flash_attn_version": None,
-        "disable_flashinfer_prefill": False,
-        "use_cudnn_prefill": False,
-        "use_trtllm_ragged_deepseek_prefill": False,
-    },
-    "cudnn": {
-        "flash_attn_version": None,
-        "disable_flashinfer_prefill": True,
-        "use_cudnn_prefill": True,
-        "use_trtllm_ragged_deepseek_prefill": False,
-    },
-    "trtllm": {
-        "flash_attn_version": None,
-        "disable_flashinfer_prefill": True,
-        "use_cudnn_prefill": False,
-        "use_trtllm_ragged_deepseek_prefill": True,
-    },
-}
-
-
-def get_prefill_backend_config(prefill_backend: str) -> dict:
-    """Get attention config overrides for a prefill backend."""
-    if prefill_backend not in _PREFILL_BACKEND_CONFIG:
-        raise ValueError(
-            f"Unknown prefill backend: {prefill_backend!r}. "
-            f"Available: {list(_PREFILL_BACKEND_CONFIG.keys())}"
-        )
-    return _PREFILL_BACKEND_CONFIG[prefill_backend]
-
-
-# ============================================================================
-# Decode Backend Configuration
+# Backend Configuration
 # ============================================================================


@@ -282,7 +204,6 @@ def _get_backend_config(backend: str) -> dict:
    Returns:
        Dict with backend configuration
    """
-    from vllm.v1.attention.backend import MultipleOf
    from vllm.v1.attention.backends.registry import AttentionBackendEnum

    try:
@@ -299,8 +220,8 @@ def _get_backend_config(backend: str) -> dict:
    block_sizes = backend_class.get_supported_kernel_block_sizes()
    # Use first supported block size (backends typically support one for MLA)
    block_size = block_sizes[0] if block_sizes else None
-    if isinstance(block_size, MultipleOf):
-        # No fixed block size; fall back to config value
+    if hasattr(block_size, "value"):
+        # Handle MultipleOf enum
        block_size = None

    # Check if sparse via class method if available
@@ -756,11 +677,16 @@ def _run_single_benchmark(
    if is_sparse and indexer is not None:
        indexer.fill_random_indices(total_q, max_kv_len)

-    # Determine which forward method to use based on metadata
-    if metadata.decode is not None:
+    # Determine which forward method to use
+    if is_sparse:
+        # Sparse backends use forward_mqa
        forward_fn = lambda: impl.forward_mqa(decode_inputs, kv_cache, metadata, layer)
+    elif metadata.decode is not None:
+        forward_fn = lambda: impl._forward_decode(
+            decode_inputs, kv_cache, metadata, layer
+        )
    elif metadata.prefill is not None:
-        forward_fn = lambda: impl.forward_mha(
+        forward_fn = lambda: impl._forward_prefill(
            prefill_inputs["q"],
            prefill_inputs["k_c_normed"],
            prefill_inputs["k_pe"],
@@ -775,7 +701,7 @@ def _run_single_benchmark(
    # Warmup
    for _ in range(config.warmup_iters):
        forward_fn()
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    # Benchmark
    times = []
@@ -788,7 +714,7 @@ def _run_single_benchmark(
            forward_fn()
        end.record()

-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        elapsed_ms = start.elapsed_time(end)
        times.append(elapsed_ms / 1000.0 / config.num_layers)

@@ -807,7 +733,6 @@ def _run_mla_benchmark_batched(
    backend: str,
    configs_with_params: list[tuple],  # [(config, threshold, num_splits), ...]
    index_topk: int = 2048,
-    prefill_backend: str | None = None,
 ) -> list[BenchmarkResult]:
    """
    Unified batched MLA benchmark runner for all backends.
@@ -819,13 +744,11 @@ def _run_mla_benchmark_batched(
    to avoid setup/teardown overhead.

    Args:
-        backend: Backend name (decode backend used for impl construction)
+        backend: Backend name
        configs_with_params: List of (config, threshold, num_splits) tuples
            - threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
            - num_splits: num_kv_splits (CUTLASS only)
        index_topk: Topk value for sparse MLA backends (default 2048)
-        prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
-            When set, forces the specified FlashAttention version for prefill.

    Returns:
        List of BenchmarkResult objects
@@ -835,7 +758,7 @@ def _run_mla_benchmark_batched(

    backend_cfg = _get_backend_config(backend)
    device = torch.device(configs_with_params[0][0].device)
-    torch.accelerator.set_device_index(device)
+    torch.cuda.set_device(device)

    # Determine block size
    config_block_size = configs_with_params[0][0].block_size
@@ -858,25 +781,11 @@ def _run_mla_benchmark_batched(
        block_size=block_size,
        mla_dims=mla_dims,  # Use custom dims from config or default
        index_topk=index_topk if is_sparse else None,
-        prefill_backend=prefill_backend,
    )

    results = []

    with set_current_vllm_config(vllm_config):
-        # Clear cached prefill backend detection functions so they re-evaluate
-        # with the current VllmConfig. These are @functools.cache decorated and
-        # would otherwise return stale results from a previous backend's config.
-        from vllm.model_executor.layers.attention.mla_attention import (
-            use_cudnn_prefill,
-            use_flashinfer_prefill,
-            use_trtllm_ragged_deepseek_prefill,
-        )
-
-        use_flashinfer_prefill.cache_clear()
-        use_cudnn_prefill.cache_clear()
-        use_trtllm_ragged_deepseek_prefill.cache_clear()
-
        # Create backend impl, layer, builder, and indexer (reused across benchmarks)
        impl, layer, builder_instance, indexer = _create_backend_impl(
            backend_cfg,
@@ -886,38 +795,6 @@ def _run_mla_benchmark_batched(
            index_topk=index_topk if is_sparse else None,
        )

-        # Verify the actual prefill backend matches what was requested
-        if prefill_backend is not None:
-            prefill_cfg = get_prefill_backend_config(prefill_backend)
-            fa_version = prefill_cfg["flash_attn_version"]
-
-            if fa_version is not None:
-                # FA backend: verify the impl's FA version
-                actual_fa_version = getattr(impl, "vllm_flash_attn_version", None)
-                if actual_fa_version != fa_version:
-                    raise RuntimeError(
-                        f"Prefill backend '{prefill_backend}' requested FA "
-                        f"version {fa_version}, but the impl is using FA "
-                        f"version {actual_fa_version}. Check "
-                        f"vllm/v1/attention/backends/fa_utils.py."
-                    )
-            else:
-                # Non-FA backend: verify the builder picked the right path
-                expected_flags = {
-                    "flashinfer": "_use_fi_prefill",
-                    "cudnn": "_use_cudnn_prefill",
-                    "trtllm": "_use_trtllm_ragged_prefill",
-                }
-                flag_name = expected_flags.get(prefill_backend)
-                if flag_name and not getattr(builder_instance, flag_name, False):
-                    raise RuntimeError(
-                        f"Prefill backend '{prefill_backend}' was requested "
-                        f"but the metadata builder did not enable it. This "
-                        f"usually means a dependency is missing (e.g., "
-                        f"flashinfer not installed) or the platform doesn't "
-                        f"support it."
-                    )
-
        # Run each benchmark with the shared impl
        for config, threshold, num_splits in configs_with_params:
            # Set threshold for this benchmark (FlashAttn/FlashMLA only)
@@ -968,7 +845,6 @@ def run_mla_benchmark(
    reorder_batch_threshold: int | None = None,
    num_kv_splits: int | None = None,
    index_topk: int = 2048,
-    prefill_backend: str | None = None,
 ) -> BenchmarkResult | list[BenchmarkResult]:
    """
    Unified MLA benchmark runner for all backends.
@@ -986,8 +862,6 @@ def run_mla_benchmark(
                                 (single config mode only)
        num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
        index_topk: Topk value for sparse MLA backends (default 2048)
-        prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
-            When set, forces the specified FlashAttention version for prefill.

    Returns:
        BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
@@ -1011,9 +885,7 @@ def run_mla_benchmark(
        return_single = True

    # Use unified batched execution
-    results = _run_mla_benchmark_batched(
-        backend, configs_with_params, index_topk, prefill_backend=prefill_backend
-    )
+    results = _run_mla_benchmark_batched(backend, configs_with_params, index_topk)

    # Return single result or list based on input
    return results[0] if return_single else results
--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -141,6 +141,7 @@ def _create_vllm_config(
    cache_config = CacheConfig(
        block_size=config.block_size,
        cache_dtype="auto",
+        swap_space=0,
    )
    cache_config.num_gpu_blocks = max_num_blocks
    cache_config.num_cpu_blocks = 0
@@ -390,7 +391,7 @@ def _run_single_benchmark(
                attn_metadata,
                output=out,
            )
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    # Benchmark
    times = []
@@ -411,15 +412,15 @@ def _run_single_benchmark(
            )
        end.record()

-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        elapsed_ms = start.elapsed_time(end)
        times.append(elapsed_ms / 1000.0 / config.num_layers)  # seconds per layer

    mem_stats = {}
    if config.profile_memory:
        mem_stats = {
-            "allocated_mb": torch.accelerator.memory_allocated(device) / 1024**2,
-            "reserved_mb": torch.accelerator.memory_reserved(device) / 1024**2,
+            "allocated_mb": torch.cuda.memory_allocated(device) / 1024**2,
+            "reserved_mb": torch.cuda.memory_reserved(device) / 1024**2,
        }

    return times, mem_stats
@@ -443,7 +444,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
        BenchmarkResult with timing and memory statistics
    """
    device = torch.device(config.device)
-    torch.accelerator.set_device_index(device)
+    torch.cuda.set_device(device)

    backend_cfg = _get_backend_config(config.backend)

--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -41,7 +41,7 @@ MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LE
 | --- | --- | --- |
 | `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
 | `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` |
-| `SYSTEM` | **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
+| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
 | `TP` | **Required.** The tensor-parallelism size. | `1` |
 | `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
 | `INPUT_LEN` | **Required.** Request input length. | `4000` |
--- a/benchmarks/benchmark_topk_topp.py
+++ b/benchmarks/benchmark_topk_topp.py
@@ -94,18 +94,15 @@ def create_logits(

 def measure_memory() -> tuple[int, int]:
    """Return (allocated, reserved) memory in bytes."""
-    torch.accelerator.synchronize()
-    return (
-        torch.accelerator.memory_allocated(),
-        torch.accelerator.max_memory_allocated(),
-    )
+    torch.cuda.synchronize()
+    return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()


 def reset_memory_stats():
    """Reset peak memory statistics."""
    reset_buffer_cache()
-    torch.accelerator.reset_peak_memory_stats()
-    torch.accelerator.empty_cache()
+    torch.cuda.reset_peak_memory_stats()
+    torch.cuda.empty_cache()
    gc.collect()


@@ -126,7 +123,7 @@ def benchmark_function(
    for _ in range(warmup_iters):
        logits_copy = logits.clone()
        func(logits_copy, k, p)
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    # Reset memory stats before benchmark
    reset_memory_stats()
@@ -143,7 +140,7 @@ def benchmark_function(
        func(logits_copy, k, p)
        end_events[i].record()

-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    # Calculate timing
    times = [
--- a/benchmarks/kernels/bench_concat_mla_q.py
+++ b/benchmarks/kernels/bench_concat_mla_q.py
@@ -1,98 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-
-import torch
-
-from vllm import _custom_ops as ops
-from vllm.triton_utils import triton
-
-# DeepSeek V3 dimensions
-NOPE_DIM = 512
-ROPE_DIM = 64
-NUM_HEADS = 128
-
-NUM_TOKENS = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
-
-
-def get_configs():
-    return NUM_TOKENS
-
-
-def make_inputs(num_tokens, dtype):
-    """Create inputs matching the real code path.
-
-    Args:
-        contiguous_nope: If False, simulate the transposed BMM output
-                         (non-contiguous nope with stride pattern from
-                         [N,B,L].transpose(0,1)).
-    """
-    # Simulate: bmm output [N, B, L].transpose(0, 1) -> [B, N, L]
-    raw = torch.randn(NUM_HEADS, num_tokens, NOPE_DIM, dtype=dtype, device="cuda")
-    ql_nope = raw.transpose(0, 1)
-
-    q_pe = torch.randn(num_tokens, NUM_HEADS, ROPE_DIM, dtype=dtype, device="cuda")
-    return ql_nope, q_pe
-
-
-# ---- Non-contiguous nope benchmark (real code path) ----
-@triton.testing.perf_report(
-    triton.testing.Benchmark(
-        x_names=["num_tokens"],
-        x_vals=get_configs(),
-        line_arg="provider",
-        line_vals=["torch_cat", "concat_mla_q"],
-        line_names=["torch.cat", "concat_mla_q (v8)"],
-        styles=[("blue", "--"), ("green", "-")],
-        ylabel="Latency (us)",
-        plot_name="concat_mla_q-transposed",
-        args={},
-    )
-)
-def bench_transposed(num_tokens, provider):
-    dtype = torch.bfloat16
-    ql_nope, q_pe = make_inputs(num_tokens, dtype)
-
-    q_out = torch.empty(
-        num_tokens, NUM_HEADS, NOPE_DIM + ROPE_DIM, dtype=dtype, device="cuda"
-    )
-
-    quantiles = [0.5, 0.2, 0.8]
-
-    if provider == "torch_cat":
-        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
-            lambda: torch.cat((ql_nope, q_pe), dim=-1), quantiles=quantiles, rep=500
-        )
-    else:
-        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
-            lambda: ops.concat_mla_q(ql_nope, q_pe, q_out), quantiles=quantiles, rep=500
-        )
-
-    return ms * 1000, max_ms * 1000, min_ms * 1000  # us
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Benchmark concat_mla_q vs torch.cat")
-    parser.add_argument(
-        "--save-path", type=str, default=None, help="Path to save benchmark results"
-    )
-    args = parser.parse_args()
-
-    print("\n" + "=" * 70)
-    print("CONCAT MLA Q KERNEL BENCHMARKS")
-    print("=" * 70)
-    print(f"Dimensions: nope={NOPE_DIM}, rope={ROPE_DIM}, heads={NUM_HEADS}")
-    print(
-        f"Per-head output: {NOPE_DIM + ROPE_DIM} bf16 = "
-        f"{(NOPE_DIM + ROPE_DIM) * 2} bytes"
-    )
-    print(f"num_tokens (decode=batch_size, prefill=chunk_size): {NUM_TOKENS}")
-    print("=" * 70)
-
-    print("\n--- Non-contiguous nope inputs (transposed BMM output) ---")
-    bench_transposed.run(print_data=True, save_path=args.save_path)
-
-    print("\n" + "=" * 70)
-    print("Benchmarking complete!")
-    print("=" * 70)
--- a/benchmarks/kernels/bench_cp_gather_fp8.py
+++ b/benchmarks/kernels/bench_cp_gather_fp8.py
@@ -1,153 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import argparse
-import math
-
-import torch
-
-from vllm import _custom_ops as ops
-from vllm.triton_utils import triton
-
-# DeepSeek V3 MLA dimensions
-NOPE_DIM = 512
-ROPE_DIM = 64
-HEAD_DIM = NOPE_DIM + ROPE_DIM  # 576 BF16 output elements per token
-ENTRY_BYTES = 656  # 512 FP8 + 16 scales + 128 BF16 RoPE
-BLOCK_SIZE = 64  # tokens per physical cache block - get_supported_kernel_block_sizes
-
-# Realistic prefill scenarios:
-#   - 1 long prefill: single request, 16K-96K tokens
-#   - 4 medium prefills: 4 requests, 4K-24K tokens each
-#   - 16 shorter prefills: 16 requests, 1K-6K tokens each
-SCENARIOS = [
-    # (label, num_reqs, total_tokens_list)
-    ("1-req", 1, [8192, 16384, 32768, 65536, 98304]),
-    ("4-reqs", 4, [8192, 16384, 32768, 65536, 98304]),
-    ("16-reqs", 16, [8192, 16384, 32768, 65536, 98304]),
-]
-
-
-def make_inputs(total_tokens, num_reqs, block_size):
-    """Create synthetic FP8 cache, block table, and output buffer.
-
-    Fills the cache with random bytes (we only measure throughput,
-    not correctness). Block table maps each request to contiguous
-    physical blocks.
-    """
-    # Divide tokens evenly across requests
-    base_len = total_tokens // num_reqs
-    remainder = total_tokens % num_reqs
-    seq_lens = [base_len + (1 if r < remainder else 0) for r in range(num_reqs)]
-
-    # workspace_starts: cumulative sum of seq_lens
-    workspace_starts = [0] * num_reqs
-    for r in range(1, num_reqs):
-        workspace_starts[r] = workspace_starts[r - 1] + seq_lens[r - 1]
-
-    # Physical blocks needed per request
-    blocks_per_req = [math.ceil(s / block_size) for s in seq_lens]
-    total_blocks = sum(blocks_per_req)
-    max_blocks = max(blocks_per_req)
-
-    # Allocate cache with random data (content doesn't matter for perf)
-    cache = torch.randint(
-        0,
-        256,
-        (total_blocks, block_size, ENTRY_BYTES),
-        dtype=torch.uint8,
-        device="cuda",
-    )
-
-    # Block table: contiguous block assignments
-    block_table = torch.zeros(num_reqs, max_blocks, dtype=torch.int32, device="cuda")
-    block_idx = 0
-    for r in range(num_reqs):
-        for b in range(blocks_per_req[r]):
-            block_table[r, b] = block_idx
-            block_idx += 1
-
-    # Output workspace
-    dst = torch.zeros(total_tokens, HEAD_DIM, dtype=torch.bfloat16, device="cuda")
-
-    seq_lens_t = torch.tensor(seq_lens, dtype=torch.int32, device="cuda")
-    workspace_starts_t = torch.tensor(
-        workspace_starts, dtype=torch.int32, device="cuda"
-    )
-
-    return cache, dst, block_table, seq_lens_t, workspace_starts_t
-
-
-def bench_scenario(label, num_reqs, total_tokens_list, save_path):
-    """Run benchmark for a specific (num_reqs, total_tokens) scenario."""
-
-    @triton.testing.perf_report(
-        triton.testing.Benchmark(
-            x_names=["total_tokens"],
-            x_vals=total_tokens_list,
-            line_arg="provider",
-            line_vals=["cuda_kernel"],
-            line_names=["cp_gather_fp8 (CUDA)"],
-            styles=[("green", "-")],
-            ylabel="Latency (us)",
-            plot_name=f"cp_gather_fp8-{label}-bs{BLOCK_SIZE}",
-            args={"num_reqs": num_reqs},
-        )
-    )
-    def bench_fn(total_tokens, provider, num_reqs):
-        cache, dst, block_table, seq_lens_t, ws_starts = make_inputs(
-            total_tokens, num_reqs, BLOCK_SIZE
-        )
-
-        quantiles = [0.5, 0.2, 0.8]
-
-        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
-            lambda: ops.cp_gather_and_upconvert_fp8_kv_cache(
-                cache, dst, block_table, seq_lens_t, ws_starts, num_reqs
-            ),
-            quantiles=quantiles,
-            rep=500,
-        )
-
-        return ms * 1000, max_ms * 1000, min_ms * 1000  # us
-
-    seq_len_per_req = total_tokens_list[0] // num_reqs
-    seq_len_per_req_max = total_tokens_list[-1] // num_reqs
-    print(
-        f"\n--- {label}: {num_reqs} request(s), "
-        f"~{seq_len_per_req}-{seq_len_per_req_max} tokens/req ---"
-    )
-    bench_fn.run(print_data=True, save_path=save_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Benchmark cp_gather_and_upconvert_fp8_kv_cache"
-    )
-    parser.add_argument(
-        "--save-path",
-        type=str,
-        default=None,
-        help="Path to save benchmark results as CSV",
-    )
-    args = parser.parse_args()
-
-    # Print data volume info for bandwidth analysis
-    read_per_token = ENTRY_BYTES  # 656 bytes from cache
-    write_per_token = HEAD_DIM * 2  # 576 * 2 = 1152 bytes to workspace
-    total_per_token = read_per_token + write_per_token  # 1808 bytes
-
-    print("\n" + "=" * 70)
-    print("CP_GATHER_AND_UPCONVERT_FP8_KV_CACHE BENCHMARKS")
-    print("=" * 70)
-    print(f"Cache entry: {ENTRY_BYTES} bytes (512 FP8 + 16 scales + 128 RoPE)")
-    print(f"Output row:  {HEAD_DIM} BF16 = {HEAD_DIM * 2} bytes")
-    print(f"Per token:   {total_per_token} bytes (read + write)")
-    print(f"Block size:  {BLOCK_SIZE} tokens/block")
-    print("=" * 70)
-
-    for label, num_reqs, total_tokens_list in SCENARIOS:
-        bench_scenario(label, num_reqs, total_tokens_list, args.save_path)
-
-    print("\n" + "=" * 70)
-    print("Benchmarking complete!")
-    print("=" * 70)
--- a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
@@ -168,7 +168,7 @@ def bench_impl(
    # warmup
    for kwargs in kwargs_list:
        impl_type.get_impl()(**kwargs)
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    # Merge into a single kwargs and qualify arguments as ArgPool
    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
@@ -202,7 +202,7 @@ def test_correctness(T: int, N: int):
    # reference output
    ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)

-    # test output
+    # test ouptut
    out_q, out_s = output_from_impl(
        ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
    )
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -64,7 +64,7 @@ def bench_run(
    per_out_ch: bool,
    mkn: tuple[int, int, int],
 ):
-    init_workspace_manager(torch.accelerator.current_device_index())
+    init_workspace_manager(torch.cuda.current_device())
    (m, k, n) = mkn

    dtype = torch.half
@@ -171,7 +171,7 @@ def bench_run(
                activation=MoEActivation.SILU,
                global_num_experts=num_experts,
            )
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    # Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
    triton_stream = torch.cuda.Stream()
@@ -187,14 +187,14 @@ def bench_run(
                topk_ids,
                quant_config=quant_config,
            )
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
        """Benchmark CUDA graph using events like benchmark_moe.py"""
        # Warmup
        for _ in range(num_warmup):
            graph.replay()
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()

        # Timing
        start_event = torch.Event(enable_timing=True)
@@ -202,7 +202,7 @@ def bench_run(

        latencies = []
        for _ in range(num_iters):
-            torch.accelerator.synchronize()
+            torch.cuda.synchronize()
            start_event.record()
            graph.replay()
            end_event.record()
--- a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
@@ -307,7 +307,7 @@ def bench_run(
    def replay_graph(graph, num_repeats):
        for _ in range(num_repeats):
            graph.replay()
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()

    cutlass_stream = torch.cuda.Stream()
    cutlass_graph = torch.cuda.CUDAGraph()
@@ -330,7 +330,7 @@ def bench_run(
            e=num_experts,
            device=device,
        )
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    triton_stream = torch.cuda.Stream()
    triton_graph = torch.cuda.CUDAGraph()
@@ -345,7 +345,7 @@ def bench_run(
            w2_fp8scale,
            a_fp8_scale,
        )
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    min_run_time = 5
    num_warmup = 5
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -342,7 +342,7 @@ class CommunicatorBenchmark:
            if not should_use_fn(tensor):
                return None

-            torch.accelerator.synchronize()
+            torch.cuda.synchronize()
            stream = torch.cuda.Stream()
            with torch.cuda.stream(stream):
                graph_input = tensor.clone()
@@ -360,17 +360,17 @@ class CommunicatorBenchmark:
                        for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
                            allreduce_fn(graph_input)

-            torch.accelerator.synchronize()
+            torch.cuda.synchronize()
            for _ in range(num_warmup):
                graph.replay()
-            torch.accelerator.synchronize()
+            torch.cuda.synchronize()

-            torch.accelerator.synchronize()
+            torch.cuda.synchronize()
            start_time = time.perf_counter()

            for _ in range(num_trials):
                graph.replay()
-            torch.accelerator.synchronize()
+            torch.cuda.synchronize()

            end_time = time.perf_counter()

@@ -495,7 +495,7 @@ def main():

    # Set device
    device = torch.device(f"cuda:{rank}")
-    torch.accelerator.set_device_index(device)
+    torch.cuda.set_device(device)

    # Get CPU process group
    cpu_group = dist.new_group(backend="gloo")
--- a/benchmarks/kernels/benchmark_fused_collective.py
+++ b/benchmarks/kernels/benchmark_fused_collective.py
@@ -385,32 +385,32 @@ def benchmark_operation(
    # Warmup before graph capture
    for _ in range(warmup):
        operation_func(*args, **kwargs)
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    # Create CUDA graph
    graph = torch.cuda.CUDAGraph()
    num_op_per_cudagraph = 10

    # Use vLLM's graph_capture to make tensor_model_parallel_all_reduce graph-safe
-    device = torch.device(f"cuda:{torch.accelerator.current_device_index()}")
+    device = torch.device(f"cuda:{torch.cuda.current_device()}")
    with graph_capture(device=device), torch.cuda.graph(graph):
        for _ in range(num_op_per_cudagraph):
            operation_func(*args, **kwargs)

    # Graph warmup
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()
    for _ in range(warmup):
        graph.replay()

    # Benchmark with CUDA graph
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()
    start_time = time.perf_counter()

    for _ in range(trials // num_op_per_cudagraph):
        # operation_func(*args, **kwargs)
        graph.replay()

-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()
    end_time = time.perf_counter()

    avg_time_ms = ((end_time - start_time) / trials) * 1000
@@ -984,7 +984,7 @@ def main():
    world_size = int(os.environ["WORLD_SIZE"])

    device = torch.device(f"cuda:{rank}")
-    torch.accelerator.set_device_index(device)
+    torch.cuda.set_device(device)
    torch.set_default_device(device)

    init_distributed_environment()
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -50,7 +50,7 @@ def bench_run(
    per_out_ch: bool,
    mkn: tuple[int, int, int],
 ):
-    init_workspace_manager(torch.accelerator.current_device_index())
+    init_workspace_manager(torch.cuda.current_device())
    label = "Quant Matmul"

    sub_label = (
@@ -224,7 +224,7 @@ def bench_run(
    def replay_graph(graph, num_repeats):
        for _ in range(num_repeats):
            graph.replay()
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()

    cutlass_stream = torch.cuda.Stream()
    cutlass_graph = torch.cuda.CUDAGraph()
@@ -239,7 +239,7 @@ def bench_run(
            topk_weights,
            topk_ids,
        )
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    triton_stream = torch.cuda.Stream()
    triton_graph = torch.cuda.CUDAGraph()
@@ -254,7 +254,7 @@ def bench_run(
            w2_scale,
            a_scale,
        )
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    min_run_time = 5
    num_warmup = 5
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -34,14 +34,14 @@ def main(
    residual = torch.randn_like(x) * scale if add_residual else None

    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()

        for _ in range(num_iters):
            layer(x, residual)
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()

        end_time = time.perf_counter()
        if profile:
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -1035,7 +1035,7 @@ def bench_optype(
    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
    for kwargs in kwargs_list:
        op_type.bench_fn()(**kwargs)
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    # Merge into a single kwargs and qualify arguments as ArgPool
    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
--- a/benchmarks/kernels/benchmark_mla_k_concat.py
+++ b/benchmarks/kernels/benchmark_mla_k_concat.py
@@ -47,13 +47,13 @@ def benchmark_method(
    # Warmup
    for _ in range(num_warmup):
        _ = method(k_nope, k_pe)
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    # Benchmark
    start = time.perf_counter()
    for _ in range(num_iters):
        _ = method(k_nope, k_pe)
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()
    end = time.perf_counter()

    return (end - start) / num_iters * 1000  # Convert to ms
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -54,7 +54,7 @@ def clear_triton_cache():

    # Clear CUDA memory cache
    if torch.cuda.is_available():
-        torch.accelerator.empty_cache()
+        torch.cuda.empty_cache()

    # Try to clear Triton's runtime cache
    try:
@@ -304,19 +304,19 @@ def benchmark_config(

    # JIT compilation & warmup
    run()
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    # Capture 10 invocations with CUDA graph
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        for _ in range(10):
            run()
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    # Warmup
    for _ in range(5):
        graph.replay()
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
@@ -324,7 +324,7 @@ def benchmark_config(
    latencies: list[float] = []
    for i in range(num_iters):
        prepare(i)
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()

        start_event.record()
        graph.replay()
@@ -626,11 +626,7 @@ class BenchmarkWorker:
            if visible_device != f"{self.device_id}":
                need_device_guard = True

-        with (
-            torch.accelerator.device_index(self.device_id)
-            if need_device_guard
-            else nullcontext()
-        ):
+        with torch.cuda.device(self.device_id) if need_device_guard else nullcontext():
            for idx, config in enumerate(tqdm(search_space)):
                try:
                    kernel_time = benchmark_config(
--- a/benchmarks/kernels/benchmark_moe_defaults.py
+++ b/benchmarks/kernels/benchmark_moe_defaults.py
@@ -131,7 +131,7 @@ def benchmark_config(
                topk_ids,
                quant_config=quant_config,
            )
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    # Benchmark
    start = torch.cuda.Event(enable_timing=True)
@@ -149,7 +149,7 @@ def benchmark_config(
                quant_config=quant_config,
            )
    end.record()
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()
    return start.elapsed_time(end) / num_iters * 1000  # ms -> us


--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -69,19 +69,19 @@ def benchmark_permute(

    # JIT compilation & warmup
    run()
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    # Capture 10 invocations with CUDA graph
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        for _ in range(10):
            run()
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    # Warmup
    for _ in range(5):
        graph.replay()
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
@@ -89,7 +89,7 @@ def benchmark_permute(
    latencies: list[float] = []
    for i in range(num_iters):
        prepare(i)
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()

        start_event.record()
        graph.replay()
@@ -159,26 +159,26 @@ def benchmark_unpermute(
    # JIT compilation & warmup
    input = prepare()
    run(input)
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    # Capture 10 invocations with CUDA graph
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        for _ in range(10):
            run(input)
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    # Warmup
    for _ in range(5):
        graph.replay()
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)

    latencies: list[float] = []
    for i in range(num_iters):
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        start_event.record()
        graph.replay()
        end_event.record()
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -135,14 +135,14 @@ def benchmark_mrope(
            key.clone(),
        )

-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    # Time reference implementation
    torch_times = []
    for _ in range(benchmark_iter):
        query_clone = query.clone()
        key_clone = key.clone()
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        start_time = time.time()

        mrope_helper_class.forward_native(
@@ -151,7 +151,7 @@ def benchmark_mrope(
            key_clone,
        )

-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        torch_times.append(time.time() - start_time)

    # Time triton kernel implementation
@@ -159,14 +159,14 @@ def benchmark_mrope(
    for _ in range(benchmark_iter):
        query_clone = query.clone()
        key_clone = key.clone()
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        start_time = time.time()
        mrope_helper_class.forward_cuda(
            positions,
            query_clone,
            key_clone,
        )
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        triton_times.append(time.time() - start_time)

    # Calculate statistics
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -103,7 +103,7 @@ def main(
        max_logits = torch.empty_like(exp_sums)

    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
@@ -173,7 +173,7 @@ def main(
                    )
            else:
                raise ValueError(f"Invalid version: {version}")
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()

        end_time = time.perf_counter()
        if profile:
--- a/benchmarks/kernels/benchmark_per_token_group_quant.py
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@@ -28,7 +28,7 @@ def _time_cuda(
    # warmup
    for _ in range(warmup_iters):
        fn()
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    start = torch.Event(enable_timing=True)
    end = torch.Event(enable_timing=True)
@@ -37,7 +37,7 @@ def _time_cuda(
    for _ in range(bench_iters):
        fn()
    end.record()
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    return start.elapsed_time(end) / bench_iters  # ms/iter

--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -29,7 +29,7 @@ def main(
    scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None

    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
@@ -39,7 +39,7 @@ def main(
                ops.scaled_int8_quant(x, scale)
            else:
                ops.scaled_fp8_quant(x, scale)
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()

        end_time = time.perf_counter()
        if profile:
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@@ -84,16 +84,16 @@ def run_benchmark(
        g = torch.cuda.CUDAGraph()
        with torch.cuda.graph(g):
            function_under_test()
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        function_under_test = lambda: g.replay()

    def run_cuda_benchmark(n_iters: int) -> float:
        nonlocal key, value, key_cache, value_cache, slot_mapping
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        start = time.perf_counter()
        for _ in range(n_iters):
            function_under_test()
-            torch.accelerator.synchronize()
+            torch.cuda.synchronize()
        end = time.perf_counter()
        return (end - start) / n_iters

@@ -104,7 +104,7 @@ def run_benchmark(

    # free tensors to mitigate OOM when sweeping
    del key, value, key_cache, value_cache, slot_mapping
-    torch.accelerator.empty_cache()
+    torch.cuda.empty_cache()

    return lat

--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@@ -109,16 +109,16 @@ def run_benchmark(
        g = torch.cuda.CUDAGraph()
        with torch.cuda.graph(g):
            function_under_test()
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        function_under_test = lambda: g.replay()

    def run_cuda_benchmark(n_iters: int) -> float:
        nonlocal key, value, key_cache, value_cache, slot_mapping
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        start = time.perf_counter()
        for _ in range(n_iters):
            function_under_test()
-            torch.accelerator.synchronize()
+            torch.cuda.synchronize()
        end = time.perf_counter()
        return (end - start) / n_iters

@@ -129,7 +129,7 @@ def run_benchmark(

    # free tensors to mitigate OOM when sweeping
    del key, value, key_cache, value_cache, slot_mapping
-    torch.accelerator.empty_cache()
+    torch.cuda.empty_cache()

    return lat

--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -251,7 +251,7 @@ def benchmark(
        kernel(
            y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
        )
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
@@ -259,7 +259,7 @@ def benchmark(
    # Benchmark
    latencies: list[float] = []
    for _ in range(runs):
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()

        start_event.record()
        for i in range(iterations_per_run):
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -126,7 +126,7 @@ def benchmark_decode(
    )

    def time_fn(fn, warmup=10, trials=20):
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        start = torch.Event(enable_timing=True)
        end = torch.Event(enable_timing=True)
        times = []
@@ -136,7 +136,7 @@ def benchmark_decode(
            start.record()
            fn()
            end.record()
-            torch.accelerator.synchronize()
+            torch.cuda.synchronize()
            times.append(start.elapsed_time(end))  # ms
        return sum(times) / len(times), torch.std(torch.tensor(times))

--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -138,7 +138,7 @@ def benchmark_prefill(
    )

    def time_fn(fn, warmup=10, trials=20):
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        start = torch.Event(enable_timing=True)
        end = torch.Event(enable_timing=True)
        times = []
@@ -148,7 +148,7 @@ def benchmark_prefill(
            start.record()
            fn()
            end.record()
-            torch.accelerator.synchronize()
+            torch.cuda.synchronize()
            times.append(start.elapsed_time(end))  # ms
        return sum(times) / len(times), torch.std(torch.tensor(times))

--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -177,18 +177,18 @@ def benchmark_config(
    def run():
        w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)

-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()
    # JIT complication & warmup
    for _ in range(5):
        run()
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()

    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)

    latencies: list[float] = []
    for i in range(num_iters):
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        start_event.record()
        run()
        end_event.record()
@@ -285,7 +285,7 @@ def tune_on_gpu(args_dict):
    weight_shapes = args_dict["weight_shapes"]
    args = args_dict["args"]

-    torch.accelerator.set_device_index(gpu_id)
+    torch.cuda.set_device(gpu_id)
    print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")

    block_n = args.block_n
@@ -334,7 +334,7 @@ def distribute_batch_sizes(batch_sizes, num_gpus):

 def main(args):
    print(args)
-    num_gpus = torch.accelerator.device_count()
+    num_gpus = torch.cuda.device_count()
    if num_gpus == 0:
        raise RuntimeError("No GPU available for tuning")
    print(f"Found {num_gpus} GPUs for parallel tuning")
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -35,7 +35,7 @@ def benchmark_shape(
    B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)

    # Reference result in BF16
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()
    C_ref = A @ B.t()

    # Pre-quantize B for all implementations
@@ -121,14 +121,14 @@ def benchmark_shape(
        # Warmup
        for _ in range(warmup):
            func()
-            torch.accelerator.synchronize()
+            torch.cuda.synchronize()

        # Timing loop
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        start = time.time()
        for _ in range(repeat):
            func()
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        end = time.time()

        # Calculate timing and TFLOPS
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -79,8 +79,7 @@ else()
    find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
    find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
    find_isa(${CPUINFO} "S390" S390_FOUND)
-    find_isa(${CPUINFO} "zvfhmin" RVV_FP16_FOUND) # Check for RISC-V Vector FP16 support
-    find_isa(${CPUINFO} "zvfbfmin" RVV_BF16_FOUND) # Check for RISC-V Vector BF16 support
+    find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support

    # Support cross-compilation by allowing override via environment variables
    if (ENABLE_ARM_BF16)
@@ -102,13 +101,11 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64" OR ENABLE_X86_ISA)
        "-mavx512f"
        "-mavx512vl"
        "-mavx512bw"
-        "-mavx512dq")
-    list(APPEND CXX_COMPILE_FLAGS_AVX512_AMX 
-        ${CXX_COMPILE_FLAGS_AVX512}
-        "-mamx-bf16"
-        "-mamx-tile"
+        "-mavx512dq"
        "-mavx512bf16"
-        "-mavx512vnni")
+        "-mavx512vnni"
+        "-mamx-bf16"
+        "-mamx-tile")
    list(APPEND CXX_COMPILE_FLAGS_AVX2
        "-mavx2")
 elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
@@ -145,19 +142,11 @@ elseif (S390_FOUND)
        "-march=native"
        "-mtune=native")
 elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
-    message(STATUS "RISC-V detected")
-    if(RVV_BF16_FOUND)
-        message(STATUS "BF16 extension detected")
-        set(MARCH_FLAGS -march=rv64gcv_zvfh_zfbfmin_zvfbfmin_zvl128b -mrvv-vector-bits=zvl -mabi=lp64d)
-        add_compile_definitions(RISCV_BF16_SUPPORT)
-    elseif (RVV_FP16_FOUND)
-        message(WARNING "BF16 functionality is not available")
-        set(MARCH_FLAGS -march=rv64gcv_zvfh_zvl128b -mrvv-vector-bits=zvl -mabi=lp64d)
+    if(RVV_FOUND)
+	    message(FAIL_ERROR "Can't support rvv now.")
    else()
-        message(STATUS "compile riscv with scalar")
        list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
    endif()
-    list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})
 else()
    message(FATAL_ERROR "vLLM CPU backend requires X86, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
 endif()
@@ -253,24 +242,13 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
        )
    else()
        message(STATUS "Downloading oneDNN from GitHub")
-        if(ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
-            message(STATUS "aarch64 detected: using pinned oneDNN commit 9c5be1cc59e368aebf0909e6cf20f981ea61462a")
-            FetchContent_Declare(
-                oneDNN
-                GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-                GIT_TAG        9c5be1cc59e368aebf0909e6cf20f981ea61462a
-                GIT_PROGRESS   TRUE
-                GIT_SHALLOW    FALSE
-            )
-        else()
-            FetchContent_Declare(
-                oneDNN
-                GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-                GIT_TAG        v3.10
-                GIT_PROGRESS   TRUE
-                GIT_SHALLOW    TRUE
-            )
-        endif()
+        FetchContent_Declare(
+            oneDNN
+            GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
+            GIT_TAG v3.10
+            GIT_PROGRESS TRUE
+            GIT_SHALLOW TRUE
+        )
    endif()

    set(ONEDNN_LIBRARY_TYPE "STATIC")
@@ -316,8 +294,7 @@ endif()

 # TODO: Refactor this
 if (ENABLE_X86_ISA)
-    message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) compile flags: ${CXX_COMPILE_FLAGS_AVX512_AMX}")
-    message(STATUS "CPU extension (AVX512F) compile flags: ${CXX_COMPILE_FLAGS_AVX512}")
+    message(STATUS "CPU extension (AVX512) compile flags: ${CXX_COMPILE_FLAGS_AVX512}")
    message(STATUS "CPU extension (AVX2) compile flags: ${CXX_COMPILE_FLAGS_AVX2}")
 else()
    message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
@@ -369,15 +346,13 @@ if(USE_ONEDNN)
 endif()

 if (ENABLE_X86_ISA)
-    set(VLLM_EXT_SRC_SGL
+    set(VLLM_EXT_SRC_AVX512
        "csrc/cpu/sgl-kernels/gemm.cpp"
        "csrc/cpu/sgl-kernels/gemm_int8.cpp"
        "csrc/cpu/sgl-kernels/gemm_fp8.cpp"
        "csrc/cpu/sgl-kernels/moe.cpp"
        "csrc/cpu/sgl-kernels/moe_int8.cpp"
-        "csrc/cpu/sgl-kernels/moe_fp8.cpp")
-
-    set(VLLM_EXT_SRC_AVX512
+        "csrc/cpu/sgl-kernels/moe_fp8.cpp"
        "csrc/cpu/shm.cpp"
        "csrc/cpu/cpu_wna16.cpp"
        "csrc/cpu/cpu_fused_moe.cpp"
@@ -403,48 +378,31 @@ if (ENABLE_X86_ISA)
        "csrc/cpu/pos_encoding.cpp"
        "csrc/moe/dynamic_4bit_int_moe_cpu.cpp") 

-    message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) source files: ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL}")
-    message(STATUS "CPU extension (AVX512F) source files: ${VLLM_EXT_SRC_AVX512}")
+    message(STATUS "CPU extension (AVX512) source files: ${VLLM_EXT_SRC_AVX512}")
    message(STATUS "CPU extension (AVX2) source files: ${VLLM_EXT_SRC_AVX2}")

-    set(_C_LIBS numa dnnl_ext)
-    set(_C_AVX512_LIBS numa dnnl_ext)
-    set(_C_AVX2_LIBS numa)
-
-    # AMX + AVX512F + AVX512BF16 + AVX512VNNI
    define_extension_target(
        _C
        DESTINATION vllm
        LANGUAGE CXX
-        SOURCES ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL}
-        LIBRARIES ${_C_LIBS}
-        COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512_AMX}
-        USE_SABI 3
-        WITH_SOABI
-    )
-
-    # For AMX kernels
-    target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16")
-
-    # AVX512F 
-    define_extension_target(
-        _C_AVX512
-        DESTINATION vllm
-        LANGUAGE CXX
        SOURCES ${VLLM_EXT_SRC_AVX512}
-        LIBRARIES ${_C_AVX512_LIBS}
+        LIBRARIES ${LIBS}
        COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512}
        USE_SABI 3
        WITH_SOABI
    )

-    # AVX2 
+    # For SGL kernels
+    target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AVX512")
+    # For AMX kernels
+    target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16")
+
    define_extension_target(
        _C_AVX2
        DESTINATION vllm
        LANGUAGE CXX
        SOURCES ${VLLM_EXT_SRC_AVX2}
-        LIBRARIES ${_C_AVX2_LIBS}
+        LIBRARIES ${LIBS}
        COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX2}
        USE_SABI 3
        WITH_SOABI
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -39,7 +39,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 1488682bb545f7d020e958a33116b1419d1cfc83
+          GIT_TAG 140c00c0241bb60cc6e44e7c1be9998d4b20d8d2
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -74,12 +74,6 @@ void indexer_k_quant_and_cache(
    int64_t quant_block_size,     // quantization block size
    const std::string& scale_fmt);

-// Concatenate query nope and rope for MLA/DSA attention
-void concat_mla_q(
-    torch::Tensor& ql_nope,  // [num_tokens, num_heads, nope_dim]
-    torch::Tensor& q_pe,     // [num_tokens, num_heads, rope_dim]
-    torch::Tensor& q_out);   // [num_tokens, num_heads, nope_dim + rope_dim]
-
 // Extract function to gather quantized K cache
 void cp_gather_indexer_k_quant_cache(
    const torch::Tensor& kv_cache,  // [num_blocks, block_size, cache_stride]
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -8,7 +8,6 @@
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
 #include "quantization/vectorization_utils.cuh"
-#include "concat_mla_q.cuh"

 #ifdef USE_ROCM
  #include "quantization/w8a8/fp8/amd/quant_utils.cuh"
@@ -919,8 +918,8 @@ __global__ void gather_and_maybe_dequant_cache(
 // SCALAR_T is the data type of the destination tensor.
 // CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
-#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE, ENTRY_SZ)              \
-  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, ENTRY_SZ, \
+#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE)                        \
+  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576,      \
                                       thread_block_size>                     \
      <<<grid, block, 0, stream>>>(                                           \
          reinterpret_cast<CACHE_T*>(src_cache.data_ptr()),                   \
@@ -931,12 +930,6 @@ __global__ void gather_and_maybe_dequant_cache(
          dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
          seq_starts_ptr);

-#define CALL_GATHER_CACHE_576(SCALAR_T, CACHE_T, KV_DTYPE) \
-  CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE, 576)
-
-#define CALL_GATHER_CACHE_320(SCALAR_T, CACHE_T, KV_DTYPE) \
-  CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE, 320)
-
 // Gather sequences from the cache into the destination tensor.
 //  - cu_seq_lens contains the cumulative sequence lengths for each batch
 //  - block_table contains the cache block indices for each sequence
@@ -966,10 +959,9 @@ void gather_and_maybe_dequant_cache(
    TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
                "seq_starts must be int32");
  }
-  TORCH_CHECK(
-      head_dim == 320 || head_dim == 576,
-      "gather_and_maybe_dequant_cache only support the head_dim to 320 or 576 "
-      "for better performance")
+  TORCH_CHECK(head_dim == 576,
+              "gather_and_maybe_dequant_cache only support the head_dim to 576 "
+              "for better performance")

  TORCH_CHECK(src_cache.device() == dst.device(),
              "src_cache and dst must be on the same device");
@@ -994,13 +986,7 @@ void gather_and_maybe_dequant_cache(
  const int32_t* seq_starts_ptr =
      seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;

-  if (head_dim == 576) {
-    DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype,
-                               CALL_GATHER_CACHE_576);
-  } else {
-    DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype,
-                               CALL_GATHER_CACHE_320);
-  }
+  DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, CALL_GATHER_CACHE);
 }

 namespace vllm {
@@ -1009,67 +995,75 @@ namespace vllm {
 // Similar to cp_gather_cache but specifically for FP8->BF16 conversion
 __global__ void cp_gather_and_upconvert_fp8_kv_cache(
    const uint8_t* __restrict__ src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, 656]
-    __nv_bfloat16* __restrict__ dst,          // [total_tokens, 576]
-    const int32_t* __restrict__ block_table,  // [num_reqs, BLOCK_INDICES]
-    const int32_t* __restrict__ workspace_starts,  // [num_reqs]
-    const int32_t num_reqs, const int32_t block_size,
-    const int32_t total_tokens, const int64_t block_table_stride,
-    const int64_t cache_block_stride, const int64_t cache_entry_stride,
-    const int64_t dst_entry_stride) {
-  const int flat_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) >> 5;
-  if (flat_warp_id >= total_tokens) return;
-  const int lane_id = threadIdx.x & 31;
+    __nv_bfloat16* __restrict__ dst,          // [TOT_TOKENS, 576]
+    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
+    const int32_t* __restrict__ seq_lens,     // [BATCH]
+    const int32_t* __restrict__ workspace_starts,  // [BATCH]
+    const int32_t block_size, const int32_t head_dim,
+    const int64_t block_table_stride, const int64_t cache_block_stride,
+    const int64_t cache_entry_stride, const int64_t dst_entry_stride) {
+  const int64_t bid = blockIdx.x;  // Batch ID
+  const int32_t num_splits = gridDim.y;
+  const int32_t split = blockIdx.y;
+  const int32_t seq_start = workspace_starts[bid];
+  const int32_t seq_len = seq_lens[bid];
+  const int32_t tot_slots = seq_len;
+  const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);

-  // Binary search to find which request owns this output token
-  int lo = 0, hi = num_reqs - 1;
-  while (lo < hi) {
-    int mid = (lo + hi + 1) >> 1;
-    if (workspace_starts[mid] <= flat_warp_id)
-      lo = mid;
-    else
-      hi = mid - 1;
+  const int32_t split_start = split * split_slots;
+  const int32_t split_end = min((split + 1) * split_slots, tot_slots);
+
+  const bool is_active_split = (split_start < tot_slots);
+
+  if (!is_active_split) return;
+
+  // Adjust the pointer for the block_table for this batch
+  const int32_t batch_offset = bid * block_table_stride;
+  int32_t offset = split_start;
+  int32_t offset_div = offset / block_size;
+  offset = offset % block_size;
+  const int32_t* batch_block_table = block_table + batch_offset;
+
+  // Adjust dst pointer based on the cumulative sequence lengths
+  dst += seq_start * dst_entry_stride;
+
+  const int tid = threadIdx.x;
+
+  // Process each token in this split
+  for (int pid = split_start; pid < split_end; ++pid) {
+    auto block_id = batch_block_table[offset_div];
+    const uint8_t* token_ptr =
+        src_cache + block_id * cache_block_stride + offset * cache_entry_stride;
+    __nv_bfloat16* dst_ptr = dst + pid * dst_entry_stride;
+
+    // FP8 format: 512 bytes fp8 + 16 bytes scales + 128 bytes rope (64 bf16)
+    const uint8_t* no_pe_ptr = token_ptr;
+    const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
+    const __nv_bfloat16* rope_ptr =
+        reinterpret_cast<const __nv_bfloat16*>(token_ptr + 512 + 16);
+
+    // Parallelize fp8 dequant (512 elements) and rope copy (64 elements)
+    if (tid < 512) {
+      // FP8 dequantization
+      const int tile = tid >> 7;  // each tile is 128 elements
+      const float scale = scales_ptr[tile];
+      const uint8_t val = no_pe_ptr[tid];
+      dst_ptr[tid] =
+          fp8::scaled_convert<__nv_bfloat16, uint8_t,
+                              vllm::Fp8KVCacheDataType::kFp8E4M3>(val, scale);
+    } else if (tid < 576) {
+      // Rope copy (64 bf16 elements)
+      const int rope_idx = tid - 512;
+      dst_ptr[512 + rope_idx] = rope_ptr[rope_idx];
+    }
+
+    // Move to next token
+    offset += 1;
+    if (offset == block_size) {
+      offset_div += 1;
+      offset = 0;
+    }
  }
-  const int req_id = lo;
-
-  // Compute physical token address via block table
-  const int out_token_id = flat_warp_id;
-  const int token_offset = out_token_id - workspace_starts[req_id];
-  const int cache_block_idx = token_offset / block_size;
-  const int offset_in_block = token_offset % block_size;
-  const int physical_block =
-      block_table[req_id * block_table_stride + cache_block_idx];
-
-  const uint8_t* token_ptr = src_cache + physical_block * cache_block_stride +
-                             offset_in_block * cache_entry_stride;
-
-  const int4* nope_src = reinterpret_cast<const int4*>(token_ptr);
-  const int4 fp8_data = nope_src[lane_id];
-
-  const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
-  const float scale = scales_ptr[lane_id >> 3];
-
-  const uint2 fp8_lo = make_uint2(fp8_data.x, fp8_data.y);
-  const uint2 fp8_hi = make_uint2(fp8_data.z, fp8_data.w);
-#ifdef USE_ROCM
-  const bf16_8_t bf16_lo =
-      fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_lo, scale);
-  const bf16_8_t bf16_hi =
-      fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_hi, scale);
-#else
-  const bf16_8_t bf16_lo =
-      fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_lo, scale, __NV_E4M3);
-  const bf16_8_t bf16_hi =
-      fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_hi, scale, __NV_E4M3);
-#endif
-
-  __nv_bfloat16* dst_ptr = dst + out_token_id * dst_entry_stride;
-  int4* nope_dst = reinterpret_cast<int4*>(dst_ptr) + lane_id * 2;
-  nope_dst[0] = *reinterpret_cast<const int4*>(&bf16_lo);
-  nope_dst[1] = *reinterpret_cast<const int4*>(&bf16_hi);
-
-  const int* rope_src = reinterpret_cast<const int*>(token_ptr + 528);
-  int* rope_dst = reinterpret_cast<int*>(dst_ptr + 512);
-  rope_dst[lane_id] = rope_src[lane_id];
 }

 template <typename scalar_t>
@@ -1263,16 +1257,15 @@ void cp_gather_and_upconvert_fp8_kv_cache(
    src_ptr = reinterpret_cast<const uint8_t*>(src_cache.data_ptr());
  }

-  const int total_tokens = dst.size(0);
-  constexpr int warps_per_block = 8;
-  const int grid_size = (total_tokens + warps_per_block - 1) / warps_per_block;
-  const int block_size_threads = warps_per_block * 32;  // 256 threads
+  // Decide on the number of splits based on the batch size
+  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
+  dim3 grid(batch_size, num_splits);
+  dim3 block(576);

-  vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid_size, block_size_threads, 0,
-                                               stream>>>(
+  vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid, block, 0, stream>>>(
      src_ptr, reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
-      block_table.data_ptr<int32_t>(), workspace_starts.data_ptr<int32_t>(),
-      static_cast<int32_t>(batch_size), block_size, total_tokens,
+      block_table.data_ptr<int32_t>(), seq_lens.data_ptr<int32_t>(),
+      workspace_starts.data_ptr<int32_t>(), block_size, head_dim,
      block_table_stride, cache_block_stride, cache_entry_stride,
      dst_entry_stride);
 }
@@ -1372,43 +1365,3 @@ void cp_gather_indexer_k_quant_cache(
    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32);
  }
 }
-
-// Concatenate ql_nope and q_pe into a contiguous q_out tensor for MLA/DSA.
-// Replaces torch.cat((ql_nope, q_pe), dim=-1).
-void concat_mla_q(torch::Tensor& ql_nope,  // [num_tokens, num_heads, nope_dim]
-                  torch::Tensor& q_pe,     // [num_tokens, num_heads, rope_dim]
-                  torch::Tensor& q_out     // [num_tokens, num_heads, nope_dim +
-                                           // rope_dim]
-) {
-  const int num_tokens = ql_nope.size(0);
-  const int num_heads = ql_nope.size(1);
-  const int nope_dim = ql_nope.size(2);
-  const int rope_dim = q_pe.size(2);
-
-  TORCH_CHECK(nope_dim % 512 == 0, "nope_dim must be a multiple of 512, got ",
-              nope_dim);
-  TORCH_CHECK(rope_dim == 64, "rope_dim must be 64, got ", rope_dim);
-  TORCH_CHECK(q_out.size(2) == nope_dim + rope_dim);
-
-  TORCH_CHECK(ql_nope.stride(2) == 1, "ql_nope must have stride 1 in dim 2");
-  TORCH_CHECK(q_pe.stride(2) == 1, "q_pe must have stride 1 in dim 2");
-  TORCH_CHECK(q_out.stride(2) == 1, "q_out must have stride 1 in dim 2");
-
-  if (num_tokens == 0) return;
-
-  constexpr int warps_per_block = 8;
-  const int total_warps = num_tokens * num_heads;
-  const int grid_size = (total_warps + warps_per_block - 1) / warps_per_block;
-  const int block_size = warps_per_block * 32;
-
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(ql_nope));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  VLLM_DISPATCH_FLOATING_TYPES(ql_nope.scalar_type(), "concat_mla_q", [&] {
-    vllm::ConcatMLAQKernel<scalar_t, 512><<<grid_size, block_size, 0, stream>>>(
-        q_out.data_ptr<scalar_t>(), ql_nope.data_ptr<scalar_t>(),
-        q_pe.data_ptr<scalar_t>(), num_tokens, num_heads, q_out.stride(0),
-        q_out.stride(1), ql_nope.stride(0), ql_nope.stride(1), q_pe.stride(0),
-        q_pe.stride(1));
-  });
-}
--- a/csrc/concat_mla_q.cuh
+++ b/csrc/concat_mla_q.cuh
@@ -1,60 +0,0 @@
-#ifndef CONCAT_MLA_Q_CUH_
-#define CONCAT_MLA_Q_CUH_
-
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-
-#include "cuda_vec_utils.cuh"
-
-namespace vllm {
-
-// Concatenates ql_nope [num_tokens, num_heads, NOPE_DIM] and
-// q_pe [num_tokens, num_heads, 64]
-// into q_out [num_tokens, num_heads, NOPE_DIM+64].
-// Currently instantiated only for NOPE_DIM=512.
-// Rope dim is hardcoded to 64 (DeepSeek V3.2 MLA)
-template <typename DType, int NOPE_DIM>
-__global__ void ConcatMLAQKernel(
-    DType* __restrict__ q_out, const DType* __restrict__ ql_nope,
-    const DType* __restrict__ q_pe, const int num_tokens, const int num_heads,
-    const int64_t out_stride_0, const int64_t out_stride_1,
-    const int64_t nope_stride_0, const int64_t nope_stride_1,
-    const int64_t pe_stride_0, const int64_t pe_stride_1) {
-  const int flat_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) >> 5;
-  if (flat_warp_id >= num_tokens * num_heads) return;
-
-  const int token_id = flat_warp_id / num_heads;
-  const int head_id = flat_warp_id % num_heads;
-  const int lane_id = threadIdx.x & 31;
-
-  constexpr bool use_256b = VLLM_256B_PTX_ENABLED;
-  constexpr int nope_vec_loads =
-      NOPE_DIM * sizeof(DType) / (VecTraits<use_256b>::ARCH_MAX_VEC_SIZE * 32);
-
-  const DType* nope_src =
-      ql_nope + token_id * nope_stride_0 + head_id * nope_stride_1;
-  DType* nope_dst = q_out + token_id * out_stride_0 + head_id * out_stride_1;
-
-#pragma unroll
-  for (int i = 0; i < nope_vec_loads; i++) {
-    const int offset = i * 32 + lane_id;
-    if constexpr (use_256b) {
-      st256_cs(reinterpret_cast<u32x8_t*>(nope_dst) + offset,
-               ld256_cs(reinterpret_cast<const u32x8_t*>(nope_src) + offset));
-    } else {
-      st128_cs(reinterpret_cast<int4*>(nope_dst) + offset,
-               ld128_cs(reinterpret_cast<const int4*>(nope_src) + offset));
-    }
-  }
-
-  const int* rope_src = reinterpret_cast<const int*>(
-      q_pe + token_id * pe_stride_0 + head_id * pe_stride_1);
-  int* rope_dst = reinterpret_cast<int*>(q_out + token_id * out_stride_0 +
-                                         head_id * out_stride_1 + NOPE_DIM);
-
-  st32_cs(rope_dst + lane_id, ld32_cs(rope_src + lane_id));
-}
-
-}  // namespace vllm
-
-#endif  // CONCAT_MLA_Q_CUH_
--- a/csrc/cpu/cpu_attn_amx.hpp
+++ b/csrc/cpu/cpu_attn_amx.hpp
@@ -420,7 +420,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
      const int64_t block_size, const int64_t block_size_stride) {
    // For AMX 2D tiles, size of each line is 64 bytes
    constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES;
-    // For AMX B matrix, N always is 16
+    // For AMX B martix, N always is 16
    constexpr int64_t amx_b_tile_n_size = AMX_TILE_ROW_BYTES / 4;
    constexpr int64_t amx_b_tile_k_size = amx_tile_row_size / sizeof(scalar_t);
    // For now suppose block_size is divisible by amx_tile_column_num
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -13,9 +13,6 @@
 #elif defined(__aarch64__)
  // arm implementation
  #include "cpu_types_arm.hpp"
-#elif defined(__riscv_v)
-  // riscv implementation
-  #include "cpu_types_riscv.hpp"
 #else
  #warning "unsupported vLLM cpu implementation, vLLM will compile with scalar"
  #include "cpu_types_scalar.hpp"
--- a/csrc/cpu/cpu_types_riscv.hpp
+++ b/csrc/cpu/cpu_types_riscv.hpp
@@ -1,832 +0,0 @@
-#ifndef CPU_TYPES_RISCV_HPP
-#define CPU_TYPES_RISCV_HPP
-
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <riscv_vector.h>
-#include <torch/all.h>
-
-// ============================================================================
-// Vector Register Type Definitions (VLEN=128 bits)
-// ============================================================================
-
-typedef vfloat16m1_t fixed_vfloat16m1_t
-    __attribute__((riscv_rvv_vector_bits(128)));
-typedef vfloat16m2_t fixed_vfloat16m2_t
-    __attribute__((riscv_rvv_vector_bits(256)));
-
-typedef vfloat32m1_t fixed_vfloat32m1_t
-    __attribute__((riscv_rvv_vector_bits(128)));
-typedef vfloat32m2_t fixed_vfloat32m2_t
-    __attribute__((riscv_rvv_vector_bits(256)));
-typedef vfloat32m4_t fixed_vfloat32m4_t
-    __attribute__((riscv_rvv_vector_bits(512)));
-typedef vfloat32m8_t fixed_vfloat32m8_t
-    __attribute__((riscv_rvv_vector_bits(1024)));
-
-typedef vint32m2_t fixed_vint32m2_t __attribute__((riscv_rvv_vector_bits(256)));
-typedef vint32m4_t fixed_vint32m4_t __attribute__((riscv_rvv_vector_bits(512)));
-
-typedef vuint16m1_t fixed_vuint16m1_t
-    __attribute__((riscv_rvv_vector_bits(128)));
-typedef vuint16m2_t fixed_vuint16m2_t
-    __attribute__((riscv_rvv_vector_bits(256)));
-typedef vuint16m4_t fixed_vuint16m4_t
-    __attribute__((riscv_rvv_vector_bits(512)));
-
-#ifdef RISCV_BF16_SUPPORT
-typedef vbfloat16m1_t fixed_vbfloat16m1_t
-    __attribute__((riscv_rvv_vector_bits(128)));
-typedef vbfloat16m2_t fixed_vbfloat16m2_t
-    __attribute__((riscv_rvv_vector_bits(256)));
-typedef vbfloat16m4_t fixed_vbfloat16m4_t
-    __attribute__((riscv_rvv_vector_bits(512)));
-#endif
-
-namespace vec_op {
-
-#ifdef RISCV_BF16_SUPPORT
-  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
-    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
-    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
-    AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
-#else
-  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
-    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
-    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
-#endif
-
-#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
-  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
-
-#define FORCE_INLINE __attribute__((always_inline)) inline
-
-namespace {
-template <typename T, T... indexes, typename F>
-constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
-  (f(std::integral_constant<T, indexes>{}), ...);
-};
-}  // namespace
-
-template <typename T, T count, typename F,
-          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
-constexpr void unroll_loop(F&& f) {
-  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
-}
-
-template <typename T>
-struct Vec {
-  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; };
-};
-
-struct FP32Vec8;
-struct FP32Vec16;
-
-// ============================================================================
-// FP16 Implementation
-// ============================================================================
-
-struct FP16Vec8 : public Vec<FP16Vec8> {
-  constexpr static int VEC_ELEM_NUM = 8;
-  fixed_vfloat16m1_t reg;
-
-  explicit FP16Vec8(const void* ptr)
-      : reg(__riscv_vle16_v_f16m1(static_cast<const _Float16*>(ptr),
-                                  VEC_ELEM_NUM)) {};
-
-  explicit FP16Vec8(const FP32Vec8&);
-
-  void save(void* ptr) const {
-    __riscv_vse16_v_f16m1(static_cast<_Float16*>(ptr), reg, VEC_ELEM_NUM);
-  }
-  void save(void* ptr, int elem_num) const {
-    __riscv_vse16_v_f16m1(static_cast<_Float16*>(ptr), reg, elem_num);
-  }
-  void save_strided(void* ptr, ptrdiff_t stride) const {
-    ptrdiff_t byte_stride = stride * sizeof(_Float16);
-    __riscv_vsse16_v_f16m1(static_cast<_Float16*>(ptr), byte_stride, reg,
-                           VEC_ELEM_NUM);
-  }
-};
-
-struct FP16Vec16 : public Vec<FP16Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-  fixed_vfloat16m2_t reg;
-
-  explicit FP16Vec16(const void* ptr)
-      : reg(__riscv_vle16_v_f16m2(static_cast<const _Float16*>(ptr),
-                                  VEC_ELEM_NUM)) {};
-
-  explicit FP16Vec16(const FP32Vec16& vec);
-
-  void save(void* ptr) const {
-    __riscv_vse16_v_f16m2(static_cast<_Float16*>(ptr), reg, VEC_ELEM_NUM);
-  }
-  void save(void* ptr, int elem_num) const {
-    __riscv_vse16_v_f16m2(static_cast<_Float16*>(ptr), reg, elem_num);
-  }
-  void save_strided(void* ptr, ptrdiff_t stride) const {
-    ptrdiff_t byte_stride = stride * sizeof(_Float16);
-    __riscv_vsse16_v_f16m2(static_cast<_Float16*>(ptr), byte_stride, reg,
-                           VEC_ELEM_NUM);
-  }
-};
-
-// ============================================================================
-// BF16 Implementation
-// ============================================================================
-
-#ifdef RISCV_BF16_SUPPORT
-
-FORCE_INLINE fixed_vuint16m1_t bf16_to_u16(fixed_vbfloat16m1_t v) {
-  return __riscv_vreinterpret_v_bf16m1_u16m1(v);
-}
-FORCE_INLINE fixed_vuint16m2_t bf16_to_u16(fixed_vbfloat16m2_t v) {
-  return __riscv_vreinterpret_v_bf16m2_u16m2(v);
-}
-FORCE_INLINE fixed_vuint16m4_t bf16_to_u16(fixed_vbfloat16m4_t v) {
-  return __riscv_vreinterpret_v_bf16m4_u16m4(v);
-}
-
-struct BF16Vec8 : public Vec<BF16Vec8> {
-  constexpr static int VEC_ELEM_NUM = 8;
-  fixed_vbfloat16m1_t reg;
-
-  explicit BF16Vec8(const void* ptr)
-      : reg(__riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vle16_v_u16m1(
-            reinterpret_cast<const uint16_t*>(ptr), VEC_ELEM_NUM))) {};
-
-  explicit BF16Vec8(fixed_vbfloat16m1_t data) : reg(data) {};
-  explicit BF16Vec8(const FP32Vec8&);
-
-  void save(void* ptr) const {
-    __riscv_vse16_v_u16m1(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
-                          VEC_ELEM_NUM);
-  }
-  void save(void* ptr, int elem_num) const {
-    __riscv_vse16_v_u16m1(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
-                          elem_num);
-  }
-  void save_strided(void* ptr, ptrdiff_t stride) const {
-    ptrdiff_t byte_stride = stride * sizeof(uint16_t);
-    __riscv_vsse16_v_u16m1(reinterpret_cast<uint16_t*>(ptr), byte_stride,
-                           bf16_to_u16(reg), VEC_ELEM_NUM);
-  }
-};
-
-struct BF16Vec16 : public Vec<BF16Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-  fixed_vbfloat16m2_t reg;
-
-  explicit BF16Vec16(const void* ptr)
-      : reg(__riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vle16_v_u16m2(
-            reinterpret_cast<const uint16_t*>(ptr), VEC_ELEM_NUM))) {};
-
-  explicit BF16Vec16(fixed_vbfloat16m2_t data) : reg(data) {};
-  explicit BF16Vec16(const FP32Vec16&);
-
-  void save(void* ptr) const {
-    __riscv_vse16_v_u16m2(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
-                          VEC_ELEM_NUM);
-  }
-  void save(void* ptr, int elem_num) const {
-    __riscv_vse16_v_u16m2(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
-                          elem_num);
-  }
-  void save_strided(void* ptr, ptrdiff_t stride) const {
-    ptrdiff_t byte_stride = stride * sizeof(uint16_t);
-    __riscv_vsse16_v_u16m2(reinterpret_cast<uint16_t*>(ptr), byte_stride,
-                           bf16_to_u16(reg), VEC_ELEM_NUM);
-  }
-};
-
-struct BF16Vec32 : public Vec<BF16Vec32> {
-  constexpr static int VEC_ELEM_NUM = 32;
-  fixed_vbfloat16m4_t reg;
-
-  explicit BF16Vec32(const void* ptr)
-      : reg(__riscv_vreinterpret_v_u16m4_bf16m4(__riscv_vle16_v_u16m4(
-            reinterpret_cast<const uint16_t*>(ptr), VEC_ELEM_NUM))) {};
-
-  explicit BF16Vec32(fixed_vbfloat16m4_t data) : reg(data) {};
-
-  explicit BF16Vec32(const BF16Vec8& v) {
-    fixed_vuint16m1_t u16_val = bf16_to_u16(v.reg);
-    fixed_vuint16m4_t u16_combined =
-        __riscv_vcreate_v_u16m1_u16m4(u16_val, u16_val, u16_val, u16_val);
-    reg = __riscv_vreinterpret_v_u16m4_bf16m4(u16_combined);
-  };
-
-  void save(void* ptr) const {
-    __riscv_vse16_v_u16m4(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
-                          VEC_ELEM_NUM);
-  }
-  void save(void* ptr, int elem_num) const {
-    __riscv_vse16_v_u16m4(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
-                          elem_num);
-  }
-  void save_strided(void* ptr, ptrdiff_t stride) const {
-    ptrdiff_t byte_stride = stride * sizeof(uint16_t);
-    __riscv_vsse16_v_u16m4(reinterpret_cast<uint16_t*>(ptr), byte_stride,
-                           bf16_to_u16(reg), VEC_ELEM_NUM);
-  }
-};
-
-#else
-// ============================================================================
-// BF16 Fallback Implementation (FP32 Simulation)
-// ============================================================================
-
-struct BF16Vec8 : public Vec<BF16Vec8> {
-  constexpr static int VEC_ELEM_NUM = 8;
-  fixed_vfloat32m2_t reg_fp32;
-  explicit BF16Vec8(const void* ptr) {
-    const uint16_t* u16 = static_cast<const uint16_t*>(ptr);
-    float tmp[8];
-    for (int i = 0; i < 8; ++i) {
-      uint32_t v = static_cast<uint32_t>(u16[i]) << 16;
-      std::memcpy(&tmp[i], &v, 4);
-    }
-    reg_fp32 = __riscv_vle32_v_f32m2(tmp, 8);
-  }
-  explicit BF16Vec8(const FP32Vec8&);
-  void save(void* ptr) const {
-    float tmp[8];
-    __riscv_vse32_v_f32m2(tmp, reg_fp32, 8);
-    uint16_t* u16 = static_cast<uint16_t*>(ptr);
-    for (int i = 0; i < 8; ++i) {
-      uint32_t v;
-      std::memcpy(&v, &tmp[i], 4);
-      u16[i] = static_cast<uint16_t>(v >> 16);
-    }
-  }
-  void save(void* ptr, int elem_num) const {
-    float tmp[8];
-    __riscv_vse32_v_f32m2(tmp, reg_fp32, 8);
-    uint16_t* u16 = static_cast<uint16_t*>(ptr);
-    for (int i = 0; i < elem_num; ++i) {
-      uint32_t v;
-      std::memcpy(&v, &tmp[i], 4);
-      u16[i] = static_cast<uint16_t>(v >> 16);
-    }
-  }
-  void save_strided(void* ptr, ptrdiff_t stride) const {
-    float tmp[8];
-    __riscv_vse32_v_f32m2(tmp, reg_fp32, 8);
-    uint8_t* u8 = static_cast<uint8_t*>(ptr);
-    ptrdiff_t byte_stride = stride * sizeof(uint16_t);
-    for (int i = 0; i < 8; ++i) {
-      uint32_t v;
-      std::memcpy(&v, &tmp[i], 4);
-      uint16_t val = static_cast<uint16_t>(v >> 16);
-      *reinterpret_cast<uint16_t*>(u8 + i * byte_stride) = val;
-    }
-  }
-};
-
-struct BF16Vec16 : public Vec<BF16Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-  fixed_vfloat32m4_t reg_fp32;
-  explicit BF16Vec16(const void* ptr) {
-    const uint16_t* u16 = static_cast<const uint16_t*>(ptr);
-    float tmp[16];
-    for (int i = 0; i < 16; ++i) {
-      uint32_t v = static_cast<uint32_t>(u16[i]) << 16;
-      std::memcpy(&tmp[i], &v, 4);
-    }
-    reg_fp32 = __riscv_vle32_v_f32m4(tmp, 16);
-  }
-  explicit BF16Vec16(const FP32Vec16&);
-  void save(void* ptr) const {
-    float tmp[16];
-    __riscv_vse32_v_f32m4(tmp, reg_fp32, 16);
-    uint16_t* u16 = static_cast<uint16_t*>(ptr);
-    for (int i = 0; i < 16; ++i) {
-      uint32_t v;
-      std::memcpy(&v, &tmp[i], 4);
-      u16[i] = static_cast<uint16_t>(v >> 16);
-    }
-  }
-  void save(void* ptr, int elem_num) const {
-    float tmp[16];
-    __riscv_vse32_v_f32m4(tmp, reg_fp32, 16);
-    uint16_t* u16 = static_cast<uint16_t*>(ptr);
-    for (int i = 0; i < elem_num; ++i) {
-      uint32_t v;
-      std::memcpy(&v, &tmp[i], 4);
-      u16[i] = static_cast<uint16_t>(v >> 16);
-    }
-  }
-  void save_strided(void* ptr, ptrdiff_t stride) const {
-    float tmp[16];
-    __riscv_vse32_v_f32m4(tmp, reg_fp32, 16);
-    uint8_t* u8 = static_cast<uint8_t*>(ptr);
-    ptrdiff_t byte_stride = stride * sizeof(uint16_t);
-    for (int i = 0; i < 16; ++i) {
-      uint32_t v;
-      std::memcpy(&v, &tmp[i], 4);
-      uint16_t val = static_cast<uint16_t>(v >> 16);
-      *reinterpret_cast<uint16_t*>(u8 + i * byte_stride) = val;
-    }
-  }
-};
-
-struct BF16Vec32 : public Vec<BF16Vec32> {
-  constexpr static int VEC_ELEM_NUM = 32;
-  fixed_vfloat32m8_t reg_fp32;
-
-  explicit BF16Vec32(const void* ptr) {
-    const uint16_t* u16 = static_cast<const uint16_t*>(ptr);
-    float tmp[32];
-    for (int i = 0; i < 32; ++i) {
-      uint32_t v = static_cast<uint32_t>(u16[i]) << 16;
-      std::memcpy(&tmp[i], &v, 4);
-    }
-    reg_fp32 = __riscv_vle32_v_f32m8(tmp, 32);
-  }
-
-  explicit BF16Vec32(const BF16Vec8& v) {
-    float tmp_small[8];
-    __riscv_vse32_v_f32m2(tmp_small, v.reg_fp32, 8);
-    float tmp_large[32];
-    for (int i = 0; i < 4; ++i) {
-      std::memcpy(tmp_large + (i * 8), tmp_small, 8 * sizeof(float));
-    }
-    reg_fp32 = __riscv_vle32_v_f32m8(tmp_large, 32);
-  }
-
-  void save(void* ptr) const {
-    float tmp[32];
-    __riscv_vse32_v_f32m8(tmp, reg_fp32, 32);
-    uint16_t* u16 = static_cast<uint16_t*>(ptr);
-    for (int i = 0; i < 32; ++i) {
-      uint32_t v;
-      std::memcpy(&v, &tmp[i], 4);
-      u16[i] = static_cast<uint16_t>(v >> 16);
-    }
-  }
-
-  void save(void* ptr, int elem_num) const {
-    float tmp[32];
-    __riscv_vse32_v_f32m8(tmp, reg_fp32, 32);
-    uint16_t* u16 = static_cast<uint16_t*>(ptr);
-    for (int i = 0; i < elem_num; ++i) {
-      uint32_t v;
-      std::memcpy(&v, &tmp[i], 4);
-      u16[i] = static_cast<uint16_t>(v >> 16);
-    }
-  }
-
-  void save_strided(void* ptr, ptrdiff_t stride) const {
-    float tmp[32];
-    __riscv_vse32_v_f32m8(tmp, reg_fp32, 32);
-    uint8_t* u8 = static_cast<uint8_t*>(ptr);
-    ptrdiff_t byte_stride = stride * sizeof(uint16_t);
-    for (int i = 0; i < 32; ++i) {
-      uint32_t v;
-      std::memcpy(&v, &tmp[i], 4);
-      uint16_t val = static_cast<uint16_t>(v >> 16);
-      *reinterpret_cast<uint16_t*>(u8 + i * byte_stride) = val;
-    }
-  }
-};
-#endif
-
-// ============================================================================
-// FP32 Implementation
-// ============================================================================
-
-struct FP32Vec4 : public Vec<FP32Vec4> {
-  constexpr static int VEC_ELEM_NUM = 4;
-  fixed_vfloat32m1_t reg;
-  explicit FP32Vec4(float v) : reg(__riscv_vfmv_v_f_f32m1(v, VEC_ELEM_NUM)) {};
-  explicit FP32Vec4() : reg(__riscv_vfmv_v_f_f32m1(0.0f, VEC_ELEM_NUM)) {};
-  explicit FP32Vec4(const float* ptr)
-      : reg(__riscv_vle32_v_f32m1(ptr, VEC_ELEM_NUM)) {};
-  explicit FP32Vec4(fixed_vfloat32m1_t data) : reg(data) {};
-  explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {};
-  void save(float* ptr) const { __riscv_vse32_v_f32m1(ptr, reg, VEC_ELEM_NUM); }
-  void save(float* ptr, int elem_num) const {
-    __riscv_vse32_v_f32m1(ptr, reg, elem_num);
-  }
-};
-
-struct FP32Vec8 : public Vec<FP32Vec8> {
-  constexpr static int VEC_ELEM_NUM = 8;
-  fixed_vfloat32m2_t reg;
-
-  explicit FP32Vec8(float v) : reg(__riscv_vfmv_v_f_f32m2(v, VEC_ELEM_NUM)) {};
-  explicit FP32Vec8() : reg(__riscv_vfmv_v_f_f32m2(0.0f, VEC_ELEM_NUM)) {};
-  explicit FP32Vec8(const float* ptr)
-      : reg(__riscv_vle32_v_f32m2(ptr, VEC_ELEM_NUM)) {};
-  explicit FP32Vec8(fixed_vfloat32m2_t data) : reg(data) {};
-  explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
-  explicit FP32Vec8(const FP16Vec8& v)
-      : reg(__riscv_vfwcvt_f_f_v_f32m2(v.reg, VEC_ELEM_NUM)) {};
-  explicit FP32Vec8(fixed_vfloat16m1_t v)
-      : reg(__riscv_vfwcvt_f_f_v_f32m2(v, VEC_ELEM_NUM)) {};
-
-#ifdef RISCV_BF16_SUPPORT
-  explicit FP32Vec8(fixed_vbfloat16m1_t v)
-      : reg(__riscv_vfwcvtbf16_f_f_v_f32m2(v, VEC_ELEM_NUM)) {};
-  explicit FP32Vec8(const BF16Vec8& v)
-      : reg(__riscv_vfwcvtbf16_f_f_v_f32m2(v.reg, VEC_ELEM_NUM)) {};
-#else
-  explicit FP32Vec8(const BF16Vec8& v) : reg(v.reg_fp32) {};
-#endif
-
-  float reduce_sum() const {
-    fixed_vfloat32m1_t scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1);
-    scalar = __riscv_vfredusum_vs_f32m2_f32m1(reg, scalar, VEC_ELEM_NUM);
-    return __riscv_vfmv_f_s_f32m1_f32(scalar);
-  }
-
-  FP32Vec8 operator*(const FP32Vec8& b) const {
-    return FP32Vec8(__riscv_vfmul_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
-  }
-  FP32Vec8 operator+(const FP32Vec8& b) const {
-    return FP32Vec8(__riscv_vfadd_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
-  }
-  FP32Vec8 operator-(const FP32Vec8& b) const {
-    return FP32Vec8(__riscv_vfsub_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
-  }
-  FP32Vec8 operator/(const FP32Vec8& b) const {
-    return FP32Vec8(__riscv_vfdiv_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
-  }
-
-  FP32Vec8 min(const FP32Vec8& b) const {
-    return FP32Vec8(__riscv_vfmin_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
-  }
-  FP32Vec8 max(const FP32Vec8& b) const {
-    return FP32Vec8(__riscv_vfmax_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
-  }
-  FP32Vec8 abs() const {
-    return FP32Vec8(__riscv_vfabs_v_f32m2(reg, VEC_ELEM_NUM));
-  }
-
-  FP32Vec8 min(const FP32Vec8& b, int elem_num) const {
-    return FP32Vec8(__riscv_vfmin_vv_f32m2(reg, b.reg, elem_num));
-  }
-  FP32Vec8 max(const FP32Vec8& b, int elem_num) const {
-    return FP32Vec8(__riscv_vfmax_vv_f32m2(reg, b.reg, elem_num));
-  }
-
-  FP32Vec8 clamp(const FP32Vec8& min_v, const FP32Vec8& max_v) const {
-    fixed_vfloat32m2_t temp =
-        __riscv_vfmax_vv_f32m2(min_v.reg, reg, VEC_ELEM_NUM);
-    return FP32Vec8(__riscv_vfmin_vv_f32m2(max_v.reg, temp, VEC_ELEM_NUM));
-  }
-
-  void save(float* ptr) const { __riscv_vse32_v_f32m2(ptr, reg, VEC_ELEM_NUM); }
-  void save(float* ptr, int elem_num) const {
-    __riscv_vse32_v_f32m2(ptr, reg, elem_num);
-  }
-  void save_strided(float* ptr, ptrdiff_t stride) const {
-    ptrdiff_t byte_stride = stride * sizeof(float);
-    __riscv_vsse32_v_f32m2(ptr, byte_stride, reg, VEC_ELEM_NUM);
-  }
-
-  FP32Vec8 exp() const {
-    const float inv_ln2 = 1.44269504088896341f;
-    fixed_vfloat32m2_t x_scaled =
-        __riscv_vfmul_vf_f32m2(reg, inv_ln2, VEC_ELEM_NUM);
-    fixed_vint32m2_t n_int = __riscv_vfcvt_x_f_v_i32m2(x_scaled, VEC_ELEM_NUM);
-    fixed_vfloat32m2_t n_float = __riscv_vfcvt_f_x_v_f32m2(n_int, VEC_ELEM_NUM);
-
-    fixed_vfloat32m2_t r =
-        __riscv_vfsub_vv_f32m2(x_scaled, n_float, VEC_ELEM_NUM);
-
-    fixed_vfloat32m2_t poly =
-        __riscv_vfmv_v_f_f32m2(0.001333355810164f, VEC_ELEM_NUM);
-    poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
-    poly = __riscv_vfadd_vf_f32m2(poly, 0.009618129107628f, VEC_ELEM_NUM);
-    poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
-    poly = __riscv_vfadd_vf_f32m2(poly, 0.055504108664821f, VEC_ELEM_NUM);
-    poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
-    poly = __riscv_vfadd_vf_f32m2(poly, 0.240226506959101f, VEC_ELEM_NUM);
-    poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
-    poly = __riscv_vfadd_vf_f32m2(poly, 0.693147180559945f, VEC_ELEM_NUM);
-    poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
-    poly = __riscv_vfadd_vf_f32m2(poly, 1.0f, VEC_ELEM_NUM);
-
-    fixed_vint32m2_t biased_exp =
-        __riscv_vadd_vx_i32m2(n_int, 127, VEC_ELEM_NUM);
-    biased_exp = __riscv_vmax_vx_i32m2(biased_exp, 0, VEC_ELEM_NUM);
-    fixed_vint32m2_t exponent_bits =
-        __riscv_vsll_vx_i32m2(biased_exp, 23, VEC_ELEM_NUM);
-    fixed_vfloat32m2_t scale =
-        __riscv_vreinterpret_v_i32m2_f32m2(exponent_bits);
-
-    return FP32Vec8(__riscv_vfmul_vv_f32m2(poly, scale, VEC_ELEM_NUM));
-  }
-
-  FP32Vec8 tanh() const {
-    fixed_vfloat32m2_t x_clamped = __riscv_vfmin_vf_f32m2(
-        __riscv_vfmax_vf_f32m2(reg, -9.0f, VEC_ELEM_NUM), 9.0f, VEC_ELEM_NUM);
-    fixed_vfloat32m2_t x2 =
-        __riscv_vfmul_vf_f32m2(x_clamped, 2.0f, VEC_ELEM_NUM);
-    FP32Vec8 exp_val = FP32Vec8(x2).exp();
-    fixed_vfloat32m2_t num =
-        __riscv_vfsub_vf_f32m2(exp_val.reg, 1.0f, VEC_ELEM_NUM);
-    fixed_vfloat32m2_t den =
-        __riscv_vfadd_vf_f32m2(exp_val.reg, 1.0f, VEC_ELEM_NUM);
-    return FP32Vec8(__riscv_vfdiv_vv_f32m2(num, den, VEC_ELEM_NUM));
-  }
-
-  FP32Vec8 er() const {
-    const float p = 0.3275911f, a1 = 0.254829592f, a2 = -0.284496736f,
-                a3 = 1.421413741f, a4 = -1.453152027f, a5 = 1.061405429f;
-    fixed_vfloat32m2_t abs_x = __riscv_vfabs_v_f32m2(reg, VEC_ELEM_NUM);
-
-    fixed_vfloat32m2_t t = __riscv_vfadd_vf_f32m2(
-        __riscv_vfmul_vf_f32m2(abs_x, p, VEC_ELEM_NUM), 1.0f, VEC_ELEM_NUM);
-    t = __riscv_vfrdiv_vf_f32m2(t, 1.0f, VEC_ELEM_NUM);
-
-    fixed_vfloat32m2_t poly = __riscv_vfmv_v_f_f32m2(a5, VEC_ELEM_NUM);
-    poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
-                                  a4, VEC_ELEM_NUM);
-    poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
-                                  a3, VEC_ELEM_NUM);
-    poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
-                                  a2, VEC_ELEM_NUM);
-    poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
-                                  a1, VEC_ELEM_NUM);
-    poly = __riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM);
-
-    fixed_vfloat32m2_t exp_val =
-        FP32Vec8(__riscv_vfneg_v_f32m2(
-                     __riscv_vfmul_vv_f32m2(abs_x, abs_x, VEC_ELEM_NUM),
-                     VEC_ELEM_NUM))
-            .exp()
-            .reg;
-    fixed_vfloat32m2_t res = __riscv_vfrsub_vf_f32m2(
-        __riscv_vfmul_vv_f32m2(poly, exp_val, VEC_ELEM_NUM), 1.0f,
-        VEC_ELEM_NUM);
-
-    vbool16_t mask = __riscv_vmflt_vf_f32m2_b16(reg, 0.0f, VEC_ELEM_NUM);
-    return FP32Vec8(__riscv_vfneg_v_f32m2_m(mask, res, VEC_ELEM_NUM));
-  }
-};
-
-struct FP32Vec16 : public Vec<FP32Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-  fixed_vfloat32m4_t reg;
-
-  explicit FP32Vec16(float v) : reg(__riscv_vfmv_v_f_f32m4(v, VEC_ELEM_NUM)) {};
-  explicit FP32Vec16() : reg(__riscv_vfmv_v_f_f32m4(0.0f, VEC_ELEM_NUM)) {};
-  explicit FP32Vec16(const float* ptr)
-      : reg(__riscv_vle32_v_f32m4(ptr, VEC_ELEM_NUM)) {};
-  explicit FP32Vec16(fixed_vfloat32m4_t data) : reg(data) {};
-  explicit FP32Vec16(const FP32Vec8& data)
-      : reg(__riscv_vcreate_v_f32m2_f32m4(data.reg, data.reg)) {};
-  explicit FP32Vec16(const FP32Vec16& data) : reg(data.reg) {};
-  explicit FP32Vec16(const FP16Vec16& v);
-
-#ifdef RISCV_BF16_SUPPORT
-  explicit FP32Vec16(fixed_vbfloat16m2_t v)
-      : reg(__riscv_vfwcvtbf16_f_f_v_f32m4(v, VEC_ELEM_NUM)) {};
-  explicit FP32Vec16(const BF16Vec16& v)
-      : reg(__riscv_vfwcvtbf16_f_f_v_f32m4(v.reg, VEC_ELEM_NUM)) {};
-#else
-  explicit FP32Vec16(const BF16Vec16& v) : reg(v.reg_fp32) {};
-#endif
-
-  FP32Vec16 operator+(const FP32Vec16& b) const {
-    return FP32Vec16(__riscv_vfadd_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
-  }
-  FP32Vec16 operator-(const FP32Vec16& b) const {
-    return FP32Vec16(__riscv_vfsub_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
-  }
-  FP32Vec16 operator*(const FP32Vec16& b) const {
-    return FP32Vec16(__riscv_vfmul_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
-  }
-  FP32Vec16 operator/(const FP32Vec16& b) const {
-    return FP32Vec16(__riscv_vfdiv_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
-  }
-
-  FP32Vec16 fma(const FP32Vec16& a, const FP32Vec16& b) const {
-    return FP32Vec16(__riscv_vfmacc_vv_f32m4(reg, a.reg, b.reg, VEC_ELEM_NUM));
-  }
-
-  float reduce_sum() const {
-    fixed_vfloat32m1_t scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1);
-    scalar = __riscv_vfredusum_vs_f32m4_f32m1(reg, scalar, VEC_ELEM_NUM);
-    return __riscv_vfmv_f_s_f32m1_f32(scalar);
-  }
-
-  float reduce_max() const {
-    fixed_vfloat32m1_t scalar =
-        __riscv_vfmv_s_f_f32m1(std::numeric_limits<float>::lowest(), 1);
-    scalar = __riscv_vfredmax_vs_f32m4_f32m1(reg, scalar, VEC_ELEM_NUM);
-    return __riscv_vfmv_f_s_f32m1_f32(scalar);
-  }
-
-  float reduce_min() const {
-    fixed_vfloat32m1_t scalar =
-        __riscv_vfmv_s_f_f32m1(std::numeric_limits<float>::max(), 1);
-    scalar = __riscv_vfredmin_vs_f32m4_f32m1(reg, scalar, VEC_ELEM_NUM);
-    return __riscv_vfmv_f_s_f32m1_f32(scalar);
-  }
-
-  template <int group_size>
-  float reduce_sub_sum(int idx) {
-    static_assert(VEC_ELEM_NUM % group_size == 0);
-    const int start = idx * group_size;
-    vuint32m4_t indices = __riscv_vid_v_u32m4(VEC_ELEM_NUM);
-    vbool8_t mask = __riscv_vmand_mm_b8(
-        __riscv_vmsgeu_vx_u32m4_b8(indices, start, VEC_ELEM_NUM),
-        __riscv_vmsltu_vx_u32m4_b8(indices, start + group_size, VEC_ELEM_NUM),
-        VEC_ELEM_NUM);
-    fixed_vfloat32m1_t scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1);
-    scalar =
-        __riscv_vfredusum_vs_f32m4_f32m1_m(mask, reg, scalar, VEC_ELEM_NUM);
-    return __riscv_vfmv_f_s_f32m1_f32(scalar);
-  };
-
-  FP32Vec16 max(const FP32Vec16& b) const {
-    return FP32Vec16(__riscv_vfmax_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
-  }
-  FP32Vec16 min(const FP32Vec16& b) const {
-    return FP32Vec16(__riscv_vfmin_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
-  }
-  FP32Vec16 abs() const {
-    return FP32Vec16(__riscv_vfabs_v_f32m4(reg, VEC_ELEM_NUM));
-  }
-
-  FP32Vec16 clamp(const FP32Vec16& min_v, const FP32Vec16& max_v) const {
-    return FP32Vec16(__riscv_vfmin_vv_f32m4(
-        max_v.reg, __riscv_vfmax_vv_f32m4(min_v.reg, reg, VEC_ELEM_NUM),
-        VEC_ELEM_NUM));
-  }
-
-  void save(float* ptr) const { __riscv_vse32_v_f32m4(ptr, reg, VEC_ELEM_NUM); }
-  void save(float* ptr, int elem_num) const {
-    __riscv_vse32_v_f32m4(ptr, reg, elem_num);
-  }
-  void save_strided(float* ptr, ptrdiff_t stride) const {
-    ptrdiff_t byte_stride = stride * sizeof(float);
-    __riscv_vsse32_v_f32m4(ptr, byte_stride, reg, VEC_ELEM_NUM);
-  }
-
-  FP32Vec16 exp() const {
-    const float inv_ln2 = 1.44269504088896341f;
-    fixed_vfloat32m4_t x_scaled =
-        __riscv_vfmul_vf_f32m4(reg, inv_ln2, VEC_ELEM_NUM);
-    fixed_vint32m4_t n_int = __riscv_vfcvt_x_f_v_i32m4(x_scaled, VEC_ELEM_NUM);
-    fixed_vfloat32m4_t n_float = __riscv_vfcvt_f_x_v_f32m4(n_int, VEC_ELEM_NUM);
-    fixed_vfloat32m4_t r =
-        __riscv_vfsub_vv_f32m4(x_scaled, n_float, VEC_ELEM_NUM);
-
-    fixed_vfloat32m4_t poly =
-        __riscv_vfmv_v_f_f32m4(0.001333355810164f, VEC_ELEM_NUM);
-    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
-                                  0.009618129107628f, VEC_ELEM_NUM);
-    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
-                                  0.055504108664821f, VEC_ELEM_NUM);
-    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
-                                  0.240226506959101f, VEC_ELEM_NUM);
-    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
-                                  0.693147180559945f, VEC_ELEM_NUM);
-    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
-                                  1.0f, VEC_ELEM_NUM);
-
-    fixed_vint32m4_t biased_exp = __riscv_vmax_vx_i32m4(
-        __riscv_vadd_vx_i32m4(n_int, 127, VEC_ELEM_NUM), 0, VEC_ELEM_NUM);
-    fixed_vfloat32m4_t scale = __riscv_vreinterpret_v_i32m4_f32m4(
-        __riscv_vsll_vx_i32m4(biased_exp, 23, VEC_ELEM_NUM));
-
-    return FP32Vec16(__riscv_vfmul_vv_f32m4(poly, scale, VEC_ELEM_NUM));
-  }
-
-  FP32Vec16 tanh() const {
-    fixed_vfloat32m4_t x_clamped = __riscv_vfmin_vf_f32m4(
-        __riscv_vfmax_vf_f32m4(reg, -9.0f, VEC_ELEM_NUM), 9.0f, VEC_ELEM_NUM);
-    FP32Vec16 exp_val =
-        FP32Vec16(__riscv_vfmul_vf_f32m4(x_clamped, 2.0f, VEC_ELEM_NUM)).exp();
-    return FP32Vec16(__riscv_vfdiv_vv_f32m4(
-        __riscv_vfsub_vf_f32m4(exp_val.reg, 1.0f, VEC_ELEM_NUM),
-        __riscv_vfadd_vf_f32m4(exp_val.reg, 1.0f, VEC_ELEM_NUM), VEC_ELEM_NUM));
-  }
-
-  FP32Vec16 er() const {
-    const float p = 0.3275911f, a1 = 0.254829592f, a2 = -0.284496736f,
-                a3 = 1.421413741f, a4 = -1.453152027f, a5 = 1.061405429f;
-    fixed_vfloat32m4_t abs_x = __riscv_vfabs_v_f32m4(reg, VEC_ELEM_NUM);
-    fixed_vfloat32m4_t t = __riscv_vfrdiv_vf_f32m4(
-        __riscv_vfadd_vf_f32m4(__riscv_vfmul_vf_f32m4(abs_x, p, VEC_ELEM_NUM),
-                               1.0f, VEC_ELEM_NUM),
-        1.0f, VEC_ELEM_NUM);
-
-    fixed_vfloat32m4_t poly = __riscv_vfmv_v_f_f32m4(a5, VEC_ELEM_NUM);
-    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
-                                  a4, VEC_ELEM_NUM);
-    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
-                                  a3, VEC_ELEM_NUM);
-    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
-                                  a2, VEC_ELEM_NUM);
-    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
-                                  a1, VEC_ELEM_NUM);
-    poly = __riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM);
-
-    fixed_vfloat32m4_t exp_val =
-        FP32Vec16(__riscv_vfneg_v_f32m4(
-                      __riscv_vfmul_vv_f32m4(abs_x, abs_x, VEC_ELEM_NUM),
-                      VEC_ELEM_NUM))
-            .exp()
-            .reg;
-    fixed_vfloat32m4_t res = __riscv_vfrsub_vf_f32m4(
-        __riscv_vfmul_vv_f32m4(poly, exp_val, VEC_ELEM_NUM), 1.0f,
-        VEC_ELEM_NUM);
-
-    vbool8_t mask = __riscv_vmflt_vf_f32m4_b8(reg, 0.0f, VEC_ELEM_NUM);
-    return FP32Vec16(__riscv_vfneg_v_f32m4_m(mask, res, VEC_ELEM_NUM));
-  }
-};
-
-// ============================================================================
-// Type Traits & Global Helpers
-// ============================================================================
-
-template <typename T>
-struct VecType {
-  using vec_type = void;
-  using vec_t = void;
-};
-
-template <typename T>
-using vec_t = typename VecType<T>::vec_type;
-
-template <>
-struct VecType<float> {
-  using vec_type = FP32Vec8;
-  using vec_t = FP32Vec8;
-};
-template <>
-struct VecType<c10::Half> {
-  using vec_type = FP16Vec8;
-  using vec_t = FP16Vec8;
-};
-template <>
-struct VecType<c10::BFloat16> {
-  using vec_type = BF16Vec8;
-  using vec_t = BF16Vec8;
-};
-
-template <typename T>
-void storeFP32(float v, T* ptr) {
-  *ptr = v;
-}
-template <>
-inline void storeFP32<c10::Half>(float v, c10::Half* ptr) {
-  *reinterpret_cast<_Float16*>(ptr) = static_cast<_Float16>(v);
-}
-
-inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
-  reg = __riscv_vfncvt_f_f_w_f16m2(v.reg, VEC_ELEM_NUM);
-}
-inline FP16Vec8::FP16Vec8(const FP32Vec8& v) {
-  reg = __riscv_vfncvt_f_f_w_f16m1(v.reg, VEC_ELEM_NUM);
-}
-inline FP32Vec16::FP32Vec16(const FP16Vec16& v) {
-  reg = __riscv_vfwcvt_f_f_v_f32m4(v.reg, VEC_ELEM_NUM);
-}
-inline void fma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
-  acc = acc.fma(a, b);
-}
-
-#ifdef RISCV_BF16_SUPPORT
-template <>
-inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
-  *ptr = static_cast<__bf16>(v);
-};
-inline BF16Vec8::BF16Vec8(const FP32Vec8& v)
-    : reg(__riscv_vfncvtbf16_f_f_w_bf16m1(v.reg, VEC_ELEM_NUM)) {};
-inline BF16Vec16::BF16Vec16(const FP32Vec16& v)
-    : reg(__riscv_vfncvtbf16_f_f_w_bf16m2(v.reg, VEC_ELEM_NUM)) {};
-#else
-template <>
-inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
-  uint32_t val;
-  std::memcpy(&val, &v, 4);
-  *reinterpret_cast<uint16_t*>(ptr) = static_cast<uint16_t>(val >> 16);
-}
-inline BF16Vec8::BF16Vec8(const FP32Vec8& v) : reg_fp32(v.reg) {}
-inline BF16Vec16::BF16Vec16(const FP32Vec16& v) : reg_fp32(v.reg) {}
-#endif
-
-inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 1); }
-
-}  // namespace vec_op
-
-#ifndef CPU_KERNEL_GUARD_IN
-  #define CPU_KERNEL_GUARD_IN(NAME)
-#endif
-
-#ifndef CPU_KERNEL_GUARD_OUT
-  #define CPU_KERNEL_GUARD_OUT(NAME)
-#endif
-
-#endif  // CPU_TYPES_RISCV_HPP
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@@ -237,10 +237,13 @@ W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
  };
  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
                                   {b_k_stride_, b_n_stride_});
-
+#ifdef __aarch64__
  // dummy M size for prepacking weights
  // Prepacking weights improves performance and avoid runtime reorders
  constexpr dnnl_dim_t kProbeM = 128;
+#else
+  constexpr dnnl_dim_t kProbeM = DNNL_RUNTIME_DIM_VAL;
+#endif

  prepack_weight(args.b_ptr, original_b_md,
                 create_primitive_desc(
@@ -408,19 +411,21 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
                                   {b_k_stride_, b_n_stride_});

-  // dummy M size for prepacking weights
-  // Prepacking weights improves performance and avoid runtime reorders
-  constexpr dnnl_dim_t kProbeM = 128;
-
  prepack_weight(args.b_ptr, original_b_md,
                 create_primitive_desc(
-                     MSizeCacheKey{// Use a concrete M so oneDNN's kernel
-                                   // selector can choose an optimally blocked
-                                   // weight layout.
-                                   .a_m_size = kProbeM,
-                                   .a_m_stride = b_k_size_,
-                                   .use_bias = false,
-                                   .bias_type = dnnl::memory::data_type::undef},
+                     MSizeCacheKey{
+#ifdef VLLM_USE_ACL
+                         // Arm Compute Library (ACL) backend for oneDNN does
+                         // not support runtime
+                         // dimensions, so we set M to a default value
+                         .a_m_size = 128,
+                         .a_m_stride = b_k_size_,
+#else
+                         .a_m_size = DNNL_RUNTIME_DIM_VAL,
+                         .a_m_stride = DNNL_RUNTIME_DIM_VAL,
+#endif
+                         .use_bias = false,
+                         .bias_type = dnnl::memory::data_type::undef},
                     true)
                     .weights_desc());
  init_runtime_memory_cache(args);
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -4,7 +4,7 @@

 #include <torch/library.h>

-// Note: overwrite the external definition for sharing same name between
+// Note: overwrite the external defination for sharing same name between
 // libraries use different ISAs.
 #define TORCH_EXTENSION_NAME _C

--- a/csrc/cuda_vec_utils.cuh
+++ b/csrc/cuda_vec_utils.cuh
@@ -196,7 +196,7 @@ __forceinline__ __device__ u32x8_t ld256_cs(const u32x8_t* addr) {
  return val;
 #else
  assert(false && "ld256_cs requires SM100+ with CUDA 12.9+");
-  return u32x8_t{};
+  return {};
 #endif
 }

@@ -211,51 +211,23 @@ __forceinline__ __device__ void st256_cs(u32x8_t* addr, u32x8_t val) {
 #endif
 }

-// 32-bit load / store.
-__device__ __forceinline__ int ld32(const int* addr) { return __ldg(addr); }
-
-__device__ __forceinline__ void st32(int* addr, int val) { *addr = val; }
-
-// 32-bit cache-streaming (.cs) load / store.
-// Falls back to ld32/st32 on ROCm (no .cs hint).
+// 32-bit cache-streaming (.cs) load / store  — SM100+ only.
 __forceinline__ __device__ int ld32_cs(const int* addr) {
+#if VLLM_256B_PTX_ENABLED
  int val;
-#ifndef USE_ROCM
  asm volatile("ld.global.cs.b32 %0, [%1];" : "=r"(val) : "l"(addr));
-#else
-  val = ld32(addr);
-#endif
  return val;
+#else
+  assert(false && "ld32_cs requires SM100+ with CUDA 12.9+");
+  return 0;
+#endif
 }

 __forceinline__ __device__ void st32_cs(int* addr, int val) {
-#ifndef USE_ROCM
+#if VLLM_256B_PTX_ENABLED
  asm volatile("st.global.cs.b32 [%0], %1;" ::"l"(addr), "r"(val));
 #else
-  st32(addr, val);
-#endif
-}
-
-// 128-bit cache-streaming (.cs) load / store.
-// Falls back to ld128/st128 on ROCm (no .cs hint).
-__forceinline__ __device__ int4 ld128_cs(const int4* addr) {
-  int4 val;
-#ifndef USE_ROCM
-  asm volatile("ld.global.cs.v4.u32 {%0,%1,%2,%3}, [%4];"
-               : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
-               : "l"(addr));
-#else
-  ld128(val, addr);
-#endif
-  return val;
-}
-
-__forceinline__ __device__ void st128_cs(int4* addr, int4 val) {
-#ifndef USE_ROCM
-  asm volatile("st.global.cs.v4.u32 [%0], {%1,%2,%3,%4};" ::"l"(addr),
-               "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w));
-#else
-  st128(val, addr);
+  assert(false && "st32_cs requires SM100+ with CUDA 12.9+");
 #endif
 }

@@ -288,7 +260,7 @@ __device__ __forceinline__ void ld256_cg_or_zero(u32x8_t& val, const void* ptr,

 __device__ __forceinline__ void ld128_cg_or_zero(uint4& val, const void* ptr,
                                                 bool pred) {
-#ifndef USE_ROCM
+#if VLLM_256B_PTX_ENABLED
  uint32_t r0, r1, r2, r3;

  asm volatile(
@@ -306,7 +278,7 @@ __device__ __forceinline__ void ld128_cg_or_zero(uint4& val, const void* ptr,

  val = uint4{r0, r1, r2, r3};
 #else
-  assert(false && "ld128_cg_or_zero is not supported on ROCm");
+  assert(false && "ld128_cg_or_zero requires SM100+ with CUDA 12.9+");
 #endif
 }

--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -109,18 +109,16 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,

 #ifndef USE_ROCM
  int flag = 0;
-  CUresult rdma_result = cuDeviceGetAttribute(
+  CUDA_CHECK(cuDeviceGetAttribute(
      &flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
-      device);
-  if (rdma_result == CUDA_SUCCESS &&
-      flag) {  // support GPUDirect RDMA if possible
+      device));
+  if (flag) {  // support GPUDirect RDMA if possible
    prop.allocFlags.gpuDirectRDMACapable = 1;
  }
  int fab_flag = 0;
-  CUresult fab_result = cuDeviceGetAttribute(
-      &fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device);
-  if (fab_result == CUDA_SUCCESS &&
-      fab_flag) {  // support fabric handle if possible
+  CUDA_CHECK(cuDeviceGetAttribute(
+      &fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device));
+  if (fab_flag) {  // support fabric handle if possible
    prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
  }
 #endif
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -35,11 +35,11 @@ __global__ void batched_moe_align_block_size_kernel(
  int32_t const block_ids_size = sorted_ids_size / block_size;
  int32_t const SENTINEL =
      num_batches * max_tokens_per_batch;  // To denote invalid entries.
-  // Initialize sorted_ids
+  // Intialize sorted_ids
  for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
    sorted_ids[i] = SENTINEL;
  }
-  // Initialize expert_ids with -1
+  // Intialize expert_ids with -1
  for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
    block_ids[i] = -1;
  }
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -73,9 +73,10 @@ void moe_permute(
  MOE_DISPATCH(input.scalar_type(), [&] {
    expandInputRowsKernelLauncher<scalar_t>(
        get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
-        get_ptr<int>(sorted_row_idx), get_ptr<int>(inv_permuted_idx),
-        get_ptr<int>(permuted_idx), get_ptr<int64_t>(expert_first_token_offset),
-        n_token, valid_num_ptr, n_hidden, topk, n_local_expert, stream);
+        get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
+        get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
+        get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
+        n_hidden, topk, n_local_expert, stream);
  });
 }

--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
@@ -57,7 +57,7 @@ void sortAndScanExpert(const int* expert_for_source_row, const int* source_rows,

 template <typename T>
 void expandInputRowsKernelLauncher(
-    T const* unpermuted_input, T* permuted_output,
+    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
    int const* expanded_dest_row_to_expanded_source_row,
    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
    int64_t const* expert_first_token_offset, int64_t const num_rows,
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
@@ -2,7 +2,7 @@

 template <typename T, bool CHECK_SKIPPED>
 __global__ void expandInputRowsKernel(
-    T const* unpermuted_input, T* permuted_output,
+    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
    int const* expanded_dest_row_to_expanded_source_row,
    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
    int64_t const* expert_first_token_offset, int64_t const num_rows,
@@ -16,6 +16,7 @@ __global__ void expandInputRowsKernel(
  int64_t expanded_dest_row = blockIdx.x;
  int64_t const expanded_source_row =
      expanded_dest_row_to_expanded_source_row[expanded_dest_row];
+  int expert_id = sorted_experts[expanded_dest_row];

  if (threadIdx.x == 0) {
    assert(expanded_dest_row <= INT32_MAX);
@@ -53,7 +54,7 @@ __global__ void expandInputRowsKernel(

 template <typename T>
 void expandInputRowsKernelLauncher(
-    T const* unpermuted_input, T* permuted_output,
+    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
    int const* expanded_dest_row_to_expanded_source_row,
    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
    int64_t const* expert_first_token_offset, int64_t const num_rows,
@@ -69,12 +70,12 @@ void expandInputRowsKernelLauncher(
  bool is_check_skip = num_valid_tokens_ptr != nullptr;
  auto func = func_map[is_check_skip];

-  func<<<blocks, threads, 0, stream>>>(unpermuted_input, permuted_output,
-                                       expanded_dest_row_to_expanded_source_row,
-                                       expanded_source_row_to_expanded_dest_row,
-                                       permuted_idx, expert_first_token_offset,
-                                       num_rows, num_valid_tokens_ptr, cols, k,
-                                       num_local_experts);
+  func<<<blocks, threads, 0, stream>>>(
+      unpermuted_input, permuted_output, sorted_experts,
+      expanded_dest_row_to_expanded_source_row,
+      expanded_source_row_to_expanded_dest_row, permuted_idx,
+      expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
+      num_local_experts);
 }

 template <class T, class U>
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -295,14 +295,10 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,

 std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a);

-std::tuple<torch::Tensor, torch::Tensor> scaled_fp4_quant_func(
-    torch::Tensor const& input, torch::Tensor const& input_scale,
-    bool is_sf_swizzled_layout);
-
-void scaled_fp4_quant_out(torch::Tensor const& input,
-                          torch::Tensor const& input_scale,
-                          bool is_sf_swizzled_layout, torch::Tensor& output,
-                          torch::Tensor& output_scale);
+void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
+                      torch::Tensor& output_scale,
+                      torch::Tensor const& input_scale,
+                      bool is_sf_swizzled_layout);

 void scaled_fp4_experts_quant(
    torch::Tensor& output, torch::Tensor& output_scale,
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@@ -542,7 +542,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
      if (!lane_id) {
        // Store scales.
        if constexpr (std::is_same<scale_t, uint8_t>::value) {
-          // Packed UE8M0 format. Remove Mantissa.
+          // Packed UE8MO format. Remove Mantissa.
          *y_s_ptr = reinterpret_cast<int16_t&>(y_s) >> 7;

          bool const jump_pack = (current_group_id + 1) % 4 == 0;
--- a/csrc/quantization/fp4/nvfp4_quant_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@@ -16,8 +16,6 @@

 #include <torch/all.h>

-#include "nvfp4_utils.cuh"
-
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
 void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
@@ -53,10 +51,9 @@ void silu_and_mul_scaled_fp4_experts_quant_sm1xxa(
    torch::Tensor const& output_scale_offset_by_experts);
 #endif

-void scaled_fp4_quant_out(torch::Tensor const& input,
-                          torch::Tensor const& input_sf,
-                          bool is_sf_swizzled_layout, torch::Tensor& output,
-                          torch::Tensor& output_sf) {
+void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
+                      torch::Tensor& output_sf, torch::Tensor const& input_sf,
+                      bool is_sf_swizzled_layout) {
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
  return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf,
@@ -65,34 +62,6 @@ void scaled_fp4_quant_out(torch::Tensor const& input,
  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization kernel");
 }

-std::tuple<torch::Tensor, torch::Tensor> scaled_fp4_quant_func(
-    torch::Tensor const& input, torch::Tensor const& input_sf,
-    bool is_sf_swizzled_layout) {
-  int64_t n = input.size(-1);
-  int64_t m = input.numel() / n;
-  auto device = input.device();
-
-  // Two fp4 values packed into a uint8
-  auto output = torch::empty(
-      {m, n / 2}, torch::TensorOptions().device(device).dtype(torch::kUInt8));
-
-  torch::Tensor output_sf;
-  if (is_sf_swizzled_layout) {
-    auto [sf_m, sf_n] = vllm::computeSwizzledSFShape(m, n);
-    output_sf = torch::empty(
-        {sf_m, sf_n},
-        torch::TensorOptions().device(device).dtype(torch::kInt32));
-  } else {
-    output_sf = torch::empty(
-        {m, n / CVT_FP4_SF_VEC_SIZE},
-        torch::TensorOptions().device(device).dtype(torch::kUInt8));
-  }
-
-  scaled_fp4_quant_out(input, input_sf, is_sf_swizzled_layout, output,
-                       output_sf);
-  return {output, output_sf};
-}
-
 void scaled_fp4_experts_quant(
    torch::Tensor& output, torch::Tensor& output_scale,
    torch::Tensor const& input, torch::Tensor const& input_global_scale,
--- a/csrc/quantization/fp4/nvfp4_utils.cuh
+++ b/csrc/quantization/fp4/nvfp4_utils.cuh
@@ -18,7 +18,6 @@

 #include <cuda_runtime.h>
 #include <cuda_fp8.h>
-#include <utility>

 #include "../../cuda_vec_utils.cuh"

@@ -55,18 +54,6 @@ inline int computeEffectiveRows(int m) {
  return round_up(m, ROW_TILE);
 }

-// Compute the shape of the swizzled SF output tensor.
-// Returns (rounded_m, rounded_n / 4) where:
-//   rounded_m = round_up(m, 128)
-//   rounded_n = round_up(n / CVT_FP4_SF_VEC_SIZE, 4)
-inline std::pair<int64_t, int64_t> computeSwizzledSFShape(int64_t m,
-                                                          int64_t n) {
-  int64_t rounded_m = round_up(m, static_cast<int64_t>(128));
-  int64_t scale_n = n / CVT_FP4_SF_VEC_SIZE;
-  int64_t rounded_n = round_up(scale_n, static_cast<int64_t>(4));
-  return {rounded_m, rounded_n / 4};
-}
-
 // Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
 inline __device__ uint32_t fp32_vec8_to_e2m1(float (&array)[8]) {
  uint32_t val;
--- a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@@ -15,33 +15,31 @@ __device__ void rms_norm_dynamic_per_token_quant_vec(
    scalar_t const* __restrict__ input,   // [..., hidden_size]
    scalar_t const* __restrict__ weight,  // [hidden_size]
    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
-    int32_t const input_stride, scalar_t* __restrict__ residual = nullptr) {
+    scalar_t* __restrict__ residual = nullptr) {
  float rms = 0.0f;
  float token_scale = 0.0f;

  // Compute rms
  vllm::vectorized::compute_rms<scalar_t, has_residual>(
-      &rms, input, hidden_size, input_stride, var_epsilon, residual);
+      &rms, input, hidden_size, var_epsilon, residual);

  // Compute scale
  vllm::vectorized::compute_dynamic_per_token_scales<scalar_t, scalar_out_t,
                                                     has_residual>(
      &token_scale, scales, input, weight, rms, scale_ub, hidden_size,
-      input_stride, residual);
+      residual);

  // RMS Norm + Quant
  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
    token_scale = 1.0f / token_scale;
    vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, true,
-                                     has_residual>(out, input, weight, rms,
-                                                   &token_scale, hidden_size,
-                                                   input_stride, residual);
+                                     has_residual>(
+        out, input, weight, rms, &token_scale, hidden_size, residual);
  } else {
    // FP8 - Do not invert token_scale for exact match with FBGemm
    vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, false,
-                                     has_residual>(out, input, weight, rms,
-                                                   &token_scale, hidden_size,
-                                                   input_stride, residual);
+                                     has_residual>(
+        out, input, weight, rms, &token_scale, hidden_size, residual);
  }
 }

@@ -53,40 +51,38 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
    scalar_t const* __restrict__ input,   // [..., hidden_size]
    scalar_t const* __restrict__ weight,  // [hidden_size]
    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
-    int32_t const input_stride, scalar_t* __restrict__ residual = nullptr) {
+    scalar_t* __restrict__ residual = nullptr) {
  // For vectorization, token_input and token_output pointers need to be
  // aligned at 8-byte and 4-byte addresses respectively.
-  bool const can_vectorize = hidden_size % 4 == 0 and input_stride % 4 == 0;
+  bool const can_vectorize = hidden_size % 4 == 0;

  if (can_vectorize) {
    return rms_norm_dynamic_per_token_quant_vec<scalar_t, scalar_out_t,
                                                has_residual>(
        out, scales, input, weight, scale_ub, var_epsilon, hidden_size,
-        input_stride, residual);
+        residual);
  }

  float rms = 0.0f;
  float token_scale = 0.0f;

  // Compute RMS
-  vllm::compute_rms<scalar_t, has_residual>(
-      &rms, input, hidden_size, input_stride, var_epsilon, residual);
+  vllm::compute_rms<scalar_t, has_residual>(&rms, input, hidden_size,
+                                            var_epsilon, residual);
  // Compute Scale
  vllm::compute_dynamic_per_token_scales<scalar_t, scalar_out_t, has_residual>(
      &token_scale, scales, input, weight, rms, scale_ub, hidden_size,
-      input_stride, residual);
+      residual);

  // RMS Norm + Quant
  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
    token_scale = 1.0f / token_scale;
    vllm::norm_and_quant<scalar_t, scalar_out_t, true, has_residual>(
-        out, input, weight, rms, &token_scale, hidden_size, input_stride,
-        residual);
+        out, input, weight, rms, &token_scale, hidden_size, residual);
  } else {
    // FP8 - Do not invert s_token_scale for exact match with FBGemm
    vllm::norm_and_quant<scalar_t, scalar_out_t, false, has_residual>(
-        out, input, weight, rms, &token_scale, hidden_size, input_stride,
-        residual);
+        out, input, weight, rms, &token_scale, hidden_size, residual);
  }
 }

@@ -101,20 +97,19 @@ __global__ void rms_norm_per_block_quant_kernel(
    scalar_t const* __restrict__ input,   // [..., hidden_size]
    scalar_t const* __restrict__ weight,  // [hidden_size]
    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
-    int32_t const input_stride, scalar_t* __restrict__ residual = nullptr,
-    int64_t outer_scale_stride = 1) {
+    scalar_t* __restrict__ residual = nullptr, int64_t outer_scale_stride = 1) {
  float rms;
  // Compute RMS
  // Always able to vectorize due to constraints on hidden_size
  vllm::vectorized::compute_rms<scalar_t, has_residual>(
-      &rms, input, hidden_size, input_stride, var_epsilon, residual);
+      &rms, input, hidden_size, var_epsilon, residual);

  // Compute Scale
  // Always able to vectorize due to constraints on hidden_size and group_size
  vllm::vectorized::compute_dynamic_per_token_scales<
      scalar_t, scalar_out_t, has_residual, is_scale_transposed, group_size>(
-      nullptr, scales, input, weight, rms, scale_ub, hidden_size, input_stride,
-      residual, outer_scale_stride);
+      nullptr, scales, input, weight, rms, scale_ub, hidden_size, residual,
+      outer_scale_stride);

  // RMS Norm + Quant
  // Always able to vectorize due to constraints on hidden_size
@@ -125,7 +120,7 @@ __global__ void rms_norm_per_block_quant_kernel(
  vllm::vectorized::norm_and_quant<
      scalar_t, scalar_out_t, std::is_same_v<scalar_out_t, int8_t>,
      has_residual, is_scale_transposed, group_size>(
-      out, input, weight, rms, scales, hidden_size, input_stride, residual,
+      out, input, weight, rms, scales, hidden_size, residual,
      outer_scale_stride);
 }

@@ -142,7 +137,6 @@ void rms_norm_dynamic_per_token_quant_dispatch(
    std::optional<at::Tensor> const& scale_ub,
    std::optional<at::Tensor>& residual) {
  int32_t hidden_size = input.size(-1);
-  int32_t input_stride = input.view({-1, hidden_size}).stride(0);
  auto num_tokens = input.numel() / hidden_size;

  dim3 grid(num_tokens);
@@ -159,7 +153,7 @@ void rms_norm_dynamic_per_token_quant_dispatch(
                  out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
                  input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
-                  var_epsilon, hidden_size, input_stride,
+                  var_epsilon, hidden_size,
                  has_residual ? residual->data_ptr<scalar_in_t>() : nullptr);
        });
  });
@@ -176,9 +170,7 @@ void rms_norm_dynamic_per_token_quant(
                                        ? c10::ScalarType::Float8_e4m3fn
                                        : c10::ScalarType::Float8_e4m3fnuz;
  TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
-  TORCH_CHECK(out.is_contiguous());
-  TORCH_CHECK(input.stride(-1) == 1,
-              "Input must be contiguous in the last dimension");
+  TORCH_CHECK(out.is_contiguous() && input.is_contiguous());

  if (scale_ub.has_value()) {
    TORCH_CHECK(out.dtype() == kFp8Type);
@@ -187,7 +179,6 @@ void rms_norm_dynamic_per_token_quant(
  TORCH_CHECK(scales.dtype() == torch::kFloat32);
  if (residual) {
    TORCH_CHECK(residual->scalar_type() == input.scalar_type());
-    TORCH_CHECK(residual->is_contiguous());
  }

  VLLM_DISPATCH_FLOATING_TYPES(
@@ -209,15 +200,6 @@ void rms_norm_per_block_quant_dispatch(
    std::optional<at::Tensor> const& scale_ub,
    std::optional<at::Tensor>& residual, bool is_scale_transposed) {
  int32_t hidden_size = input.size(-1);
-  int32_t input_stride = input.view({-1, hidden_size}).stride(0);
-
-  TORCH_CHECK(hidden_size % 4 == 0,
-              "Hidden size must be divisible by 4 for vectorized access");
-  TORCH_CHECK(input_stride % 4 == 0,
-              "Input stride must be divisible by 4 for vectorized access");
-  TORCH_CHECK(group_size % 4 == 0,
-              "Group size must be divisible by 4 for vectorized access");
-
  auto num_tokens = input.numel() / hidden_size;

  dim3 grid(num_tokens);
@@ -243,7 +225,7 @@ void rms_norm_per_block_quant_dispatch(
                            weight.data_ptr<scalar_in_t>(),
                            scale_ub.has_value() ? scale_ub->data_ptr<float>()
                                                 : nullptr,
-                            var_epsilon, hidden_size, input_stride,
+                            var_epsilon, hidden_size,
                            has_residual ? residual->data_ptr<scalar_in_t>()
                                         : nullptr,
                            scales.stride(1));
@@ -264,9 +246,7 @@ void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
                                        ? c10::ScalarType::Float8_e4m3fn
                                        : c10::ScalarType::Float8_e4m3fnuz;
  TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
-  TORCH_CHECK(out.is_contiguous());
-  TORCH_CHECK(input.stride(-1) == 1,
-              "Input must be contiguous in the last dimension");
+  TORCH_CHECK(out.is_contiguous() && input.is_contiguous());

  if (scale_ub.has_value()) {
    TORCH_CHECK(out.dtype() == kFp8Type);
@@ -275,7 +255,6 @@ void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
  TORCH_CHECK(scales.dtype() == torch::kFloat32);
  if (residual) {
    TORCH_CHECK(residual->scalar_type() == input.scalar_type());
-    TORCH_CHECK(residual->is_contiguous());
  }

  TORCH_CHECK(group_size == 128 || group_size == 64,
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
roikoren755	95c0f928cd	[NemotronH] Small fix reasoning parser (#36635 ) Signed-off-by: Roi Koren <roik@nvidia.com> (cherry picked from commit `e661b9ee83`)	2026-03-11 02:51:18 -07:00
Shaun Kotek	c9b1e977dc	add nemotron v3 reasoning parser (#36393 ) Signed-off-by: Shaun Kotek - Nvidia <skotek@nvidia.com> Co-authored-by: root <root@gpu-259.slurm-workers-slurm.slurm.svc.cluster.local> (cherry picked from commit `203a7f27da`)	2026-03-11 02:51:04 -07:00
Kevin H. Luu	1ff2393897	[ci] Bound nvidia-cudnn-frontend version (#36719 ) Signed-off-by: khluu <khluu000@gmail.com> (cherry picked from commit `82b110d50e`)	2026-03-10 21:20:41 -07:00
Benjamin Chislett	5bec0b0ba3	[DSV3.2][MTP] Optimize Indexer MTP handling (#36723 ) Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> (cherry picked from commit `9040cd40af`)	2026-03-10 21:20:23 -07:00
Wei Zhao	6da1310f91	[Bug] Fix TRTLLM Block FP8 MoE Monolithic (#36296 ) Signed-off-by: wzhao18 <wzhao18.sz@gmail.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> (cherry picked from commit `84e436ed1c`)	2026-03-10 19:08:18 -07:00
khluu	bc46be5daf	Revert "add nemotron v3 reasoning parser (#36393 )" This reverts commit `8e39d39fd4`.	2026-03-10 11:47:09 -07:00
Shaun Kotek	8e39d39fd4	add nemotron v3 reasoning parser (#36393 ) Signed-off-by: Shaun Kotek - Nvidia <skotek@nvidia.com> Co-authored-by: root <root@gpu-259.slurm-workers-slurm.slurm.svc.cluster.local> (cherry picked from commit `203a7f27da`)	2026-03-10 09:50:38 -07:00
Vadim Gimpelson	46fa044cc1	[BUGFIX][Mamba][Qwen3.5] Zero freed SSM cache blocks on GPU (#35219 ) Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com> (cherry picked from commit `4ff8c3c8f9`)	2026-03-10 09:26:18 -07:00
amirkl94	ab43e37158	Fix: Re-Enable EP for trtllm MoE FP8 backend (#36494 ) Signed-off-by: Amir Klein <203507526+amirkl94@users.noreply.github.com> (cherry picked from commit `156e33553c`)	2026-03-10 09:26:03 -07:00
Shaun Kotek	f45d010120	Fix/resupport nongated fused moe triton (#36412 ) Signed-off-by: Shaun Kotek - Nvidia <skotek@nvidia.com> Signed-off-by: Natan Bagrov <nbagrov@nvidia.com> Signed-off-by: Daniel Serebrenik <daserebrenik@nvidia.com> Signed-off-by: zjy0516 <riverclouds.zhu@qq.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Signed-off-by: liweiguang <codingpunk@gmail.com> Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io> Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Alex Brooks <albrooks@redhat.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: cong-or <conchubhar.gannon@gmail.com> Signed-off-by: Tushar Shetty <tushar.shetty@abbyy.com> Signed-off-by: Tushar Shetty <54362365+tusharshetty61@users.noreply.github.com> Signed-off-by: jiang1.li <jiang1.li@intel.com> Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com> Signed-off-by: Xin Yang <xyangx@amazon.com> Signed-off-by: Kevin H. Luu <khluu000@gmail.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: nvnbagrov <nbagrov@nvidia.com> Co-authored-by: Sage <80211083+sagearc@users.noreply.github.com> Co-authored-by: danisereb <daserebrenik@nvidia.com> Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Weiguang Li <codingpunk@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Li, Jiang <jiang1.li@intel.com> Co-authored-by: wang.yuqi <yuqi.wang@daocloud.io> Co-authored-by: Alex Brooks <albrooks@redhat.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: cong-or <conchubhar.gannon@gmail.com> Co-authored-by: Tushar Shetty <54362365+tusharshetty61@users.noreply.github.com> Co-authored-by: liuzhenwei <zhenwei.liu@intel.com> Co-authored-by: Xin Yang <105740670+xyang16@users.noreply.github.com> Co-authored-by: Kevin H. Luu <khluu000@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> (cherry picked from commit `fa028207aa`)	2026-03-10 09:25:51 -07:00
amitz-nv	244b922088	[Bugfix] Fix passing of activation_type to trtllm fused MoE NVFP4 and FP8 (#36017 ) Signed-off-by: amitz-nv <203509407+amitz-nv@users.noreply.github.com> (cherry picked from commit `d7adcadb9b`)	2026-03-10 09:25:36 -07:00
khluu	b31e9326a7	Bound openai to under 2.25.0 Signed-off-by: khluu <khluu000@gmail.com>	2026-03-06 13:04:15 -08:00
Doug Smith	e346c08560	[Release] Include source distribution (sdist) in PyPI uploads (#35136 ) Signed-off-by: dougbtv <dosmith@redhat.com> Co-authored-by: Daniele Trifirò <dtrifiro@redhat.com> (cherry picked from commit `0bfa229bf1`)	2026-03-06 13:03:53 -08:00
Avery Miao	b7a423cb01	[BUGFIX]Fix Qwen-Omni models audio max_token_per_item estimation error leading to encoder_cache_size is 0 (#35994 ) Signed-off-by: Miao, Avery <avery.miao@intel.com> (cherry picked from commit `e998fa76b9`)	2026-03-06 13:03:40 -08:00
Cyrus Leung	fa78ec8a72	[Bugfix] Fix Qwen-VL tokenizer implementation (#36140 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> (cherry picked from commit `7196348157`)	2026-03-06 13:03:26 -08:00
Kunshang Ji	9a474ce7a4	[XPU] bump vllm-xpu-kernels to v0.1.3 (#35984 ) Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> (cherry picked from commit `a8f66cbde8`)	2026-03-06 13:03:05 -08:00