[cherry-pick][Bugfix] Fix EP weight filter breaking EPLB and NVFP4 accuracy #37322

Signed-off-by: khluu <khluu000@gmail.com>
[Bugfix] Avoid OpenMP thread reallocation in CPU torch compile (#37391 )
2026-03-18 01:48:32 -07:00 · 2026-03-18 01:41:42 -07:00 · 2026-03-18 01:41:25 -07:00 · 2026-03-18 01:41:09 -07:00 · 2026-03-18 01:40:57 -07:00 · 2026-03-16 22:05:47 -07:00
671 changed files with 58663 additions and 34929 deletions
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -10,7 +10,7 @@ steps:
      docker build
      --build-arg max_jobs=16
      --build-arg REMOTE_VLLM=1
-      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx942;gfx950'
+      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
      --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
      --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
      -f docker/Dockerfile.rocm
--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -21,6 +21,20 @@ steps:
      pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
      pytest -x -v -s tests/kernels/test_onednn.py"

+- label: CPU-Compatibility Tests
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  - cmake/cpu_extension.cmake
+  - setup.py
+  - vllm/platforms/cpu.py
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+      bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
+
 - label: CPU-Language Generation and Pooling Model Tests
  depends_on: []
  soft_fail: true
--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -25,9 +25,7 @@ fi
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --build-arg VLLM_CPU_AVX512BF16=true \
-  --build-arg VLLM_CPU_AVX512VNNI=true \
-  --build-arg VLLM_CPU_AMXBF16=true \
+  --build-arg VLLM_CPU_X86=true \
  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
  --target vllm-test \
  --progress plain .
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@@ -7,12 +7,12 @@ import argparse
 import html as _html
 import json
 import os
+from contextlib import nullcontext
 from dataclasses import dataclass
 from importlib import util
 from pathlib import Path

 import pandas as pd
-import regex as re

 pd.options.display.float_format = "{:.2f}".format
 plotly_found = util.find_spec("plotly.express") is not None
@@ -33,6 +33,45 @@ pd.set_option("display.precision", 2)
 pd.set_option("display.float_format", lambda x: f"{x:.2f}")


+# -----------------------------
+# Concurrency normalization (NEW, small)
+# -----------------------------
+def _find_concurrency_col(df: pd.DataFrame) -> str:
+    for c in [
+        "# of max concurrency.",
+        "# of max concurrency",
+        "Max Concurrency",
+        "max_concurrency",
+        "Concurrency",
+    ]:
+        if c in df.columns:
+            return c
+
+    for c in df.columns:
+        if "concurr" in str(c).lower():
+            s = df[c]
+            if s.dtype.kind in "iu" and s.nunique() > 1 and s.min() >= 1:
+                return c
+
+    raise ValueError(
+        "Cannot infer concurrency column. "
+        "Please rename the column to one of the known names "
+        "or add an explicit override (e.g., --concurrency-col)."
+    )
+
+
+def _normalize_concurrency_in_df(
+    df: pd.DataFrame, canonical: str = "# of max concurrency."
+) -> pd.DataFrame:
+    if canonical in df.columns:
+        return df
+    detected = _find_concurrency_col(df)
+    if detected in df.columns and detected != canonical:
+        return df.rename(columns={detected: canonical})
+    df[canonical] = pd.NA
+    return df
+
+
 # -----------------------------
 # Core data compare
 # -----------------------------
@@ -52,19 +91,25 @@ def compare_data_columns(
    - Concat along axis=1 (indexes align), then reset_index so callers can
      group by columns.
    - If --debug, add a <file_label>_name column per file.
+
+    Minimal fix to support different max_concurrency lists across files:
+      - normalize concurrency column naming to "# of max concurrency."
+      - align on UNION of keys (missing points become NaN)
+      - BUGFIX: don't drop throughput rows based on P99/Median presence
    """
    print("\ncompare_data_column:", data_column)

    frames = []
    raw_data_cols: list[str] = []
-    compare_frames = []

+    # Determine key cols after normalizing concurrency
    cols_per_file: list[set] = []
    for f in files:
        try:
            df_tmp = pd.read_json(f, orient="records")
        except Exception as err:
            raise ValueError(f"Failed to read {f}") from err
+        df_tmp = _normalize_concurrency_in_df(df_tmp, canonical="# of max concurrency.")
        cols_per_file.append(set(df_tmp.columns))

    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
@@ -75,12 +120,25 @@ def compare_data_columns(
            "No common key columns found from info_cols across the input files."
        )

-    meta_added = False
+    union_index = None
+    metas: list[pd.DataFrame] = []
+    staged: list[tuple[str, pd.Series, pd.Series | None]] = []

    for file in files:
        df = pd.read_json(file, orient="records")
+        df = _normalize_concurrency_in_df(df, canonical="# of max concurrency.")

-        if drop_column in df.columns:
+        # BUGFIX: only drop rows for latency-like metrics; throughput rows may have
+        # NaN in P99/Median columns even if the column exists in the JSON.
+        metric_lc = str(data_column).lower()
+        is_latency_metric = (
+            "ttft" in metric_lc
+            or "tpot" in metric_lc
+            or "p99" in metric_lc
+            or "median" in metric_lc
+            or metric_lc.strip() in {"p99", "median"}
+        )
+        if is_latency_metric and drop_column in df.columns:
            df = df.dropna(subset=[drop_column], ignore_index=True)

        for c in (
@@ -105,35 +163,61 @@ def compare_data_columns(
            meta = meta.groupby(level=key_cols, dropna=False).first()

        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
-        s = df_idx[data_column]
-        if not s.index.is_unique:
-            s = s.groupby(level=key_cols, dropna=False).mean()
+
+        if data_column in df_idx.columns:
+            s = df_idx[data_column]
+            if not s.index.is_unique:
+                s = s.groupby(level=key_cols, dropna=False).mean()
+        else:
+            # keep NA series to preserve meta keys for union_index
+            s = pd.Series(pd.NA, index=meta.index)
        s.name = file_label

-        if not meta_added:
-            frames.append(meta)
-            meta_added = True
-
+        name_s = None
        if debug and name_column in df_idx.columns:
            name_s = df_idx[name_column]
            if not name_s.index.is_unique:
                name_s = name_s.groupby(level=key_cols, dropna=False).first()
            name_s.name = f"{file_label}_name"
-            frames.append(name_s)

-        frames.append(s)
+        if union_index is None:
+            union_index = meta.index
+        else:
+            union_index = union_index.union(meta.index)
+        metas.append(meta)
+
+        staged.append((file_label, s, name_s))
+
+    if union_index is None:
+        raise ValueError("No data found after loading inputs.")
+
+    # meta first (union-aligned): build UNION meta across all files
+    if metas:
+        meta_union = pd.concat(metas, axis=0)
+        # Collapse duplicates on the MultiIndex; keep first non-null per column
+        meta_union = meta_union.groupby(level=key_cols, dropna=False).first()
+        frames.append(meta_union.reindex(union_index))
+
+    # values + ratios (union-aligned)
+    metric_series_aligned: list[pd.Series] = []
+    for file_label, s, name_s in staged:
+        s_aligned = s.reindex(union_index)
+        frames.append(s_aligned)
        raw_data_cols.append(file_label)
-        compare_frames.append(s)
+        metric_series_aligned.append(s_aligned)

-        if len(compare_frames) >= 2:
-            base = compare_frames[0]
-            current = compare_frames[-1]
-            if "P99" in data_column or "Median" in data_column:
+        if debug and name_s is not None:
+            frames.append(name_s.reindex(union_index))
+
+        if len(metric_series_aligned) >= 2:
+            base = metric_series_aligned[0]
+            current = metric_series_aligned[-1]
+            if "P99" in str(data_column) or "Median" in str(data_column):
                ratio = base / current
            else:
                ratio = current / base
            ratio = ratio.mask(base == 0)
-            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
+            ratio.name = f"Ratio 1 vs {len(metric_series_aligned)}"
            frames.append(ratio)

    concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
@@ -204,24 +288,10 @@ def split_json_by_tp_pp(
 # -----------------------------
 # Styling helpers
 # -----------------------------
-def _find_concurrency_col(df: pd.DataFrame) -> str:
-    for c in [
-        "# of max concurrency.",
-        "# of max concurrency",
-        "Max Concurrency",
-        "max_concurrency",
-        "Concurrency",
-    ]:
-        if c in df.columns:
-            return c
-    for c in df.columns:
-        if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
-            return c
-    return "# of max concurrency."
-
-
 def _highlight_threshold(
-    df: pd.DataFrame, threshold: float
+    df: pd.DataFrame,
+    threshold: float,
+    slack_pct: float = 0.0,
 ) -> pd.io.formats.style.Styler:
    conc_col = _find_concurrency_col(df)
    key_cols = [
@@ -234,12 +304,24 @@ def _highlight_threshold(
    ]
    conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]

-    return df.style.map(
-        lambda v: "background-color:#e6ffe6;font-weight:bold;"
-        if pd.notna(v) and v <= threshold
-        else "",
-        subset=conf_cols,
-    )
+    try:
+        slack_pct = float(slack_pct or 0.0)
+    except Exception:
+        slack_pct = 0.0
+    slack_limit = threshold * (1.0 + slack_pct / 100.0)
+
+    def _cell(v):
+        if pd.isna(v):
+            return ""
+        if v <= threshold:
+            # Strict SLA
+            return "background-color:#e6ffe6;font-weight:bold;"
+        if v <= slack_limit:
+            # Within slack range
+            return "background-color:#ffe5cc;font-weight:bold;"
+        return ""
+
+    return df.style.map(_cell, subset=conf_cols)


 def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
@@ -286,11 +368,30 @@ def _sanitize_sheet_name(name: str) -> str:
      - max 31 chars
      - cannot contain: : \ / ? * [ ]
      - cannot be empty
+
+    NOTE: Use fast, non-regex operations here to avoid the third-party `regex`
+    module's compile overhead/edge-cases on some systems.
    """
    name = "sheet" if name is None else str(name)
-    name = re.sub(r"[:\\/?*\[\]]", "_", name)
+
+    # Replace illegal characters with underscore.
+    trans = str.maketrans(
+        {
+            ":": "_",
+            "\\": "_",
+            "/": "_",
+            "?": "_",
+            "*": "_",
+            "[": "_",
+            "]": "_",
+        }
+    )
+    name = name.translate(trans)
+
+    # Strip quotes/spaces and collapse whitespace.
    name = name.strip().strip("'")
-    name = re.sub(r"\s+", " ", name)
+    name = " ".join(name.split())
+
    if not name:
        name = "sheet"
    return name[:31]
@@ -298,30 +399,57 @@ def _sanitize_sheet_name(name: str) -> str:

 def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
    d = dict(zip(group_cols, gkey_tuple))
-    model = d.get("Model", "model")
-    model_short = str(model).split("/")[-1]
+
+    # Always keep input/output lengths (these are important).
    ilen = d.get("Input Len", "")
    olen = d.get("Output Len", "")
    lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
+
+    # Shorten model name aggressively to make room for lens.
+    model = d.get("Model", "model")
+    leaf = str(model).split("/")[-1]
+
+    max_model_len = max(1, 31 - len(lens))
+    model_short = leaf[:max_model_len]
+
    return _sanitize_sheet_name(f"{model_short}{lens}")


 def _write_tables_to_excel_sheet(
    writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
 ):
-    startrow = 0
+    """Write all blocks to a sheet with a single to_excel() call.
+
+    Pandas+openpyxl can be extremely slow when called many times per sheet.
+    We flatten blocks into one table with a 'Section' column to keep structure
+    while making Excel generation fast and deterministic.
+    """
+    if not blocks:
+        pd.DataFrame().to_excel(writer, sheet_name=sheet, index=False)
+        return
+
+    combined_parts: list[pd.DataFrame] = []
    for title, df in blocks:
-        pd.DataFrame([[title]]).to_excel(
-            writer, sheet_name=sheet, index=False, header=False, startrow=startrow
-        )
-        startrow += 1
-        df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow)
-        startrow += len(df) + 3
+        df2 = df.copy()
+        # Put the section label as the first column for readability.
+        df2.insert(0, "Section", title)
+        combined_parts.append(df2)
+
+    combined = pd.concat(combined_parts, axis=0, ignore_index=True, sort=False)
+    combined.to_excel(writer, sheet_name=sheet, index=False)


 def _safe_filename(s: str) -> str:
-    s = re.sub(r"[^\w\-.]+", "_", str(s).strip())
-    return s[:180] if len(s) > 180 else s
+    # Fast path without the third-party `regex` module.
+    s = " ".join(str(s).strip().split())
+    allowed = []
+    for ch in s:
+        if ch.isalnum() or ch in "._-":
+            allowed.append(ch)
+        else:
+            allowed.append("_")
+    out = "".join(allowed)
+    return out[:180] if len(out) > 180 else out


 # -----------------------------
@@ -428,7 +556,11 @@ def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:


 def _max_concurrency_ok(
-    df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
+    df: pd.DataFrame,
+    conc_col: str,
+    cfg_col: str,
+    threshold: float,
+    slack_pct: float = 0.0,
 ):
    if df is None or conc_col not in df.columns or cfg_col not in df.columns:
        return pd.NA
@@ -441,7 +573,14 @@ def _max_concurrency_ok(
    if d.empty:
        return pd.NA

-    ok = d[d[cfg_col] <= threshold]
+    # Accept values up to (1 + slack_pct%) above the SLA.
+    try:
+        slack_pct = float(slack_pct or 0.0)
+    except Exception:
+        slack_pct = 0.0
+    effective_limit = float(threshold) * (1.0 + slack_pct / 100.0)
+
+    ok = d[d[cfg_col] <= effective_limit]
    if ok.empty:
        return pd.NA

@@ -507,15 +646,25 @@ def build_valid_max_concurrency_summary_html(
    if not cfg_cols:
        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)

+    # Display SLA ranges in the table header (SLA .. SLA*(1+slack))
+    ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
+    tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
+    ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
+    tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
+
    rows = []
    for cfg in cfg_cols:
        ttft_max = (
-            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
+            _max_concurrency_ok(
+                ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
+            )
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_max = (
-            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
+            _max_concurrency_ok(
+                tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
+            )
            if tpot_group_df is not None
            else pd.NA
        )
@@ -544,8 +693,8 @@ def build_valid_max_concurrency_summary_html(
        rows.append(
            {
                "Configuration": cfg,
-                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
-                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
+                f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
+                f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
                f"Max {conc_col} (Both)": both,
                "Output Tput @ Both (tok/s)": tput_at_both,
                "TTFT @ Both (ms)": ttft_at_both,
@@ -620,15 +769,24 @@ def build_valid_max_concurrency_summary_df(
    if not cfg_cols:
        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)

+    ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
+    tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
+    ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
+    tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
+
    rows = []
    for cfg in cfg_cols:
        ttft_max = (
-            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
+            _max_concurrency_ok(
+                ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
+            )
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_max = (
-            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
+            _max_concurrency_ok(
+                tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
+            )
            if tpot_group_df is not None
            else pd.NA
        )
@@ -657,8 +815,8 @@ def build_valid_max_concurrency_summary_df(
        rows.append(
            {
                "Configuration": cfg,
-                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
-                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
+                f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
+                f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
                f"Max {conc_col} (Both)": both,
                "Output Tput @ Both (tok/s)": tput_at_both,
                "TTFT @ Both (ms)": ttft_at_both,
@@ -751,7 +909,21 @@ def build_parser() -> argparse.ArgumentParser:
        help="Reference limit for TPOT plots (ms)",
    )

-    # ---- NEW: export options ----
+    # ---- SLA tolerance (slack) options ----
+    parser.add_argument(
+        "--ttft-slack-pct",
+        type=float,
+        default=5.0,
+        help="Allowed percentage above TTFT SLA (default: 5).",
+    )
+    parser.add_argument(
+        "--tpot-slack-pct",
+        type=float,
+        default=5.0,
+        help="Allowed percentage above TPOT SLA (default: 5).",
+    )
+
+    # ---- export options ----
    parser.add_argument(
        "--excel-out",
        type=str,
@@ -843,9 +1015,13 @@ def render_metric_table_html(

    metric_name = metric_label.lower()
    if "ttft" in metric_name:
-        styler = _highlight_threshold(display_group, args.ttft_max_ms)
+        styler = _highlight_threshold(
+            display_group, args.ttft_max_ms, args.ttft_slack_pct
+        )
    elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
-        styler = _highlight_threshold(display_group, args.tpot_max_ms)
+        styler = _highlight_threshold(
+            display_group, args.tpot_max_ms, args.tpot_slack_pct
+        )
    else:
        styler = display_group.style

@@ -962,22 +1138,46 @@ def write_report_group_first(
        csv_dir.mkdir(parents=True, exist_ok=True)

    excel_path = args.excel_out or "perf_comparison.xlsx"
-    with pd.ExcelWriter(excel_path, engine="openpyxl") as xw:
+    disable_excel = os.getenv("VLLM_COMPARE_DISABLE_EXCEL", "0") == "1"
+
+    # Prefer xlsxwriter for speed; fallback to openpyxl if unavailable.
+    excel_engine = (
+        os.getenv("VLLM_COMPARE_EXCEL_ENGINE", "xlsxwriter").strip() or "xlsxwriter"
+    )
+    if excel_engine == "xlsxwriter" and util.find_spec("xlsxwriter") is None:
+        excel_engine = "openpyxl"
+
+    excel_engine_kwargs = {}
+    if excel_engine == "xlsxwriter":
+        # Reduce memory pressure & usually faster writes.
+        excel_engine_kwargs = {"options": {"constant_memory": True}}
+
+    xw_ctx = (
+        nullcontext(None)
+        if disable_excel
+        else pd.ExcelWriter(
+            excel_path, engine=excel_engine, engine_kwargs=excel_engine_kwargs
+        )
+    )
+    with xw_ctx as xw:
+        used_sheets: set[str] = set()
        # ---- Environment sheet (first) ----
        env_sheet = _sanitize_sheet_name("Environment")
        env_df = _load_env_df_for_inputs(args, files)
-        if env_df is None or env_df.empty:
-            pd.DataFrame(
-                [
-                    {
-                        "Section": "Environment",
-                        "Key": "vllm_env.txt",
-                        "Value": "NOT FOUND (or empty)",
-                    }
-                ]
-            ).to_excel(xw, sheet_name=env_sheet, index=False)
-        else:
-            env_df.to_excel(xw, sheet_name=env_sheet, index=False)
+        if xw is not None:
+            if env_df is None or env_df.empty:
+                pd.DataFrame(
+                    [
+                        {
+                            "Section": "Environment",
+                            "Key": "vllm_env.txt",
+                            "Value": "NOT FOUND (or empty)",
+                        }
+                    ]
+                ).to_excel(xw, sheet_name=env_sheet, index=False)
+            else:
+                env_df.to_excel(xw, sheet_name=env_sheet, index=False)
+            used_sheets.add(env_sheet)
        with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
            main_fh.write('<meta charset="utf-8">\n')
            for gkey in group_keys:
@@ -993,12 +1193,19 @@ def write_report_group_first(

                main_fh.write(group_header)

+                do_excel = xw is not None
                sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
                sheet_base = sheet
-                dedup_i = 1
-                while sheet in xw.sheets:
-                    dedup_i += 1
-                    sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}")
+                if do_excel:
+                    dedup_i = 1
+                    while sheet in used_sheets:
+                        dedup_i += 1
+                        suffix = f"_{dedup_i}"
+                        # Ensure uniqueness even when sheet names are truncated.
+                        base = str(sheet_base)
+                        keep = max(1, 31 - len(suffix))
+                        sheet = _sanitize_sheet_name(base[:keep] + suffix)
+                    used_sheets.add(sheet)

                excel_blocks: list[tuple[str, pd.DataFrame]] = []

@@ -1059,7 +1266,7 @@ def write_report_group_first(
                        )

                        excel_blocks.append(
-                            (metric_label, display_group.reset_index(drop=True))
+                            (metric_label, group_df.reset_index(drop=True))
                        )
                        if csv_dir:
                            fn = _safe_filename(
@@ -1067,7 +1274,7 @@ def write_report_group_first(
                                    "/", "_"
                                )
                            )
-                            display_group.to_csv(csv_dir / f"{fn}.csv", index=False)
+                            group_df.to_csv(csv_dir / f"{fn}.csv", index=False)

                    summary_html = build_valid_max_concurrency_summary_html(
                        tput_group_df=tput_group_df,
@@ -1097,9 +1304,13 @@ def write_report_group_first(
                            )
                            summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)

-                _write_tables_to_excel_sheet(xw, sheet, excel_blocks)
+                if do_excel:
+                    _write_tables_to_excel_sheet(xw, sheet, excel_blocks)

-    print(f"Wrote Excel: {excel_path}")
+    if disable_excel:
+        print("Skipped Excel generation (VLLM_COMPARE_DISABLE_EXCEL=1).")
+    else:
+        print(f"Wrote Excel: {excel_path}")
    if csv_dir:
        print(f"Wrote CSVs under: {csv_dir}")

--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -12,6 +12,13 @@ DRY_RUN="${DRY_RUN:-0}"
 MODEL_FILTER="${MODEL_FILTER:-}"
 DTYPE_FILTER="${DTYPE_FILTER:-}"

+# Adaptive search controls
+ENABLE_ADAPTIVE_CONCURRENCY="${ENABLE_ADAPTIVE_CONCURRENCY:-0}"
+SLA_TTFT_MS="${SLA_TTFT_MS:-3000}"
+SLA_TPOT_MS="${SLA_TPOT_MS:-100}"
+ADAPTIVE_MAX_PROBES="${ADAPTIVE_MAX_PROBES:-8}"
+ADAPTIVE_MAX_CONCURRENCY="${ADAPTIVE_MAX_CONCURRENCY:-1024}"
+
 check_gpus() {
  if command -v nvidia-smi; then
    # check the number of GPUs and GPU type.
@@ -183,6 +190,304 @@ upload_to_buildkite() {
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }

+# -------------------------------
+# Adaptive concurrency helpers
+# -------------------------------
+result_json_path_for_serving() {
+  local test_name=$1
+  local qps=$2
+  local max_concurrency=$3
+  echo "$RESULTS_FOLDER/${test_name}_qps_${qps}_concurrency_${max_concurrency}.json"
+}
+
+extract_metric_ms() {
+  local metric_name=$1
+  local json_file=$2
+
+  [[ -f "$json_file" ]] || return 0
+
+  if [[ "$metric_name" == "ttft" ]]; then
+    jq -r '
+      [
+        .ttft_ms.p99?,
+        .metrics.ttft_ms.p99?,
+        .ttft.p99?,
+        .metrics.ttft.p99?,
+        .p99_ttft_ms?,
+        .ttft_ms.mean?,
+        .metrics.ttft_ms.mean?,
+        .ttft.mean?,
+        .metrics.ttft.mean?,
+        .mean_ttft_ms?
+      ] | map(select(. != null)) | .[0] // empty
+    ' "$json_file"
+  else
+    jq -r '
+      [
+        .tpot_ms.p99?,
+        .metrics.tpot_ms.p99?,
+        .tpot.p99?,
+        .metrics.tpot.p99?,
+        .p99_tpot_ms?,
+        .itl_ms.p99?,
+        .metrics.itl_ms.p99?,
+        .inter_token_latency_ms.p99?,
+        .tpot_ms.mean?,
+        .metrics.tpot_ms.mean?,
+        .tpot.mean?,
+        .metrics.tpot.mean?,
+        .itl_ms.mean?,
+        .metrics.itl_ms.mean?,
+        .mean_tpot_ms?,
+        .mean_itl_ms?
+      ] | map(select(. != null)) | .[0] // empty
+    ' "$json_file"
+  fi
+}
+
+evaluate_sla_from_json() {
+  local json_file=$1
+  local ttft
+  local tpot
+  local pass
+
+  [[ -f "$json_file" ]] || return 2
+
+  ttft=$(extract_metric_ms ttft "$json_file")
+  tpot=$(extract_metric_ms tpot "$json_file")
+
+  [[ -n "$ttft" && -n "$tpot" ]] || return 2
+
+  pass=$(jq -n \
+    --argjson ttft "$ttft" \
+    --argjson tpot "$tpot" \
+    --argjson sla_ttft "$SLA_TTFT_MS" \
+    --argjson sla_tpot "$SLA_TPOT_MS" \
+    '($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)')
+
+  [[ "$pass" == "true" ]]
+}
+
+write_adaptive_summary_json() {
+  local summary_file=$1
+  local test_name=$2
+  local qps=$3
+  local static_last_pass=$4
+  local static_first_fail=$5
+  local final_last_pass=$6
+  local final_first_fail=$7
+
+  jq -n \
+    --arg test_name "$test_name" \
+    --arg qps "$qps" \
+    --argjson sla_ttft "$SLA_TTFT_MS" \
+    --argjson sla_tpot "$SLA_TPOT_MS" \
+    --arg static_last_pass "${static_last_pass:-}" \
+    --arg static_first_fail "${static_first_fail:-}" \
+    --arg final_last_pass "${final_last_pass:-}" \
+    --arg final_first_fail "${final_first_fail:-}" \
+    '{
+      test_name: $test_name,
+      qps: $qps,
+      sla_ttft_ms: $sla_ttft,
+      sla_tpot_ms: $sla_tpot,
+      static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end),
+      static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end),
+      final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end),
+      final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end)
+    }' > "$summary_file"
+}
+
+run_single_serving_probe() {
+  local test_name=$1
+  local qps=$2
+  local max_concurrency=$3
+  local tp=$4
+  local compilation_config_mode=$5
+  local optimization_level=$6
+  local client_args_effective=$7
+  local client_remote_args=$8
+  local server_command=$9
+
+  local new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
+  local result_json
+  local num_prompts_arg=""
+  local client_command
+
+  result_json=$(result_json_path_for_serving "$test_name" "$qps" "$max_concurrency")
+
+  if [[ -f "$result_json" ]]; then
+    evaluate_sla_from_json "$result_json"
+    return $?
+  fi
+
+  if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
+    num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
+    if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
+    if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
+    num_prompts_arg="--num-prompts $num_prompts"
+  fi
+
+  client_command="vllm bench serve \
+    --save-result \
+    --result-dir $RESULTS_FOLDER \
+    --result-filename ${new_test_name}.json \
+    --request-rate $qps \
+    --max-concurrency $max_concurrency \
+    $num_prompts_arg \
+    --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level adaptive_search=1 \
+    $client_args_effective $client_remote_args "
+
+  echo "Adaptive probe: $client_command"
+
+  if [[ "${DRY_RUN:-0}" != "1" ]]; then
+    bash -c "$client_command"
+  fi
+
+  jq_output=$(jq -n \
+    --arg server "$server_command" \
+    --arg client "$client_command" \
+    --arg gpu "$gpu_type" \
+    '{
+      server_command: $server,
+      client_command: $client,
+      gpu_type: $gpu,
+      adaptive_search: true
+    }')
+  echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
+
+  evaluate_sla_from_json "$result_json"
+}
+
+adaptive_refine_from_static_results() {
+  local test_name=$1
+  local qps=$2
+  local max_concurrency_list_raw=$3
+  local tp=$4
+  local compilation_config_mode=$5
+  local optimization_level=$6
+  local client_args_effective=$7
+  local client_remote_args=$8
+  local server_command=$9
+
+  local sorted_points
+  local point
+  local rc
+  local static_last_pass=""
+  local static_first_fail=""
+  local largest_static=""
+  local step_hint=1
+  local previous_point=""
+  local low
+  local high
+  local mid
+  local probes=0
+  local summary_file="$RESULTS_FOLDER/${test_name}_qps_${qps}_sla_summary.json"
+
+  [[ "${ENABLE_ADAPTIVE_CONCURRENCY}" == "1" ]] || return 0
+  [[ "${DRY_RUN:-0}" != "1" ]] || return 0
+
+  sorted_points=$(for point in $max_concurrency_list_raw; do printf '%s\n' "$point"; done | tr -d "'" | awk '/^[0-9]+$/' | sort -n | uniq)
+  [[ -n "$sorted_points" ]] || return 0
+
+  while read -r point; do
+    [[ -z "$point" ]] && continue
+    largest_static="$point"
+    evaluate_sla_from_json "$(result_json_path_for_serving "$test_name" "$qps" "$point")"
+    rc=$?
+    if (( rc == 0 )); then
+      static_last_pass="$point"
+    elif (( rc == 1 )); then
+      if [[ -n "$static_last_pass" ]]; then
+        static_first_fail="$point"
+        break
+      fi
+    fi
+
+    if [[ -n "$previous_point" ]]; then
+      step_hint=$(( point - previous_point ))
+      if (( step_hint < 1 )); then step_hint=1; fi
+    fi
+    previous_point="$point"
+  done <<< "$sorted_points"
+
+  if [[ -z "$static_last_pass" ]]; then
+    write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "" "$static_first_fail" "" "$static_first_fail"
+    return 0
+  fi
+
+  if [[ -n "$static_first_fail" ]]; then
+    low=$static_last_pass
+    high=$static_first_fail
+    while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
+      mid=$(( (low + high) / 2 ))
+      probes=$(( probes + 1 ))
+      run_single_serving_probe \
+        "$test_name" "$qps" "$mid" "$tp" \
+        "$compilation_config_mode" "$optimization_level" \
+        "$client_args_effective" "$client_remote_args" "$server_command"
+      rc=$?
+      if (( rc == 0 )); then
+        low=$mid
+      elif (( rc == 1 )); then
+        high=$mid
+      else
+        break
+      fi
+    done
+    write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "$static_first_fail" "$low" "$high"
+    return 0
+  fi
+
+  low=$largest_static
+  high=""
+  while (( probes < ADAPTIVE_MAX_PROBES )); do
+    point=$(( low + step_hint ))
+    if (( point > ADAPTIVE_MAX_CONCURRENCY )); then
+      point=$ADAPTIVE_MAX_CONCURRENCY
+    fi
+    (( point > low )) || break
+    probes=$(( probes + 1 ))
+    run_single_serving_probe \
+      "$test_name" "$qps" "$point" "$tp" \
+      "$compilation_config_mode" "$optimization_level" \
+      "$client_args_effective" "$client_remote_args" "$server_command"
+    rc=$?
+    if (( rc == 0 )); then
+      low=$point
+      (( point == ADAPTIVE_MAX_CONCURRENCY )) && break
+      step_hint=$(( step_hint * 2 ))
+      if (( step_hint < 1 )); then step_hint=1; fi
+    elif (( rc == 1 )); then
+      high=$point
+      break
+    else
+      break
+    fi
+  done
+
+  if [[ -n "$high" ]]; then
+    while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
+      mid=$(( (low + high) / 2 ))
+      probes=$(( probes + 1 ))
+      run_single_serving_probe \
+        "$test_name" "$qps" "$mid" "$tp" \
+        "$compilation_config_mode" "$optimization_level" \
+        "$client_args_effective" "$client_remote_args" "$server_command"
+      rc=$?
+      if (( rc == 0 )); then
+        low=$mid
+      elif (( rc == 1 )); then
+        high=$mid
+      else
+        break
+      fi
+    done
+  fi
+
+  write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "" "$low" "$high"
+}
+
 run_benchmark_tests() {
  # run benchmark tests using `vllm bench <test_type>` command
  # $1: test type (latency or throughput)
@@ -347,10 +652,48 @@ run_serving_tests() {
    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
    client_params=$(echo "$params" | jq -r '.client_parameters')

-    server_args=$(json2args "$server_params")
+    # vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly.
+    server_model=$(echo "$server_params" | jq -r '.model // empty')
+    if [[ -z "$server_model" || "$server_model" == "null" ]]; then
+      echo "Error: serving test '$test_name' is missing server_parameters.model" >&2
+      exit 1
+    fi
+    server_params_no_model=$(echo "$server_params" | jq -c 'del(.model)')
+    server_args=$(json2args "$server_params_no_model")
+
    server_envs=$(json2envs "$server_envs")
    client_args=$(json2args "$client_params")

+    # ------------------------------------------------------------
+    # Option 1: Dynamic num-prompts scaling based on max_concurrency
+    #
+    # If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with:
+    #   num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY
+    #
+    # If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior
+    # unchanged (i.e., whatever is in serving-tests-*.json).
+    # ------------------------------------------------------------
+    PROMPTS_PER_CONCURRENCY="${PROMPTS_PER_CONCURRENCY-}"  # no default on purpose
+    MIN_NUM_PROMPTS="${MIN_NUM_PROMPTS:-1}"
+    MAX_NUM_PROMPTS="${MAX_NUM_PROMPTS:-1000000}"
+
+    if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
+      # Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
+      # Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
+      # Handles: --num-prompts 123   and   --num-prompts=123
+      client_args_no_np="$(
+        printf ' %s ' "$client_args" \
+        | sed -E \
+          -e 's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g' \
+          -e 's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g'
+      )"
+      # normalize whitespace
+      client_args_no_np="$(echo "$client_args_no_np" | tr -s ' ' | sed -E 's/^ //; s/ $//')"
+      client_args_no_np="$(echo "$client_args_no_np" | xargs)"
+      client_args_effective="$client_args_no_np"
+    else
+      client_args_effective="$client_args"
+    fi
    # qps_list
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
@@ -382,14 +725,13 @@ run_serving_tests() {
    fi

    # check if server model and client model is aligned
-    server_model=$(echo "$server_params" | jq -r '.model')
    client_model=$(echo "$client_params" | jq -r '.model')
    if [[ $server_model != "$client_model" ]]; then
      echo "Server model and client model must be the same. Skip testcase $test_name."
      continue
    fi

-    server_command="$server_envs vllm serve \
+    server_command="$server_envs vllm serve $server_model \
      $server_args"

    # run the server
@@ -436,6 +778,14 @@ run_serving_tests() {
      for max_concurrency in $max_concurrency_list; do
        new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
        echo " new test name $new_test_name"
+        # If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts.
+        num_prompts_arg=""
+        if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
+          num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
+          if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
+          if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
+          num_prompts_arg="--num-prompts $num_prompts"
+        fi
        # pass the tensor parallel size, the compilation mode, and the optimization
        # level to the client so that they can be used on the benchmark dashboard
        client_command="vllm bench serve \
@@ -444,8 +794,9 @@ run_serving_tests() {
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --max-concurrency $max_concurrency \
+          $num_prompts_arg \
          --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
-          $client_args $client_remote_args "
+          $client_args_effective $client_remote_args "

        echo "Running test case $test_name with qps $qps"
        echo "Client command: $client_command"
@@ -467,6 +818,11 @@ run_serving_tests() {
        echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"

      done
+
+      adaptive_refine_from_static_results \
+        "$test_name" "$qps" "$max_concurrency_list" "$tp" \
+        "$compilation_config_mode" "$optimization_level" \
+        "$client_args_effective" "$client_remote_args" "$server_command"
    done

    # clean up
@@ -532,6 +888,7 @@ main() {
  # postprocess benchmarking results
  pip install tabulate pandas
  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
+  python3 $QUICK_BENCHMARK_ROOT/scripts/compare-json-results.py -f $RESULTS_FOLDER/benchmark_results.json

  upload_to_buildkite
 }
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
@@ -0,0 +1,37 @@
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+    "server_environment_variables": {
+      "VLLM_RPC_TIMEOUT": 100000,
+      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
+    },
+    "server_parameters": {
+      "dtype": "bfloat16",
+      "model": "openai/whisper-large-v3-turbo"
+    },
+    "client_parameters": {
+      "model": "openai/whisper-large-v3-turbo",
+      "backend": "openai-audio",
+      "endpoint": "/v1/audio/transcriptions",
+      "dataset_name": "hf",
+      "dataset_path": "openslr/librispeech_asr",
+      "hf_subset": "clean",
+      "hf_split": "test",
+      "no_stream": "",
+      "no_oversample": "",
+      "num_prompts": 200
+    }
+  },
+  "tests": [
+    {
+      "test_name": "serving_whisper_large_v3_turbo_librispeech_clean_tp1",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {}
+    }
+  ]
+}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
@@ -149,6 +149,39 @@
        "random-output-len": 128
      }
    },
+    {
+      "test_name": "serving_llama8B_tp1_random_2048_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_2048_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_2048_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 2048
+      }
+    },
    {
      "test_name": "serving_llama8B_int4_tp1_random_128_128",
      "server_parameters": {
@@ -188,6 +221,45 @@
        "random-output-len": 128
      }
    },
+    {
+      "test_name": "serving_llama8B_int8_tp1_random_128_128",
+      "server_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int8_tp2_random_128_128",
+      "server_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int8_tp4_random_128_128",
+      "server_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
    {
      "test_name": "serving_llama3B_tp1_random_128_128",
      "server_parameters": {
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -72,17 +72,6 @@
        "random-output-len": 128
      }
    },
-    {
-      "test_name": "serving_llama8B_tp4_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
    {
      "test_name": "serving_llama8B_tp1_random_128_2048",
      "server_parameters": {
@@ -105,17 +94,6 @@
        "random-output-len": 2048
      }
    },
-    {
-      "test_name": "serving_llama8B_tp4_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
    {
      "test_name": "serving_llama8B_tp1_random_2048_128",
      "server_parameters": {
@@ -139,14 +117,25 @@
      }
    },
    {
-      "test_name": "serving_llama8B_tp4_random_2048_128",
+      "test_name": "serving_llama8B_tp1_random_2048_2048",
      "server_parameters": {
-        "tensor_parallel_size": 4
+        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
-        "random-output-len": 128
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_2048_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 2048
      }
    }
  ]
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -83,7 +83,7 @@ steps:
        agents:
          queue: cpu_queue_postmerge
        commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
          - "mkdir artifacts"
          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
@@ -152,7 +152,7 @@ steps:
          queue: cpu_queue_postmerge
        commands:
          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
        env:
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -205,6 +205,13 @@ re_quote_pytest_markers() {
      esac

      if $is_boundary; then
+        # Strip surrounding double quotes if present (from upstream
+        # single-to-double conversion); without this, wrapping below
+        # would produce '"expr"' with literal double-quote characters.
+        if [[ "$marker_buf" == '"'*'"' ]]; then
+          marker_buf="${marker_buf#\"}"
+          marker_buf="${marker_buf%\"}"
+        fi
        # Flush the collected marker expression
        if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
          output+="'${marker_buf}' "
@@ -242,6 +249,11 @@ re_quote_pytest_markers() {

  # Flush any trailing marker expression (marker at end of command)
  if $collecting && [[ -n "$marker_buf" ]]; then
+    # Strip surrounding double quotes (see mid-stream flush comment)
+    if [[ "$marker_buf" == '"'*'"' ]]; then
+      marker_buf="${marker_buf#\"}"
+      marker_buf="${marker_buf%\"}"
+    fi
    if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
      output+="'${marker_buf}'"
    else
@@ -492,6 +504,8 @@ else
    -e HF_TOKEN \
    -e AWS_ACCESS_KEY_ID \
    -e AWS_SECRET_ACCESS_KEY \
+    -e BUILDKITE_PARALLEL_JOB \
+    -e BUILDKITE_PARALLEL_JOB_COUNT \
    -v "${HF_CACHE}:${HF_MOUNT}" \
    -e "HF_HOME=${HF_MOUNT}" \
    -e "PYTHONPATH=${MYPYTHONPATH}" \
--- a/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+set -euox pipefail
+
+export VLLM_CPU_KVCACHE_SPACE=1 
+export VLLM_CPU_CI_ENV=1
+# Reduce sub-processes for acceleration
+export TORCH_COMPILE_DISABLE=1 
+export VLLM_ENABLE_V1_MULTIPROCESSING=0
+
+SDE_ARCHIVE="sde-external-10.7.0-2026-02-18-lin.tar.xz"
+SDE_CHECKSUM="CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217"
+wget "https://downloadmirror.intel.com/913594/${SDE_ARCHIVE}"
+echo "${SDE_CHECKSUM}  ${SDE_ARCHIVE}" | sha256sum --check
+mkdir -p sde
+tar -xvf "./${SDE_ARCHIVE}" --strip-components=1 -C ./sde/
+
+wait_for_pid_and_check_log() {
+    local pid="$1"
+    local log_file="$2"
+    local exit_status
+
+    if [ -z "$pid" ] || [ -z "$log_file" ]; then
+        echo "Usage: wait_for_pid_and_check_log <PID> <LOG_FILE>"
+        return 1
+    fi
+
+    echo "Waiting for process $pid to finish..."
+    
+    # Use the 'wait' command to pause the script until the specific PID exits.
+    # The 'wait' command's own exit status will be that of the waited-for process.
+    if wait "$pid"; then
+        exit_status=$?
+        echo "Process $pid finished with exit status $exit_status (Success)."
+    else
+        exit_status=$?
+        echo "Process $pid finished with exit status $exit_status (Failure)."
+    fi
+
+    if [ "$exit_status" -ne 0 ]; then
+        echo "Process exited with a non-zero status."
+        echo "--- Last few lines of log file: $log_file ---"
+        tail -n 50 "$log_file"
+        echo "---------------------------------------------"
+        return 1 # Indicate failure based on exit status
+    fi
+
+    echo "No errors detected in log file and process exited successfully."
+    return 0
+}
+
+# Test Sky Lake (AVX512F)
+./sde/sde64 -skl -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_0.log 2>&1 &
+PID_TEST_0=$!
+
+# Test Cascade Lake (AVX512F + VNNI)
+./sde/sde64 -clx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_1.log 2>&1 &
+PID_TEST_1=$!
+
+# Test Cooper Lake (AVX512F + VNNI + BF16)
+./sde/sde64 -cpx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_2.log 2>&1 &
+PID_TEST_2=$!
+
+wait_for_pid_and_check_log $PID_TEST_0 test_0.log
+wait_for_pid_and_check_log $PID_TEST_1 test_1.log
+wait_for_pid_and_check_log $PID_TEST_2 test_2.log
--- a/.buildkite/scripts/tool_call/run-bfcl-eval.sh
+++ b/.buildkite/scripts/tool_call/run-bfcl-eval.sh
@@ -0,0 +1,248 @@
+#!/bin/bash
+# Run BFCL (Berkeley Function Call Leaderboard) tool-calling correctness
+# evaluation against a local vLLM server.
+#
+# Usage:
+#   # Run with defaults (gpt-oss-20b, multi_turn)
+#   bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
+#
+#   # Run with gpt-oss-120b and multiple test categories
+#   BFCL_MODEL="openai/gpt-oss-120b" BFCL_TP_SIZE=4 \
+#     BFCL_TEST_CATEGORY="live_simple, multiple, parallel_multiple" \
+#     bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
+#
+#   # Chain both API types (use BFCL_OUTPUT_DIR to avoid overwriting results)
+#   BFCL_OUTPUT_DIR=./bfcl-chat-completions BFCL_API_TYPE=chat_completions \
+#     bash .buildkite/scripts/tool_call/run-bfcl-eval.sh && \
+#   BFCL_OUTPUT_DIR=./bfcl-responses BFCL_API_TYPE=responses \
+#     bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
+#
+# Environment variables (all optional, with defaults):
+#   BFCL_MODEL          - HF model name (default: openai/gpt-oss-20b)
+#   BFCL_API_TYPE       - API type: "chat_completions" or "responses" (default: chat_completions)
+#   BFCL_OUTPUT_DIR     - Directory for BFCL results (default: current working directory)
+#   BFCL_TEST_CATEGORY  - BFCL test categories (default: multi_turn)
+#   BFCL_TOOL_CALL_PARSER - Tool call parser name (default: openai)
+#   BFCL_NUM_THREADS    - Threads for BFCL generate (default: 8)
+#   BFCL_TP_SIZE        - Tensor parallel size (default: 1)
+#   BFCL_MAX_MODEL_LEN  - Max model length (default: 4096)
+#   BFCL_PORT           - Server port (default: 8000)
+#   BFCL_REASONING_PARSER - Reasoning parser name (default: disabled)
+#   BFCL_EXTRA_ARGS     - Additional vLLM server args
+
+set -euo pipefail
+
+# ---- Configuration ----
+MODEL="${BFCL_MODEL:-openai/gpt-oss-20b}"
+API_TYPE="${BFCL_API_TYPE:-chat_completions}"
+OUTPUT_DIR="${BFCL_OUTPUT_DIR:-}"
+TEST_CATEGORY="${BFCL_TEST_CATEGORY:-multi_turn}"
+TOOL_CALL_PARSER="${BFCL_TOOL_CALL_PARSER:-openai}"
+NUM_THREADS="${BFCL_NUM_THREADS:-8}"
+TP_SIZE="${BFCL_TP_SIZE:-1}"
+MAX_MODEL_LEN="${BFCL_MAX_MODEL_LEN:-4096}"
+PORT="${BFCL_PORT:-8000}"
+REASONING_PARSER="${BFCL_REASONING_PARSER:-}"
+EXTRA_ARGS="${BFCL_EXTRA_ARGS:-}"
+
+# Set up output directory
+if [ -n "$OUTPUT_DIR" ]; then
+    mkdir -p "$OUTPUT_DIR"
+    OUTPUT_DIR="$(cd "$OUTPUT_DIR" && pwd)"
+fi
+
+echo "============================================"
+echo "BFCL Tool Call Correctness Evaluation"
+echo "============================================"
+echo "Model:          $MODEL"
+echo "Tool parser:    $TOOL_CALL_PARSER"
+echo "API type:       $API_TYPE"
+echo "Output dir:     ${OUTPUT_DIR:-<cwd>}"
+echo "Test category:  $TEST_CATEGORY"
+echo "TP size:        $TP_SIZE"
+echo "Max model len:  $MAX_MODEL_LEN"
+echo "Port:           $PORT"
+echo "Num threads:    $NUM_THREADS"
+echo "============================================"
+
+# ---- Install bfcl-eval if missing ----
+if ! python3 -c "import bfcl_eval" 2>/dev/null; then
+    echo "Installing bfcl-eval..."
+    pip install "bfcl-eval>=2025.10.20.1,<2026"
+fi
+
+# ---- Cleanup handler ----
+SERVER_PID=""
+cleanup() {
+    if [ -n "$SERVER_PID" ]; then
+        echo "Stopping vLLM server (pid=$SERVER_PID)..."
+        kill "$SERVER_PID" 2>/dev/null || true
+        wait "$SERVER_PID" 2>/dev/null || true
+    fi
+    # Remove BFCL lock files (created by filelock for thread-safe writes)
+    rm -rf .file_locks/
+    if [ -n "${OUTPUT_DIR:-}" ]; then
+        rm -rf "$OUTPUT_DIR/.file_locks/"
+    fi
+}
+trap cleanup EXIT
+
+# ---- Start vLLM server ----
+echo "Starting vLLM server..."
+
+SERVE_ARGS=(
+    "$MODEL"
+    --port "$PORT"
+    --enable-auto-tool-choice
+    --tool-call-parser "$TOOL_CALL_PARSER"
+    --tensor-parallel-size "$TP_SIZE"
+    --max-model-len "$MAX_MODEL_LEN"
+    --enforce-eager
+    --no-enable-prefix-caching
+)
+
+# Append reasoning parser if specified
+if [ -n "$REASONING_PARSER" ]; then
+    SERVE_ARGS+=(--reasoning-parser "$REASONING_PARSER")
+fi
+
+# Append any extra args
+if [ -n "$EXTRA_ARGS" ]; then
+    read -ra EXTRA_ARGS_ARRAY <<< "$EXTRA_ARGS"
+    SERVE_ARGS+=("${EXTRA_ARGS_ARRAY[@]}")
+fi
+
+echo "Command: vllm serve ${SERVE_ARGS[*]}"
+vllm serve "${SERVE_ARGS[@]}" &
+SERVER_PID=$!
+
+# ---- Wait for server to be ready ----
+echo "Waiting for vLLM server to start (timeout: 600s)..."
+SECONDS_WAITED=0
+until curl -sf "http://localhost:${PORT}/health" > /dev/null 2>&1; do
+    if [ $SECONDS_WAITED -ge 600 ]; then
+        echo ""
+        echo "ERROR: vLLM server failed to start within 600s"
+        exit 1
+    fi
+    if (( SECONDS_WAITED % 30 == 0 && SECONDS_WAITED > 0 )); then
+        echo "  Still waiting... (${SECONDS_WAITED}s elapsed)"
+    fi
+    sleep 2
+    SECONDS_WAITED=$((SECONDS_WAITED + 2))
+done
+echo "vLLM server is ready. (started in ${SECONDS_WAITED}s)"
+
+# ---- Run BFCL evaluation ----
+# bfcl-eval has no CLI entry point; generate() and evaluate() are Typer
+# functions that must be called from Python. The MODEL_CONFIG_MAPPING must
+# be patched in-process so BFCL knows to use the OpenAI-compatible handler
+# against our local vLLM server.
+bfcl_exit_code=0
+python3 - "$MODEL" "$TEST_CATEGORY" "$NUM_THREADS" "$PORT" "$API_TYPE" "$OUTPUT_DIR" << 'PYEOF' || bfcl_exit_code=$?
+import os
+import sys
+
+model = sys.argv[1]
+test_category = sys.argv[2]
+num_threads = int(sys.argv[3])
+port = sys.argv[4]
+api_type = sys.argv[5]
+output_dir = sys.argv[6] if len(sys.argv) > 6 and sys.argv[6] else os.getcwd()
+
+os.environ["OPENAI_BASE_URL"] = f"http://localhost:{port}/v1"
+os.environ["OPENAI_API_KEY"] = "dummy"
+os.environ["BFCL_PROJECT_ROOT"] = output_dir
+
+import bfcl_eval.constants.model_config as bfcl_model_config
+from bfcl_eval.constants.model_config import ModelConfig
+from bfcl_eval.model_handler.api_inference.openai_completion import (
+    OpenAICompletionsHandler,
+)
+from bfcl_eval.model_handler.api_inference.openai_response import (
+    OpenAIResponsesHandler,
+)
+
+if api_type == "responses":
+    handler = OpenAIResponsesHandler
+else:
+    handler = OpenAICompletionsHandler
+
+bfcl_model_config.MODEL_CONFIG_MAPPING[model] = ModelConfig(
+    model_name=model,
+    display_name=f"{model} (FC) (vLLM)",
+    url=f"https://huggingface.co/{model}",
+    org="",
+    license="apache-2.0",
+    model_handler=handler,
+    input_price=None,
+    output_price=None,
+    is_fc_model=True,
+    underscore_to_dot=True,
+)
+
+from bfcl_eval.__main__ import evaluate, generate
+import inspect
+import typer
+
+
+def _get_default_kwargs(function):
+    kwargs = {}
+    for k, v in inspect.signature(function).parameters.items():
+        if v.default is not inspect.Parameter.empty:
+            default = v.default
+            if isinstance(default, typer.models.OptionInfo):
+                default = default.default
+            kwargs[k] = default
+    return kwargs
+
+
+# ---- generate ----
+print(f"=== BFCL generate: model={model} test_category={test_category} ===")
+gen_kwargs = _get_default_kwargs(generate)
+gen_kwargs["model"] = [model]
+gen_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
+gen_kwargs["skip_server_setup"] = True
+gen_kwargs["num_threads"] = num_threads
+generate(**gen_kwargs)
+
+# ---- evaluate ----
+print(f"=== BFCL evaluate: model={model} test_category={test_category} ===")
+eval_kwargs = _get_default_kwargs(evaluate)
+eval_kwargs["model"] = [model]
+eval_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
+evaluate(**eval_kwargs)
+
+print("=== BFCL evaluation completed successfully ===")
+PYEOF
+
+# ---- Upload results to buildkite ----
+if command -v buildkite-agent &>/dev/null; then
+    if [ $bfcl_exit_code -eq 0 ]; then
+        STYLE="success"
+        STATUS="PASSED"
+    else
+        STYLE="error"
+        STATUS="FAILED"
+    fi
+
+    buildkite-agent annotate --style "$STYLE" --context "bfcl-results" <<EOF
+### BFCL Tool Call Correctness - ${STATUS}
+- **Model:** \`${MODEL}\`
+- **Parser:** \`${TOOL_CALL_PARSER}\`
+- **API type:** \`${API_TYPE}\`
+- **Test category:** \`${TEST_CATEGORY}\`
+EOF
+
+    # BFCL writes results to $BFCL_PROJECT_ROOT/result/ and scores to
+    # $BFCL_PROJECT_ROOT/score/
+    RESULTS_ROOT="${OUTPUT_DIR:-.}"
+    if [ -d "$RESULTS_ROOT/result" ]; then
+        buildkite-agent artifact upload "$RESULTS_ROOT/result/**/*"
+    fi
+    if [ -d "$RESULTS_ROOT/score" ]; then
+        buildkite-agent artifact upload "$RESULTS_ROOT/score/**/*"
+    fi
+fi
+
+exit $bfcl_exit_code
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
--- a/.buildkite/test_areas/basic_correctness.yaml
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -14,8 +14,3 @@ steps:
  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -101,8 +101,8 @@ steps:
    - nvidia-smi
    # Run all models and attn backends but only Inductor partition and native custom ops
    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
-    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
+    # Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and (qwen3 or deepseek)"

 - label: Fusion E2E Config Sweep (H100)
  timeout_in_minutes: 30
@@ -132,9 +132,9 @@ steps:
  commands:
    - nvidia-smi
    # Run all models but only FLASHINFER, Inductor partition and native custom ops
-    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    # Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek)) or llama-3)"

 - label: Fusion E2E TP2 Quick (H100)
  timeout_in_minutes: 20
@@ -150,8 +150,8 @@ steps:
  commands:
    - nvidia-smi
    # Run all models and attn backends but only Inductor partition and native custom ops
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"

 - label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
  timeout_in_minutes: 40
@@ -205,7 +205,7 @@ steps:
  commands:
    - nvidia-smi
    # Run all models but only FLASHINFER, Inductor partition and native custom ops
-    # include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # include qwen/deepseek with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    # for ar-rms-quant-fp4, also sweep llama3
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))) or Llama-3.1-8B-Instruct-FP4"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -50,24 +50,18 @@ steps:
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py

- label: Distributed Tests (4 GPUs)
-  timeout_in_minutes: 50
+- label: Distributed Torchrun + Examples (4 GPUs)
+  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
-  - tests/distributed/test_utils
-  - tests/distributed/test_pynccl
-  - tests/distributed/test_events
-  - tests/compile/fullgraph/test_basic_correctness.py
+  - tests/distributed/test_torchrun_example.py
+  - tests/distributed/test_torchrun_example_moe.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - examples/offline_inference/new_weight_syncing/
  - tests/examples/offline_inference/data_parallel.py
-  - tests/v1/distributed
-  - tests/v1/engine/test_engine_core_client.py
-  - tests/distributed/test_symm_mem_allreduce.py
-  - tests/distributed/test_multiproc_executor.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
@@ -85,21 +79,6 @@ steps:
  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
-  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
-  - pytest -v -s distributed/test_utils.py
-  - pytest -v -s compile/fullgraph/test_basic_correctness.py
-  - pytest -v -s distributed/test_pynccl.py
-  - pytest -v -s distributed/test_events.py
-  - pytest -v -s distributed/test_symm_mem_allreduce.py
-  # test multi-node TP with multiproc executor (simulated on single node)
-  - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
-  # TODO: create a dedicated test section for multi-GPU example tests
-  # when we have multiple distributed example tests
  # OLD rlhf examples
  - cd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
@@ -109,6 +88,47 @@ steps:
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py

+- label: Distributed DP Tests (4 GPUs)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - tests/v1/distributed
+  - tests/v1/engine/test_engine_core_client.py
+  - tests/distributed/test_utils
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
+  - pytest -v -s distributed/test_utils.py
+
+- label: Distributed Compile + Comm (4 GPUs)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - tests/distributed/test_pynccl
+  - tests/distributed/test_events
+  - tests/compile/fullgraph/test_basic_correctness.py
+  - tests/distributed/test_symm_mem_allreduce.py
+  - tests/distributed/test_multiproc_executor.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  - pytest -v -s compile/fullgraph/test_basic_correctness.py
+  - pytest -v -s distributed/test_pynccl.py
+  - pytest -v -s distributed/test_events.py
+  - pytest -v -s distributed/test_symm_mem_allreduce.py
+  # test multi-node TP with multiproc executor (simulated on single node)
+  - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
+
 - label: Distributed Tests (8 GPUs)(H100)
  timeout_in_minutes: 10
  device: h100
@@ -149,7 +169,7 @@ steps:
  num_devices: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
-    # - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py --- failing, need to re-enable
+    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py

--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -1,5 +1,5 @@
 group: Engine
-depends_on: 
+depends_on:
  - image-build
 steps:
 - label: Engine
@@ -14,28 +14,30 @@ steps:
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py

- label: V1 e2e + engine (1 GPU)
-  timeout_in_minutes: 45
+- label: Engine (1 GPU)
+  timeout_in_minutes: 30
  source_file_dependencies:
-    - vllm/
-    - tests/v1
+    - vllm/v1/engine/
+    - tests/v1/engine/
  commands:
-    # TODO: accuracy does not match, whether setting
-    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-    - pytest -v -s v1/e2e
-    # Run this test standalone for now;
-    # need to untangle use (implicit) use of spawn/fork across the tests.
    - pytest -v -s v1/engine/test_preprocess_error_handling.py
-    # Run the rest of v1/engine tests
    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-      commands:
-      - pytest -v -s v1/e2e
-      - pytest -v -s v1/engine
+
+- label: e2e Scheduling (1 GPU)
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/v1/
+    - tests/v1/e2e/general/
+  commands:
+    - pytest -v -s v1/e2e/general/test_async_scheduling.py
+
+- label: e2e Core (1 GPU)
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/v1/
+    - tests/v1/e2e/general/
+  commands:
+    - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py

 - label: V1 e2e (2 GPUs)
  timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
@@ -46,7 +48,7 @@ steps:
    - tests/v1/e2e
  commands:
    # Only run tests that need exactly 2 GPUs
-    - pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
+    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
  mirror:
    amd:
      device: mi325_2
@@ -62,7 +64,7 @@ steps:
    - tests/v1/e2e
  commands:
    # Only run tests that need 4 GPUs
-    - pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
+    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
  mirror:
    amd:
      device: mi325_4
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -24,11 +24,6 @@ steps:
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: Entrypoints Integration (API Server 1)
  timeout_in_minutes: 130
@@ -39,7 +34,7 @@ steps:
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
  - pytest -v -s entrypoints/test_chat_utils.py
  mirror:
    amd:
@@ -60,11 +55,6 @@ steps:
  - pytest -v -s entrypoints/instrumentator
  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
  - pytest -v -s tool_use
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: Entrypoints Integration (Pooling)
  timeout_in_minutes: 50
@@ -75,11 +65,6 @@ steps:
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/pooling
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: Entrypoints Integration (Responses API)
  timeout_in_minutes: 50
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -88,11 +88,6 @@ steps:
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: Metrics, Tracing (2 GPUs)
  timeout_in_minutes: 20
--- a/.buildkite/test_areas/model_runner_v2.yaml
+++ b/.buildkite/test_areas/model_runner_v2.yaml
@@ -0,0 +1,110 @@
+group: Model Runner V2
+depends_on:
+  - image-build
+steps:
+- label: Model Runner V2 Core Tests
+  timeout_in_minutes: 45
+  source_file_dependencies:
+  - vllm/v1/worker/gpu/
+  - vllm/v1/worker/gpu_worker.py
+  - vllm/v1/core/sched/
+  - vllm/v1/attention/
+  - tests/v1/engine/test_llm_engine.py
+  - tests/v1/e2e/
+  - tests/v1/entrypoints/llm/test_struct_output_generate.py
+  commands:
+  - set -x
+  - export VLLM_USE_V2_MODEL_RUNNER=1
+  - pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics"
+  # This requires eager until we sort out CG correctness issues.
+  # TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged.
+  - ENFORCE_EAGER=1 pytest -v -s v1/e2e/general/test_async_scheduling.py -k "not ngram"
+  - pytest -v -s v1/e2e/general/test_context_length.py
+  - pytest -v -s v1/e2e/general/test_min_tokens.py
+  # Temporary hack filter to exclude ngram spec decoding based tests.
+  - pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
+
+- label: Model Runner V2 Examples
+  timeout_in_minutes: 45
+  working_dir: "/vllm-workspace/examples"
+  source_file_dependencies:
+    - vllm/v1/worker/gpu/
+    - vllm/v1/core/sched/
+    - vllm/v1/worker/gpu_worker.py
+    - examples/offline_inference/
+    - examples/basic/offline_inference/
+    - examples/pooling/embed/vision_embedding_offline.py
+    - examples/others/tensorize_vllm_model.py
+  commands:
+    - set -x
+    - export VLLM_USE_V2_MODEL_RUNNER=1
+    - pip install tensorizer # for tensorizer test
+    - python3 basic/offline_inference/chat.py # for basic
+    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
+    #- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10  # TODO
+    #- python3 basic/offline_inference/embed.py   # TODO
+    # for multi-modal models
+    - python3 offline_inference/audio_language.py --seed 0
+    - python3 offline_inference/vision_language.py --seed 0
+    - python3 offline_inference/vision_language_multi_image.py --seed 0
+    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+    # for pooling models
+    - python3 pooling/embed/vision_embedding_offline.py --seed 0
+    # for features demo
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+
+- label: Model Runner V2 Distributed (2 GPUs)
+  timeout_in_minutes: 45
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 2
+  source_file_dependencies:
+    - vllm/v1/worker/gpu/
+    - vllm/v1/worker/gpu_worker.py
+    - tests/basic_correctness/test_basic_correctness.py
+    - tests/v1/distributed/test_async_llm_dp.py
+    - tests/v1/distributed/test_eagle_dp.py
+  commands:
+    - set -x
+    - export VLLM_USE_V2_MODEL_RUNNER=1
+    # The "and not True" here is a hacky way to exclude the prompt_embeds cases which aren't yet supported.
+    - TARGET_TEST_SUITE=L4 pytest -v -s basic_correctness/test_basic_correctness.py -m 'distributed(num_gpus=2)' -k "not ray and not True"
+    # https://github.com/NVIDIA/nccl/issues/1838
+    - export NCCL_CUMEM_HOST_ENABLE=0
+    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py -k "not ray"
+    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+
+# These require fix https://github.com/vllm-project/vllm/pull/36280
+- label: Model Runner V2 Pipeline Parallelism (4 GPUs)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 4
+  source_file_dependencies:
+    - vllm/v1/worker/gpu/
+    - vllm/v1/worker/gpu_worker.py
+    - tests/distributed/test_pipeline_parallel.py
+    #- tests/distributed/test_pp_cudagraph.py
+  commands:
+    - set -x
+    - export VLLM_USE_V2_MODEL_RUNNER=1
+    - pytest -v -s distributed/test_pipeline_parallel.py -k "not ray and not Jamba"
+    # TODO: Uncomment once https://github.com/vllm-project/vllm/pull/35162 is merged.
+    #- pytest -v -s distributed/test_pp_cudagraph.py -k "not ray"
+
+- label: Model Runner V2 Spec Decode
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/v1/worker/gpu/
+  - vllm/v1/worker/gpu_worker.py
+  - tests/v1/spec_decode/test_max_len.py
+  - tests/v1/e2e/spec_decode/test_spec_decode.py
+  commands:
+  - set -x
+  - export VLLM_USE_V2_MODEL_RUNNER=1
+  - pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp"
+  - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle or mtp"
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -2,15 +2,59 @@ group: Models - Multimodal
 depends_on: 
  - image-build
 steps:
- label: Multi-Modal Models (Standard) # 60min
-  timeout_in_minutes: 80
+- label: "Multi-Modal Models (Standard) 1: qwen2"
+  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+    - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2"
+    - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
+
+- label: "Multi-Modal Models (Standard) 2: qwen3 + gemma"
+  timeout_in_minutes: 45
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma"
+    - pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
+
+- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl"
+  timeout_in_minutes: 45
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma"
+    - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
+
+- label: "Multi-Modal Models (Standard) 4: other + whisper"
+  timeout_in_minutes: 45
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
  mirror:
    amd:
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -39,8 +39,3 @@ steps:
  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
-  mirror:
-    amd:
-      device: mi325_2
-      depends_on:
-      - image-build-amd
--- a/.buildkite/test_areas/spec_decode.yaml
+++ b/.buildkite/test_areas/spec_decode.yaml
@@ -0,0 +1,40 @@
+group: Spec Decode
+depends_on:
+  - image-build
+steps:
+- label: Spec Decode Eagle
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/v1/spec_decode/
+    - vllm/v1/worker/gpu/spec_decode/
+    - tests/v1/e2e/spec_decode/
+  commands:
+    - pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"
+
+- label: Spec Decode Speculators + MTP
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/v1/spec_decode/
+    - vllm/v1/worker/gpu/spec_decode/
+    - vllm/transformers_utils/configs/speculators/
+    - tests/v1/e2e/spec_decode/
+  commands:
+    - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
+
+- label: Spec Decode Ngram + Suffix
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/v1/spec_decode/
+    - vllm/v1/worker/gpu/spec_decode/
+    - tests/v1/e2e/spec_decode/
+  commands:
+    - pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"
+
+- label: Spec Decode Draft Model
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/v1/spec_decode/
+    - vllm/v1/worker/gpu/spec_decode/
+    - tests/v1/e2e/spec_decode/
+  commands:
+    - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -27,7 +27,7 @@ pull_request_rules:
        Hi @{{author}}, the pre-commit checks have failed. Please run:

        ```bash 
-        uv pip install pre-commit
+        uv pip install pre-commit>=4.5.1
        pre-commit install
        pre-commit run --all-files
        ```
@@ -334,7 +334,7 @@ pull_request_rules:
    - or:
      - files~=^tests/tool_use/
      - files~=^tests/entrypoints/openai/tool_parsers/
-      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+      - files=tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
      - files~=^vllm/entrypoints/openai/tool_parsers/
      - files=docs/features/tool_calling.md
      - files~=^examples/tool_chat_*
--- a/.gitignore
+++ b/.gitignore
@@ -189,11 +189,9 @@ cython_debug/
 .vscode/

 # Claude
-CLAUDE.md
 .claude/

 # Codex
-AGENTS.md
 .codex/

 # Cursor
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -30,6 +30,7 @@ repos:
  - id: markdownlint-cli2
    language_version: lts
    args: [--fix]
+    exclude: ^CLAUDE\.md$
 - repo: https://github.com/rhysd/actionlint
  rev: v1.7.7
  hooks:
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -9,7 +9,7 @@ build:
    python: "3.12"
  jobs:
    post_checkout:
-      - bash docs/maybe_skip_pr_build.sh
+      # - bash docs/maybe_skip_pr_build.sh
      - git fetch origin main --unshallow --no-tags --filter=blob:none || true
    pre_create_environment:
      - pip install uv
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -0,0 +1,113 @@
+# Agent Instructions for vLLM
+
+> These instructions apply to **all** AI-assisted contributions to `vllm-project/vllm`.
+> Breaching these guidelines can result in automatic banning.
+
+## 1. Contribution Policy (Mandatory)
+
+### Duplicate-work checks
+
+Before proposing a PR, run these checks:
+
+```bash
+gh issue view <issue_number> --repo vllm-project/vllm --comments
+gh pr list --repo vllm-project/vllm --state open --search "<issue_number> in:body"
+gh pr list --repo vllm-project/vllm --state open --search "<short area keywords>"
+```
+
+- If an open PR already addresses the same fix, do not open another.
+- If your approach is materially different, explain the difference in the issue.
+
+### No low-value busywork PRs
+
+Do not open one-off PRs for tiny edits (single typo, isolated style change, one mutable default, etc.). Mechanical cleanups are acceptable only when bundled with substantive work.
+
+### Accountability
+
+- Pure code-agent PRs are **not allowed**. A human submitter must understand and defend the change end-to-end.
+- The submitting human must review every changed line and run relevant tests.
+- PR descriptions for AI-assisted work **must** include:
+    - Why this is not duplicating an existing PR.
+    - Test commands run and results.
+    - Clear statement that AI assistance was used.
+
+### Fail-closed behavior
+
+If work is duplicate/trivial busywork, **do not proceed**. Return a short explanation of what is missing.
+
+---
+
+## 2. Development Workflow
+
+### Environment setup
+
+```bash
+# Install `uv` if you don't have it already:
+curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Always use `uv` for Python environment management:
+uv venv --python 3.12
+source .venv/bin/activate
+
+# Always make sure `pre-commit` and its hooks are installed:
+uv pip install -r requirements/lint.txt
+pre-commit install
+```
+
+### Installing dependencies
+
+```bash
+# If you are only making Python changes:
+VLLM_USE_PRECOMPILED=1 uv pip install -e .
+
+# If you are also making C/C++ changes:
+uv pip install -e .
+```
+
+### Running tests
+
+Tests require extra dependencies.
+All versions for test dependencies should be read from `requirements/test.txt`
+
+```bash
+# Install bare minimum test dependencies:
+uv pip install pytest pytest-asyncio tblib
+
+# Install additional test dependencies as needed, or install them all as follows:
+uv pip install -r requirements/test.txt
+
+# Run specific test from specific test file
+pytest tests/path/to/test.py -v -s -k test_name
+
+# Run all tests in directory
+pytest tests/path/to/dir -v -s
+```
+
+### Running linters
+
+```bash
+# Run all pre-commit hooks on staged files:
+pre-commit run
+
+# Run on all files:
+pre-commit run --all-files
+
+# Run a specific hook:
+pre-commit run ruff-check --all-files
+
+# Run mypy as it is in CI:
+pre-commit run mypy-3.10 --all-files --hook-stage manual
+```
+
+### Commit messages
+
+Add attribution using commit trailers such as `Co-authored-by:` (other projects use `Assisted-by:` or `Generated-by:`). For example:
+
+```text
+Your commit message here
+
+Co-authored-by: GitHub Copilot
+Co-authored-by: Claude
+Co-authored-by: gemini-code-assist
+Signed-off-by: Your Name <your.email@example.com>
+```
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -0,0 +1 @@
+@AGENTS.md
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,7 +37,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")

 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1150;gfx1151;gfx1152;gfx1153;gfx1200;gfx1201")

 # ROCm installation prefix. Default to /opt/rocm but allow override via
 # -DROCM_PATH=/your/rocm/path when invoking cmake.
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -59,7 +59,9 @@ def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
    """Run MLA benchmark with appropriate backend."""
    from mla_runner import run_mla_benchmark as run_mla

-    return run_mla(config.backend, config, **kwargs)
+    return run_mla(
+        config.backend, config, prefill_backend=config.prefill_backend, **kwargs
+    )


 def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
@@ -440,14 +442,21 @@ def main():
    # Backend selection
    parser.add_argument(
        "--backends",
+        "--decode-backends",
        nargs="+",
-        help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
+        help="Decode backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
        "flashinfer_mla, flashattn_mla, flashmla)",
    )
    parser.add_argument(
        "--backend",
        help="Single backend (alternative to --backends)",
    )
+    parser.add_argument(
+        "--prefill-backends",
+        nargs="+",
+        help="Prefill backends to compare (fa2, fa3, fa4). "
+        "Uses the first decode backend for impl construction.",
+    )

    # Batch specifications
    parser.add_argument(
@@ -502,7 +511,7 @@ def main():

        # Override args with YAML values, but CLI args take precedence
        # Check if CLI provided backends (they would be non-None and not default)
-        cli_backends_provided = args.backends is not None or args.backend is not None
+        cli_backends_provided = args.backend is not None or args.backends is not None

        # Backend(s) - only use YAML if CLI didn't specify
        if not cli_backends_provided:
@@ -512,6 +521,12 @@ def main():
            elif "backends" in yaml_config:
                args.backends = yaml_config["backends"]
                args.backend = None
+            elif "decode_backends" in yaml_config:
+                args.backends = yaml_config["decode_backends"]
+                args.backend = None
+
+        # Prefill backends (e.g., ["fa3", "fa4"])
+        args.prefill_backends = yaml_config.get("prefill_backends", None)

        # Check for special modes
        if "mode" in yaml_config:
@@ -613,7 +628,10 @@ def main():

    # Determine backends
    backends = args.backends or ([args.backend] if args.backend else ["flash"])
+    prefill_backends = getattr(args, "prefill_backends", None)
    console.print(f"Backends: {', '.join(backends)}")
+    if prefill_backends:
+        console.print(f"Prefill backends: {', '.join(prefill_backends)}")
    console.print(f"Batch specs: {', '.join(args.batch_specs)}")
    console.print()

@@ -850,37 +868,93 @@ def main():

    else:
        # Normal mode: compare backends
-        total = len(backends) * len(args.batch_specs)
+        decode_results = []
+        prefill_results = []

-        with tqdm(total=total, desc="Benchmarking") as pbar:
-            for spec in args.batch_specs:
-                for backend in backends:
-                    config = BenchmarkConfig(
-                        backend=backend,
-                        batch_spec=spec,
-                        num_layers=args.num_layers,
-                        head_dim=args.head_dim,
-                        num_q_heads=args.num_q_heads,
-                        num_kv_heads=args.num_kv_heads,
-                        block_size=args.block_size,
-                        device=args.device,
-                        repeats=args.repeats,
-                        warmup_iters=args.warmup_iters,
-                        profile_memory=args.profile_memory,
-                    )
+        # Run decode backend comparison
+        if not prefill_backends:
+            # No prefill backends specified: compare decode backends as before
+            total = len(backends) * len(args.batch_specs)

-                    result = run_benchmark(config)
-                    all_results.append(result)
+            with tqdm(total=total, desc="Benchmarking") as pbar:
+                for spec in args.batch_specs:
+                    for backend in backends:
+                        config = BenchmarkConfig(
+                            backend=backend,
+                            batch_spec=spec,
+                            num_layers=args.num_layers,
+                            head_dim=args.head_dim,
+                            num_q_heads=args.num_q_heads,
+                            num_kv_heads=args.num_kv_heads,
+                            block_size=args.block_size,
+                            device=args.device,
+                            repeats=args.repeats,
+                            warmup_iters=args.warmup_iters,
+                            profile_memory=args.profile_memory,
+                        )

-                    if not result.success:
-                        console.print(f"[red]Error {backend} {spec}: {result.error}[/]")
+                        result = run_benchmark(config)
+                        decode_results.append(result)

-                    pbar.update(1)
+                        if not result.success:
+                            console.print(
+                                f"[red]Error {backend} {spec}: {result.error}[/]"
+                            )

-        # Display results
-        console.print("\n[bold green]Results:[/]")
-        formatter = ResultsFormatter(console)
-        formatter.print_table(all_results, backends)
+                        pbar.update(1)
+
+            console.print("\n[bold green]Results:[/]")
+            formatter = ResultsFormatter(console)
+            formatter.print_table(decode_results, backends)
+
+        # Run prefill backend comparison
+        if prefill_backends:
+            # Use first decode backend for impl construction
+            decode_backend = backends[0]
+            total = len(prefill_backends) * len(args.batch_specs)
+
+            console.print(
+                f"[yellow]Prefill comparison mode: "
+                f"using {decode_backend} for decode impl[/]"
+            )
+
+            with tqdm(total=total, desc="Prefill benchmarking") as pbar:
+                for spec in args.batch_specs:
+                    for pb in prefill_backends:
+                        config = BenchmarkConfig(
+                            backend=decode_backend,
+                            batch_spec=spec,
+                            num_layers=args.num_layers,
+                            head_dim=args.head_dim,
+                            num_q_heads=args.num_q_heads,
+                            num_kv_heads=args.num_kv_heads,
+                            block_size=args.block_size,
+                            device=args.device,
+                            repeats=args.repeats,
+                            warmup_iters=args.warmup_iters,
+                            profile_memory=args.profile_memory,
+                            prefill_backend=pb,
+                        )
+
+                        result = run_benchmark(config)
+
+                        # Label result with prefill backend name for display
+                        labeled_config = replace(result.config, backend=pb)
+                        result = replace(result, config=labeled_config)
+                        prefill_results.append(result)
+
+                        if not result.success:
+                            console.print(f"[red]Error {pb} {spec}: {result.error}[/]")
+
+                        pbar.update(1)
+
+            console.print("\n[bold green]Prefill Backend Results:[/]")
+            formatter = ResultsFormatter(console)
+            formatter.print_table(
+                prefill_results, prefill_backends, compare_to_fastest=True
+            )
+
+        all_results = decode_results + prefill_results

    # Save results
    if all_results:
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -77,6 +77,7 @@ class MockKVBProj:
        self.qk_nope_head_dim = qk_nope_head_dim
        self.v_head_dim = v_head_dim
        self.out_dim = qk_nope_head_dim + v_head_dim
+        self.weight = torch.empty(0, dtype=torch.bfloat16)

    def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
        """
@@ -213,6 +214,7 @@ class BenchmarkConfig:
    use_cuda_graphs: bool = False

    # MLA-specific
+    prefill_backend: str | None = None
    kv_lora_rank: int | None = None
    qk_nope_head_dim: int | None = None
    qk_rope_head_dim: int | None = None
--- a/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
@@ -1,4 +1,19 @@
-# MLA prefill-only benchmark configuration for sparse backends
+# MLA prefill backend comparison
+#
+# Compares all available MLA prefill backends:
+#   FA backends:  fa2, fa3, fa4 (FlashAttention versions)
+#   Non-FA:       flashinfer, cudnn, trtllm (Blackwell-only, require flashinfer)
+#
+# Uses cutlass_mla as the decode backend for impl construction
+# (only the prefill path is exercised).
+#
+# Backends that aren't available on the current platform will report errors
+# in the results table (e.g., fa3 on Blackwell, cudnn without artifactory).
+#
+# Usage:
+#   python benchmark.py --config configs/mla_prefill.yaml
+
+description: "MLA prefill backend comparison"

 model:
  name: "deepseek-v3"
@@ -12,20 +27,25 @@ model:
  v_head_dim: 128
  block_size: 128

-# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
-# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
-model_parameter_sweep:
-  param_name: "num_q_heads"
-  values: [128, 64, 32, 16]
-  label_format: "{backend}_{value}h"
+# model:
+#   name: "deepseek-v2-lite"
+#   num_layers: 27
+#   num_q_heads: 16
+#   num_kv_heads: 1
+#   head_dim: 576
+#   kv_lora_rank: 512
+#   qk_nope_head_dim: 128
+#   qk_rope_head_dim: 64
+#   v_head_dim: 128
+#   block_size: 128

 batch_specs:
  # Pure prefill
-  - "1q512"
-  - "1q1k"
-  - "1q2k"
-  - "1q4k"
-  - "1q8k"
+  - "q512"
+  - "q1k"
+  - "q2k"
+  - "q4k"
+  - "q8k"

  # Batched pure prefill
  - "2q512"
@@ -44,19 +64,63 @@ batch_specs:
  - "8q4k"
  - "8q8k"

-  # Extend
-  - "1q512s4k"
-  - "1q512s8k"
-  - "1q1ks8k"
-  - "1q2ks8k"
-  - "1q2ks16k"
-  - "1q4ks16k"
+  # Chunked prefill / extend
+  # Short context
+  - "q128s1k"
+  - "q256s2k"
+  - "q512s4k"
+  - "q1ks4k"
+  - "q2ks8k"
+  - "2q128s1k"
+  - "2q256s2k"
+  - "2q512s4k"
+  - "2q1ks4k"
+  - "2q2ks8k"
+  - "4q128s1k"
+  - "4q256s2k"
+  - "4q512s4k"
+  - "4q1ks4k"
+  - "4q2ks8k"
+  - "8q128s1k"
+  - "8q256s2k"
+  - "8q512s4k"
+  - "8q1ks4k"

-backends:
-  - FLASHMLA_SPARSE
-  - FLASHINFER_MLA_SPARSE
+  # Medium context
+  - "q128s16k"
+  - "q512s16k"
+  - "q1ks16k"
+  - "q2ks16k"
+  - "2q128s16k"
+  - "2q512s16k"
+  - "2q1ks16k"
+  - "2q2ks16k"
+  - "4q128s16k"
+  - "4q512s16k"
+  - "4q1ks16k"
+  - "4q2ks16k"
+
+  # Long context
+  - "q128s64k"
+  - "q512s64k"
+  - "q1ks64k"
+  - "q2ks64k"
+  - "2q128s64k"
+  - "2q512s64k"
+  - "2q1ks64k"
+  - "2q2ks64k"
+
+decode_backends:
+  - CUTLASS_MLA
+
+prefill_backends:
+  - fa2
+  - fa3
+  - fa4
+  - flashinfer
+  - cudnn
+  - trtllm

 device: "cuda:0"
-repeats: 10
-warmup_iters: 3
-profile_memory: true
+repeats: 20
+warmup_iters: 5
--- a/benchmarks/attention_benchmarks/configs/mla_sparse_prefill.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_sparse_prefill.yaml
@@ -0,0 +1,62 @@
+# MLA prefill-only benchmark configuration for sparse backends
+
+model:
+  name: "deepseek-v3"
+  num_layers: 60
+  num_q_heads: 128
+  num_kv_heads: 1
+  head_dim: 576
+  kv_lora_rank: 512
+  qk_nope_head_dim: 128
+  qk_rope_head_dim: 64
+  v_head_dim: 128
+  block_size: 128
+
+# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
+# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
+model_parameter_sweep:
+  param_name: "num_q_heads"
+  values: [128, 64, 32, 16]
+  label_format: "{backend}_{value}h"
+
+batch_specs:
+  # Pure prefill
+  - "1q512"
+  - "1q1k"
+  - "1q2k"
+  - "1q4k"
+  - "1q8k"
+
+  # Batched pure prefill
+  - "2q512"
+  - "2q1k"
+  - "2q2k"
+  - "2q4k"
+  - "2q8k"
+  - "4q512"
+  - "4q1k"
+  - "4q2k"
+  - "4q4k"
+  - "4q8k"
+  - "8q512"
+  - "8q1k"
+  - "8q2k"
+  - "8q4k"
+  - "8q8k"
+
+  # Extend
+  - "1q512s4k"
+  - "1q512s8k"
+  - "1q1ks8k"
+  - "1q2ks8k"
+  - "1q2ks16k"
+  - "1q4ks16k"
+
+backends:
+  - FLASHMLA_SPARSE
+  - FLASHINFER_MLA_SPARSE
+
+device: "cuda:0"
+repeats: 10
+warmup_iters: 3
+profile_memory: true
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -62,6 +62,7 @@ def create_minimal_vllm_config(
    max_num_seqs: int = 256,
    mla_dims: dict | None = None,
    index_topk: int | None = None,
+    prefill_backend: str | None = None,
 ) -> VllmConfig:
    """
    Create minimal VllmConfig for MLA benchmarks.
@@ -75,6 +76,9 @@ def create_minimal_vllm_config(
                  setup_mla_dims(model_name)
        index_topk: Optional topk value for sparse MLA backends. If provided,
                    the config will include index_topk for sparse attention.
+        prefill_backend: Prefill backend name (e.g., "fa3", "fa4", "flashinfer",
+                        "cudnn", "trtllm"). Configures the attention config to
+                        force the specified prefill backend.

    Returns:
        VllmConfig for benchmarking
@@ -163,7 +167,7 @@ def create_minimal_vllm_config(

    compilation_config = CompilationConfig()

-    return VllmConfig(
+    vllm_config = VllmConfig(
        model_config=model_config,
        cache_config=cache_config,
        parallel_config=parallel_config,
@@ -171,9 +175,84 @@ def create_minimal_vllm_config(
        compilation_config=compilation_config,
    )

+    if prefill_backend is not None:
+        prefill_cfg = get_prefill_backend_config(prefill_backend)
+        if prefill_cfg["flash_attn_version"] is not None:
+            vllm_config.attention_config.flash_attn_version = prefill_cfg[
+                "flash_attn_version"
+            ]
+        vllm_config.attention_config.disable_flashinfer_prefill = prefill_cfg[
+            "disable_flashinfer_prefill"
+        ]
+        vllm_config.attention_config.use_cudnn_prefill = prefill_cfg[
+            "use_cudnn_prefill"
+        ]
+        vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill = prefill_cfg[
+            "use_trtllm_ragged_deepseek_prefill"
+        ]
+
+    return vllm_config
+

 # ============================================================================
-# Backend Configuration
+# Prefill Backend Configuration
+# ============================================================================
+
+# Maps prefill backend names to attention config overrides.
+# FA backends set flash_attn_version and disable non-FA paths.
+# Non-FA backends enable their specific path and disable others.
+_PREFILL_BACKEND_CONFIG: dict[str, dict] = {
+    "fa2": {
+        "flash_attn_version": 2,
+        "disable_flashinfer_prefill": True,
+        "use_cudnn_prefill": False,
+        "use_trtllm_ragged_deepseek_prefill": False,
+    },
+    "fa3": {
+        "flash_attn_version": 3,
+        "disable_flashinfer_prefill": True,
+        "use_cudnn_prefill": False,
+        "use_trtllm_ragged_deepseek_prefill": False,
+    },
+    "fa4": {
+        "flash_attn_version": 4,
+        "disable_flashinfer_prefill": True,
+        "use_cudnn_prefill": False,
+        "use_trtllm_ragged_deepseek_prefill": False,
+    },
+    "flashinfer": {
+        "flash_attn_version": None,
+        "disable_flashinfer_prefill": False,
+        "use_cudnn_prefill": False,
+        "use_trtllm_ragged_deepseek_prefill": False,
+    },
+    "cudnn": {
+        "flash_attn_version": None,
+        "disable_flashinfer_prefill": True,
+        "use_cudnn_prefill": True,
+        "use_trtllm_ragged_deepseek_prefill": False,
+    },
+    "trtllm": {
+        "flash_attn_version": None,
+        "disable_flashinfer_prefill": True,
+        "use_cudnn_prefill": False,
+        "use_trtllm_ragged_deepseek_prefill": True,
+    },
+}
+
+
+def get_prefill_backend_config(prefill_backend: str) -> dict:
+    """Get attention config overrides for a prefill backend."""
+    if prefill_backend not in _PREFILL_BACKEND_CONFIG:
+        raise ValueError(
+            f"Unknown prefill backend: {prefill_backend!r}. "
+            f"Available: {list(_PREFILL_BACKEND_CONFIG.keys())}"
+        )
+    return _PREFILL_BACKEND_CONFIG[prefill_backend]
+
+
+# ============================================================================
+# Decode Backend Configuration
 # ============================================================================


@@ -203,6 +282,7 @@ def _get_backend_config(backend: str) -> dict:
    Returns:
        Dict with backend configuration
    """
+    from vllm.v1.attention.backend import MultipleOf
    from vllm.v1.attention.backends.registry import AttentionBackendEnum

    try:
@@ -219,8 +299,8 @@ def _get_backend_config(backend: str) -> dict:
    block_sizes = backend_class.get_supported_kernel_block_sizes()
    # Use first supported block size (backends typically support one for MLA)
    block_size = block_sizes[0] if block_sizes else None
-    if hasattr(block_size, "value"):
-        # Handle MultipleOf enum
+    if isinstance(block_size, MultipleOf):
+        # No fixed block size; fall back to config value
        block_size = None

    # Check if sparse via class method if available
@@ -676,16 +756,11 @@ def _run_single_benchmark(
    if is_sparse and indexer is not None:
        indexer.fill_random_indices(total_q, max_kv_len)

-    # Determine which forward method to use
-    if is_sparse:
-        # Sparse backends use forward_mqa
+    # Determine which forward method to use based on metadata
+    if metadata.decode is not None:
        forward_fn = lambda: impl.forward_mqa(decode_inputs, kv_cache, metadata, layer)
-    elif metadata.decode is not None:
-        forward_fn = lambda: impl._forward_decode(
-            decode_inputs, kv_cache, metadata, layer
-        )
    elif metadata.prefill is not None:
-        forward_fn = lambda: impl._forward_prefill(
+        forward_fn = lambda: impl.forward_mha(
            prefill_inputs["q"],
            prefill_inputs["k_c_normed"],
            prefill_inputs["k_pe"],
@@ -732,6 +807,7 @@ def _run_mla_benchmark_batched(
    backend: str,
    configs_with_params: list[tuple],  # [(config, threshold, num_splits), ...]
    index_topk: int = 2048,
+    prefill_backend: str | None = None,
 ) -> list[BenchmarkResult]:
    """
    Unified batched MLA benchmark runner for all backends.
@@ -743,11 +819,13 @@ def _run_mla_benchmark_batched(
    to avoid setup/teardown overhead.

    Args:
-        backend: Backend name
+        backend: Backend name (decode backend used for impl construction)
        configs_with_params: List of (config, threshold, num_splits) tuples
            - threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
            - num_splits: num_kv_splits (CUTLASS only)
        index_topk: Topk value for sparse MLA backends (default 2048)
+        prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
+            When set, forces the specified FlashAttention version for prefill.

    Returns:
        List of BenchmarkResult objects
@@ -757,7 +835,7 @@ def _run_mla_benchmark_batched(

    backend_cfg = _get_backend_config(backend)
    device = torch.device(configs_with_params[0][0].device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)

    # Determine block size
    config_block_size = configs_with_params[0][0].block_size
@@ -780,11 +858,25 @@ def _run_mla_benchmark_batched(
        block_size=block_size,
        mla_dims=mla_dims,  # Use custom dims from config or default
        index_topk=index_topk if is_sparse else None,
+        prefill_backend=prefill_backend,
    )

    results = []

    with set_current_vllm_config(vllm_config):
+        # Clear cached prefill backend detection functions so they re-evaluate
+        # with the current VllmConfig. These are @functools.cache decorated and
+        # would otherwise return stale results from a previous backend's config.
+        from vllm.model_executor.layers.attention.mla_attention import (
+            use_cudnn_prefill,
+            use_flashinfer_prefill,
+            use_trtllm_ragged_deepseek_prefill,
+        )
+
+        use_flashinfer_prefill.cache_clear()
+        use_cudnn_prefill.cache_clear()
+        use_trtllm_ragged_deepseek_prefill.cache_clear()
+
        # Create backend impl, layer, builder, and indexer (reused across benchmarks)
        impl, layer, builder_instance, indexer = _create_backend_impl(
            backend_cfg,
@@ -794,6 +886,38 @@ def _run_mla_benchmark_batched(
            index_topk=index_topk if is_sparse else None,
        )

+        # Verify the actual prefill backend matches what was requested
+        if prefill_backend is not None:
+            prefill_cfg = get_prefill_backend_config(prefill_backend)
+            fa_version = prefill_cfg["flash_attn_version"]
+
+            if fa_version is not None:
+                # FA backend: verify the impl's FA version
+                actual_fa_version = getattr(impl, "vllm_flash_attn_version", None)
+                if actual_fa_version != fa_version:
+                    raise RuntimeError(
+                        f"Prefill backend '{prefill_backend}' requested FA "
+                        f"version {fa_version}, but the impl is using FA "
+                        f"version {actual_fa_version}. Check "
+                        f"vllm/v1/attention/backends/fa_utils.py."
+                    )
+            else:
+                # Non-FA backend: verify the builder picked the right path
+                expected_flags = {
+                    "flashinfer": "_use_fi_prefill",
+                    "cudnn": "_use_cudnn_prefill",
+                    "trtllm": "_use_trtllm_ragged_prefill",
+                }
+                flag_name = expected_flags.get(prefill_backend)
+                if flag_name and not getattr(builder_instance, flag_name, False):
+                    raise RuntimeError(
+                        f"Prefill backend '{prefill_backend}' was requested "
+                        f"but the metadata builder did not enable it. This "
+                        f"usually means a dependency is missing (e.g., "
+                        f"flashinfer not installed) or the platform doesn't "
+                        f"support it."
+                    )
+
        # Run each benchmark with the shared impl
        for config, threshold, num_splits in configs_with_params:
            # Set threshold for this benchmark (FlashAttn/FlashMLA only)
@@ -844,6 +968,7 @@ def run_mla_benchmark(
    reorder_batch_threshold: int | None = None,
    num_kv_splits: int | None = None,
    index_topk: int = 2048,
+    prefill_backend: str | None = None,
 ) -> BenchmarkResult | list[BenchmarkResult]:
    """
    Unified MLA benchmark runner for all backends.
@@ -861,6 +986,8 @@ def run_mla_benchmark(
                                 (single config mode only)
        num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
        index_topk: Topk value for sparse MLA backends (default 2048)
+        prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
+            When set, forces the specified FlashAttention version for prefill.

    Returns:
        BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
@@ -884,7 +1011,9 @@ def run_mla_benchmark(
        return_single = True

    # Use unified batched execution
-    results = _run_mla_benchmark_batched(backend, configs_with_params, index_topk)
+    results = _run_mla_benchmark_batched(
+        backend, configs_with_params, index_topk, prefill_backend=prefill_backend
+    )

    # Return single result or list based on input
    return results[0] if return_single else results
--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -418,8 +418,8 @@ def _run_single_benchmark(
    mem_stats = {}
    if config.profile_memory:
        mem_stats = {
-            "allocated_mb": torch.cuda.memory_allocated(device) / 1024**2,
-            "reserved_mb": torch.cuda.memory_reserved(device) / 1024**2,
+            "allocated_mb": torch.accelerator.memory_allocated(device) / 1024**2,
+            "reserved_mb": torch.accelerator.memory_reserved(device) / 1024**2,
        }

    return times, mem_stats
@@ -443,7 +443,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
        BenchmarkResult with timing and memory statistics
    """
    device = torch.device(config.device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)

    backend_cfg = _get_backend_config(config.backend)

--- a/benchmarks/benchmark_topk_topp.py
+++ b/benchmarks/benchmark_topk_topp.py
@@ -95,13 +95,16 @@ def create_logits(
 def measure_memory() -> tuple[int, int]:
    """Return (allocated, reserved) memory in bytes."""
    torch.accelerator.synchronize()
-    return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()
+    return (
+        torch.accelerator.memory_allocated(),
+        torch.accelerator.max_memory_allocated(),
+    )


 def reset_memory_stats():
    """Reset peak memory statistics."""
    reset_buffer_cache()
-    torch.cuda.reset_peak_memory_stats()
+    torch.accelerator.reset_peak_memory_stats()
    torch.accelerator.empty_cache()
    gc.collect()

--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -64,7 +64,7 @@ def bench_run(
    per_out_ch: bool,
    mkn: tuple[int, int, int],
 ):
-    init_workspace_manager(torch.cuda.current_device())
+    init_workspace_manager(torch.accelerator.current_device_index())
    (m, k, n) = mkn

    dtype = torch.half
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -495,7 +495,7 @@ def main():

    # Set device
    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)

    # Get CPU process group
    cpu_group = dist.new_group(backend="gloo")
--- a/benchmarks/kernels/benchmark_fused_collective.py
+++ b/benchmarks/kernels/benchmark_fused_collective.py
@@ -392,7 +392,7 @@ def benchmark_operation(
    num_op_per_cudagraph = 10

    # Use vLLM's graph_capture to make tensor_model_parallel_all_reduce graph-safe
-    device = torch.device(f"cuda:{torch.cuda.current_device()}")
+    device = torch.device(f"cuda:{torch.accelerator.current_device_index()}")
    with graph_capture(device=device), torch.cuda.graph(graph):
        for _ in range(num_op_per_cudagraph):
            operation_func(*args, **kwargs)
@@ -984,7 +984,7 @@ def main():
    world_size = int(os.environ["WORLD_SIZE"])

    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    torch.set_default_device(device)

    init_distributed_environment()
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -50,7 +50,7 @@ def bench_run(
    per_out_ch: bool,
    mkn: tuple[int, int, int],
 ):
-    init_workspace_manager(torch.cuda.current_device())
+    init_workspace_manager(torch.accelerator.current_device_index())
    label = "Quant Matmul"

    sub_label = (
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -626,7 +626,11 @@ class BenchmarkWorker:
            if visible_device != f"{self.device_id}":
                need_device_guard = True

-        with torch.cuda.device(self.device_id) if need_device_guard else nullcontext():
+        with (
+            torch.accelerator.device_index(self.device_id)
+            if need_device_guard
+            else nullcontext()
+        ):
            for idx, config in enumerate(tqdm(search_space)):
                try:
                    kernel_time = benchmark_config(
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -285,7 +285,7 @@ def tune_on_gpu(args_dict):
    weight_shapes = args_dict["weight_shapes"]
    args = args_dict["args"]

-    torch.cuda.set_device(gpu_id)
+    torch.accelerator.set_device_index(gpu_id)
    print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")

    block_n = args.block_n
@@ -334,7 +334,7 @@ def distribute_batch_sizes(batch_sizes, num_gpus):

 def main(args):
    print(args)
-    num_gpus = torch.cuda.device_count()
+    num_gpus = torch.accelerator.device_count()
    if num_gpus == 0:
        raise RuntimeError("No GPU available for tuning")
    print(f"Found {num_gpus} GPUs for parallel tuning")
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -79,7 +79,8 @@ else()
    find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
    find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
    find_isa(${CPUINFO} "S390" S390_FOUND)
-    find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
+    find_isa(${CPUINFO} "zvfhmin" RVV_FP16_FOUND) # Check for RISC-V Vector FP16 support
+    find_isa(${CPUINFO} "zvfbfmin" RVV_BF16_FOUND) # Check for RISC-V Vector BF16 support

    # Support cross-compilation by allowing override via environment variables
    if (ENABLE_ARM_BF16)
@@ -101,11 +102,13 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64" OR ENABLE_X86_ISA)
        "-mavx512f"
        "-mavx512vl"
        "-mavx512bw"
-        "-mavx512dq"
-        "-mavx512bf16"
-        "-mavx512vnni"
+        "-mavx512dq")
+    list(APPEND CXX_COMPILE_FLAGS_AVX512_AMX 
+        ${CXX_COMPILE_FLAGS_AVX512}
        "-mamx-bf16"
-        "-mamx-tile")
+        "-mamx-tile"
+        "-mavx512bf16"
+        "-mavx512vnni")
    list(APPEND CXX_COMPILE_FLAGS_AVX2
        "-mavx2")
 elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
@@ -142,11 +145,19 @@ elseif (S390_FOUND)
        "-march=native"
        "-mtune=native")
 elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
-    if(RVV_FOUND)
-	    message(FAIL_ERROR "Can't support rvv now.")
+    message(STATUS "RISC-V detected")
+    if(RVV_BF16_FOUND)
+        message(STATUS "BF16 extension detected")
+        set(MARCH_FLAGS -march=rv64gcv_zvfh_zfbfmin_zvfbfmin_zvl128b -mrvv-vector-bits=zvl -mabi=lp64d)
+        add_compile_definitions(RISCV_BF16_SUPPORT)
+    elseif (RVV_FP16_FOUND)
+        message(WARNING "BF16 functionality is not available")
+        set(MARCH_FLAGS -march=rv64gcv_zvfh_zvl128b -mrvv-vector-bits=zvl -mabi=lp64d)
    else()
+        message(STATUS "compile riscv with scalar")
        list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
    endif()
+    list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})
 else()
    message(FATAL_ERROR "vLLM CPU backend requires X86, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
 endif()
@@ -305,7 +316,8 @@ endif()

 # TODO: Refactor this
 if (ENABLE_X86_ISA)
-    message(STATUS "CPU extension (AVX512) compile flags: ${CXX_COMPILE_FLAGS_AVX512}")
+    message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) compile flags: ${CXX_COMPILE_FLAGS_AVX512_AMX}")
+    message(STATUS "CPU extension (AVX512F) compile flags: ${CXX_COMPILE_FLAGS_AVX512}")
    message(STATUS "CPU extension (AVX2) compile flags: ${CXX_COMPILE_FLAGS_AVX2}")
 else()
    message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
@@ -357,13 +369,15 @@ if(USE_ONEDNN)
 endif()

 if (ENABLE_X86_ISA)
-    set(VLLM_EXT_SRC_AVX512
+    set(VLLM_EXT_SRC_SGL
        "csrc/cpu/sgl-kernels/gemm.cpp"
        "csrc/cpu/sgl-kernels/gemm_int8.cpp"
        "csrc/cpu/sgl-kernels/gemm_fp8.cpp"
        "csrc/cpu/sgl-kernels/moe.cpp"
        "csrc/cpu/sgl-kernels/moe_int8.cpp"
-        "csrc/cpu/sgl-kernels/moe_fp8.cpp"
+        "csrc/cpu/sgl-kernels/moe_fp8.cpp")
+
+    set(VLLM_EXT_SRC_AVX512
        "csrc/cpu/shm.cpp"
        "csrc/cpu/cpu_wna16.cpp"
        "csrc/cpu/cpu_fused_moe.cpp"
@@ -389,31 +403,48 @@ if (ENABLE_X86_ISA)
        "csrc/cpu/pos_encoding.cpp"
        "csrc/moe/dynamic_4bit_int_moe_cpu.cpp") 

-    message(STATUS "CPU extension (AVX512) source files: ${VLLM_EXT_SRC_AVX512}")
+    message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) source files: ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL}")
+    message(STATUS "CPU extension (AVX512F) source files: ${VLLM_EXT_SRC_AVX512}")
    message(STATUS "CPU extension (AVX2) source files: ${VLLM_EXT_SRC_AVX2}")

+    set(_C_LIBS numa dnnl_ext)
+    set(_C_AVX512_LIBS numa dnnl_ext)
+    set(_C_AVX2_LIBS numa)
+
+    # AMX + AVX512F + AVX512BF16 + AVX512VNNI
    define_extension_target(
        _C
        DESTINATION vllm
        LANGUAGE CXX
+        SOURCES ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL}
+        LIBRARIES ${_C_LIBS}
+        COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512_AMX}
+        USE_SABI 3
+        WITH_SOABI
+    )
+
+    # For AMX kernels
+    target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16")
+
+    # AVX512F 
+    define_extension_target(
+        _C_AVX512
+        DESTINATION vllm
+        LANGUAGE CXX
        SOURCES ${VLLM_EXT_SRC_AVX512}
-        LIBRARIES ${LIBS}
+        LIBRARIES ${_C_AVX512_LIBS}
        COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512}
        USE_SABI 3
        WITH_SOABI
    )

-    # For SGL kernels
-    target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AVX512")
-    # For AMX kernels
-    target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16")
-
+    # AVX2 
    define_extension_target(
        _C_AVX2
        DESTINATION vllm
        LANGUAGE CXX
        SOURCES ${VLLM_EXT_SRC_AVX2}
-        LIBRARIES ${LIBS}
+        LIBRARIES ${_C_AVX2_LIBS}
        COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX2}
        USE_SABI 3
        WITH_SOABI
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -39,7 +39,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 140c00c0241bb60cc6e44e7c1be9998d4b20d8d2
+          GIT_TAG 1488682bb545f7d020e958a33116b1419d1cfc83
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -919,8 +919,8 @@ __global__ void gather_and_maybe_dequant_cache(
 // SCALAR_T is the data type of the destination tensor.
 // CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
-#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE)                        \
-  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576,      \
+#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE, ENTRY_SZ)              \
+  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, ENTRY_SZ, \
                                       thread_block_size>                     \
      <<<grid, block, 0, stream>>>(                                           \
          reinterpret_cast<CACHE_T*>(src_cache.data_ptr()),                   \
@@ -931,6 +931,12 @@ __global__ void gather_and_maybe_dequant_cache(
          dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
          seq_starts_ptr);

+#define CALL_GATHER_CACHE_576(SCALAR_T, CACHE_T, KV_DTYPE) \
+  CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE, 576)
+
+#define CALL_GATHER_CACHE_320(SCALAR_T, CACHE_T, KV_DTYPE) \
+  CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE, 320)
+
 // Gather sequences from the cache into the destination tensor.
 //  - cu_seq_lens contains the cumulative sequence lengths for each batch
 //  - block_table contains the cache block indices for each sequence
@@ -960,9 +966,10 @@ void gather_and_maybe_dequant_cache(
    TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
                "seq_starts must be int32");
  }
-  TORCH_CHECK(head_dim == 576,
-              "gather_and_maybe_dequant_cache only support the head_dim to 576 "
-              "for better performance")
+  TORCH_CHECK(
+      head_dim == 320 || head_dim == 576,
+      "gather_and_maybe_dequant_cache only support the head_dim to 320 or 576 "
+      "for better performance")

  TORCH_CHECK(src_cache.device() == dst.device(),
              "src_cache and dst must be on the same device");
@@ -987,7 +994,13 @@ void gather_and_maybe_dequant_cache(
  const int32_t* seq_starts_ptr =
      seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;

-  DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, CALL_GATHER_CACHE);
+  if (head_dim == 576) {
+    DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype,
+                               CALL_GATHER_CACHE_576);
+  } else {
+    DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype,
+                               CALL_GATHER_CACHE_320);
+  }
 }

 namespace vllm {
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -13,6 +13,9 @@
 #elif defined(__aarch64__)
  // arm implementation
  #include "cpu_types_arm.hpp"
+#elif defined(__riscv_v)
+  // riscv implementation
+  #include "cpu_types_riscv.hpp"
 #else
  #warning "unsupported vLLM cpu implementation, vLLM will compile with scalar"
  #include "cpu_types_scalar.hpp"
--- a/csrc/cpu/cpu_types_riscv.hpp
+++ b/csrc/cpu/cpu_types_riscv.hpp
@@ -0,0 +1,832 @@
+#ifndef CPU_TYPES_RISCV_HPP
+#define CPU_TYPES_RISCV_HPP
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <riscv_vector.h>
+#include <torch/all.h>
+
+// ============================================================================
+// Vector Register Type Definitions (VLEN=128 bits)
+// ============================================================================
+
+typedef vfloat16m1_t fixed_vfloat16m1_t
+    __attribute__((riscv_rvv_vector_bits(128)));
+typedef vfloat16m2_t fixed_vfloat16m2_t
+    __attribute__((riscv_rvv_vector_bits(256)));
+
+typedef vfloat32m1_t fixed_vfloat32m1_t
+    __attribute__((riscv_rvv_vector_bits(128)));
+typedef vfloat32m2_t fixed_vfloat32m2_t
+    __attribute__((riscv_rvv_vector_bits(256)));
+typedef vfloat32m4_t fixed_vfloat32m4_t
+    __attribute__((riscv_rvv_vector_bits(512)));
+typedef vfloat32m8_t fixed_vfloat32m8_t
+    __attribute__((riscv_rvv_vector_bits(1024)));
+
+typedef vint32m2_t fixed_vint32m2_t __attribute__((riscv_rvv_vector_bits(256)));
+typedef vint32m4_t fixed_vint32m4_t __attribute__((riscv_rvv_vector_bits(512)));
+
+typedef vuint16m1_t fixed_vuint16m1_t
+    __attribute__((riscv_rvv_vector_bits(128)));
+typedef vuint16m2_t fixed_vuint16m2_t
+    __attribute__((riscv_rvv_vector_bits(256)));
+typedef vuint16m4_t fixed_vuint16m4_t
+    __attribute__((riscv_rvv_vector_bits(512)));
+
+#ifdef RISCV_BF16_SUPPORT
+typedef vbfloat16m1_t fixed_vbfloat16m1_t
+    __attribute__((riscv_rvv_vector_bits(128)));
+typedef vbfloat16m2_t fixed_vbfloat16m2_t
+    __attribute__((riscv_rvv_vector_bits(256)));
+typedef vbfloat16m4_t fixed_vbfloat16m4_t
+    __attribute__((riscv_rvv_vector_bits(512)));
+#endif
+
+namespace vec_op {
+
+#ifdef RISCV_BF16_SUPPORT
+  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
+    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
+    AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+#else
+  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
+    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
+#endif
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#define FORCE_INLINE __attribute__((always_inline)) inline
+
+namespace {
+template <typename T, T... indexes, typename F>
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
+  (f(std::integral_constant<T, indexes>{}), ...);
+};
+}  // namespace
+
+template <typename T, T count, typename F,
+          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
+constexpr void unroll_loop(F&& f) {
+  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+
+template <typename T>
+struct Vec {
+  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; };
+};
+
+struct FP32Vec8;
+struct FP32Vec16;
+
+// ============================================================================
+// FP16 Implementation
+// ============================================================================
+
+struct FP16Vec8 : public Vec<FP16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  fixed_vfloat16m1_t reg;
+
+  explicit FP16Vec8(const void* ptr)
+      : reg(__riscv_vle16_v_f16m1(static_cast<const _Float16*>(ptr),
+                                  VEC_ELEM_NUM)) {};
+
+  explicit FP16Vec8(const FP32Vec8&);
+
+  void save(void* ptr) const {
+    __riscv_vse16_v_f16m1(static_cast<_Float16*>(ptr), reg, VEC_ELEM_NUM);
+  }
+  void save(void* ptr, int elem_num) const {
+    __riscv_vse16_v_f16m1(static_cast<_Float16*>(ptr), reg, elem_num);
+  }
+  void save_strided(void* ptr, ptrdiff_t stride) const {
+    ptrdiff_t byte_stride = stride * sizeof(_Float16);
+    __riscv_vsse16_v_f16m1(static_cast<_Float16*>(ptr), byte_stride, reg,
+                           VEC_ELEM_NUM);
+  }
+};
+
+struct FP16Vec16 : public Vec<FP16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  fixed_vfloat16m2_t reg;
+
+  explicit FP16Vec16(const void* ptr)
+      : reg(__riscv_vle16_v_f16m2(static_cast<const _Float16*>(ptr),
+                                  VEC_ELEM_NUM)) {};
+
+  explicit FP16Vec16(const FP32Vec16& vec);
+
+  void save(void* ptr) const {
+    __riscv_vse16_v_f16m2(static_cast<_Float16*>(ptr), reg, VEC_ELEM_NUM);
+  }
+  void save(void* ptr, int elem_num) const {
+    __riscv_vse16_v_f16m2(static_cast<_Float16*>(ptr), reg, elem_num);
+  }
+  void save_strided(void* ptr, ptrdiff_t stride) const {
+    ptrdiff_t byte_stride = stride * sizeof(_Float16);
+    __riscv_vsse16_v_f16m2(static_cast<_Float16*>(ptr), byte_stride, reg,
+                           VEC_ELEM_NUM);
+  }
+};
+
+// ============================================================================
+// BF16 Implementation
+// ============================================================================
+
+#ifdef RISCV_BF16_SUPPORT
+
+FORCE_INLINE fixed_vuint16m1_t bf16_to_u16(fixed_vbfloat16m1_t v) {
+  return __riscv_vreinterpret_v_bf16m1_u16m1(v);
+}
+FORCE_INLINE fixed_vuint16m2_t bf16_to_u16(fixed_vbfloat16m2_t v) {
+  return __riscv_vreinterpret_v_bf16m2_u16m2(v);
+}
+FORCE_INLINE fixed_vuint16m4_t bf16_to_u16(fixed_vbfloat16m4_t v) {
+  return __riscv_vreinterpret_v_bf16m4_u16m4(v);
+}
+
+struct BF16Vec8 : public Vec<BF16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  fixed_vbfloat16m1_t reg;
+
+  explicit BF16Vec8(const void* ptr)
+      : reg(__riscv_vreinterpret_v_u16m1_bf16m1(__riscv_vle16_v_u16m1(
+            reinterpret_cast<const uint16_t*>(ptr), VEC_ELEM_NUM))) {};
+
+  explicit BF16Vec8(fixed_vbfloat16m1_t data) : reg(data) {};
+  explicit BF16Vec8(const FP32Vec8&);
+
+  void save(void* ptr) const {
+    __riscv_vse16_v_u16m1(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
+                          VEC_ELEM_NUM);
+  }
+  void save(void* ptr, int elem_num) const {
+    __riscv_vse16_v_u16m1(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
+                          elem_num);
+  }
+  void save_strided(void* ptr, ptrdiff_t stride) const {
+    ptrdiff_t byte_stride = stride * sizeof(uint16_t);
+    __riscv_vsse16_v_u16m1(reinterpret_cast<uint16_t*>(ptr), byte_stride,
+                           bf16_to_u16(reg), VEC_ELEM_NUM);
+  }
+};
+
+struct BF16Vec16 : public Vec<BF16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  fixed_vbfloat16m2_t reg;
+
+  explicit BF16Vec16(const void* ptr)
+      : reg(__riscv_vreinterpret_v_u16m2_bf16m2(__riscv_vle16_v_u16m2(
+            reinterpret_cast<const uint16_t*>(ptr), VEC_ELEM_NUM))) {};
+
+  explicit BF16Vec16(fixed_vbfloat16m2_t data) : reg(data) {};
+  explicit BF16Vec16(const FP32Vec16&);
+
+  void save(void* ptr) const {
+    __riscv_vse16_v_u16m2(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
+                          VEC_ELEM_NUM);
+  }
+  void save(void* ptr, int elem_num) const {
+    __riscv_vse16_v_u16m2(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
+                          elem_num);
+  }
+  void save_strided(void* ptr, ptrdiff_t stride) const {
+    ptrdiff_t byte_stride = stride * sizeof(uint16_t);
+    __riscv_vsse16_v_u16m2(reinterpret_cast<uint16_t*>(ptr), byte_stride,
+                           bf16_to_u16(reg), VEC_ELEM_NUM);
+  }
+};
+
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+  fixed_vbfloat16m4_t reg;
+
+  explicit BF16Vec32(const void* ptr)
+      : reg(__riscv_vreinterpret_v_u16m4_bf16m4(__riscv_vle16_v_u16m4(
+            reinterpret_cast<const uint16_t*>(ptr), VEC_ELEM_NUM))) {};
+
+  explicit BF16Vec32(fixed_vbfloat16m4_t data) : reg(data) {};
+
+  explicit BF16Vec32(const BF16Vec8& v) {
+    fixed_vuint16m1_t u16_val = bf16_to_u16(v.reg);
+    fixed_vuint16m4_t u16_combined =
+        __riscv_vcreate_v_u16m1_u16m4(u16_val, u16_val, u16_val, u16_val);
+    reg = __riscv_vreinterpret_v_u16m4_bf16m4(u16_combined);
+  };
+
+  void save(void* ptr) const {
+    __riscv_vse16_v_u16m4(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
+                          VEC_ELEM_NUM);
+  }
+  void save(void* ptr, int elem_num) const {
+    __riscv_vse16_v_u16m4(reinterpret_cast<uint16_t*>(ptr), bf16_to_u16(reg),
+                          elem_num);
+  }
+  void save_strided(void* ptr, ptrdiff_t stride) const {
+    ptrdiff_t byte_stride = stride * sizeof(uint16_t);
+    __riscv_vsse16_v_u16m4(reinterpret_cast<uint16_t*>(ptr), byte_stride,
+                           bf16_to_u16(reg), VEC_ELEM_NUM);
+  }
+};
+
+#else
+// ============================================================================
+// BF16 Fallback Implementation (FP32 Simulation)
+// ============================================================================
+
+struct BF16Vec8 : public Vec<BF16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  fixed_vfloat32m2_t reg_fp32;
+  explicit BF16Vec8(const void* ptr) {
+    const uint16_t* u16 = static_cast<const uint16_t*>(ptr);
+    float tmp[8];
+    for (int i = 0; i < 8; ++i) {
+      uint32_t v = static_cast<uint32_t>(u16[i]) << 16;
+      std::memcpy(&tmp[i], &v, 4);
+    }
+    reg_fp32 = __riscv_vle32_v_f32m2(tmp, 8);
+  }
+  explicit BF16Vec8(const FP32Vec8&);
+  void save(void* ptr) const {
+    float tmp[8];
+    __riscv_vse32_v_f32m2(tmp, reg_fp32, 8);
+    uint16_t* u16 = static_cast<uint16_t*>(ptr);
+    for (int i = 0; i < 8; ++i) {
+      uint32_t v;
+      std::memcpy(&v, &tmp[i], 4);
+      u16[i] = static_cast<uint16_t>(v >> 16);
+    }
+  }
+  void save(void* ptr, int elem_num) const {
+    float tmp[8];
+    __riscv_vse32_v_f32m2(tmp, reg_fp32, 8);
+    uint16_t* u16 = static_cast<uint16_t*>(ptr);
+    for (int i = 0; i < elem_num; ++i) {
+      uint32_t v;
+      std::memcpy(&v, &tmp[i], 4);
+      u16[i] = static_cast<uint16_t>(v >> 16);
+    }
+  }
+  void save_strided(void* ptr, ptrdiff_t stride) const {
+    float tmp[8];
+    __riscv_vse32_v_f32m2(tmp, reg_fp32, 8);
+    uint8_t* u8 = static_cast<uint8_t*>(ptr);
+    ptrdiff_t byte_stride = stride * sizeof(uint16_t);
+    for (int i = 0; i < 8; ++i) {
+      uint32_t v;
+      std::memcpy(&v, &tmp[i], 4);
+      uint16_t val = static_cast<uint16_t>(v >> 16);
+      *reinterpret_cast<uint16_t*>(u8 + i * byte_stride) = val;
+    }
+  }
+};
+
+struct BF16Vec16 : public Vec<BF16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  fixed_vfloat32m4_t reg_fp32;
+  explicit BF16Vec16(const void* ptr) {
+    const uint16_t* u16 = static_cast<const uint16_t*>(ptr);
+    float tmp[16];
+    for (int i = 0; i < 16; ++i) {
+      uint32_t v = static_cast<uint32_t>(u16[i]) << 16;
+      std::memcpy(&tmp[i], &v, 4);
+    }
+    reg_fp32 = __riscv_vle32_v_f32m4(tmp, 16);
+  }
+  explicit BF16Vec16(const FP32Vec16&);
+  void save(void* ptr) const {
+    float tmp[16];
+    __riscv_vse32_v_f32m4(tmp, reg_fp32, 16);
+    uint16_t* u16 = static_cast<uint16_t*>(ptr);
+    for (int i = 0; i < 16; ++i) {
+      uint32_t v;
+      std::memcpy(&v, &tmp[i], 4);
+      u16[i] = static_cast<uint16_t>(v >> 16);
+    }
+  }
+  void save(void* ptr, int elem_num) const {
+    float tmp[16];
+    __riscv_vse32_v_f32m4(tmp, reg_fp32, 16);
+    uint16_t* u16 = static_cast<uint16_t*>(ptr);
+    for (int i = 0; i < elem_num; ++i) {
+      uint32_t v;
+      std::memcpy(&v, &tmp[i], 4);
+      u16[i] = static_cast<uint16_t>(v >> 16);
+    }
+  }
+  void save_strided(void* ptr, ptrdiff_t stride) const {
+    float tmp[16];
+    __riscv_vse32_v_f32m4(tmp, reg_fp32, 16);
+    uint8_t* u8 = static_cast<uint8_t*>(ptr);
+    ptrdiff_t byte_stride = stride * sizeof(uint16_t);
+    for (int i = 0; i < 16; ++i) {
+      uint32_t v;
+      std::memcpy(&v, &tmp[i], 4);
+      uint16_t val = static_cast<uint16_t>(v >> 16);
+      *reinterpret_cast<uint16_t*>(u8 + i * byte_stride) = val;
+    }
+  }
+};
+
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+  fixed_vfloat32m8_t reg_fp32;
+
+  explicit BF16Vec32(const void* ptr) {
+    const uint16_t* u16 = static_cast<const uint16_t*>(ptr);
+    float tmp[32];
+    for (int i = 0; i < 32; ++i) {
+      uint32_t v = static_cast<uint32_t>(u16[i]) << 16;
+      std::memcpy(&tmp[i], &v, 4);
+    }
+    reg_fp32 = __riscv_vle32_v_f32m8(tmp, 32);
+  }
+
+  explicit BF16Vec32(const BF16Vec8& v) {
+    float tmp_small[8];
+    __riscv_vse32_v_f32m2(tmp_small, v.reg_fp32, 8);
+    float tmp_large[32];
+    for (int i = 0; i < 4; ++i) {
+      std::memcpy(tmp_large + (i * 8), tmp_small, 8 * sizeof(float));
+    }
+    reg_fp32 = __riscv_vle32_v_f32m8(tmp_large, 32);
+  }
+
+  void save(void* ptr) const {
+    float tmp[32];
+    __riscv_vse32_v_f32m8(tmp, reg_fp32, 32);
+    uint16_t* u16 = static_cast<uint16_t*>(ptr);
+    for (int i = 0; i < 32; ++i) {
+      uint32_t v;
+      std::memcpy(&v, &tmp[i], 4);
+      u16[i] = static_cast<uint16_t>(v >> 16);
+    }
+  }
+
+  void save(void* ptr, int elem_num) const {
+    float tmp[32];
+    __riscv_vse32_v_f32m8(tmp, reg_fp32, 32);
+    uint16_t* u16 = static_cast<uint16_t*>(ptr);
+    for (int i = 0; i < elem_num; ++i) {
+      uint32_t v;
+      std::memcpy(&v, &tmp[i], 4);
+      u16[i] = static_cast<uint16_t>(v >> 16);
+    }
+  }
+
+  void save_strided(void* ptr, ptrdiff_t stride) const {
+    float tmp[32];
+    __riscv_vse32_v_f32m8(tmp, reg_fp32, 32);
+    uint8_t* u8 = static_cast<uint8_t*>(ptr);
+    ptrdiff_t byte_stride = stride * sizeof(uint16_t);
+    for (int i = 0; i < 32; ++i) {
+      uint32_t v;
+      std::memcpy(&v, &tmp[i], 4);
+      uint16_t val = static_cast<uint16_t>(v >> 16);
+      *reinterpret_cast<uint16_t*>(u8 + i * byte_stride) = val;
+    }
+  }
+};
+#endif
+
+// ============================================================================
+// FP32 Implementation
+// ============================================================================
+
+struct FP32Vec4 : public Vec<FP32Vec4> {
+  constexpr static int VEC_ELEM_NUM = 4;
+  fixed_vfloat32m1_t reg;
+  explicit FP32Vec4(float v) : reg(__riscv_vfmv_v_f_f32m1(v, VEC_ELEM_NUM)) {};
+  explicit FP32Vec4() : reg(__riscv_vfmv_v_f_f32m1(0.0f, VEC_ELEM_NUM)) {};
+  explicit FP32Vec4(const float* ptr)
+      : reg(__riscv_vle32_v_f32m1(ptr, VEC_ELEM_NUM)) {};
+  explicit FP32Vec4(fixed_vfloat32m1_t data) : reg(data) {};
+  explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {};
+  void save(float* ptr) const { __riscv_vse32_v_f32m1(ptr, reg, VEC_ELEM_NUM); }
+  void save(float* ptr, int elem_num) const {
+    __riscv_vse32_v_f32m1(ptr, reg, elem_num);
+  }
+};
+
+struct FP32Vec8 : public Vec<FP32Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  fixed_vfloat32m2_t reg;
+
+  explicit FP32Vec8(float v) : reg(__riscv_vfmv_v_f_f32m2(v, VEC_ELEM_NUM)) {};
+  explicit FP32Vec8() : reg(__riscv_vfmv_v_f_f32m2(0.0f, VEC_ELEM_NUM)) {};
+  explicit FP32Vec8(const float* ptr)
+      : reg(__riscv_vle32_v_f32m2(ptr, VEC_ELEM_NUM)) {};
+  explicit FP32Vec8(fixed_vfloat32m2_t data) : reg(data) {};
+  explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
+  explicit FP32Vec8(const FP16Vec8& v)
+      : reg(__riscv_vfwcvt_f_f_v_f32m2(v.reg, VEC_ELEM_NUM)) {};
+  explicit FP32Vec8(fixed_vfloat16m1_t v)
+      : reg(__riscv_vfwcvt_f_f_v_f32m2(v, VEC_ELEM_NUM)) {};
+
+#ifdef RISCV_BF16_SUPPORT
+  explicit FP32Vec8(fixed_vbfloat16m1_t v)
+      : reg(__riscv_vfwcvtbf16_f_f_v_f32m2(v, VEC_ELEM_NUM)) {};
+  explicit FP32Vec8(const BF16Vec8& v)
+      : reg(__riscv_vfwcvtbf16_f_f_v_f32m2(v.reg, VEC_ELEM_NUM)) {};
+#else
+  explicit FP32Vec8(const BF16Vec8& v) : reg(v.reg_fp32) {};
+#endif
+
+  float reduce_sum() const {
+    fixed_vfloat32m1_t scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1);
+    scalar = __riscv_vfredusum_vs_f32m2_f32m1(reg, scalar, VEC_ELEM_NUM);
+    return __riscv_vfmv_f_s_f32m1_f32(scalar);
+  }
+
+  FP32Vec8 operator*(const FP32Vec8& b) const {
+    return FP32Vec8(__riscv_vfmul_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
+  }
+  FP32Vec8 operator+(const FP32Vec8& b) const {
+    return FP32Vec8(__riscv_vfadd_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
+  }
+  FP32Vec8 operator-(const FP32Vec8& b) const {
+    return FP32Vec8(__riscv_vfsub_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
+  }
+  FP32Vec8 operator/(const FP32Vec8& b) const {
+    return FP32Vec8(__riscv_vfdiv_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
+  }
+
+  FP32Vec8 min(const FP32Vec8& b) const {
+    return FP32Vec8(__riscv_vfmin_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
+  }
+  FP32Vec8 max(const FP32Vec8& b) const {
+    return FP32Vec8(__riscv_vfmax_vv_f32m2(reg, b.reg, VEC_ELEM_NUM));
+  }
+  FP32Vec8 abs() const {
+    return FP32Vec8(__riscv_vfabs_v_f32m2(reg, VEC_ELEM_NUM));
+  }
+
+  FP32Vec8 min(const FP32Vec8& b, int elem_num) const {
+    return FP32Vec8(__riscv_vfmin_vv_f32m2(reg, b.reg, elem_num));
+  }
+  FP32Vec8 max(const FP32Vec8& b, int elem_num) const {
+    return FP32Vec8(__riscv_vfmax_vv_f32m2(reg, b.reg, elem_num));
+  }
+
+  FP32Vec8 clamp(const FP32Vec8& min_v, const FP32Vec8& max_v) const {
+    fixed_vfloat32m2_t temp =
+        __riscv_vfmax_vv_f32m2(min_v.reg, reg, VEC_ELEM_NUM);
+    return FP32Vec8(__riscv_vfmin_vv_f32m2(max_v.reg, temp, VEC_ELEM_NUM));
+  }
+
+  void save(float* ptr) const { __riscv_vse32_v_f32m2(ptr, reg, VEC_ELEM_NUM); }
+  void save(float* ptr, int elem_num) const {
+    __riscv_vse32_v_f32m2(ptr, reg, elem_num);
+  }
+  void save_strided(float* ptr, ptrdiff_t stride) const {
+    ptrdiff_t byte_stride = stride * sizeof(float);
+    __riscv_vsse32_v_f32m2(ptr, byte_stride, reg, VEC_ELEM_NUM);
+  }
+
+  FP32Vec8 exp() const {
+    const float inv_ln2 = 1.44269504088896341f;
+    fixed_vfloat32m2_t x_scaled =
+        __riscv_vfmul_vf_f32m2(reg, inv_ln2, VEC_ELEM_NUM);
+    fixed_vint32m2_t n_int = __riscv_vfcvt_x_f_v_i32m2(x_scaled, VEC_ELEM_NUM);
+    fixed_vfloat32m2_t n_float = __riscv_vfcvt_f_x_v_f32m2(n_int, VEC_ELEM_NUM);
+
+    fixed_vfloat32m2_t r =
+        __riscv_vfsub_vv_f32m2(x_scaled, n_float, VEC_ELEM_NUM);
+
+    fixed_vfloat32m2_t poly =
+        __riscv_vfmv_v_f_f32m2(0.001333355810164f, VEC_ELEM_NUM);
+    poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
+    poly = __riscv_vfadd_vf_f32m2(poly, 0.009618129107628f, VEC_ELEM_NUM);
+    poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
+    poly = __riscv_vfadd_vf_f32m2(poly, 0.055504108664821f, VEC_ELEM_NUM);
+    poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
+    poly = __riscv_vfadd_vf_f32m2(poly, 0.240226506959101f, VEC_ELEM_NUM);
+    poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
+    poly = __riscv_vfadd_vf_f32m2(poly, 0.693147180559945f, VEC_ELEM_NUM);
+    poly = __riscv_vfmul_vv_f32m2(poly, r, VEC_ELEM_NUM);
+    poly = __riscv_vfadd_vf_f32m2(poly, 1.0f, VEC_ELEM_NUM);
+
+    fixed_vint32m2_t biased_exp =
+        __riscv_vadd_vx_i32m2(n_int, 127, VEC_ELEM_NUM);
+    biased_exp = __riscv_vmax_vx_i32m2(biased_exp, 0, VEC_ELEM_NUM);
+    fixed_vint32m2_t exponent_bits =
+        __riscv_vsll_vx_i32m2(biased_exp, 23, VEC_ELEM_NUM);
+    fixed_vfloat32m2_t scale =
+        __riscv_vreinterpret_v_i32m2_f32m2(exponent_bits);
+
+    return FP32Vec8(__riscv_vfmul_vv_f32m2(poly, scale, VEC_ELEM_NUM));
+  }
+
+  FP32Vec8 tanh() const {
+    fixed_vfloat32m2_t x_clamped = __riscv_vfmin_vf_f32m2(
+        __riscv_vfmax_vf_f32m2(reg, -9.0f, VEC_ELEM_NUM), 9.0f, VEC_ELEM_NUM);
+    fixed_vfloat32m2_t x2 =
+        __riscv_vfmul_vf_f32m2(x_clamped, 2.0f, VEC_ELEM_NUM);
+    FP32Vec8 exp_val = FP32Vec8(x2).exp();
+    fixed_vfloat32m2_t num =
+        __riscv_vfsub_vf_f32m2(exp_val.reg, 1.0f, VEC_ELEM_NUM);
+    fixed_vfloat32m2_t den =
+        __riscv_vfadd_vf_f32m2(exp_val.reg, 1.0f, VEC_ELEM_NUM);
+    return FP32Vec8(__riscv_vfdiv_vv_f32m2(num, den, VEC_ELEM_NUM));
+  }
+
+  FP32Vec8 er() const {
+    const float p = 0.3275911f, a1 = 0.254829592f, a2 = -0.284496736f,
+                a3 = 1.421413741f, a4 = -1.453152027f, a5 = 1.061405429f;
+    fixed_vfloat32m2_t abs_x = __riscv_vfabs_v_f32m2(reg, VEC_ELEM_NUM);
+
+    fixed_vfloat32m2_t t = __riscv_vfadd_vf_f32m2(
+        __riscv_vfmul_vf_f32m2(abs_x, p, VEC_ELEM_NUM), 1.0f, VEC_ELEM_NUM);
+    t = __riscv_vfrdiv_vf_f32m2(t, 1.0f, VEC_ELEM_NUM);
+
+    fixed_vfloat32m2_t poly = __riscv_vfmv_v_f_f32m2(a5, VEC_ELEM_NUM);
+    poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
+                                  a4, VEC_ELEM_NUM);
+    poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
+                                  a3, VEC_ELEM_NUM);
+    poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
+                                  a2, VEC_ELEM_NUM);
+    poly = __riscv_vfadd_vf_f32m2(__riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM),
+                                  a1, VEC_ELEM_NUM);
+    poly = __riscv_vfmul_vv_f32m2(poly, t, VEC_ELEM_NUM);
+
+    fixed_vfloat32m2_t exp_val =
+        FP32Vec8(__riscv_vfneg_v_f32m2(
+                     __riscv_vfmul_vv_f32m2(abs_x, abs_x, VEC_ELEM_NUM),
+                     VEC_ELEM_NUM))
+            .exp()
+            .reg;
+    fixed_vfloat32m2_t res = __riscv_vfrsub_vf_f32m2(
+        __riscv_vfmul_vv_f32m2(poly, exp_val, VEC_ELEM_NUM), 1.0f,
+        VEC_ELEM_NUM);
+
+    vbool16_t mask = __riscv_vmflt_vf_f32m2_b16(reg, 0.0f, VEC_ELEM_NUM);
+    return FP32Vec8(__riscv_vfneg_v_f32m2_m(mask, res, VEC_ELEM_NUM));
+  }
+};
+
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  fixed_vfloat32m4_t reg;
+
+  explicit FP32Vec16(float v) : reg(__riscv_vfmv_v_f_f32m4(v, VEC_ELEM_NUM)) {};
+  explicit FP32Vec16() : reg(__riscv_vfmv_v_f_f32m4(0.0f, VEC_ELEM_NUM)) {};
+  explicit FP32Vec16(const float* ptr)
+      : reg(__riscv_vle32_v_f32m4(ptr, VEC_ELEM_NUM)) {};
+  explicit FP32Vec16(fixed_vfloat32m4_t data) : reg(data) {};
+  explicit FP32Vec16(const FP32Vec8& data)
+      : reg(__riscv_vcreate_v_f32m2_f32m4(data.reg, data.reg)) {};
+  explicit FP32Vec16(const FP32Vec16& data) : reg(data.reg) {};
+  explicit FP32Vec16(const FP16Vec16& v);
+
+#ifdef RISCV_BF16_SUPPORT
+  explicit FP32Vec16(fixed_vbfloat16m2_t v)
+      : reg(__riscv_vfwcvtbf16_f_f_v_f32m4(v, VEC_ELEM_NUM)) {};
+  explicit FP32Vec16(const BF16Vec16& v)
+      : reg(__riscv_vfwcvtbf16_f_f_v_f32m4(v.reg, VEC_ELEM_NUM)) {};
+#else
+  explicit FP32Vec16(const BF16Vec16& v) : reg(v.reg_fp32) {};
+#endif
+
+  FP32Vec16 operator+(const FP32Vec16& b) const {
+    return FP32Vec16(__riscv_vfadd_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
+  }
+  FP32Vec16 operator-(const FP32Vec16& b) const {
+    return FP32Vec16(__riscv_vfsub_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
+  }
+  FP32Vec16 operator*(const FP32Vec16& b) const {
+    return FP32Vec16(__riscv_vfmul_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
+  }
+  FP32Vec16 operator/(const FP32Vec16& b) const {
+    return FP32Vec16(__riscv_vfdiv_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
+  }
+
+  FP32Vec16 fma(const FP32Vec16& a, const FP32Vec16& b) const {
+    return FP32Vec16(__riscv_vfmacc_vv_f32m4(reg, a.reg, b.reg, VEC_ELEM_NUM));
+  }
+
+  float reduce_sum() const {
+    fixed_vfloat32m1_t scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1);
+    scalar = __riscv_vfredusum_vs_f32m4_f32m1(reg, scalar, VEC_ELEM_NUM);
+    return __riscv_vfmv_f_s_f32m1_f32(scalar);
+  }
+
+  float reduce_max() const {
+    fixed_vfloat32m1_t scalar =
+        __riscv_vfmv_s_f_f32m1(std::numeric_limits<float>::lowest(), 1);
+    scalar = __riscv_vfredmax_vs_f32m4_f32m1(reg, scalar, VEC_ELEM_NUM);
+    return __riscv_vfmv_f_s_f32m1_f32(scalar);
+  }
+
+  float reduce_min() const {
+    fixed_vfloat32m1_t scalar =
+        __riscv_vfmv_s_f_f32m1(std::numeric_limits<float>::max(), 1);
+    scalar = __riscv_vfredmin_vs_f32m4_f32m1(reg, scalar, VEC_ELEM_NUM);
+    return __riscv_vfmv_f_s_f32m1_f32(scalar);
+  }
+
+  template <int group_size>
+  float reduce_sub_sum(int idx) {
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+    const int start = idx * group_size;
+    vuint32m4_t indices = __riscv_vid_v_u32m4(VEC_ELEM_NUM);
+    vbool8_t mask = __riscv_vmand_mm_b8(
+        __riscv_vmsgeu_vx_u32m4_b8(indices, start, VEC_ELEM_NUM),
+        __riscv_vmsltu_vx_u32m4_b8(indices, start + group_size, VEC_ELEM_NUM),
+        VEC_ELEM_NUM);
+    fixed_vfloat32m1_t scalar = __riscv_vfmv_s_f_f32m1(0.0f, 1);
+    scalar =
+        __riscv_vfredusum_vs_f32m4_f32m1_m(mask, reg, scalar, VEC_ELEM_NUM);
+    return __riscv_vfmv_f_s_f32m1_f32(scalar);
+  };
+
+  FP32Vec16 max(const FP32Vec16& b) const {
+    return FP32Vec16(__riscv_vfmax_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
+  }
+  FP32Vec16 min(const FP32Vec16& b) const {
+    return FP32Vec16(__riscv_vfmin_vv_f32m4(reg, b.reg, VEC_ELEM_NUM));
+  }
+  FP32Vec16 abs() const {
+    return FP32Vec16(__riscv_vfabs_v_f32m4(reg, VEC_ELEM_NUM));
+  }
+
+  FP32Vec16 clamp(const FP32Vec16& min_v, const FP32Vec16& max_v) const {
+    return FP32Vec16(__riscv_vfmin_vv_f32m4(
+        max_v.reg, __riscv_vfmax_vv_f32m4(min_v.reg, reg, VEC_ELEM_NUM),
+        VEC_ELEM_NUM));
+  }
+
+  void save(float* ptr) const { __riscv_vse32_v_f32m4(ptr, reg, VEC_ELEM_NUM); }
+  void save(float* ptr, int elem_num) const {
+    __riscv_vse32_v_f32m4(ptr, reg, elem_num);
+  }
+  void save_strided(float* ptr, ptrdiff_t stride) const {
+    ptrdiff_t byte_stride = stride * sizeof(float);
+    __riscv_vsse32_v_f32m4(ptr, byte_stride, reg, VEC_ELEM_NUM);
+  }
+
+  FP32Vec16 exp() const {
+    const float inv_ln2 = 1.44269504088896341f;
+    fixed_vfloat32m4_t x_scaled =
+        __riscv_vfmul_vf_f32m4(reg, inv_ln2, VEC_ELEM_NUM);
+    fixed_vint32m4_t n_int = __riscv_vfcvt_x_f_v_i32m4(x_scaled, VEC_ELEM_NUM);
+    fixed_vfloat32m4_t n_float = __riscv_vfcvt_f_x_v_f32m4(n_int, VEC_ELEM_NUM);
+    fixed_vfloat32m4_t r =
+        __riscv_vfsub_vv_f32m4(x_scaled, n_float, VEC_ELEM_NUM);
+
+    fixed_vfloat32m4_t poly =
+        __riscv_vfmv_v_f_f32m4(0.001333355810164f, VEC_ELEM_NUM);
+    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
+                                  0.009618129107628f, VEC_ELEM_NUM);
+    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
+                                  0.055504108664821f, VEC_ELEM_NUM);
+    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
+                                  0.240226506959101f, VEC_ELEM_NUM);
+    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
+                                  0.693147180559945f, VEC_ELEM_NUM);
+    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, r, VEC_ELEM_NUM),
+                                  1.0f, VEC_ELEM_NUM);
+
+    fixed_vint32m4_t biased_exp = __riscv_vmax_vx_i32m4(
+        __riscv_vadd_vx_i32m4(n_int, 127, VEC_ELEM_NUM), 0, VEC_ELEM_NUM);
+    fixed_vfloat32m4_t scale = __riscv_vreinterpret_v_i32m4_f32m4(
+        __riscv_vsll_vx_i32m4(biased_exp, 23, VEC_ELEM_NUM));
+
+    return FP32Vec16(__riscv_vfmul_vv_f32m4(poly, scale, VEC_ELEM_NUM));
+  }
+
+  FP32Vec16 tanh() const {
+    fixed_vfloat32m4_t x_clamped = __riscv_vfmin_vf_f32m4(
+        __riscv_vfmax_vf_f32m4(reg, -9.0f, VEC_ELEM_NUM), 9.0f, VEC_ELEM_NUM);
+    FP32Vec16 exp_val =
+        FP32Vec16(__riscv_vfmul_vf_f32m4(x_clamped, 2.0f, VEC_ELEM_NUM)).exp();
+    return FP32Vec16(__riscv_vfdiv_vv_f32m4(
+        __riscv_vfsub_vf_f32m4(exp_val.reg, 1.0f, VEC_ELEM_NUM),
+        __riscv_vfadd_vf_f32m4(exp_val.reg, 1.0f, VEC_ELEM_NUM), VEC_ELEM_NUM));
+  }
+
+  FP32Vec16 er() const {
+    const float p = 0.3275911f, a1 = 0.254829592f, a2 = -0.284496736f,
+                a3 = 1.421413741f, a4 = -1.453152027f, a5 = 1.061405429f;
+    fixed_vfloat32m4_t abs_x = __riscv_vfabs_v_f32m4(reg, VEC_ELEM_NUM);
+    fixed_vfloat32m4_t t = __riscv_vfrdiv_vf_f32m4(
+        __riscv_vfadd_vf_f32m4(__riscv_vfmul_vf_f32m4(abs_x, p, VEC_ELEM_NUM),
+                               1.0f, VEC_ELEM_NUM),
+        1.0f, VEC_ELEM_NUM);
+
+    fixed_vfloat32m4_t poly = __riscv_vfmv_v_f_f32m4(a5, VEC_ELEM_NUM);
+    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
+                                  a4, VEC_ELEM_NUM);
+    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
+                                  a3, VEC_ELEM_NUM);
+    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
+                                  a2, VEC_ELEM_NUM);
+    poly = __riscv_vfadd_vf_f32m4(__riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM),
+                                  a1, VEC_ELEM_NUM);
+    poly = __riscv_vfmul_vv_f32m4(poly, t, VEC_ELEM_NUM);
+
+    fixed_vfloat32m4_t exp_val =
+        FP32Vec16(__riscv_vfneg_v_f32m4(
+                      __riscv_vfmul_vv_f32m4(abs_x, abs_x, VEC_ELEM_NUM),
+                      VEC_ELEM_NUM))
+            .exp()
+            .reg;
+    fixed_vfloat32m4_t res = __riscv_vfrsub_vf_f32m4(
+        __riscv_vfmul_vv_f32m4(poly, exp_val, VEC_ELEM_NUM), 1.0f,
+        VEC_ELEM_NUM);
+
+    vbool8_t mask = __riscv_vmflt_vf_f32m4_b8(reg, 0.0f, VEC_ELEM_NUM);
+    return FP32Vec16(__riscv_vfneg_v_f32m4_m(mask, res, VEC_ELEM_NUM));
+  }
+};
+
+// ============================================================================
+// Type Traits & Global Helpers
+// ============================================================================
+
+template <typename T>
+struct VecType {
+  using vec_type = void;
+  using vec_t = void;
+};
+
+template <typename T>
+using vec_t = typename VecType<T>::vec_type;
+
+template <>
+struct VecType<float> {
+  using vec_type = FP32Vec8;
+  using vec_t = FP32Vec8;
+};
+template <>
+struct VecType<c10::Half> {
+  using vec_type = FP16Vec8;
+  using vec_t = FP16Vec8;
+};
+template <>
+struct VecType<c10::BFloat16> {
+  using vec_type = BF16Vec8;
+  using vec_t = BF16Vec8;
+};
+
+template <typename T>
+void storeFP32(float v, T* ptr) {
+  *ptr = v;
+}
+template <>
+inline void storeFP32<c10::Half>(float v, c10::Half* ptr) {
+  *reinterpret_cast<_Float16*>(ptr) = static_cast<_Float16>(v);
+}
+
+inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
+  reg = __riscv_vfncvt_f_f_w_f16m2(v.reg, VEC_ELEM_NUM);
+}
+inline FP16Vec8::FP16Vec8(const FP32Vec8& v) {
+  reg = __riscv_vfncvt_f_f_w_f16m1(v.reg, VEC_ELEM_NUM);
+}
+inline FP32Vec16::FP32Vec16(const FP16Vec16& v) {
+  reg = __riscv_vfwcvt_f_f_v_f32m4(v.reg, VEC_ELEM_NUM);
+}
+inline void fma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+  acc = acc.fma(a, b);
+}
+
+#ifdef RISCV_BF16_SUPPORT
+template <>
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
+  *ptr = static_cast<__bf16>(v);
+};
+inline BF16Vec8::BF16Vec8(const FP32Vec8& v)
+    : reg(__riscv_vfncvtbf16_f_f_w_bf16m1(v.reg, VEC_ELEM_NUM)) {};
+inline BF16Vec16::BF16Vec16(const FP32Vec16& v)
+    : reg(__riscv_vfncvtbf16_f_f_w_bf16m2(v.reg, VEC_ELEM_NUM)) {};
+#else
+template <>
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
+  uint32_t val;
+  std::memcpy(&val, &v, 4);
+  *reinterpret_cast<uint16_t*>(ptr) = static_cast<uint16_t>(val >> 16);
+}
+inline BF16Vec8::BF16Vec8(const FP32Vec8& v) : reg_fp32(v.reg) {}
+inline BF16Vec16::BF16Vec16(const FP32Vec16& v) : reg_fp32(v.reg) {}
+#endif
+
+inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 1); }
+
+}  // namespace vec_op
+
+#ifndef CPU_KERNEL_GUARD_IN
+  #define CPU_KERNEL_GUARD_IN(NAME)
+#endif
+
+#ifndef CPU_KERNEL_GUARD_OUT
+  #define CPU_KERNEL_GUARD_OUT(NAME)
+#endif
+
+#endif  // CPU_TYPES_RISCV_HPP
--- a/csrc/cuda_vec_utils.cuh
+++ b/csrc/cuda_vec_utils.cuh
@@ -196,6 +196,7 @@ __forceinline__ __device__ u32x8_t ld256_cs(const u32x8_t* addr) {
  return val;
 #else
  assert(false && "ld256_cs requires SM100+ with CUDA 12.9+");
+  return u32x8_t{};
 #endif
 }

--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -109,16 +109,18 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,

 #ifndef USE_ROCM
  int flag = 0;
-  CUDA_CHECK(cuDeviceGetAttribute(
+  CUresult rdma_result = cuDeviceGetAttribute(
      &flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
-      device));
-  if (flag) {  // support GPUDirect RDMA if possible
+      device);
+  if (rdma_result == CUDA_SUCCESS &&
+      flag) {  // support GPUDirect RDMA if possible
    prop.allocFlags.gpuDirectRDMACapable = 1;
  }
  int fab_flag = 0;
-  CUDA_CHECK(cuDeviceGetAttribute(
-      &fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device));
-  if (fab_flag) {  // support fabric handle if possible
+  CUresult fab_result = cuDeviceGetAttribute(
+      &fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device);
+  if (fab_result == CUDA_SUCCESS &&
+      fab_flag) {  // support fabric handle if possible
    prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
  }
 #endif
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -73,10 +73,9 @@ void moe_permute(
  MOE_DISPATCH(input.scalar_type(), [&] {
    expandInputRowsKernelLauncher<scalar_t>(
        get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
-        get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
-        get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
-        get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
-        n_hidden, topk, n_local_expert, stream);
+        get_ptr<int>(sorted_row_idx), get_ptr<int>(inv_permuted_idx),
+        get_ptr<int>(permuted_idx), get_ptr<int64_t>(expert_first_token_offset),
+        n_token, valid_num_ptr, n_hidden, topk, n_local_expert, stream);
  });
 }

--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
@@ -57,7 +57,7 @@ void sortAndScanExpert(const int* expert_for_source_row, const int* source_rows,

 template <typename T>
 void expandInputRowsKernelLauncher(
-    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
+    T const* unpermuted_input, T* permuted_output,
    int const* expanded_dest_row_to_expanded_source_row,
    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
    int64_t const* expert_first_token_offset, int64_t const num_rows,
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
@@ -2,7 +2,7 @@

 template <typename T, bool CHECK_SKIPPED>
 __global__ void expandInputRowsKernel(
-    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
+    T const* unpermuted_input, T* permuted_output,
    int const* expanded_dest_row_to_expanded_source_row,
    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
    int64_t const* expert_first_token_offset, int64_t const num_rows,
@@ -16,7 +16,6 @@ __global__ void expandInputRowsKernel(
  int64_t expanded_dest_row = blockIdx.x;
  int64_t const expanded_source_row =
      expanded_dest_row_to_expanded_source_row[expanded_dest_row];
-  int expert_id = sorted_experts[expanded_dest_row];

  if (threadIdx.x == 0) {
    assert(expanded_dest_row <= INT32_MAX);
@@ -54,7 +53,7 @@ __global__ void expandInputRowsKernel(

 template <typename T>
 void expandInputRowsKernelLauncher(
-    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
+    T const* unpermuted_input, T* permuted_output,
    int const* expanded_dest_row_to_expanded_source_row,
    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
    int64_t const* expert_first_token_offset, int64_t const num_rows,
@@ -70,12 +69,12 @@ void expandInputRowsKernelLauncher(
  bool is_check_skip = num_valid_tokens_ptr != nullptr;
  auto func = func_map[is_check_skip];

-  func<<<blocks, threads, 0, stream>>>(
-      unpermuted_input, permuted_output, sorted_experts,
-      expanded_dest_row_to_expanded_source_row,
-      expanded_source_row_to_expanded_dest_row, permuted_idx,
-      expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
-      num_local_experts);
+  func<<<blocks, threads, 0, stream>>>(unpermuted_input, permuted_output,
+                                       expanded_dest_row_to_expanded_source_row,
+                                       expanded_source_row_to_expanded_dest_row,
+                                       permuted_idx, expert_first_token_offset,
+                                       num_rows, num_valid_tokens_ptr, cols, k,
+                                       num_local_experts);
 }

 template <class T, class U>
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -295,10 +295,14 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,

 std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a);

-void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
-                      torch::Tensor& output_scale,
-                      torch::Tensor const& input_scale,
-                      bool is_sf_swizzled_layout);
+std::tuple<torch::Tensor, torch::Tensor> scaled_fp4_quant_func(
+    torch::Tensor const& input, torch::Tensor const& input_scale,
+    bool is_sf_swizzled_layout);
+
+void scaled_fp4_quant_out(torch::Tensor const& input,
+                          torch::Tensor const& input_scale,
+                          bool is_sf_swizzled_layout, torch::Tensor& output,
+                          torch::Tensor& output_scale);

 void scaled_fp4_experts_quant(
    torch::Tensor& output, torch::Tensor& output_scale,
--- a/csrc/quantization/fp4/nvfp4_quant_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@@ -16,6 +16,8 @@

 #include <torch/all.h>

+#include "nvfp4_utils.cuh"
+
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
 void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
@@ -51,9 +53,10 @@ void silu_and_mul_scaled_fp4_experts_quant_sm1xxa(
    torch::Tensor const& output_scale_offset_by_experts);
 #endif

-void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
-                      torch::Tensor& output_sf, torch::Tensor const& input_sf,
-                      bool is_sf_swizzled_layout) {
+void scaled_fp4_quant_out(torch::Tensor const& input,
+                          torch::Tensor const& input_sf,
+                          bool is_sf_swizzled_layout, torch::Tensor& output,
+                          torch::Tensor& output_sf) {
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
  return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf,
@@ -62,6 +65,34 @@ void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization kernel");
 }

+std::tuple<torch::Tensor, torch::Tensor> scaled_fp4_quant_func(
+    torch::Tensor const& input, torch::Tensor const& input_sf,
+    bool is_sf_swizzled_layout) {
+  int64_t n = input.size(-1);
+  int64_t m = input.numel() / n;
+  auto device = input.device();
+
+  // Two fp4 values packed into a uint8
+  auto output = torch::empty(
+      {m, n / 2}, torch::TensorOptions().device(device).dtype(torch::kUInt8));
+
+  torch::Tensor output_sf;
+  if (is_sf_swizzled_layout) {
+    auto [sf_m, sf_n] = vllm::computeSwizzledSFShape(m, n);
+    output_sf = torch::empty(
+        {sf_m, sf_n},
+        torch::TensorOptions().device(device).dtype(torch::kInt32));
+  } else {
+    output_sf = torch::empty(
+        {m, n / CVT_FP4_SF_VEC_SIZE},
+        torch::TensorOptions().device(device).dtype(torch::kUInt8));
+  }
+
+  scaled_fp4_quant_out(input, input_sf, is_sf_swizzled_layout, output,
+                       output_sf);
+  return {output, output_sf};
+}
+
 void scaled_fp4_experts_quant(
    torch::Tensor& output, torch::Tensor& output_scale,
    torch::Tensor const& input, torch::Tensor const& input_global_scale,
--- a/csrc/quantization/fp4/nvfp4_utils.cuh
+++ b/csrc/quantization/fp4/nvfp4_utils.cuh
@@ -18,6 +18,7 @@

 #include <cuda_runtime.h>
 #include <cuda_fp8.h>
+#include <utility>

 #include "../../cuda_vec_utils.cuh"

@@ -54,6 +55,18 @@ inline int computeEffectiveRows(int m) {
  return round_up(m, ROW_TILE);
 }

+// Compute the shape of the swizzled SF output tensor.
+// Returns (rounded_m, rounded_n / 4) where:
+//   rounded_m = round_up(m, 128)
+//   rounded_n = round_up(n / CVT_FP4_SF_VEC_SIZE, 4)
+inline std::pair<int64_t, int64_t> computeSwizzledSFShape(int64_t m,
+                                                          int64_t n) {
+  int64_t rounded_m = round_up(m, static_cast<int64_t>(128));
+  int64_t scale_n = n / CVT_FP4_SF_VEC_SIZE;
+  int64_t rounded_n = round_up(scale_n, static_cast<int64_t>(4));
+  return {rounded_m, rounded_n / 4};
+}
+
 // Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
 inline __device__ uint32_t fp32_vec8_to_e2m1(float (&array)[8]) {
  uint32_t val;
--- a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@@ -15,31 +15,33 @@ __device__ void rms_norm_dynamic_per_token_quant_vec(
    scalar_t const* __restrict__ input,   // [..., hidden_size]
    scalar_t const* __restrict__ weight,  // [hidden_size]
    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
-    scalar_t* __restrict__ residual = nullptr) {
+    int32_t const input_stride, scalar_t* __restrict__ residual = nullptr) {
  float rms = 0.0f;
  float token_scale = 0.0f;

  // Compute rms
  vllm::vectorized::compute_rms<scalar_t, has_residual>(
-      &rms, input, hidden_size, var_epsilon, residual);
+      &rms, input, hidden_size, input_stride, var_epsilon, residual);

  // Compute scale
  vllm::vectorized::compute_dynamic_per_token_scales<scalar_t, scalar_out_t,
                                                     has_residual>(
      &token_scale, scales, input, weight, rms, scale_ub, hidden_size,
-      residual);
+      input_stride, residual);

  // RMS Norm + Quant
  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
    token_scale = 1.0f / token_scale;
    vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, true,
-                                     has_residual>(
-        out, input, weight, rms, &token_scale, hidden_size, residual);
+                                     has_residual>(out, input, weight, rms,
+                                                   &token_scale, hidden_size,
+                                                   input_stride, residual);
  } else {
    // FP8 - Do not invert token_scale for exact match with FBGemm
    vllm::vectorized::norm_and_quant<scalar_t, scalar_out_t, false,
-                                     has_residual>(
-        out, input, weight, rms, &token_scale, hidden_size, residual);
+                                     has_residual>(out, input, weight, rms,
+                                                   &token_scale, hidden_size,
+                                                   input_stride, residual);
  }
 }

@@ -51,38 +53,40 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
    scalar_t const* __restrict__ input,   // [..., hidden_size]
    scalar_t const* __restrict__ weight,  // [hidden_size]
    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
-    scalar_t* __restrict__ residual = nullptr) {
+    int32_t const input_stride, scalar_t* __restrict__ residual = nullptr) {
  // For vectorization, token_input and token_output pointers need to be
  // aligned at 8-byte and 4-byte addresses respectively.
-  bool const can_vectorize = hidden_size % 4 == 0;
+  bool const can_vectorize = hidden_size % 4 == 0 and input_stride % 4 == 0;

  if (can_vectorize) {
    return rms_norm_dynamic_per_token_quant_vec<scalar_t, scalar_out_t,
                                                has_residual>(
        out, scales, input, weight, scale_ub, var_epsilon, hidden_size,
-        residual);
+        input_stride, residual);
  }

  float rms = 0.0f;
  float token_scale = 0.0f;

  // Compute RMS
-  vllm::compute_rms<scalar_t, has_residual>(&rms, input, hidden_size,
-                                            var_epsilon, residual);
+  vllm::compute_rms<scalar_t, has_residual>(
+      &rms, input, hidden_size, input_stride, var_epsilon, residual);
  // Compute Scale
  vllm::compute_dynamic_per_token_scales<scalar_t, scalar_out_t, has_residual>(
      &token_scale, scales, input, weight, rms, scale_ub, hidden_size,
-      residual);
+      input_stride, residual);

  // RMS Norm + Quant
  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
    token_scale = 1.0f / token_scale;
    vllm::norm_and_quant<scalar_t, scalar_out_t, true, has_residual>(
-        out, input, weight, rms, &token_scale, hidden_size, residual);
+        out, input, weight, rms, &token_scale, hidden_size, input_stride,
+        residual);
  } else {
    // FP8 - Do not invert s_token_scale for exact match with FBGemm
    vllm::norm_and_quant<scalar_t, scalar_out_t, false, has_residual>(
-        out, input, weight, rms, &token_scale, hidden_size, residual);
+        out, input, weight, rms, &token_scale, hidden_size, input_stride,
+        residual);
  }
 }

@@ -97,19 +101,20 @@ __global__ void rms_norm_per_block_quant_kernel(
    scalar_t const* __restrict__ input,   // [..., hidden_size]
    scalar_t const* __restrict__ weight,  // [hidden_size]
    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
-    scalar_t* __restrict__ residual = nullptr, int64_t outer_scale_stride = 1) {
+    int32_t const input_stride, scalar_t* __restrict__ residual = nullptr,
+    int64_t outer_scale_stride = 1) {
  float rms;
  // Compute RMS
  // Always able to vectorize due to constraints on hidden_size
  vllm::vectorized::compute_rms<scalar_t, has_residual>(
-      &rms, input, hidden_size, var_epsilon, residual);
+      &rms, input, hidden_size, input_stride, var_epsilon, residual);

  // Compute Scale
  // Always able to vectorize due to constraints on hidden_size and group_size
  vllm::vectorized::compute_dynamic_per_token_scales<
      scalar_t, scalar_out_t, has_residual, is_scale_transposed, group_size>(
-      nullptr, scales, input, weight, rms, scale_ub, hidden_size, residual,
-      outer_scale_stride);
+      nullptr, scales, input, weight, rms, scale_ub, hidden_size, input_stride,
+      residual, outer_scale_stride);

  // RMS Norm + Quant
  // Always able to vectorize due to constraints on hidden_size
@@ -120,7 +125,7 @@ __global__ void rms_norm_per_block_quant_kernel(
  vllm::vectorized::norm_and_quant<
      scalar_t, scalar_out_t, std::is_same_v<scalar_out_t, int8_t>,
      has_residual, is_scale_transposed, group_size>(
-      out, input, weight, rms, scales, hidden_size, residual,
+      out, input, weight, rms, scales, hidden_size, input_stride, residual,
      outer_scale_stride);
 }

@@ -137,6 +142,7 @@ void rms_norm_dynamic_per_token_quant_dispatch(
    std::optional<at::Tensor> const& scale_ub,
    std::optional<at::Tensor>& residual) {
  int32_t hidden_size = input.size(-1);
+  int32_t input_stride = input.view({-1, hidden_size}).stride(0);
  auto num_tokens = input.numel() / hidden_size;

  dim3 grid(num_tokens);
@@ -153,7 +159,7 @@ void rms_norm_dynamic_per_token_quant_dispatch(
                  out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
                  input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
-                  var_epsilon, hidden_size,
+                  var_epsilon, hidden_size, input_stride,
                  has_residual ? residual->data_ptr<scalar_in_t>() : nullptr);
        });
  });
@@ -170,7 +176,9 @@ void rms_norm_dynamic_per_token_quant(
                                        ? c10::ScalarType::Float8_e4m3fn
                                        : c10::ScalarType::Float8_e4m3fnuz;
  TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
-  TORCH_CHECK(out.is_contiguous() && input.is_contiguous());
+  TORCH_CHECK(out.is_contiguous());
+  TORCH_CHECK(input.stride(-1) == 1,
+              "Input must be contiguous in the last dimension");

  if (scale_ub.has_value()) {
    TORCH_CHECK(out.dtype() == kFp8Type);
@@ -179,6 +187,7 @@ void rms_norm_dynamic_per_token_quant(
  TORCH_CHECK(scales.dtype() == torch::kFloat32);
  if (residual) {
    TORCH_CHECK(residual->scalar_type() == input.scalar_type());
+    TORCH_CHECK(residual->is_contiguous());
  }

  VLLM_DISPATCH_FLOATING_TYPES(
@@ -200,6 +209,15 @@ void rms_norm_per_block_quant_dispatch(
    std::optional<at::Tensor> const& scale_ub,
    std::optional<at::Tensor>& residual, bool is_scale_transposed) {
  int32_t hidden_size = input.size(-1);
+  int32_t input_stride = input.view({-1, hidden_size}).stride(0);
+
+  TORCH_CHECK(hidden_size % 4 == 0,
+              "Hidden size must be divisible by 4 for vectorized access");
+  TORCH_CHECK(input_stride % 4 == 0,
+              "Input stride must be divisible by 4 for vectorized access");
+  TORCH_CHECK(group_size % 4 == 0,
+              "Group size must be divisible by 4 for vectorized access");
+
  auto num_tokens = input.numel() / hidden_size;

  dim3 grid(num_tokens);
@@ -225,7 +243,7 @@ void rms_norm_per_block_quant_dispatch(
                            weight.data_ptr<scalar_in_t>(),
                            scale_ub.has_value() ? scale_ub->data_ptr<float>()
                                                 : nullptr,
-                            var_epsilon, hidden_size,
+                            var_epsilon, hidden_size, input_stride,
                            has_residual ? residual->data_ptr<scalar_in_t>()
                                         : nullptr,
                            scales.stride(1));
@@ -246,7 +264,9 @@ void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
                                        ? c10::ScalarType::Float8_e4m3fn
                                        : c10::ScalarType::Float8_e4m3fnuz;
  TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
-  TORCH_CHECK(out.is_contiguous() && input.is_contiguous());
+  TORCH_CHECK(out.is_contiguous());
+  TORCH_CHECK(input.stride(-1) == 1,
+              "Input must be contiguous in the last dimension");

  if (scale_ub.has_value()) {
    TORCH_CHECK(out.dtype() == kFp8Type);
@@ -255,6 +275,7 @@ void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
  TORCH_CHECK(scales.dtype() == torch::kFloat32);
  if (residual) {
    TORCH_CHECK(residual->scalar_type() == input.scalar_type());
+    TORCH_CHECK(residual->is_contiguous());
  }

  TORCH_CHECK(group_size == 128 || group_size == 64,
--- a/csrc/quantization/fused_kernels/layernorm_utils.cuh
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@@ -16,14 +16,17 @@ namespace vllm {
 // has_residual must be true, if residual is not a nullptr
 template <typename scalar_t, bool has_residual = false>
 __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
-                            int32_t const hidden_size, float const epsilon,
+                            int32_t const hidden_size,
+                            int32_t const input_stride, float const epsilon,
                            scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const input_token_offset =
+      blockIdx.x * static_cast<int64_t>(input_stride);
  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
  // sum of squares
  float ss = 0.0f;

  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
-    float x = static_cast<float>(input[token_offset + i]);
+    float x = static_cast<float>(input[input_token_offset + i]);
    if constexpr (has_residual) {
      x += static_cast<float>(residual[token_offset + i]);
    }
@@ -73,15 +76,20 @@ __device__ void compute_dynamic_per_token_scales(
    float* __restrict__ token_scale, float* __restrict__ all_token_scales,
    scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
    float const rms, float const* __restrict__ scale_ub,
-    int32_t const hidden_size, scalar_t const* __restrict__ residual = nullptr,
+    int32_t const hidden_size, int32_t const input_stride,
+    scalar_t const* __restrict__ residual = nullptr,
    int32_t const group_size = 0, int64_t outer_scale_stride = 1) {
  float block_absmax_val_maybe = 0.0f;
  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
  __syncthreads();
+
+  int64_t const input_token_offset =
+      blockIdx.x * static_cast<int64_t>(input_stride);
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+
  if (group_size > 0) {
-    __shared__ float s_max_vals[1024];
-    int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
    int64_t num_groups = hidden_size / group_size;
+    __shared__ float s_max_vals[1024];
    int64_t const threads_per_group = blockDim.x / num_groups;
    int64_t const thread_in_group = threadIdx.x % threads_per_group;
    int64_t const group_offset = threadIdx.x / threads_per_group * group_size;
@@ -89,7 +97,7 @@ __device__ void compute_dynamic_per_token_scales(
    int64_t const thread_end =
        min(group_offset + group_size, static_cast<int64_t>(hidden_size));
    for (auto i = thread_offset; i < thread_end; i += threads_per_group) {
-      float x = static_cast<float>(input[token_offset + i]);
+      float x = static_cast<float>(input[input_token_offset + i]);
      if constexpr (has_residual) {
        x += static_cast<float>(residual[token_offset + i]);
      }
@@ -144,10 +152,8 @@ __device__ void compute_dynamic_per_token_scales(
    }
    __syncthreads();
  } else {
-    int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
-
    for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
-      float x = static_cast<float>(input[token_offset + i]);
+      float x = static_cast<float>(input[input_token_offset + i]);
      if constexpr (has_residual) {
        x += static_cast<float>(residual[token_offset + i]);
      }
@@ -185,12 +191,15 @@ template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
 __device__ void norm_and_quant(
    scalar_out_t* __restrict__ output, scalar_t const* __restrict__ input,
    scalar_t const* __restrict__ weight, float const rms, float* const scale,
-    int32_t const hidden_size, scalar_t* __restrict__ residual = nullptr,
-    int32_t const group_size = 0, int64_t outer_scale_stride = 1) {
+    int32_t const hidden_size, int32_t const input_stride,
+    scalar_t* __restrict__ residual = nullptr, int32_t const group_size = 0,
+    int64_t outer_scale_stride = 1) {
+  int64_t const input_token_offset =
+      blockIdx.x * static_cast<int64_t>(input_stride);
  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);

  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
-    float x = static_cast<float>(input[token_offset + i]);
+    float x = static_cast<float>(input[input_token_offset + i]);
    if constexpr (has_residual) {
      x += static_cast<float>(residual[token_offset + i]);
      residual[token_offset + i] = static_cast<scalar_t>(x);
@@ -224,13 +233,16 @@ namespace vectorized {
 // hidden_size must be a multiple of 4
 template <typename scalar_t, bool has_residual = false>
 __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
-                            int32_t const hidden_size, float const epsilon,
+                            int32_t const hidden_size,
+                            int32_t const input_stride, float const epsilon,
                            scalar_t const* __restrict__ residual = nullptr) {
+  int64_t const input_token_offset =
+      blockIdx.x * static_cast<int64_t>(input_stride);
  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);

  // Vectorized input/output to better utilize memory bandwidth.
  vec4_t<scalar_t> const* vec_input =
-      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+      reinterpret_cast<vec4_t<scalar_t> const*>(&input[input_token_offset]);
  vec4_t<scalar_t> const* vec_residual = nullptr;
  if constexpr (has_residual) {
    vec_residual =
@@ -288,7 +300,8 @@ __device__ void compute_dynamic_per_token_scales(
    float* __restrict__ token_scale, float* __restrict__ all_token_scales,
    scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
    float const rms, float const* __restrict__ scale_ub,
-    int32_t const hidden_size, scalar_t const* __restrict__ residual = nullptr,
+    int32_t const hidden_size, int32_t const input_stride,
+    scalar_t const* __restrict__ residual = nullptr,
    int64_t outer_scale_stride = 1) {
  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};

@@ -300,10 +313,13 @@ __device__ void compute_dynamic_per_token_scales(
  vec4_t<scalar_t> const* vec_weight = nullptr;
  vec4_t<scalar_t> const* vec_residual = nullptr;

+  int64_t const input_token_offset =
+      blockIdx.x * static_cast<int64_t>(input_stride);
+  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
+
  if constexpr (group_size > 0) {
    __shared__ float s_max_vals[1024];

-    int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
    int64_t const num_groups = hidden_size / group_size;
    int64_t const threads_per_group = blockDim.x / num_groups;
    int64_t const thread_in_group = threadIdx.x % threads_per_group;
@@ -312,7 +328,8 @@ __device__ void compute_dynamic_per_token_scales(
    int64_t const thread_offset = group_offset + thread_in_group;
    int64_t const thread_end = min(group_offset + (group_size >> 2),
                                   static_cast<int64_t>(hidden_size >> 2));
-    vec_input = reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+    vec_input =
+        reinterpret_cast<vec4_t<scalar_t> const*>(&input[input_token_offset]);
    vec_weight = reinterpret_cast<vec4_t<scalar_t> const*>(weight);
    if constexpr (has_residual) {
      vec_residual =
@@ -396,8 +413,8 @@ __device__ void compute_dynamic_per_token_scales(
    __syncthreads();

  } else {
-    int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
-    vec_input = reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+    vec_input =
+        reinterpret_cast<vec4_t<scalar_t> const*>(&input[input_token_offset]);
    vec_weight = reinterpret_cast<vec4_t<scalar_t> const*>(weight);
    if constexpr (has_residual) {
      vec_residual =
@@ -462,18 +479,18 @@ __device__ void compute_dynamic_per_token_scales(
 template <typename scalar_t, typename scalar_out_t, bool is_scale_inverted,
          bool has_residual = false, bool is_scale_transposed = false,
          int32_t group_size = 0>
-__device__ void norm_and_quant(scalar_out_t* __restrict__ output,
-                               scalar_t const* __restrict__ input,
-                               scalar_t const* __restrict__ weight,
-                               float const rms, float* const scale,
-                               int32_t const hidden_size,
-                               scalar_t* __restrict__ residual = nullptr,
-                               int64_t outer_scale_stride = 1) {
+__device__ void norm_and_quant(
+    scalar_out_t* __restrict__ output, scalar_t const* __restrict__ input,
+    scalar_t const* __restrict__ weight, float const rms, float* const scale,
+    int32_t const hidden_size, int32_t const input_stride,
+    scalar_t* __restrict__ residual = nullptr, int64_t outer_scale_stride = 1) {
+  int64_t const input_token_offset =
+      blockIdx.x * static_cast<int64_t>(input_stride);
  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);

  // Vectorized input/output/weight/residual to better utilize memory bandwidth.
  vec4_t<scalar_t> const* vec_input =
-      reinterpret_cast<vec4_t<scalar_t> const*>(&input[token_offset]);
+      reinterpret_cast<vec4_t<scalar_t> const*>(&input[input_token_offset]);
  vec4_t<scalar_t> const* vec_weight =
      reinterpret_cast<vec4_t<scalar_t> const*>(weight);
  q8x4_t<scalar_out_t>* vec_output =
--- a/csrc/sampler.cu
+++ b/csrc/sampler.cu
@@ -575,7 +575,7 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode(
  // The range of logits within the row.
  int rowStart = 0;
  int seq_len = seqLens[rowIdx / next_n];
-  int rowEnd = seq_len - next_n + (rowIdx % next_n) + 1;
+  int rowEnd = max(0, seq_len - next_n + (rowIdx % next_n) + 1);

  // Local pointers to this block
  if constexpr (!multipleBlocksPerRow && !mergeBlocks) {
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -564,10 +564,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {

  // Compute NVFP4 block quantized tensor.
  ops.def(
-      "scaled_fp4_quant(Tensor! output, Tensor input,"
-      "                 Tensor! output_scale, Tensor input_scale, bool "
-      "is_sf_swizzled_layout) -> ()");
-  ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant);
+      "scaled_fp4_quant(Tensor input,"
+      "                 Tensor input_scale, bool "
+      "is_sf_swizzled_layout) -> (Tensor, Tensor)");
+  ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant_func);
+
+  // Out variant
+  // TODO: Add {at::Tag::out_variant} tag and update all call sites
+  // to use the functional variant once vLLM upgrades PyTorch.
+  // See pytorch/pytorch#176117.
+  ops.def(
+      "scaled_fp4_quant.out(Tensor input,"
+      "                     Tensor input_scale, bool "
+      "is_sf_swizzled_layout, *, Tensor(a!) output, Tensor(b!) output_scale) "
+      "-> ()");
+  ops.impl("scaled_fp4_quant.out", torch::kCUDA, &scaled_fp4_quant_out);

  // Compute NVFP4 experts quantization.
  ops.def(
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -586,7 +586,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # This is ~1.1GB and only changes when FlashInfer version bumps
 # https://docs.flashinfer.ai/installation.html
 # From versions.json: .flashinfer.version
-ARG FLASHINFER_VERSION=0.6.4
+ARG FLASHINFER_VERSION=0.6.6
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \
    && uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
@@ -620,7 +620,7 @@ RUN set -eux; \
 ARG BITSANDBYTES_VERSION_X86=0.46.1
 ARG BITSANDBYTES_VERSION_ARM64=0.42.0
 ARG TIMM_VERSION=">=1.0.17"
-ARG RUNAI_MODEL_STREAMER_VERSION=">=0.15.3"
+ARG RUNAI_MODEL_STREAMER_VERSION=">=0.15.7"
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
        BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_ARM64}"; \
@@ -628,7 +628,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
        BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_X86}"; \
    fi; \
    uv pip install --system accelerate hf_transfer modelscope \
-        "bitsandbytes>=${BITSANDBYTES_VERSION}" "timm${TIMM_VERSION}" "runai-model-streamer[s3,gcs]${RUNAI_MODEL_STREAMER_VERSION}"
+        "bitsandbytes>=${BITSANDBYTES_VERSION}" "timm${TIMM_VERSION}" "runai-model-streamer[s3,gcs,azure]${RUNAI_MODEL_STREAMER_VERSION}"

 # ============================================================
 # VLLM INSTALLATION (depends on build stage)
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -9,17 +9,13 @@
 #
 # Build targets:
 #   vllm-openai (default): used for serving deployment
+#   vllm-openai-zen: vLLM from source + zentorch from PyPI via vllm[zen]
 #   vllm-test: used for CI tests
 #   vllm-dev: used for development
 #
 # Build arguments:
 #   PYTHON_VERSION=3.13|3.12 (default)|3.11|3.10
-#   VLLM_CPU_DISABLE_AVX512=false (default)|true
-#   VLLM_CPU_AVX2=false (default)|true (for cross-compilation)
-#   VLLM_CPU_AVX512=false (default)|true (for cross-compilation)
-#   VLLM_CPU_AVX512BF16=false (default)|true (for cross-compilation)
-#   VLLM_CPU_AVX512VNNI=false (default)|true (for cross-compilation)
-#   VLLM_CPU_AMXBF16=false (default)|true (for cross-compilation)
+#   VLLM_CPU_X86=false (default)|true (for cross-compilation)
 #   VLLM_CPU_ARM_BF16=false (default)|true (for cross-compilation)
 #

@@ -36,7 +32,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt,sharing=locked \
    apt-get update -y \
    && apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
-    gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \
+    gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof make xz-utils \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
    && curl -LsSf https://astral.sh/uv/install.sh | sh

@@ -91,24 +87,9 @@ ARG max_jobs=32
 ENV MAX_JOBS=${max_jobs}

 ARG GIT_REPO_CHECK=0
-# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
-ARG VLLM_CPU_DISABLE_AVX512=0
-ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
-# Support for cross-compilation with AVX2 ISA: docker build --build-arg VLLM_CPU_AVX2="1" ...
-ARG VLLM_CPU_AVX2=0
-ENV VLLM_CPU_AVX2=${VLLM_CPU_AVX2}
-# Support for cross-compilation with AVX512 ISA: docker build --build-arg VLLM_CPU_AVX512="1" ...
-ARG VLLM_CPU_AVX512=0
-ENV VLLM_CPU_AVX512=${VLLM_CPU_AVX512}
-# Support for building with AVX512BF16 ISA: docker build --build-arg VLLM_CPU_AVX512BF16="true" ...
-ARG VLLM_CPU_AVX512BF16=0
-ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
-# Support for building with AVX512VNNI ISA: docker build --build-arg VLLM_CPU_AVX512VNNI="true" ...
-ARG VLLM_CPU_AVX512VNNI=0
-ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
-# Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ...
-ARG VLLM_CPU_AMXBF16=1
-ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16}
+# Support for cross-compilation with x86 ISA including AVX2 and AVX512: docker build --build-arg VLLM_CPU_X86="true" ...
+ARG VLLM_CPU_X86=0
+ENV VLLM_CPU_X86=${VLLM_CPU_X86}
 # Support for cross-compilation with ARM BF16 ISA: docker build --build-arg VLLM_CPU_ARM_BF16="true" ...
 ARG VLLM_CPU_ARM_BF16=0
 ENV VLLM_CPU_ARM_BF16=${VLLM_CPU_ARM_BF16}
@@ -116,7 +97,7 @@ ENV VLLM_CPU_ARM_BF16=${VLLM_CPU_ARM_BF16}
 WORKDIR /vllm-workspace

 # Validate build arguments - prevent mixing incompatible ISA flags
-RUN if [ "$TARGETARCH" = "arm64" ] && { [ "$VLLM_CPU_AVX2" != "0" ] || [ "$VLLM_CPU_AVX512" != "0" ] || [ "$VLLM_CPU_AVX512BF16" != "0" ] || [ "$VLLM_CPU_AVX512VNNI" != "0" ]; }; then \
+RUN if [ "$TARGETARCH" = "arm64" ] && [ "$VLLM_CPU_X86" != "0" ]; then \
        echo "ERROR: Cannot use x86-specific ISA flags (AVX2, AVX512, etc.) when building for ARM64 (--platform=linux/arm64)"; \
        exit 1; \
    fi && \
@@ -174,7 +155,7 @@ WORKDIR /vllm-workspace

 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt,sharing=locked \
-    apt-get install -y --no-install-recommends vim numactl xz-utils make clangd-14
+    apt-get install -y --no-install-recommends vim numactl clangd-14

 RUN ln -s /usr/bin/clangd-14 /usr/bin/clangd

@@ -232,23 +213,29 @@ LABEL org.opencontainers.image.source="https://github.com/vllm-project/vllm"

 # Build configuration labels
 ARG TARGETARCH
-ARG VLLM_CPU_DISABLE_AVX512
-ARG VLLM_CPU_AVX2
-ARG VLLM_CPU_AVX512
-ARG VLLM_CPU_AVX512BF16
-ARG VLLM_CPU_AVX512VNNI
-ARG VLLM_CPU_AMXBF16
+ARG VLLM_CPU_X86
 ARG VLLM_CPU_ARM_BF16
 ARG PYTHON_VERSION

 LABEL ai.vllm.build.target-arch="${TARGETARCH}"
-LABEL ai.vllm.build.cpu-disable-avx512="${VLLM_CPU_DISABLE_AVX512:-false}"
-LABEL ai.vllm.build.cpu-avx2="${VLLM_CPU_AVX2:-false}"
-LABEL ai.vllm.build.cpu-avx512="${VLLM_CPU_AVX512:-false}"
-LABEL ai.vllm.build.cpu-avx512bf16="${VLLM_CPU_AVX512BF16:-false}"
-LABEL ai.vllm.build.cpu-avx512vnni="${VLLM_CPU_AVX512VNNI:-false}"
-LABEL ai.vllm.build.cpu-amxbf16="${VLLM_CPU_AMXBF16:-false}"
+LABEL ai.vllm.build.cpu-x86="${VLLM_CPU_X86:-false}"
 LABEL ai.vllm.build.cpu-arm-bf16="${VLLM_CPU_ARM_BF16:-false}"
 LABEL ai.vllm.build.python-version="${PYTHON_VERSION:-3.12}"

 ENTRYPOINT ["vllm", "serve"]
+
+
+######################### ZEN CPU PYPI IMAGE #########################
+FROM vllm-openai AS vllm-openai-zen
+
+ARG TARGETARCH
+
+RUN if [ "$TARGETARCH" != "amd64" ]; then \
+        echo "ERROR: vllm-openai-amd only supports --platform=linux/amd64"; \
+        exit 1; \
+    fi
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install "vllm[zen]"
+
+ENTRYPOINT ["vllm", "serve"]
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -217,13 +217,13 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.


 # build flashinfer for torch nightly from source around 10 mins
-# release version: v0.6.4
+# release version: v0.6.6
 # todo(elainewy): cache flashinfer build result for faster build
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/uv \
    echo "git clone flashinfer..." \
-    && git clone --depth 1 --branch v0.6.4 --recursive https://github.com/flashinfer-ai/flashinfer.git \
+    && git clone --depth 1 --branch v0.6.6 --recursive https://github.com/flashinfer-ai/flashinfer.git \
    && cd flashinfer \
    && git submodule update --init --recursive \
    && echo "finish git clone flashinfer..." \
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -184,6 +184,34 @@ RUN cd /opt/rixl && mkdir -p /app/install && \
        --ucx-plugins-dir ${UCX_HOME}/lib/ucx \
        --nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins

+# DeepEP build stage
+FROM base AS build_deep
+ARG ROCSHMEM_BRANCH="ba0bf0f3"
+ARG ROCSHMEM_REPO="https://github.com/ROCm/rocm-systems.git"
+ARG DEEPEP_BRANCH="e84464ec"
+ARG DEEPEP_REPO="https://github.com/ROCm/DeepEP.git"
+ARG DEEPEP_NIC="cx7"
+ENV ROCSHMEM_DIR=/opt/rocshmem
+
+RUN git clone ${ROCSHMEM_REPO} \
+ && cd rocm-systems \
+ && git checkout ${ROCSHMEM_BRANCH} \
+ && mkdir -p projects/rocshmem/build \
+ && cd projects/rocshmem/build \
+ && cmake .. \
+    -DCMAKE_INSTALL_PREFIX="${ROCSHMEM_DIR}" \
+    -DROCM_PATH=/opt/rocm \
+    -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+    -DUSE_EXTERNAL_MPI=OFF \
+ && make -j \
+ && make install
+
+# Build DeepEP wheel.
+# DeepEP looks for rocshmem at ROCSHMEM_DIR.
+RUN git clone ${DEEPEP_REPO} \
+ && cd DeepEP \
+ && git checkout ${DEEPEP_BRANCH} \
+ && python3 setup.py --variant rocm --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install

 # -----------------------
 # vLLM wheel release build stage (for building distributable wheels)
@@ -305,6 +333,11 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
 RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
    uv pip install --system /rixl_install/*.whl

+# Install DeepEP wheel
+RUN --mount=type=bind,from=build_deep,src=/app/deep_install,target=/deep_install \
+    uv pip install --system /deep_install/*.whl
+COPY --from=build_deep /opt/rocshmem /opt/rocshmem
+
 # RIXL/MoRIIO runtime dependencies (RDMA userspace libraries)
 RUN apt-get update -q -y && apt-get install -q -y \
    librdmacm1 \
--- a/docker/versions.json
+++ b/docker/versions.json
@@ -65,7 +65,7 @@
      "default": "true"
    },
    "FLASHINFER_VERSION": {
-      "default": "0.6.4"
+      "default": "0.6.6"
    },
    "GDRCOPY_CUDA_VERSION": {
      "default": "12.8"
@@ -83,7 +83,7 @@
      "default": ">=1.0.17"
    },
    "RUNAI_MODEL_STREAMER_VERSION": {
-      "default": ">=0.15.3"
+      "default": ">=0.15.7"
    }
  }
 }
--- a/docs/benchmarking/dashboard.md
+++ b/docs/benchmarking/dashboard.md
@@ -39,6 +39,12 @@ When run, benchmark script generates results under **benchmark/results** folder,
 - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
 - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
 - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
+- `PROMPTS_PER_CONCURRENCY`: Multiplier to compute `num_prompts` for serving tests (`num_prompts = max_concurrency × value`). Overrides JSON `num_prompts`. Default is NULL.
+- `ENABLE_ADAPTIVE_CONCURRENCY`: set the value to '1' to enable adaptive SLA-based concurrency search after the static serving max_concurrency sweep. Default value is 0.
+- `SLA_TTFT_MS`: default TTFT SLA threshold in milliseconds for adaptive concurrency search. Default value is 3000.
+- `SLA_TPOT_MS`: default TPOT SLA threshold in milliseconds for adaptive concurrency search. Default value is 100.
+- `ADAPTIVE_MAX_PROBES`: maximum number of extra adaptive search probes. Default value is 8.
+- `ADAPTIVE_MAX_CONCURRENCY`: maximum allowed concurrency during adaptive search. Default value is 1024.

 ### Visualization

--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@@ -15,7 +15,7 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
 ```

 !!! warning
-    To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.cuda.set_device][])
+    To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.accelerator.set_device_index][])
    before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.

    To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@@ -75,7 +75,7 @@ For an optimized workflow when iterating on C++/CUDA kernels, see the [Increment
 vLLM uses `pre-commit` to lint and format the codebase. See <https://pre-commit.com/#usage> if `pre-commit` is new to you. Setting up `pre-commit` is as easy as:

 ```bash
-uv pip install pre-commit
+uv pip install pre-commit>=4.5.1
 pre-commit install
 ```

@@ -187,6 +187,30 @@ Using `-s` with `git commit` will automatically add this header.
    - **VSCode**: Open the [Settings editor](https://code.visualstudio.com/docs/configure/settings)
      and enable the `Git: Always Sign Off` (`git.alwaysSignOff`) field.

+### AI Assisted Contributions
+
+Before making an AI assisted contribution, you must:
+
+1. **Be involved**: Do not submit "pure agent" PRs. The human submitter is responsible for reviewing all changed lines, validating behavior end-to-end, and running relevant tests.
+2. **Ensure significance**: Avoid one-off "busywork" PRs (single typo, isolated style cleanup, one mutable default fix, etc.). Bundle mechanical cleanups into a clear, systematic scope.
+
+When AI tools provide non-trivial assistance in generating or modifying code, you must:
+
+1. **Review thoroughly**: You remain responsible for all code you submit. Review and understand AI-generated code with the same care as code you write manually.
+2. **Disclose in PR**: Always mention when a pull request includes AI-generated code. Add a note in the PR description.
+3. **Mark commits**: Add attribution using commit trailers such as `Co-authored-by:` (other projects use `Assisted-by:` or `Generated-by:`). For example:
+
+   ```text
+   Your commit message here
+
+   Co-authored-by: GitHub Copilot
+   Co-authored-by: Claude
+   Co-authored-by: gemini-code-assist
+   Signed-off-by: Your Name <your.email@example.com>
+   ```
+
+AI-assisted code must meet all quality standards: proper testing, documentation, adherence to style guides, and thorough review. Attribution helps reviewers evaluate contributions in context and maintains legal clarity for the project.
+
 ### PR Title and Classification

 Only specific types of PRs will be reviewed. The PR title is prefixed
--- a/docs/design/attention_backends.md
+++ b/docs/design/attention_backends.md
@@ -164,18 +164,18 @@ Priority is **1 = highest** (tried first).
 | Backend | Version | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | MM Prefix | DCP | Attention Types | Compute Cap. |
 | ------- | ------- | ------ | --------- | ----------- | ---------- | ---- | --------- | --- | --------------- | ------------ |
 | `CPU_ATTN` | | fp16, bf16, fp32 | `auto` | Any | 32, 64, 80, 96, 112, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | All | N/A |
-| `FLASHINFER` | Native† | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ❌ | ❌ | ✅ | Decoder | 7.x-9.x |
-| `FLASHINFER` | TRTLLM† | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ✅ | ❌ | ✅ | Decoder | 10.x |
-| `FLASH_ATTN` | FA2* | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ✅ | All | ≥8.0 |
-| `FLASH_ATTN` | FA3* | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ❌ | ✅ | All | 9.x |
-| `FLASH_ATTN` | FA4* | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ✅ | All | ≥10.0 |
+| `FLASHINFER` | Native† | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ❌ | ❌ | ✅ | Decoder | 7.x-9.x |
+| `FLASHINFER` | TRTLLM† | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ✅ | ❌ | ✅ | Decoder | 10.x |
+| `FLASH_ATTN` | FA2* | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | Any | ❌ | ❌ | ✅ | All | ≥8.0 |
+| `FLASH_ATTN` | FA3* | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ❌ | ✅ | All | 9.x |
+| `FLASH_ATTN` | FA4* | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | Any | ❌ | ❌ | ✅ | All | ≥10.0 |
 | `FLASH_ATTN_DIFFKV` | | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ✅ | Decoder | Any |
-| `FLEX_ATTENTION` | | fp16, bf16, fp32 | `auto`, `bfloat16` | Any | Any | ❌ | ✅ | ❌ | Decoder, Encoder Only | Any |
-| `ROCM_AITER_FA` | | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32 | 64, 128, 256 | ❌ | ❌ | ❌ | Decoder, Enc-Dec | N/A |
+| `FLEX_ATTENTION` | | fp16, bf16, fp32 | `auto`, `float16`, `bfloat16` | Any | Any | ❌ | ✅ | ❌ | Decoder, Encoder Only | Any |
+| `ROCM_AITER_FA` | | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32 | 64, 128, 256 | ❌ | ❌ | ❌ | Decoder, Enc-Dec | N/A |
 | `ROCM_AITER_UNIFIED_ATTN` | | fp16, bf16 | `auto` | %16 | Any | ✅ | ✅ | ❌ | All | N/A |
-| `ROCM_ATTN` | | fp16, bf16, fp32 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 544 | 32, 64, 80, 96, 128, 160, 192, 224, 256 | ✅ | ✅ | ❌ | All | N/A |
-| `TREE_ATTN` | | fp16, bf16 | `auto` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | Decoder | Any |
-| `TRITON_ATTN` | | fp16, bf16, fp32 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ✅ | ❌ | All | Any |
+| `ROCM_ATTN` | | fp16, bf16, fp32 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | 32, 64, 80, 96, 128, 160, 192, 224, 256 | ✅ | ✅ | ❌ | All | N/A |
+| `TREE_ATTN` | | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | Decoder | Any |
+| `TRITON_ATTN` | | fp16, bf16, fp32 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ✅ | ❌ | All | Any |

 > **†** FlashInfer uses TRTLLM attention on Blackwell (SM100), which supports sinks. Disable via `--attention-config.use_trtllm_attention=0`.
 >
@@ -204,13 +204,14 @@ configuration.

 | Backend | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | Sparse | MM Prefix | DCP | Attention Types | Compute Cap. |
 | ------- | ------ | --------- | ----------- | ---------- | ---- | ------ | --------- | --- | --------------- | ------------ |
-| `CUTLASS_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 128 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 10.x |
-| `FLASHINFER_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | 10.x |
-| `FLASHINFER_MLA_SPARSE` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 10.x |
-| `FLASHMLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 64 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x-10.x |
+| `CUTLASS_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 128 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 10.x |
+| `FLASHINFER_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | 10.x |
+| `FLASHINFER_MLA_SPARSE` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 10.x |
+| `FLASHMLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 64 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x-10.x |
 | `FLASHMLA_SPARSE` | bf16 | `auto`, `bfloat16`, `fp8_ds_mla` | 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 9.x-10.x |
-| `FLASH_ATTN_MLA` | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x |
-| `ROCM_AITER_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 1 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
-| `ROCM_AITER_MLA_SPARSE` | fp16, bf16 | `auto`, `bfloat16` | 1 | Any | ❌ | ✅ | ❌ | ❌ | Decoder | N/A |
+| `FLASH_ATTN_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x |
+| `ROCM_AITER_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 1 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
+| `ROCM_AITER_MLA_SPARSE` | fp16, bf16 | `auto`, `float16`, `bfloat16` | 1 | Any | ❌ | ✅ | ❌ | ❌ | Decoder | N/A |
 | `ROCM_AITER_TRITON_MLA` | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
-| `TRITON_MLA` | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | Any |
+| `TRITON_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | %16 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | Any |
+| `XPU_MLA_SPARSE` | fp16, bf16 | `auto`, `float16`, `bfloat16` | Any | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | Any |
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -167,9 +167,6 @@ FusedMoEExpertsModular performs the core of the FusedMoE operations. The various

 `FusedMoEExpertsModular::activation_formats()`: Return the supported Input and Output activation formats. i.e. Contiguous / Batched format.

-`FusedMoEExpertsModular::supports_chunking()`: Return True if the implementation supports chunking. Typically
-implementations that input `FusedMoEActivationFormat.Standard` support chunking and `FusedMoEActivationFormat.BatchedExperts` do not.
-
 `FusedMoEExpertsModular::supports_expert_map()`: Return True if the implementation supports expert map.

 `FusedMoEExpertsModular::workspace_shapes()` /
@@ -220,8 +217,8 @@ If you are adding some `FusedMoEPrepareAndFinalizeModular` / `FusedMoEExpertsMod

 1. Add the implementation type to `MK_ALL_PREPARE_FINALIZE_TYPES` and `MK_FUSED_EXPERT_TYPES` in [mk_objects.py](../../tests/kernels/moe/modular_kernel_tools/mk_objects.py) respectively.
 2. Update `Config::is_batched_prepare_finalize()`, `Config::is_batched_fused_experts()`, `Config::is_standard_fused_experts()`,
-`Config::is_fe_16bit_supported()`,  `Config::is_fe_fp8_supported()`, `Config::is_fe_block_fp8_supported()`,
-`Config::is_fe_supports_chunking()` methods in [/tests/kernels/moe/modular_kernel_tools/common.py](../../tests/kernels/moe/modular_kernel_tools/common.py)
+`Config::is_fe_16bit_supported()`,  `Config::is_fe_fp8_supported()`, `Config::is_fe_block_fp8_supported()`
+methods in [/tests/kernels/moe/modular_kernel_tools/common.py](../../tests/kernels/moe/modular_kernel_tools/common.py)

 Doing this will add the new implementation to the test suite.

--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -35,7 +35,8 @@ th {
 | naive | standard | all<sup>1</sup> | G,A,T | N | <sup>6</sup> | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE] |
 | deepep_high_throughput | standard | fp8 | G(128),A,T<sup>2</sup> | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] |
 | deepep_low_latency | batched | fp8 | G(128),A,T<sup>3</sup> | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] |
-| flashinfer_all2allv | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferA2APrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_a2a_prepare_finalize.FlashInferA2APrepareAndFinalize] |
+| flashinfer_nvlink_two_sided | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferNVLinkTwoSidedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_nvlink_two_sided_prepare_finalize.FlashInferNVLinkTwoSidedPrepareAndFinalize] |
+| flashinfer_nvlink_one_sided | standard | nvfp4 | G,A,T | N | N | [`FlashInferNVLinkOneSidedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_nvlink_one_sided_prepare_finalize.FlashInferNVLinkOneSidedPrepareAndFinalize] |

 !!! info "Table key"
    1. All types: mxfp4, nvfp4, int4, int8, fp8
--- a/docs/design/torch_compile_multimodal.md
+++ b/docs/design/torch_compile_multimodal.md
@@ -34,9 +34,6 @@ relies on caching artifacts to reduce start time, we must properly propagate the
 with the LLM text-backbone, or other instances of the same artifact (as is the case with vision block). `is_encoder=True` is also needed for encoder
 components (see Compile Range Integration).

-3. `with set_forward_context` context manager should be used around the nn.Module's forward call. This will properly forward the vllm_config which is needed
-for torch.compile integration.
-
 ### CompilationConfig

 With the exception of `compile_mm_encoder: true`, the multimodal encoder will inherit from the same compilation config as the text LLM. We may extend
--- a/docs/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@@ -44,6 +44,12 @@ For NixlConnector, you may also specify one or multiple NIXL_Backend. Such as:
  --kv-transfer-config '{"kv_connector":"OffloadingConnector","kv_role":"kv_both","kv_connector_extra_config":{"block_size": 64, "cpu_bytes_to_use": 1000000000}}'
  ```

+- **FlexKVConnectorV1**: refer to [examples/offline_inference/prefix_caching_flexkv.py](../../examples/offline_inference/prefix_caching_flexkv.py) for the example usage of FlexKVConnectorV1. FlexKV is a distributed KV Store and multi-level cache management system for ultra-large-scale LLM inference.
+
+  ```bash
+  --kv-transfer-config '{"kv_connector":"FlexKVConnectorV1","kv_role":"kv_both"}'
+  ```
+
 ## Benchmarks

 Please refer to [benchmarks/disagg_benchmarks](../../benchmarks/disagg_benchmarks) for disaggregated prefilling benchmarks.
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -219,7 +219,7 @@ Supported models:

 * `ibm-granite/granite-4.0-h-small` and other Granite 4.0 models

-    Recommended flags: `--tool-call-parser hermes`
+    Recommended flags: `--tool-call-parser granite4`

 * `ibm-granite/granite-3.0-8b-instruct`

--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@@ -16,4 +16,6 @@ vLLM supports the following hardware platforms:

 vLLM supports third-party hardware plugins that live **outside** the main `vllm` repository. These follow the [Hardware-Pluggable RFC](../../design/plugin_system.md).

-A list of all supported hardware can be found on the [vllm.ai website](https://vllm.ai/#hardware). If you want to add new hardware, please contact us on [Slack](https://slack.vllm.ai/) or [Email](mailto:collaboration@vllm.ai).
+A list of all supported hardware can be found on the vLLM website, see [Universal Compatibility - Hardware](https://vllm.ai/#compatibility).
+
+If you want to add new hardware, please contact us on [Slack](https://slack.vllm.ai/) or [Email](mailto:collaboration@vllm.ai).
--- a/docs/getting_started/installation/cpu.x86.inc.md
+++ b/docs/getting_started/installation/cpu.x86.inc.md
@@ -7,7 +7,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
 --8<-- [start:requirements]

 - OS: Linux
- CPU flags: `avx512f` (Recommended), `avx512_bf16` (Optional), `avx512_vnni` (Optional)
+- CPU flags: `avx512f` (Recommended), `avx2` (Limited features)

 !!! tip
    Use `lscpu` to check the CPU flags.
@@ -18,7 +18,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
 --8<-- [end:set-up-using-python]
 --8<-- [start:pre-built-wheels]

-Pre-built vLLM wheels for x86 with AVX512 are available since version 0.13.0. To install release wheels:
+Pre-built vLLM wheels for x86 with AVX512/AVX2 are available since version 0.17.0. To install release wheels:

 ```bash
 export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
@@ -108,13 +108,13 @@ VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation
 If you want to develop vLLM, install it in editable mode instead.

 ```bash
-VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
+VLLM_TARGET_DEVICE=cpu python3 setup.py develop
 ```

 Optionally, build a portable wheel which you can then install elsewhere:

 ```bash
-VLLM_TARGET_DEVICE=cpu uv build --wheel
+VLLM_TARGET_DEVICE=cpu uv build --wheel --no-build-isolation
 ```

 ```bash
@@ -185,12 +185,9 @@ docker run \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    -p 8000:8000 \
    --env "HF_TOKEN=<secret>" \
-vllm/vllm-openai-cpu:latest-x86_64 <args...>
+    vllm/vllm-openai-cpu:latest-x86_64 <args...>
 ```

-!!! warning
-    If deploying the pre-built images on machines without `avx512f`, `avx512_bf16`, or `avx512_vnni` support, an `Illegal instruction` error may be raised. See the build-image-from-source section below for build arguments to match your target CPU capabilities.
-
 --8<-- [end:pre-built-images]
 --8<-- [start:build-image-from-source]

@@ -198,50 +195,11 @@ vllm/vllm-openai-cpu:latest-x86_64 <args...>

 ```bash
 docker build -f docker/Dockerfile.cpu \
-        --build-arg VLLM_CPU_DISABLE_AVX512=<false (default)|true> \
-        --build-arg VLLM_CPU_AVX2=<false (default)|true> \
-        --build-arg VLLM_CPU_AVX512=<false (default)|true> \
-        --build-arg VLLM_CPU_AVX512BF16=<false (default)|true> \
-        --build-arg VLLM_CPU_AVX512VNNI=<false (default)|true> \
-        --build-arg VLLM_CPU_AMXBF16=<false|true (default)> \
+        --build-arg VLLM_CPU_X86=<false (default)|true> \ # For cross-compilation
        --tag vllm-cpu-env \
        --target vllm-openai .
 ```

-!!! note "Auto-detection by default"
-    By default, CPU instruction sets (AVX512, AVX2, etc.) are automatically detected from the build system's CPU flags. Build arguments like `VLLM_CPU_AVX2`, `VLLM_CPU_AVX512`, `VLLM_CPU_AVX512BF16`, `VLLM_CPU_AVX512VNNI`, and `VLLM_CPU_AMXBF16` are used for cross-compilation:
-
-    - `VLLM_CPU_{ISA}=true` - Force-enable the instruction set (build with ISA regardless of build system capabilities)
-    - `VLLM_CPU_{ISA}=false` - Rely on auto-detection (default)
-
-##### Examples
-
-###### Auto-detection build (default)
-
-```bash
-docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
-```
-
-###### Cross-compile for AVX512
-
-```bash
-docker build -f docker/Dockerfile.cpu \
-        --build-arg VLLM_CPU_AVX512=true \
-        --build-arg VLLM_CPU_AVX512BF16=true \
-        --build-arg VLLM_CPU_AVX512VNNI=true \
-        --tag vllm-cpu-avx512 \
-        --target vllm-openai .
-```
-
-###### Cross-compile for AVX2
-
-```bash
-docker build -f docker/Dockerfile.cpu \
-        --build-arg VLLM_CPU_AVX2=true \
-        --tag vllm-cpu-avx2 \
-        --target vllm-openai .
-```
-
 #### Launching the OpenAI server

 ```bash
--- a/docs/getting_started/installation/gpu.xpu.inc.md
+++ b/docs/getting_started/installation/gpu.xpu.inc.md
@@ -7,7 +7,6 @@ vLLM initially supports basic model inference and serving on Intel GPU platform.
 --8<-- [start:requirements]

 - Supported Hardware: Intel Data Center GPU, Intel ARC GPU
- OneAPI requirements: oneAPI 2025.3
 - Dependency: [vllm-xpu-kernels](https://github.com/vllm-project/vllm-xpu-kernels): a package provide all necessary vllm custom kernel when running vLLM on Intel GPU platform,
 - Python: 3.12
 !!! warning
@@ -26,8 +25,8 @@ Currently, there are no pre-built XPU wheels.
 --8<-- [end:pre-built-wheels]
 --8<-- [start:build-wheel-from-source]

- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.3 or later.
- Second, install Python packages for vLLM XPU backend building:
+- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers).
+- Second, install Python packages for vLLM XPU backend building (Intel OneAPI dependencies are installed automatically as part of `torch-xpu`, see [PyTorch XPU get started](https://docs.pytorch.org/docs/stable/notes/get_start_xpu.html)):

 ```bash
 git clone https://github.com/vllm-project/vllm.git
--- a/docs/governance/process.md
+++ b/docs/governance/process.md
@@ -135,6 +135,19 @@ PRs requires at least one committer review and approval. If the code is covered

 In case where CI didn't pass due to the failure is not related to the PR, the PR can be merged by the lead maintainers using "force merge" option that overrides the CI checks.

+### AI Assisted Contributions
+
+AI tools can accelerate development, but contributors remain fully responsible for all code they submit. Like the Developer Certificate of Origin, this policy centers on accountability: contributors must believe they have the right to submit their contribution under vLLM's open source license, regardless of how the code was created.
+
+All AI-assisted contributions must meet the same quality, testing, and review standards as any other code. Contributors must review and understand AI-generated code before submission—just make sure it is good code:
+
+- Do not submit "pure agent" PRs. The human submitter is responsible for reviewing all changed lines, validating behavior end-to-end, and running relevant tests.
+- Attribution preserves legal clarity and community trust. Contributors must disclose AI assistance in pull requests and mark commits with appropriate trailers (e.g. `Co-authored-by:`).
+- Avoid one-off "busywork" PRs (single typo, isolated style cleanup, one mutable default fix, etc.). Bundle mechanical cleanups into a clear, systematic scope.
+
+!!! warning
+    These topics are outlined for agents in [AGENTS.md](../../AGENTS.md) with instructions for how to autonomously implement them.
+
 ### Slack

 Contributors are encouraged to join `#pr-reviews` and `#contributors` channels.
--- a/docs/models/extensions/instanttensor.md
+++ b/docs/models/extensions/instanttensor.md
@@ -0,0 +1,31 @@
+# Loading Model Weights with InstantTensor
+
+InstantTensor accelerates loading Safetensors weights on CUDA devices through distributed loading, pipelined prefetching, and direct I/O. InstantTensor also supports GDS (GPUDirect Storage) when available.
+For more details, see the [InstantTensor GitHub repository](https://github.com/scitix/InstantTensor).
+
+## Installation
+
+```bash
+pip install instanttensor
+```
+
+## Use InstantTensor in vLLM
+
+Add `--load-format instanttensor` as a command-line argument.
+
+For example:
+
+```bash
+vllm serve Qwen/Qwen2.5-0.5B --load-format instanttensor
+```
+
+## Benchmarks
+
+| Model | GPU | Backend | Load Time (s) | Throughput (GB/s) | Speedup |
+| --- | ---: | --- | ---: | ---: | --- |
+| Qwen3-30B-A3B | 1*H200 | Safetensors | 57.4 | 1.1 | 1x |
+| Qwen3-30B-A3B | 1*H200 | InstantTensor | 1.77 | 35 | <span style="color: green">**32.4x**</span> |
+| DeepSeek-R1 | 8*H200 | Safetensors | 160 | 4.3 | 1x |
+| DeepSeek-R1 | 8*H200 | InstantTensor | 15.3 | 45 | <span style="color: green">**10.5x**</span> |
+
+For the full benchmark results, see <https://github.com/scitix/InstantTensor/blob/main/docs/benchmark.md>.
--- a/docs/models/extensions/runai_model_streamer.md
+++ b/docs/models/extensions/runai_model_streamer.md
@@ -31,6 +31,16 @@ vllm serve gs://core-llm/Llama-3-8b \
    --load-format runai_streamer
 ```

+To run model from Azure Blob Storage run:
+
+```bash
+AZURE_STORAGE_ACCOUNT_NAME=<account> \
+vllm serve az://<container>/<model-path> \
+    --load-format runai_streamer
+```
+
+Authentication uses `DefaultAzureCredential`, which supports `az login`, managed identity, environment variables (`AZURE_CLIENT_ID`, `AZURE_TENANT_ID`, `AZURE_CLIENT_SECRET`), and other methods.
+
 To run model from a S3 compatible object store run:

 ```bash
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -418,6 +418,7 @@ th {
 | `Grok1ForCausalLM` | Grok2 | `xai-org/grok-2` | ✅︎ | ✅︎ |
 | `HunYuanDenseV1ForCausalLM` | Hunyuan Dense | `tencent/Hunyuan-7B-Instruct` | ✅︎ | ✅︎ |
 | `HunYuanMoEV1ForCausalLM` | Hunyuan-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ |
+| `HyperCLOVAXForCausalLM` | HyperCLOVAX-SEED-Think-14B | `naver-hyperclovax/HyperCLOVAX-SEED-Think-14B` | ✅︎ | ✅︎ |
 | `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ |
 | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ |
 | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ |
@@ -514,6 +515,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
 | ------------ | ------ | ----------------- | -------------------- | ------------------------- |
 | `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | |
 | `BertSpladeSparseEmbeddingModel` | SPLADE | `naver/splade-v3` | | |
+| `ErnieModel` | BERT-like Chinese ERNIE | `shibing624/text2vec-base-chinese-sentence` | | |
 | `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ |
 | `Gemma3TextModel`<sup>C</sup> | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ |
 | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
@@ -556,8 +558,9 @@ These models primarily support the [`LLM.classify`](./pooling_models.md#llmclass

 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 | ------------ | ------ | ----------------- | -------------------- | ------------------------- |
-| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ |
+| `ErnieForSequenceClassification` | BERT-like Chinese ERNIE | `Forrest20231206/ernie-3.0-base-zh-cls` | | |
 | `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | |
+| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ |
 | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |

 <sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))  
@@ -574,6 +577,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
 | Architecture | Models | Example HF Models | Score template (see note) | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 | ------------ | ------ | ----------------- | ------------------------- | --------------------------- | --------------------------------------- |
 | `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | N/A | | |
+| `ErnieForSequenceClassification` | BERT-like Chinese ERNIE | `Forrest20231206/ernie-3.0-base-zh-cls` | N/A | | |
 | `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma`(see note), etc. | [bge-reranker-v2-gemma.jinja](../../examples/pooling/score/template/bge-reranker-v2-gemma.jinja) | ✅︎ | ✅︎ |
 | `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | N/A | | |
 | `LlamaBidirectionalForSequenceClassification`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-rerank-1b-v2`, etc. | [nemotron-rerank.jinja](../../examples/pooling/score/template/nemotron-rerank.jinja) | ✅︎ | ✅︎ |
@@ -639,6 +643,7 @@ These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode)
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 | ------------ | ------ | ----------------- | --------------------------- | --------------------------------------- |
 | `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. | | |
+| `ErnieForTokenClassification` | BERT-like Chinese ERNIE | `gyr66/Ernie-3.0-base-chinese-finetuned-ner` | | |
 | `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | |

 !!! note
@@ -713,8 +718,9 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `KananaVForConditionalGeneration` | Kanana-V | T + I<sup>+</sup> | `kakaocorp/kanana-1.5-v-3b-instruct`, etc. | | ✅︎ |
 | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ |
 | `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ |
-| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ |
+| `KimiAudioForConditionalGeneration` | Kimi-Audio | T + A<sup>+</sup> | `moonshotai/Kimi-Audio-7B-Instruct` | | ✅︎ |
 | `KimiK25ForConditionalGeneration` | Kimi-K2.5 | T + I<sup>+</sup> | `moonshotai/Kimi-K2.5` | | ✅︎ |
+| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ |
 | `LightOnOCRForConditionalGeneration` | LightOnOCR-1B | T + I<sup>+</sup> | `lightonai/LightOnOCR-1B`, etc | ✅︎ | ✅︎ |
 | `Lfm2VlForConditionalGeneration` | LFM2-VL | T + I<sup>+</sup> | `LiquidAI/LFM2-VL-450M`, `LiquidAI/LFM2-VL-3B`, `LiquidAI/LFM2-VL-8B-A1B`, etc. | ✅︎ | ✅︎ |
 | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎ | ✅︎ |
@@ -827,6 +833,7 @@ The following table lists those that are tested in vLLM.
 | ------------ | ------ | ------ | ----------------- | -------------------- | ------------------------- |
 | `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | |
 | `ColModernVBertForRetrieval` | ColModernVBERT | T / I | `ModernVBERT/colmodernvbert-merged` | | |
+| `ColPaliForRetrieval` | ColPali | T / I | `vidore/colpali-v1.3-hf` | | |
 | `LlamaNemotronVLModel` | Llama Nemotron Embedding + SigLIP | T + I | `nvidia/llama-nemotron-embed-vl-1b-v2` | | |
 | `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ |
 | `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ |
--- a/docs/serving/expert_parallel_deployment.md
+++ b/docs/serving/expert_parallel_deployment.md
@@ -21,7 +21,8 @@ vLLM provides multiple communication backends for EP. Use `--all2all-backend` to
 | `allgather_reducescatter` | Default backend | Standard all2all using allgather/reducescatter primitives | General purpose, works with any EP+DP configuration |
 | `deepep_high_throughput` | Multi-node prefill | Grouped GEMM with continuous layout, optimized for prefill | Prefill-dominated workloads, high-throughput scenarios |
 | `deepep_low_latency` | Multi-node decode | CUDA graph support, masked layout, optimized for decode | Decode-dominated workloads, low-latency scenarios |
-| `flashinfer_all2allv` | MNNVL systems | FlashInfer alltoallv kernels for multi-node NVLink | Systems with NVLink across nodes |
+| `flashinfer_nvlink_one_sided` | MNNVL systems | FlashInfer's one-sided A2A strategy for multi-node NVLink | High-throughput workloads |
+| `flashinfer_nvlink_two_sided` | MNNVL systems | FlashInfer's two-sided A2A strategy for multi-node NVLink | Systems with NVLink across nodes |
 | `naive` | Testing/debugging | Simple broadcast-based implementation | Debugging, not recommended for production |

 ## Single Node Deployment
--- a/docs/serving/integrations/claude_code.md
+++ b/docs/serving/integrations/claude_code.md
@@ -60,6 +60,9 @@ The environment variables:
 !!! tip
    You can add these environment variables to your shell profile (e.g., `.bashrc`, `.zshrc`), Claude Code configuration file (`~/.claude/settings.json`), or create a wrapper script for convenience.

+!!! warning
+    Claude Code recently started injecting a per-request hash in the system prompt, which can defeat [prefix caching](../../design/prefix_caching.md) because the prompt changes on every request, causing greatly reduced performance. This is addressed automatically in vLLM versions > 0.17.1 but for older versions `"CLAUDE_CODE_ATTRIBUTION_HEADER": "0"` should be added to the `"env"` section of `~/.claude/settings.json` (see this [blog post](https://unsloth.ai/docs/basics/claude-code#fixing-90-slower-inference-in-claude-code) from Unsloth).
+
 ## Testing the Setup

 Once Claude Code launches, try a simple prompt to verify the connection:
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -72,6 +72,9 @@ In addition, we have the following custom APIs:
    - Only applicable to [classification models](../models/pooling_models.md).
 - [Score API](#score-api) (`/score`)
    - Applicable to [embedding models and cross-encoder models](../models/pooling_models.md).
+- [Cohere Embed API](#cohere-embed-api) (`/v2/embed`)
+    - Compatible with [Cohere's Embed API](https://docs.cohere.com/reference/embed)
+    - Works with any [embedding model](../models/pooling_models.md), including multimodal models.
 - [Re-rank API](#re-rank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
    - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/)
    - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank)
@@ -429,6 +432,137 @@ these extra parameters are supported instead:
    --8<-- "vllm/entrypoints/pooling/base/protocol.py:embed-extra-params"
    ```

+### Cohere Embed API
+
+Our API is also compatible with [Cohere's Embed v2 API](https://docs.cohere.com/reference/embed) which adds support for some modern embedding feature such as truncation, output dimensions, embedding types, and input types. This endpoint works with any embedding model (including multimodal models).
+
+#### Cohere Embed API request parameters
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `model` | string | Yes | Model name |
+| `input_type` | string | No | Prompt prefix key (model-dependent, see below) |
+| `texts` | list[string] | No | Text inputs (use one of `texts`, `images`, or `inputs`) |
+| `images` | list[string] | No | Base64 data URI images |
+| `inputs` | list[object] | No | Mixed text and image content objects |
+| `embedding_types` | list[string] | No | Output types (default: `["float"]`) |
+| `output_dimension` | int | No | Truncate embeddings to this dimension (Matryoshka) |
+| `truncate` | string | No | `END`, `START`, or `NONE` (default: `END`) |
+
+#### Text embedding
+
+```bash
+curl -X POST "http://localhost:8000/v2/embed" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Snowflake/snowflake-arctic-embed-m-v1.5",
+    "input_type": "query",
+    "texts": ["Hello world", "How are you?"],
+    "embedding_types": ["float"]
+  }'
+```
+
+??? console "Response"
+
+    ```json
+    {
+      "id": "embd-...",
+      "embeddings": {
+        "float": [
+          [0.012, -0.034, ...],
+          [0.056, 0.078, ...]
+        ]
+      },
+      "texts": ["Hello world", "How are you?"],
+      "meta": {
+        "api_version": {"version": "2"},
+        "billed_units": {"input_tokens": 12}
+      }
+    }
+    ```
+
+#### Mixed text and image inputs
+
+For multimodal models, you can embed images by passing base64 data URIs. The `inputs` field accepts a list of objects with mixed text and image content:
+
+```bash
+curl -X POST "http://localhost:8000/v2/embed" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "google/siglip-so400m-patch14-384",
+    "inputs": [
+      {
+        "content": [
+          {"type": "text", "text": "A photo of a cat"},
+          {"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBOR..."}}
+        ]
+      }
+    ],
+    "embedding_types": ["float"]
+  }'
+```
+
+#### Embedding types
+
+The `embedding_types` parameter controls the output format. Multiple types can be requested in a single call:
+
+| Type | Description |
+| ---- | ----------- |
+| `float` | Raw float32 embeddings (default) |
+| `binary` | Bit-packed signed binary |
+| `ubinary` | Bit-packed unsigned binary |
+| `base64` | Little-endian float32 encoded as base64 |
+
+```bash
+curl -X POST "http://localhost:8000/v2/embed" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Snowflake/snowflake-arctic-embed-m-v1.5",
+    "input_type": "query",
+    "texts": ["What is machine learning?"],
+    "embedding_types": ["float", "binary"]
+  }'
+```
+
+??? console "Response"
+
+    ```json
+    {
+      "id": "embd-...",
+      "embeddings": {
+        "float": [[0.012, -0.034, ...]],
+        "binary": [[42, -117, ...]]
+      },
+      "texts": ["What is machine learning?"],
+      "meta": {
+        "api_version": {"version": "2"},
+        "billed_units": {"input_tokens": 8}
+      }
+    }
+    ```
+
+#### Truncation
+
+The `truncate` parameter controls how inputs exceeding the model's maximum sequence length are handled:
+
+| Value | Behavior |
+| ----- | --------- |
+| `END` (default) | Keep the first tokens, drop the end |
+| `START` | Keep the last tokens, drop the beginning |
+| `NONE` | Return an error if the input is too long |
+
+#### Input type and prompt prefixes
+
+The `input_type` field selects a prompt prefix to prepend to each text input. The available values
+depend on the model:
+
+- **Models with `task_instructions` in `config.json`**: The keys from the `task_instructions` dict are
+  the valid `input_type` values and the corresponding value is prepended to each text.
+- **Models with `config_sentence_transformers.json` prompts**: The keys from the `prompts` dict are
+  the valid `input_type` values. For example, `Snowflake/snowflake-arctic-embed-xs` defines `"query"`,
+  so setting `input_type: "query"` prepends `"Represent this sentence for searching relevant passages: "`.
+- **Other models**: `input_type` is not accepted and will raise a validation error if passed.
+
 ### Transcriptions API

 Our Transcriptions API is compatible with [OpenAI's Transcriptions API](https://platform.openai.com/docs/api-reference/audio/createTranscription);
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -91,8 +91,8 @@ If GPU/CPU communication cannot be established, you can use the following Python
    import torch
    import torch.distributed as dist
    dist.init_process_group(backend="nccl")
-    local_rank = dist.get_rank() % torch.cuda.device_count()
-    torch.cuda.set_device(local_rank)
+    local_rank = dist.get_rank() % torch.accelerator.device_count()
+    torch.accelerator.set_device_index(local_rank)
    data = torch.FloatTensor([1,] * 128).to("cuda")
    dist.all_reduce(data, op=dist.ReduceOp.SUM)
    torch.accelerator.synchronize()
@@ -337,7 +337,7 @@ import vllm
 import torch

 print(f"CUDA available: {torch.cuda.is_available()}")
-print(f"CUDA device count: {torch.cuda.device_count()}")
+print(f"CUDA device count: {torch.accelerator.device_count()}")
 EOF
 ```

--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -201,6 +201,34 @@ def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
    )


+# Kimi-Audio-7B-Instruct
+def run_kimi_audio(question: str, audio_count: int) -> ModelRequestData:
+    """Kimi-Audio-7B-Instruct for audio transcription and understanding."""
+    model_name = "moonshotai/Kimi-Audio-7B-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    # Kimi-Audio uses <|im_kimia_text_blank|> as placeholder for audio features
+    audio_placeholder = "<|im_kimia_text_blank|>" * audio_count
+    # Default prompt for transcription
+    if not question:
+        question = "Please transcribe the audio"
+    prompt = f"{audio_placeholder}{question}"
+
+    # Stop at EOS token (151644) to prevent repetition
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        stop_token_ids=[151644],
+    )
+
+
 # MiDashengLM
 def run_midashenglm(question: str, audio_count: int):
    model_name = "mispeech/midashenglm-7b"
@@ -485,6 +513,7 @@ model_example_map = {
    "glmasr": run_glmasr,
    "funaudiochat": run_funaudiochat,
    "granite_speech": run_granite_speech,
+    "kimi_audio": run_kimi_audio,
    "midashenglm": run_midashenglm,
    "minicpmo": run_minicpmo,
    "phi4_mm": run_phi4mm,
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@@ -62,9 +62,9 @@ def run_simple_demo(args: argparse.Namespace):

    llm = LLM(
        model=model_name,
-        tokenizer_mode="mistral" if args.format == "mistral" else "auto",
-        config_format="mistral" if args.format == "mistral" else "auto",
-        load_format="mistral" if args.format == "mistral" else "auto",
+        tokenizer_mode="mistral" if args.format == "mistral" else "hf",
+        config_format="mistral" if args.format == "mistral" else "hf",
+        load_format="mistral" if args.format == "mistral" else "hf",
        limit_mm_per_prompt={"image": 1},
        max_model_len=4096,
        max_num_seqs=2,
@@ -102,9 +102,9 @@ def run_advanced_demo(args: argparse.Namespace):
    sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)
    llm = LLM(
        model=model_name,
-        tokenizer_mode="mistral" if args.format == "mistral" else "auto",
-        config_format="mistral" if args.format == "mistral" else "auto",
-        load_format="mistral" if args.format == "mistral" else "auto",
+        tokenizer_mode="mistral" if args.format == "mistral" else "hf",
+        config_format="mistral" if args.format == "mistral" else "hf",
+        load_format="mistral" if args.format == "mistral" else "hf",
        limit_mm_per_prompt={"image": max_img_per_msg},
        max_model_len=max_img_per_msg * max_tokens_per_img,
        tensor_parallel_size=2,
--- a/examples/offline_inference/prefix_caching_flexkv.py
+++ b/examples/offline_inference/prefix_caching_flexkv.py
@@ -0,0 +1,221 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use FlexKV with vLLM for prefix caching.
+
+FlexKV is a distributed KV Store and multi-level cache management system for
+ultra-large-scale LLM inference.
+
+Requirements:
+    - Install FlexKV (https://github.com/taco-project/FlexKV):
+        1. git clone git@github.com:taco-project/FlexKV.git
+        2. cd FlexKV && bash build.sh
+    - Ensure FlexKV is compatible with your vLLM version.
+
+Usage:
+    1. Run this script:
+       python examples/offline_inference/prefix_caching_flexkv.py \
+           --model /path/to/your/model
+
+    2. Arguments:
+       --model              Path or name of the model (required)
+       --tp-size            Tensor parallel size (default: 1)
+       --gpu-memory-util    GPU memory utilization (default: 0.4)
+
+    3. The script will:
+       - Create a FlexKV configuration file.
+       - Set the FLEXKV_CONFIG_PATH environment variable.
+       - Run vLLM with FlexKVConnectorV1 enabled.
+       - Compare results between regular execution, vLLM's default prefix
+         caching, and FlexKV.
+"""
+
+import argparse
+import json
+import os
+import time
+
+from vllm import LLM, SamplingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+
+# NOTE: This is just a running example. For benchmarking purpose,
+# please see benchmarks/benchmark_prefix_caching.py
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Example of using FlexKV with vLLM for prefix caching."
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Path or name of the model to use.",
+    )
+    parser.add_argument(
+        "--tp-size",
+        type=int,
+        default=1,
+        help="Tensor parallel size (default: 1).",
+    )
+    parser.add_argument(
+        "--gpu-memory-util",
+        type=float,
+        default=0.4,
+        help="GPU memory utilization fraction (default: 0.4).",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    flexkv_config = {
+        "server_recv_port": f"ipc:///tmp/flexkv_test_{os.getpid()}",
+        "cache_config": {
+            "enable_cpu": True,
+            "num_cpu_blocks": 10240,
+        },
+        "num_log_interval_requests": 200,
+    }
+    flexkv_config_path = f"./flexkv_config_{os.getpid()}.json"
+    with open(flexkv_config_path, "w") as f:
+        json.dump(flexkv_config, f)
+    os.environ["FLEXKV_CONFIG_PATH"] = flexkv_config_path
+
+    try:
+        _run(args)
+    finally:
+        if os.path.exists(flexkv_config_path):
+            os.remove(flexkv_config_path)
+
+
+def _run(args):
+    # Common prefix.
+    prefix = (
+        "You are an expert school principal, skilled in effectively managing "
+        "faculty and staff. Draft 10-15 questions for a potential first grade "
+        "Head Teacher for my K-12, all-girls', independent school that emphasizes "
+        "community, joyful discovery, and life-long learning. The candidate is "
+        "coming in for a first-round panel interview for a 8th grade Math "
+        "teaching role. They have 5 years of previous teaching experience "
+        "as an assistant teacher at a co-ed, public school with experience "
+        "in middle school math teaching. Based on these information, fulfill "
+        "the following paragraph: "
+    )
+
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    generating_prompts = [prefix + prompt for prompt in prompts]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.0)
+
+    kv_transfer_config = {
+        "kv_connector": "FlexKVConnectorV1",
+        "kv_role": "kv_both",
+    }
+
+    # Create an LLM without prefix caching as a baseline.
+    regular_llm = LLM(
+        model=args.model,
+        enable_prefix_caching=False,
+        gpu_memory_utilization=args.gpu_memory_util,
+        tensor_parallel_size=args.tp_size,
+    )
+
+    print("Results without `enable_prefix_caching`")
+
+    # ruff: noqa: E501
+    # Generate texts from the prompts. The output is a list of RequestOutput
+    # objects that contain the prompt, generated text, and other information.
+    outputs = regular_llm.generate(generating_prompts, sampling_params)
+
+    regular_generated_texts = []
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        regular_generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+    # Destroy the LLM object and free up the GPU memory.
+    del regular_llm
+    cleanup_dist_env_and_memory()
+
+    # Create an LLM with prefix caching enabled.
+    prefix_cached_llm = LLM(
+        model=args.model,
+        enable_prefix_caching=True,
+        gpu_memory_utilization=args.gpu_memory_util,
+        tensor_parallel_size=args.tp_size,
+        kv_transfer_config=kv_transfer_config,
+    )
+
+    # Warmup so that the shared prompt's KV cache is computed.
+    prefix_cached_llm.generate(generating_prompts[0], sampling_params)
+
+    # wait for offload kv task finished.
+    time.sleep(2)
+
+    # Generate with prefix caching.
+    outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
+
+    print("Results with `enable_prefix_caching`")
+
+    cached_generated_texts = []
+    # Print the outputs. You should see the same outputs as before.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        cached_generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+    # Compare the results and display the speedup
+    generated_same = all(
+        regular_generated_texts[i] == cached_generated_texts[i]
+        for i in range(len(prompts))
+    )
+    print(f"Generated answers are the same: {generated_same}")
+
+    # wait for offload kv task finished.
+    time.sleep(2)
+
+    # reset prefix cache to use flexkv
+    prefix_cached_llm.reset_prefix_cache()
+
+    # Generate with prefix caching.
+    outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
+
+    print("Results with `flexkv`")
+
+    flexkv_generated_texts = []
+    # Print the outputs. You should see the same outputs as before.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        flexkv_generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+    # Compare the results and display the speedup
+    generated_same = all(
+        regular_generated_texts[i] == flexkv_generated_texts[i]
+        for i in range(len(prompts))
+    )
+    print(f"Generated answers are the same: {generated_same}")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/routed_experts_e2e.py
+++ b/examples/offline_inference/routed_experts_e2e.py
@@ -0,0 +1,384 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+End-to-end example for routed experts capture with hybrid models.
+
+Validates that:
+1. routed_experts is returned in CompletionOutput for MoE models.
+2. Expert IDs are within valid range.
+3. Results are deterministic across runs (baseline vs reference).
+
+Usage:
+    python examples/offline_inference/routed_experts_e2e.py \
+        --model Qwen/Qwen3-30B-A3B \
+        --tp 4 \
+        --max-model-len 4096 \
+        --num-prompts 20 \
+        --max-new-tokens 50
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import logging
+import os
+import uuid
+from dataclasses import dataclass, field
+
+import numpy as np
+
+from vllm.engine.arg_utils import AsyncEngineArgs
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_MODEL = "Qwen/Qwen3-30B-A3B"
+
+TEST_PROMPTS = [
+    "Hello, my name is",
+    "The capital of France is",
+    "Explain quantum computing in simple terms:",
+    "Write a Python function that sorts a list:",
+    "The meaning of life is",
+    "In a distant galaxy, there was a",
+    "The best way to learn programming is",
+    "Once upon a time in a land far away,",
+    "The theory of relativity states that",
+    "How does photosynthesis work?",
+    "Describe the process of machine learning:",
+    "What are the benefits of exercise?",
+    "The history of artificial intelligence began",
+    "Translate the following to French: Hello world",
+    "Summarize the plot of Romeo and Juliet:",
+    "What is the difference between TCP and UDP?",
+    "The water cycle consists of",
+    "Explain how a neural network learns:",
+    "The periodic table organizes elements by",
+    "Write a haiku about the ocean:",
+]
+
+
+@dataclass
+class InferenceResult:
+    """Result from a single inference run."""
+
+    experts_list: list[np.ndarray] = field(default_factory=list)
+    token_ids_list: list[list[int]] = field(default_factory=list)
+    num_experts: int = 0
+
+
+# ---------------------------------------------------------------------------
+# Inference helpers
+# ---------------------------------------------------------------------------
+
+
+async def _run_async_inference(
+    engine_args: AsyncEngineArgs,
+    prompts: list[str],
+    max_new_tokens: int,
+) -> InferenceResult:
+    """Run inference using AsyncLLM."""
+    from vllm.sampling_params import SamplingParams
+    from vllm.v1.engine.async_llm import AsyncLLM
+
+    engine = AsyncLLM.from_engine_args(engine_args)
+
+    hf_config = engine.model_config.hf_text_config
+    num_experts: int = getattr(hf_config, "num_experts", 0) or getattr(
+        hf_config, "num_local_experts", 0
+    )
+    assert num_experts > 0, "Could not determine num_experts from model config"
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_tokens=max_new_tokens,
+    )
+
+    async def _generate_one(prompt: str, idx: int):
+        request_id = str(uuid.uuid4())
+        final_output = None
+        async for output in engine.generate(prompt, sampling_params, request_id):
+            final_output = output
+        assert final_output is not None
+
+        completion = final_output.outputs[0]
+        routed = completion.routed_experts
+        num_prompt_tokens = len(final_output.prompt_token_ids)
+        num_generated_tokens = len(completion.token_ids)
+        expected_len = num_prompt_tokens + num_generated_tokens - 1
+        assert routed is not None, f"Prompt {idx}: routed_experts is None"
+        assert routed.shape[0] == expected_len, (
+            f"Prompt {idx}: routed_experts length {routed.shape[0]} != "
+            f"prompt ({num_prompt_tokens}) + generated ({num_generated_tokens})"
+            f" - 1 = {expected_len}"
+        )
+        return idx, routed, list(completion.token_ids)
+
+    tasks = [_generate_one(p, i) for i, p in enumerate(prompts)]
+    outputs = await asyncio.gather(*tasks)
+
+    # Sort by original index to maintain prompt order
+    outputs.sort(key=lambda x: x[0])
+
+    result = InferenceResult(num_experts=num_experts)
+    for _, routed, token_ids in outputs:
+        result.experts_list.append(routed)
+        result.token_ids_list.append(token_ids)
+
+    engine.shutdown()
+    return result
+
+
+def run_inference(
+    model: str,
+    prompts: list[str],
+    max_new_tokens: int = 50,
+    tp: int = 1,
+    max_model_len: int = 4096,
+) -> InferenceResult:
+    """Run inference with routed experts capture enabled via AsyncLLM."""
+    engine_args = AsyncEngineArgs(
+        model=model,
+        enable_return_routed_experts=True,
+        tensor_parallel_size=tp,
+        max_model_len=max_model_len,
+        disable_log_stats=True,
+        attention_backend="FLASH_ATTN",
+    )
+
+    result = asyncio.run(_run_async_inference(engine_args, prompts, max_new_tokens))
+
+    from vllm.platforms import current_platform
+
+    if current_platform.is_cuda_alike():
+        current_platform.empty_cache()
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Validation helpers
+# ---------------------------------------------------------------------------
+
+
+def validate_expert_ids(
+    experts_list: list[np.ndarray],
+    num_experts: int,
+) -> None:
+    """Check that all expert IDs are within valid range [0, num_experts)."""
+    for i, experts in enumerate(experts_list):
+        assert np.all(experts >= 0), (
+            f"Prompt {i}: negative expert IDs found, min={experts.min()}"
+        )
+        assert np.all(experts < num_experts), (
+            f"Prompt {i}: expert ID out of range [0, {num_experts}), "
+            f"max={experts.max()}"
+        )
+
+
+def validate_shapes(experts_list: list[np.ndarray]) -> None:
+    """Check that all routed_experts arrays have at least 2 dimensions."""
+    for i, experts in enumerate(experts_list):
+        assert experts.ndim >= 2, (
+            f"Prompt {i}: expected at least 2D array, got shape {experts.shape}"
+        )
+        logger.info("Prompt %d: routed_experts shape = %s", i, experts.shape)
+
+
+# ---------------------------------------------------------------------------
+# Comparison helpers
+# ---------------------------------------------------------------------------
+
+
+def compare_token_ids(
+    baseline: list[list[int]],
+    reference: list[list[int]],
+) -> float:
+    """Compare token IDs from two runs. Returns mismatch ratio."""
+    assert len(baseline) == len(reference), (
+        f"Length mismatch: {len(baseline)} vs {len(reference)}"
+    )
+
+    total_tokens = 0
+    total_mismatches = 0
+
+    for i, (base, ref) in enumerate(zip(baseline, reference)):
+        min_len = min(len(base), len(ref))
+        max_len = max(len(base), len(ref))
+        matches = 0
+        for a, b in zip(base[:min_len], ref[:min_len]):
+            if a != b:
+                break
+            matches += 1
+
+        total_mismatches += max_len - matches
+        total_tokens += max_len
+
+        if matches < min_len or len(base) != len(ref):
+            print(
+                f"  Prompt {i}: token_ids len={len(base)} vs {len(ref)}, "
+                f"mismatches={max_len - matches}/{max_len}"
+            )
+
+    if total_tokens == 0:
+        raise ValueError("No tokens to compare")
+
+    mismatch_ratio = total_mismatches / total_tokens
+    print(
+        f"Token ID mismatches: {total_mismatches}/{total_tokens} ({mismatch_ratio:.4%})"
+    )
+    return mismatch_ratio
+
+
+def compare_routed_experts(
+    baseline: list[np.ndarray],
+    reference: list[np.ndarray],
+    threshold: float = 0.05,
+) -> float:
+    """Compare two runs of routed experts. Returns mismatch ratio.
+
+    Raises AssertionError if ratio exceeds threshold.
+    """
+    assert len(baseline) == len(reference), (
+        f"Length mismatch: {len(baseline)} vs {len(reference)}"
+    )
+
+    total_elements = 0
+    total_mismatches = 0
+
+    for i, (base, ref) in enumerate(zip(baseline, reference)):
+        min_len = min(len(base), len(ref))
+        max_len = max(len(base), len(ref))
+        if min_len == 0:
+            continue
+
+        base_trimmed = base[:min_len]
+        ref_trimmed = ref[:min_len]
+
+        matches = 0
+        for a, b in zip(base_trimmed, ref_trimmed):
+            if a.sum() != b.sum():
+                break
+            matches += 1
+
+        total_mismatches += max_len - matches
+        total_elements += max_len
+
+        if matches < min_len or len(base) != len(ref):
+            print(
+                f"  Prompt {i}: routed_experts len={len(base)} vs {len(ref)}, "
+                f"mismatches={max_len - matches}/{max_len}"
+            )
+
+    if total_elements == 0:
+        raise ValueError("No elements to compare")
+
+    mismatch_ratio = total_mismatches / total_elements
+    print(
+        f"Routed experts mismatches: {total_mismatches}/{total_elements} "
+        f"({mismatch_ratio:.4%})"
+    )
+
+    assert mismatch_ratio < threshold, (
+        f"Too many mismatches: {total_mismatches}/{total_elements} "
+        f"({mismatch_ratio:.4%}) exceeds threshold {threshold:.4%}"
+    )
+
+    return mismatch_ratio
+
+
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+
+
+def main():
+    os.environ.setdefault("VLLM_BATCH_INVARIANT", "1")
+
+    parser = argparse.ArgumentParser(
+        description="Test routed experts capture for MoE models"
+    )
+    parser.add_argument("--model", type=str, default=DEFAULT_MODEL)
+    parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--max-model-len", type=int, default=4096)
+    parser.add_argument("--num-prompts", type=int, default=20)
+    parser.add_argument("--max-new-tokens", type=int, default=50)
+    parser.add_argument(
+        "--deterministic",
+        action="store_true",
+        help="Run twice and compare results for determinism check",
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=0.05,
+        help="Maximum allowed mismatch ratio for determinism check",
+    )
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO)
+    prompts = TEST_PROMPTS[: args.num_prompts]
+
+    print(f"Model: {args.model}")
+    print(f"TP: {args.tp}")
+    print(f"Prompts: {len(prompts)}")
+    print(f"Max new tokens: {args.max_new_tokens}")
+    print()
+
+    print("=== Run 1 (baseline) ===")
+    baseline = run_inference(
+        model=args.model,
+        prompts=prompts,
+        max_new_tokens=args.max_new_tokens,
+        tp=args.tp,
+        max_model_len=args.max_model_len,
+    )
+    print(f"num_experts (from model config): {baseline.num_experts}")
+
+    print("\n=== Validation ===")
+    validate_shapes(baseline.experts_list)
+    validate_expert_ids(baseline.experts_list, num_experts=baseline.num_experts)
+    print(f"All {len(baseline.experts_list)} results passed validation.")
+
+    for i, experts in enumerate(baseline.experts_list):
+        print(
+            f"  Prompt {i}: shape={experts.shape}, "
+            f"min={experts.min()}, max={experts.max()}"
+        )
+
+    if args.deterministic:
+        print("\n=== Run 2 (reference) ===")
+        reference = run_inference(
+            model=args.model,
+            prompts=prompts,
+            max_new_tokens=args.max_new_tokens,
+            tp=args.tp,
+            max_model_len=args.max_model_len,
+        )
+
+        print("\n=== Determinism Check ===")
+        validate_expert_ids(reference.experts_list, num_experts=baseline.num_experts)
+
+        print("\n--- Token IDs ---")
+        token_mismatch = compare_token_ids(
+            baseline.token_ids_list, reference.token_ids_list
+        )
+
+        print("\n--- Routed Experts ---")
+        expert_mismatch = compare_routed_experts(
+            baseline.experts_list,
+            reference.experts_list,
+            threshold=args.threshold,
+        )
+
+        print(
+            f"\nDeterminism check passed. "
+            f"Token mismatch: {token_mismatch:.4%}, "
+            f"Expert mismatch: {expert_mismatch:.4%}"
+        )
+
+    print("\nAll tests passed!")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py
+++ b/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py
@@ -14,6 +14,10 @@ import regex as re
 import zmq
 from quart import Quart, make_response, request

+from vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_common import (
+    MoRIIOConstants,
+)
+
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 prefill_instances: list[dict] = []
@@ -213,6 +217,8 @@ async def handle_request():

        dip, dport = extract_ip_port_fast(decode_instance_endpoint["request_address"])

+        transfer_id = f"{MoRIIOConstants.TRANSFER_PREFIX}-{str(uuid.uuid4())}"
+
        req_data_to_prefill = copy.deepcopy(req_data)
        req_data_to_prefill["kv_transfer_params"] = {}
        req_data["kv_transfer_params"] = {}
@@ -222,6 +228,7 @@ async def handle_request():
        req_data_to_prefill["kv_transfer_params"]["remote_tp_size"] = (
            decode_instance_endpoint["tp_size"]
        )
+        req_data_to_prefill["kv_transfer_params"]["transfer_id"] = transfer_id

        send_prefill_task = asyncio.create_task(
            send_request_to_prefill(
@@ -267,6 +274,7 @@ async def handle_request():

        if selected_prefill_dp_rank is not None:
            req_data["kv_transfer_params"]["remote_dp_rank"] = selected_prefill_dp_rank
+        req_data["kv_transfer_params"]["transfer_id"] = transfer_id

        decode_request_task = asyncio.create_task(
            start_decode_request(
--- a/examples/online_serving/new_weight_syncing/rlhf_http_ipc.py
+++ b/examples/online_serving/new_weight_syncing/rlhf_http_ipc.py
@@ -106,7 +106,7 @@ def main():
    # IPC requires the training model to be on the same GPU as the vLLM server
    # The server should be started on GPU 0 with reduced memory utilization
    device = "cuda:0"
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)

    # Load the training model on the same GPU as the server
    # Use bfloat16 to reduce memory footprint
--- a/examples/online_serving/new_weight_syncing/rlhf_http_nccl.py
+++ b/examples/online_serving/new_weight_syncing/rlhf_http_nccl.py
@@ -131,7 +131,7 @@ def main():
    inference_world_size = get_world_size(BASE_URL)
    world_size = inference_world_size + 1  # +1 for the trainer
    device = f"cuda:{inference_world_size}"
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)

    # Load the training model
    print(f"Loading training model: {MODEL_NAME}")
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -167,6 +167,7 @@ fo = "fo"
 nd = "nd"
 eles = "eles"
 datas = "datas"
+ser = "ser"
 ure = "ure"

 [tool.uv]
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -24,14 +24,14 @@ outlines_core == 0.2.11
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
-xgrammar == 0.1.29; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
+xgrammar >= 0.1.32, < 1.0.0; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
 pyzmq >= 25.0.0
 msgspec
 gguf >= 0.17.0
-mistral_common[image] >= 1.9.1
+mistral_common[image] >= 1.10.0
 opencv-python-headless >= 4.13.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -7,13 +7,13 @@ numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative d

 # Dependencies for CPUs
 torch==2.10.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
-torch==2.10.0; platform_machine == "aarch64" or platform_system == "Darwin" or platform_machine == "ppc64le"
+torch==2.10.0; platform_machine == "aarch64" or platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "riscv64"

 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
-torchaudio; platform_machine != "s390x"
+torchaudio; platform_machine != "s390x" and platform_machine != "riscv64"

 # required for the image processor of phi3v, this must be updated alongside torch
-torchvision; platform_machine != "s390x"
+torchvision; platform_machine != "s390x"  and platform_machine != "riscv64"

 # Intel Extension for PyTorch, only for x86_64 CPUs
 intel-openmp==2024.2.1; platform_machine == "x86_64"
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -9,7 +9,10 @@ torchaudio==2.10.0
 # These must be updated alongside torch
 torchvision==0.25.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
-flashinfer-python==0.6.4
+flashinfer-python==0.6.6
+# Cap nvidia-cudnn-frontend (transitive dep of flashinfer) due to
+# breaking changes in 1.19.0
+nvidia-cudnn-frontend>=1.13.0,<1.19.0

 # QuACK and Cutlass DSL for FA4 (cute-DSL implementation)
 nvidia-cutlass-dsl>=4.4.0.dev1
--- a/requirements/lint.txt
+++ b/requirements/lint.txt
@@ -1,2 +1,2 @@
 # formatting
-pre-commit==4.0.1
+pre-commit>=4.5.1
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -42,6 +42,7 @@ tritonclient>=2.51.0

 numba == 0.61.2 # Required for N-gram speculative decoding
 numpy
-runai-model-streamer[s3,gcs]==0.15.3
+runai-model-streamer[s3,gcs,azure]==0.15.7
 fastsafetensors>=0.2.2
+instanttensor>=0.1.5
 pydantic>=2.12 # 2.11 leads to error on python 3.13
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -45,6 +45,8 @@ pystemmer==3.0.0
    # via mteb

 # Multi-modal processing
+av==16.1.0
+    # required for audio_in_video tests
 blobfile==3.0.0
    # Multi-Modal Models Test
 decord==0.6.0
@@ -95,7 +97,7 @@ transformers==4.57.5
 # Pin HF Hub version
 huggingface-hub==0.36.2
 # Pin Mistral Common
-mistral-common[image,audio]==1.9.1
+mistral-common[image,audio]==1.10.0
 # Required for Prithvi tests
 terratorch==1.2.2
 # Required for Prithvi tests
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -15,7 +15,7 @@ tensorizer==2.10.1
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
-runai-model-streamer[s3,gcs]==0.15.3
+runai-model-streamer[s3,gcs,azure]==0.15.7
 conch-triton-kernels==1.2.1
 timm>=1.0.17
 # amd-quark: required for Quark quantization on ROCm 
--- a/Show More
+++ b/Show More