[ROCm][CI] Pin TorchCodec to v0.10.0 for ROCm compatibility (#34447 )

Signed-off-by: Andreas Karatzas <akaratza@amd.com> (cherry picked from commit 4c078fa546) (cherry picked from commit a976961fb77d38129abf69edd4952101731f2421)
[ci] Use the right tag for CPU arm64 image (#34915 )
2026-02-24 20:30:22 -08:00 · 2026-02-24 20:30:13 -08:00 · 2026-02-24 20:30:02 -08:00 · 2026-02-24 20:28:53 -08:00 · 2026-02-24 20:28:45 -08:00 · 2026-02-12 02:37:52 -08:00
1046 changed files with 19713 additions and 83757 deletions
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -1,7 +1,6 @@
-group: Hardware - AMD Build 
+group: Hardware
 steps:
  - label: "AMD: :docker: build image"
-    key: image-build-amd
    depends_on: []
    device: amd_cpu
    no_plugin: true
@@ -10,7 +9,7 @@ steps:
      docker build
      --build-arg max_jobs=16
      --build-arg REMOTE_VLLM=1
-      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx942;gfx950'
+      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
      --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
      --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
      -f docker/Dockerfile.rocm
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@@ -8,7 +8,7 @@ clean_docker_tag() {
 }

 print_usage_and_exit() {
-    echo "Usage: $0 <registry> <repo> <commit> <branch> <image_tag> [<image_tag_latest>]"
+    echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
    exit 1
 }

@@ -142,16 +142,11 @@ resolve_parent_commit() {

 print_bake_config() {
    echo "--- :page_facing_up: Resolved bake configuration"
-    # Write to a temp directory to avoid polluting the repo root (which is the
-    # Docker build context). Files left in the repo root get COPY'd into the
-    # image and can cause duplicate artifact uploads from downstream steps.
-    local bake_tmp
-    bake_tmp="$(mktemp -d)"
-    BAKE_CONFIG_FILE="${bake_tmp}/bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
+    BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
    docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
    echo "Saved bake config to ${BAKE_CONFIG_FILE}"
    echo "--- :arrow_down: Uploading bake config to Buildkite"
-    (cd "$(dirname "${BAKE_CONFIG_FILE}")" && buildkite-agent artifact upload "$(basename "${BAKE_CONFIG_FILE}")")
+    buildkite-agent artifact upload "${BAKE_CONFIG_FILE}"
 }

 #################################
@@ -159,7 +154,7 @@ print_bake_config() {
 #################################
 print_instance_info

-if [[ $# -lt 5 ]]; then
+if [[ $# -lt 7 ]]; then
    print_usage_and_exit
 fi

@@ -168,8 +163,10 @@ REGISTRY=$1
 REPO=$2
 BUILDKITE_COMMIT=$3
 BRANCH=$4
-IMAGE_TAG=$5
-IMAGE_TAG_LATEST=${6:-} # only used for main branch, optional
+VLLM_USE_PRECOMPILED=$5
+VLLM_MERGE_BASE_COMMIT=$6
+IMAGE_TAG=$7
+IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional

 # build config
 TARGET="test-ci"
@@ -196,6 +193,8 @@ export CACHE_FROM
 export CACHE_FROM_BASE_BRANCH
 export CACHE_FROM_MAIN
 export CACHE_TO
+export VLLM_USE_PRECOMPILED
+export VLLM_MERGE_BASE_COMMIT

 # print args
 echo "--- :mag: Arguments"
@@ -203,6 +202,8 @@ echo "REGISTRY: ${REGISTRY}"
 echo "REPO: ${REPO}"
 echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
 echo "BRANCH: ${BRANCH}"
+echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
+echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
 echo "IMAGE_TAG: ${IMAGE_TAG}"
 echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"

--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -3,9 +3,9 @@ steps:
  - label: ":docker: Build image"
    key: image-build
    depends_on: []
-    timeout_in_minutes: 600
    commands:
-    - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
+    - if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
+    - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
@@ -41,7 +41,7 @@ steps:
          limit: 2
        - exit_status: -10  # Agent was lost
          limit: 2
-
+  
  - label: ":docker: Build CPU arm64 image"
    key: cpu-arm64-image-build
    depends_on: []
--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3

 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY

 # skip build if image already exists
-if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -24,13 +24,13 @@ fi
 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
  --build-arg VLLM_CPU_AVX512BF16=true \
  --build-arg VLLM_CPU_AVX512VNNI=true \
  --build-arg VLLM_CPU_AMXBF16=true \
-  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
  --target vllm-test \
  --progress plain .

 # push
-docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
--- a/.buildkite/image_build/image_build_cpu_arm64.sh
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -11,7 +11,7 @@ REPO=$2
 BUILDKITE_COMMIT=$3

 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY

 # skip build if image already exists
 if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
--- a/.buildkite/image_build/image_build_hpu.sh
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3

 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY

 # skip build if image already exists
-if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -25,10 +25,10 @@ fi
 docker build \
  --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
  --progress plain \
  https://github.com/vllm-project/vllm-gaudi.git

 # push
-docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on chartqa for vllm.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.11"
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
  --tasks chartqa \
  --batch_size auto \
  --apply_chat_template \
-  --limit "$LIMIT"
+  --limit $LIMIT
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.11"
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.11"
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.11"
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
@@ -20,11 +20,14 @@ usage() {
    echo
 }

-while getopts "m:l:f:t:" OPT; do
+while getopts "m:b:l:f:t:" OPT; do
  case ${OPT} in
    m )
        MODEL="$OPTARG"
        ;;
+    b )
+        BATCH_SIZE="$OPTARG"
+        ;;
    l )
        LIMIT="$OPTARG"
        ;;
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@@ -9,10 +9,8 @@ import json
 import os
 from dataclasses import dataclass
 from importlib import util
-from pathlib import Path

 import pandas as pd
-import regex as re

 pd.options.display.float_format = "{:.2f}".format
 plotly_found = util.find_spec("plotly.express") is not None
@@ -277,131 +275,6 @@ def _apply_two_decimals(
    return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")


-# -----------------------------
-# Export helpers (Excel + CSV)
-# -----------------------------
-def _sanitize_sheet_name(name: str) -> str:
-    """
-    Excel sheet constraints:
-      - max 31 chars
-      - cannot contain: : \ / ? * [ ]
-      - cannot be empty
-    """
-    name = "sheet" if name is None else str(name)
-    name = re.sub(r"[:\\/?*\[\]]", "_", name)
-    name = name.strip().strip("'")
-    name = re.sub(r"\s+", " ", name)
-    if not name:
-        name = "sheet"
-    return name[:31]
-
-
-def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
-    d = dict(zip(group_cols, gkey_tuple))
-    model = d.get("Model", "model")
-    model_short = str(model).split("/")[-1]
-    ilen = d.get("Input Len", "")
-    olen = d.get("Output Len", "")
-    lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
-    return _sanitize_sheet_name(f"{model_short}{lens}")
-
-
-def _write_tables_to_excel_sheet(
-    writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
-):
-    startrow = 0
-    for title, df in blocks:
-        pd.DataFrame([[title]]).to_excel(
-            writer, sheet_name=sheet, index=False, header=False, startrow=startrow
-        )
-        startrow += 1
-        df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow)
-        startrow += len(df) + 3
-
-
-def _safe_filename(s: str) -> str:
-    s = re.sub(r"[^\w\-.]+", "_", str(s).strip())
-    return s[:180] if len(s) > 180 else s
-
-
-# -----------------------------
-# vLLM environment export helper
-# -----------------------------
-def _parse_vllm_env_txt(env_path: Path) -> pd.DataFrame:
-    """Parse vllm_env.txt into a flat table (Section, Key, Value).
-
-    Supports:
-      - section headers as standalone lines (no ':' or '=')
-      - key-value lines like 'OS: Ubuntu ...'
-      - env var lines like 'HF_HOME=/data/hf'
-    """
-    lines = env_path.read_text(encoding="utf-8", errors="replace").splitlines()
-    section = "General"
-    rows: list[dict] = []
-
-    def set_section(s: str):
-        nonlocal section
-        s = (s or "").strip()
-        if s:
-            section = s
-
-    for raw in lines:
-        stripped = raw.strip()
-        if not stripped:
-            continue
-        # divider lines like =====
-        if set(stripped) <= {"="}:
-            continue
-
-        # section header heuristic: short standalone line
-        if ":" not in stripped and "=" not in stripped and len(stripped) <= 64:
-            if stripped.lower().startswith("collecting environment information"):
-                continue
-            set_section(stripped)
-            continue
-
-        # env var style: KEY=VALUE (and not a URL with :)
-        if "=" in stripped and ":" not in stripped:
-            k, v = stripped.split("=", 1)
-            k = k.strip()
-            v = v.strip()
-            if k:
-                rows.append({"Section": section, "Key": k, "Value": v})
-            continue
-
-        # key: value
-        if ":" in stripped:
-            k, v = stripped.split(":", 1)
-            k = k.strip()
-            v = v.strip()
-            if k:
-                rows.append({"Section": section, "Key": k, "Value": v})
-            continue
-
-    return pd.DataFrame(rows, columns=["Section", "Key", "Value"])
-
-
-def _load_env_df_for_inputs(args, files: list[str]) -> pd.DataFrame | None:
-    """Load vllm_env.txt next to the *original* input JSON file.
-
-    Note: when only one -f is provided, the script may split JSON into ./splits/...,
-    but vllm_env.txt typically lives next to the original benchmark_results.json.
-    """
-    base_dir: Path | None = None
-    if getattr(args, "file", None):
-        base_dir = Path(args.file[0]).resolve().parent
-    elif files:
-        base_dir = Path(files[0]).resolve().parent
-    if base_dir is None:
-        return None
-
-    env_path = base_dir / "vllm_env.txt"
-    if not env_path.exists():
-        return None
-    df = _parse_vllm_env_txt(env_path)
-    return df
-
-
 # -----------------------------
 # Valid max concurrency summary helpers
 # -----------------------------
@@ -555,6 +428,7 @@ def build_valid_max_concurrency_summary_html(

    summary_df = pd.DataFrame(rows)

+    # --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
    for c in summary_df.columns:
        if c == "Configuration":
            continue
@@ -562,10 +436,12 @@ def build_valid_max_concurrency_summary_html(

    both_col = f"Max {conc_col} (Both)"

+    # --- Strict 2-decimal formatting for ALL non-Configuration columns ---
    formatters = {}
    for c in summary_df.columns:
        if c == "Configuration":
            continue
+        # default argument binds per-column formatter correctly
        formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"

    styler = summary_df.style.format(formatters)
@@ -584,95 +460,6 @@ def build_valid_max_concurrency_summary_html(
    return title + styler.to_html(table_attributes='border="1" class="dataframe"')


-def build_valid_max_concurrency_summary_df(
-    tput_group_df: pd.DataFrame | None,
-    ttft_group_df: pd.DataFrame | None,
-    tpot_group_df: pd.DataFrame | None,
-    conc_col: str,
-    args,
-) -> pd.DataFrame | None:
-    if ttft_group_df is None and tpot_group_df is None:
-        return None
-
-    ttft_cols = (
-        _config_value_columns(ttft_group_df, conc_col)
-        if ttft_group_df is not None
-        else []
-    )
-    tpot_cols = (
-        _config_value_columns(tpot_group_df, conc_col)
-        if tpot_group_df is not None
-        else []
-    )
-    tput_cols = (
-        _config_value_columns(tput_group_df, conc_col)
-        if tput_group_df is not None
-        else []
-    )
-
-    if ttft_group_df is not None and tpot_group_df is not None:
-        cfg_cols = [c for c in ttft_cols if c in tpot_cols]
-        if tput_group_df is not None:
-            cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
-    else:
-        cfg_cols = ttft_cols or tpot_cols
-
-    if not cfg_cols:
-        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
-
-    rows = []
-    for cfg in cfg_cols:
-        ttft_max = (
-            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
-            if ttft_group_df is not None
-            else pd.NA
-        )
-        tpot_max = (
-            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
-            if tpot_group_df is not None
-            else pd.NA
-        )
-        both = (
-            pd.NA
-            if (pd.isna(ttft_max) or pd.isna(tpot_max))
-            else min(ttft_max, tpot_max)
-        )
-
-        tput_at_both = (
-            _value_at_concurrency(tput_group_df, conc_col, cfg, both)
-            if tput_group_df is not None
-            else pd.NA
-        )
-        ttft_at_both = (
-            _value_at_concurrency(ttft_group_df, conc_col, cfg, both)
-            if ttft_group_df is not None
-            else pd.NA
-        )
-        tpot_at_both = (
-            _value_at_concurrency(tpot_group_df, conc_col, cfg, both)
-            if tpot_group_df is not None
-            else pd.NA
-        )
-
-        rows.append(
-            {
-                "Configuration": cfg,
-                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
-                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
-                f"Max {conc_col} (Both)": both,
-                "Output Tput @ Both (tok/s)": tput_at_both,
-                "TTFT @ Both (ms)": ttft_at_both,
-                "TPOT @ Both (ms)": tpot_at_both,
-            }
-        )
-
-    df = pd.DataFrame(rows)
-    for c in df.columns:
-        if c != "Configuration":
-            df[c] = pd.to_numeric(df[c], errors="coerce")
-    return df
-
-
 # -----------------------------
 # Plot helper
 # -----------------------------
@@ -750,21 +537,6 @@ def build_parser() -> argparse.ArgumentParser:
        default=100.0,
        help="Reference limit for TPOT plots (ms)",
    )
-
-    # ---- NEW: export options ----
-    parser.add_argument(
-        "--excel-out",
-        type=str,
-        default="perf_comparison.xlsx",
-        help="Write one sheet per (Model, Dataset, Input Len, Output Len).",
-    )
-    parser.add_argument(
-        "--csv-out-dir",
-        type=str,
-        default="",
-        help="If set, write per-group per-metric CSVs into this directory.",
-    )
-
    return parser


@@ -885,6 +657,7 @@ def maybe_write_plot(
        markers=True,
    )

+    # Ensure plot hover + y tick labels are also 2 decimals.
    fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
    fig.update_yaxes(tickformat=".2f")

@@ -957,151 +730,87 @@ def write_report_group_first(
        for metric_label, (df, _) in metric_cache.items()
    }

-    csv_dir = Path(args.csv_out_dir) if args.csv_out_dir else None
-    if csv_dir:
-        csv_dir.mkdir(parents=True, exist_ok=True)
+    with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
+        main_fh.write('<meta charset="utf-8">\n')
+        for gkey in group_keys:
+            gkey_tuple = normalize_group_key(gkey)
+            suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
+            sub_path = group_filename(gkey_tuple)
+            group_header = (
+                '<div style="font-size: 1.4em; font-weight: 700; '
+                'margin: 18px 0 10px 0;">'
+                f"{_html.escape(suffix)}"
+                "</div>\n"
+            )

-    excel_path = args.excel_out or "perf_comparison.xlsx"
-    with pd.ExcelWriter(excel_path, engine="openpyxl") as xw:
-        # ---- Environment sheet (first) ----
-        env_sheet = _sanitize_sheet_name("Environment")
-        env_df = _load_env_df_for_inputs(args, files)
-        if env_df is None or env_df.empty:
-            pd.DataFrame(
-                [
-                    {
-                        "Section": "Environment",
-                        "Key": "vllm_env.txt",
-                        "Value": "NOT FOUND (or empty)",
-                    }
-                ]
-            ).to_excel(xw, sheet_name=env_sheet, index=False)
-        else:
-            env_df.to_excel(xw, sheet_name=env_sheet, index=False)
-        with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
-            main_fh.write('<meta charset="utf-8">\n')
-            for gkey in group_keys:
-                gkey_tuple = normalize_group_key(gkey)
-                suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
-                sub_path = group_filename(gkey_tuple)
-                group_header = (
-                    '<div style="font-size: 1.4em; font-weight: 700; '
-                    'margin: 18px 0 10px 0;">'
-                    f"{_html.escape(suffix)}"
-                    "</div>\n"
+            main_fh.write(group_header)
+            with open(sub_path, "w", encoding="utf-8") as sub_fh:
+                sub_fh.write('<meta charset="utf-8">\n')
+                sub_fh.write(group_header)
+                tput_group_df = None
+                ttft_group_df = None
+                tpot_group_df = None
+                conc_col = args.xaxis
+
+                for metric_label in plan.data_cols:
+                    gb = metric_groupbys[metric_label]
+                    df_sorted, raw_data_cols = metric_cache[metric_label]
+
+                    try:
+                        group_df = gb.get_group(gkey)
+                    except KeyError:
+                        missing = (
+                            '<div style="font-size: 1.1em; font-weight: 600; '
+                            'margin: 10px 0;">'
+                            f"{_html.escape(metric_label)} — missing for this group"
+                            "</div>\n"
+                        )
+
+                        main_fh.write(missing)
+                        sub_fh.write(missing)
+                        continue
+
+                    if conc_col not in group_df.columns:
+                        conc_col = _find_concurrency_col(group_df)
+
+                    mn = metric_label.lower().strip()
+                    if "tok/s" in mn:
+                        tput_group_df = group_df
+                    elif "ttft" in mn:
+                        ttft_group_df = group_df
+                    elif mn in ("p99", "median") or "tpot" in mn:
+                        tpot_group_df = group_df
+
+                    display_group = group_df.drop(
+                        columns=group_cols_canonical, errors="ignore"
+                    )
+
+                    html = render_metric_table_html(
+                        display_group, metric_label, suffix, args
+                    )
+                    main_fh.write(html)
+                    sub_fh.write(html)
+
+                    maybe_write_plot(
+                        main_fh,
+                        sub_fh,
+                        group_df=group_df,
+                        raw_data_cols=raw_data_cols,
+                        metric_label=metric_label,
+                        y_axis_col=y_axis_col,
+                        args=args,
+                    )
+
+                summary_html = build_valid_max_concurrency_summary_html(
+                    tput_group_df=tput_group_df,
+                    ttft_group_df=ttft_group_df,
+                    tpot_group_df=tpot_group_df,
+                    conc_col=conc_col,
+                    args=args,
                )
-
-                main_fh.write(group_header)
-
-                sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
-                sheet_base = sheet
-                dedup_i = 1
-                while sheet in xw.sheets:
-                    dedup_i += 1
-                    sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}")
-
-                excel_blocks: list[tuple[str, pd.DataFrame]] = []
-
-                with open(sub_path, "w", encoding="utf-8") as sub_fh:
-                    sub_fh.write('<meta charset="utf-8">\n')
-                    sub_fh.write(group_header)
-                    tput_group_df = None
-                    ttft_group_df = None
-                    tpot_group_df = None
-                    conc_col = args.xaxis
-
-                    for metric_label in plan.data_cols:
-                        gb = metric_groupbys[metric_label]
-                        df_sorted, raw_data_cols = metric_cache[metric_label]
-
-                        try:
-                            group_df = gb.get_group(gkey)
-                        except KeyError:
-                            missing = (
-                                '<div style="font-size: 1.1em; font-weight: 600; '
-                                'margin: 10px 0;">'
-                                f"{_html.escape(metric_label)} — missing for this group"
-                                "</div>\n"
-                            )
-                            main_fh.write(missing)
-                            sub_fh.write(missing)
-                            continue
-
-                        if conc_col not in group_df.columns:
-                            conc_col = _find_concurrency_col(group_df)
-
-                        mn = metric_label.lower().strip()
-                        if "tok/s" in mn:
-                            tput_group_df = group_df
-                        elif "ttft" in mn:
-                            ttft_group_df = group_df
-                        elif mn in ("p99", "median") or "tpot" in mn:
-                            tpot_group_df = group_df
-
-                        display_group = group_df.drop(
-                            columns=group_cols_canonical, errors="ignore"
-                        )
-
-                        html = render_metric_table_html(
-                            display_group, metric_label, suffix, args
-                        )
-                        main_fh.write(html)
-                        sub_fh.write(html)
-
-                        maybe_write_plot(
-                            main_fh,
-                            sub_fh,
-                            group_df=group_df,
-                            raw_data_cols=raw_data_cols,
-                            metric_label=metric_label,
-                            y_axis_col=y_axis_col,
-                            args=args,
-                        )
-
-                        excel_blocks.append(
-                            (metric_label, display_group.reset_index(drop=True))
-                        )
-                        if csv_dir:
-                            fn = _safe_filename(
-                                f"{sheet}__{metric_label}".replace(" ", "_").replace(
-                                    "/", "_"
-                                )
-                            )
-                            display_group.to_csv(csv_dir / f"{fn}.csv", index=False)
-
-                    summary_html = build_valid_max_concurrency_summary_html(
-                        tput_group_df=tput_group_df,
-                        ttft_group_df=ttft_group_df,
-                        tpot_group_df=tpot_group_df,
-                        conc_col=conc_col,
-                        args=args,
-                    )
-                    if summary_html:
-                        main_fh.write(summary_html)
-                        sub_fh.write(summary_html)
-
-                    summary_df = build_valid_max_concurrency_summary_df(
-                        tput_group_df=tput_group_df,
-                        ttft_group_df=ttft_group_df,
-                        tpot_group_df=tpot_group_df,
-                        conc_col=conc_col,
-                        args=args,
-                    )
-                    if summary_df is not None:
-                        excel_blocks.append(
-                            ("Valid Max Concurrency Summary", summary_df)
-                        )
-                        if csv_dir:
-                            fn = _safe_filename(
-                                f"{sheet}__Valid_Max_Concurrency_Summary"
-                            )
-                            summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
-
-                _write_tables_to_excel_sheet(xw, sheet, excel_blocks)
-
-    print(f"Wrote Excel: {excel_path}")
-    if csv_dir:
-        print(f"Wrote CSVs under: {csv_dir}")
+                if summary_html:
+                    main_fh.write(summary_html)
+                    sub_fh.write(summary_html)


 def main():
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+
+# This script should be run inside the CI process
 # This script assumes that we are already inside the vllm/ directory
 # Benchmarking results will be available inside vllm/benchmarks/results/

@@ -7,19 +9,14 @@
 set -x
 set -o pipefail

-# Environment-driven debug controls (like ON_CPU=1)
-DRY_RUN="${DRY_RUN:-0}"
-MODEL_FILTER="${MODEL_FILTER:-}"
-DTYPE_FILTER="${DTYPE_FILTER:-}"
-
 check_gpus() {
  if command -v nvidia-smi; then
    # check the number of GPUs and GPU type.
-    declare -g gpu_count=$(nvidia-smi --list-gpus | grep -c . || true)
+    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  elif command -v amd-smi; then
-    declare -g gpu_count=$(amd-smi list | grep -c 'GPU' || true)
+    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
  elif command -v hl-smi; then
-    declare -g gpu_count=$(hl-smi --list | grep -ci "Module ID" || true)
+    declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
  fi

  if [[ $gpu_count -gt 0 ]]; then
@@ -47,7 +44,7 @@ check_cpus() {
  declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
  if [[ $numa_count -gt 0 ]]; then
    echo "NUMA found."
-    echo "$numa_count"
+    echo $numa_count
  else
    echo "Need at least 1 NUMA to run benchmarking."
    exit 1
@@ -115,12 +112,13 @@ json2envs() {
 }

 wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
  local timeout_val="1200"
  timeout "$timeout_val" bash -c '
-    until curl -sf http://localhost:8000/v1/models >/dev/null; do
+    until curl -X POST localhost:8000/v1/completions; do
      sleep 1
-    done
-  '
+    done' && return 0 || return 1
 }

 kill_processes_launched_by_current_bash() {
@@ -254,16 +252,37 @@ run_benchmark_tests() {
  done
 }

-run_latency_tests() { run_benchmark_tests "latency" "$1"; }
-run_startup_tests() { run_benchmark_tests "startup" "$1"; }
-run_throughput_tests() { run_benchmark_tests "throughput" "$1"; }
+run_latency_tests() {
+  run_benchmark_tests "latency" "$1"
+}

-merge_serving_tests_stream() {
-  # Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode.
-  # This helper does NOT modify JSON; it only filters the stream in dry-run mode.
-  local serving_test_file="$1"
-  # shellcheck disable=SC2016
-  local merged='
+run_startup_tests() {
+  run_benchmark_tests "startup" "$1"
+}
+
+run_throughput_tests() {
+  run_benchmark_tests "throughput" "$1"
+}
+
+run_serving_tests() {
+  # run serving tests using `vllm bench serve` command
+  # $1: a json file specifying serving test cases
+  #
+  # Supported JSON formats:
+  # 1) Plain format: top-level array
+  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+  #
+  # 2) Default parameters field + plain format tests
+  #    {
+  #      "defaults": { ... },
+  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
+  #    }
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '
    if type == "array" then
      # Plain format: test cases array
      .[]
@@ -285,50 +304,7 @@ merge_serving_tests_stream() {
    else
      error("Unsupported serving test file format: must be array or object with .tests")
    end
-  '
-
-  jq -c "$merged" "$serving_test_file" | \
-  if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
-    jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" '
-      select((($model|length)==0)
-             or ((.server_parameters.model // "") == $model)
-             or ((.client_parameters.model // "") == $model))
-      | select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
-    '
-  else
-    cat
-  fi
-}
-
-run_serving_tests() {
-  # run serving tests using `vllm bench serve` command
-  # $1: a json file specifying serving test cases
-  #
-  # Supported JSON formats:
-  # 1) Plain format: top-level array
-  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
-  #
-  # 2) Default parameters field + plain format tests
-  #    {
-  #      "defaults": { ... },
-  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
-  #    }
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # In dry-run mode, if filters are provided but no tests match, fail fast.
-  if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
-    local count
-    count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ')
-    if [[ "$count" -eq 0 ]]; then
-      echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2
-      return 0
-    fi
-  fi
-
-  # Iterate over serving tests (merged + optional filtered stream)
-  merge_serving_tests_stream "$serving_test_file" | while read -r params; do
+  ' "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -397,7 +373,7 @@ run_serving_tests() {
    echo "Server command: $server_command"
    # support remote vllm server
    client_remote_args=""
-    if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then
+    if [[ -z "${REMOTE_HOST}" ]]; then
      bash -c "$server_command" &
      server_pid=$!
      # wait until the server is alive
@@ -408,9 +384,6 @@ run_serving_tests() {
        echo ""
        echo "vLLM failed to start within the timeout period."
      fi
-    elif [[ "${DRY_RUN:-0}" == "1" ]]; then
-        # dry-run: don't start server
-        echo "Dry Run."
    else
      server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
      if [[ ${REMOTE_PORT} ]]; then
@@ -429,12 +402,14 @@ run_serving_tests() {
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
        qps="inf"
+        echo "now qps is $qps"
      fi

      # iterate over different max_concurrency
      for max_concurrency in $max_concurrency_list; do
-        new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
+        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
        echo " new test name $new_test_name"
        # pass the tensor parallel size, the compilation mode, and the optimization
        # level to the client so that they can be used on the benchmark dashboard
@@ -450,9 +425,7 @@ run_serving_tests() {
        echo "Running test case $test_name with qps $qps"
        echo "Client command: $client_command"

-        if [[ "${DRY_RUN:-0}" != "1" ]]; then
-          bash -c "$client_command"
-        fi
+        bash -c "$client_command"

        # record the benchmarking commands
        jq_output=$(jq -n \
@@ -470,15 +443,12 @@ run_serving_tests() {
    done

    # clean up
-    if [[ "${DRY_RUN:-0}" != "1" ]]; then
-      kill -9 "$server_pid"
-      kill_gpu_processes
-    fi
+    kill -9 $server_pid
+    kill_gpu_processes
  done
 }

 main() {
-
  local ARCH
  ARCH=''
  if [[ "$ON_CPU" == "1" ]]; then
@@ -488,13 +458,7 @@ main() {
     check_gpus
     ARCH="$arch_suffix"
  fi
-
-  # DRY_RUN does not execute vLLM; do not require HF_TOKEN.
-  if [[ "${DRY_RUN:-0}" != "1" ]]; then
-    check_hf_token
-  else
-    echo "DRY_RUN=1 -> skip HF_TOKEN validation"
-  fi
+  check_hf_token

  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
@@ -515,16 +479,11 @@ main() {

  # dump vllm info via vllm collect-env
  env_output=$(vllm collect-env)
+
  echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"

  # benchmarking
-  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $?
-
-  if [[ "${DRY_RUN:-0}" == "1" ]]; then
-    echo "DRY_RUN=1 -> skip latency/startup/throughput suites"
-    exit 0
-  fi
-
+  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
  run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
@@ -1,41 +0,0 @@
-{
-  "defaults": {
-    "qps_list": [
-      "inf"
-    ],
-    "max_concurrency_list": [
-      32,
-      64,
-      128
-    ],
-    "server_environment_variables": {
-      "VLLM_RPC_TIMEOUT": 100000,
-      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-      "VLLM_CPU_SGL_KERNEL": 1,
-      "VLLM_CPU_KVCACHE_SPACE": 40
-    },
-    "server_parameters": {
-      "dtype": "bfloat16",
-      "model": "jinaai/jina-embeddings-v3",
-      "trust_remote_code": ""
-    },
-    "client_parameters": {
-      "model": "jinaai/jina-embeddings-v3",
-      "backend": "openai-embeddings",
-      "endpoint": "/v1/embeddings",
-      "dataset_name": "sharegpt",
-      "dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json",
-      "num_prompts": 200
-    }
-  },
-  "tests": [
-    {
-      "test_name": "serving_jina_embed_v3_tp1_sharegpt",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {}
-    }
-  ]
-}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
@@ -1,283 +0,0 @@
-{
-  "defaults": {
-    "qps_list": [
-      "inf"
-    ],
-    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-    "server_environment_variables": {
-      "VLLM_RPC_TIMEOUT": 100000,
-      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-      "VLLM_CPU_SGL_KERNEL": 1,
-      "VLLM_CPU_KVCACHE_SPACE": 40
-    },
-    "server_parameters": {
-      "model": "meta-llama/Llama-3.1-8B-Instruct",
-      "tensor_parallel_size": 1,
-      "dtype": "bfloat16",
-      "distributed_executor_backend": "mp",
-      "block_size": 128,
-      "trust_remote_code": "",
-      "disable_log_stats": "",
-      "max_num_batched_tokens": 2048,
-      "max_num_seqs": 256
-    },
-    "client_parameters": {
-      "model": "meta-llama/Llama-3.1-8B-Instruct",
-      "backend": "vllm",
-      "ignore-eos": "",
-      "num_prompts": 200
-    }
-  },
-  "tests": [
-    {
-      "test_name": "serving_llama8B_tp1_sharegpt",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "sharegpt",
-        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_sharegpt",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "sharegpt",
-        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp4_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp4_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp1_random_2048_128",
-      "server_parameters": {
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp2_random_2048_128",
-      "server_parameters": {
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_tp4_random_2048_128",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int4_tp1_random_128_128",
-      "server_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int4_tp2_random_128_128",
-      "server_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "tensor_parallel_size": 2
-      },
-      "client_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int4_tp4_random_128_128",
-      "server_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama3B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "meta-llama/Llama-3.2-3B-Instruct",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "meta-llama/Llama-3.2-3B-Instruct",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_granite2B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "ibm-granite/granite-3.2-2b-instruct",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "ibm-granite/granite-3.2-2b-instruct",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_qwen1.7B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-1.7B",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-1.7B",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_qwen4B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-4B",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-4B",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_qwen8B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-8B",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-8B",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_glm9B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "zai-org/glm-4-9b-hf",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "zai-org/glm-4-9b-hf",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_gemma7B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "google/gemma-7b",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "google/gemma-7b",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    }
-  ]
-}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -148,6 +148,136 @@
        "random-input-len": 2048,
        "random-output-len": 128
      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp1_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp2_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp4_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama3B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_granite2B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "ibm-granite/granite-3.2-2b-instruct",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "ibm-granite/granite-3.2-2b-instruct",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen1.7B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-1.7B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-1.7B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen4B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-4B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-4B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen8B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-8B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-8B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_glm9B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "zai-org/glm-4-9b-hf",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "zai-org/glm-4-9b-hf",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_gemma7B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "google/gemma-7b",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "google/gemma-7b",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
    }
  ]
 }
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -248,8 +248,8 @@ steps:
          DOCKER_BUILDKIT: "1"
          DOCKERHUB_USERNAME: "vllmbot"

-  - group: "Publish wheels"
-    key: "publish-wheels"
+  - group: "Publish release artifacts"
+    key: "publish-release-artifacts"
    steps:
      - block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
        key: block-upload-release-wheels
@@ -265,6 +265,27 @@ steps:
          queue: small_cpu_queue_postmerge
        commands:
          - "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
+      
+      - block: "Confirm update release images to DockerHub"
+        key: block-update-release-images-dockerhub
+        depends_on:
+          - input-release-version
+          - annotate-release-workflow
+      
+      - label: "Publish release images to DockerHub"
+        depends_on:
+          - block-update-release-images-dockerhub
+        agents:
+          queue: small_cpu_queue_postmerge
+        commands:
+          - "bash .buildkite/scripts/push-release-images-dockerhub.sh"
+        plugins:
+          - docker-login#v3.0.0:
+              username: vllmbot
+              password-env: DOCKERHUB_TOKEN
+        env:
+          DOCKER_BUILDKIT: "1"
+          DOCKERHUB_USERNAME: "vllmbot"

  # =============================================================================
  # ROCm Release Pipeline (x86_64 only)
--- a/.buildkite/scripts/annotate-rocm-release.sh
+++ b/.buildkite/scripts/annotate-rocm-release.sh
@@ -25,7 +25,7 @@ S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
 S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"

 # Format ROCm version for path (e.g., "7.1" -> "rocm710")
-ROCM_VERSION_PATH="rocm$(echo "${ROCM_VERSION}" | tr -d '.')"
+ROCM_VERSION_PATH="rocm$(echo ${ROCM_VERSION} | tr -d '.')"
 ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
 buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
 ## ROCm Wheel and Docker Image Releases
--- a/.buildkite/scripts/cache-rocm-base-wheels.sh
+++ b/.buildkite/scripts/cache-rocm-base-wheels.sh
@@ -83,7 +83,7 @@ case "${1:-}" in
            exit 1
        fi

-        WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
+        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
        if [[ "$WHEEL_COUNT" -eq 0 ]]; then
            echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
            exit 1
@@ -110,9 +110,9 @@ case "${1:-}" in

        echo ""
        echo "Downloaded wheels:"
-        find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \;
+        ls -lh artifacts/rocm-base-wheels/

-        WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
+        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
        echo ""
        echo "Total: $WHEEL_COUNT wheels"
        echo "========================================"
--- a/.buildkite/scripts/cherry-pick-from-milestone.sh
+++ b/.buildkite/scripts/cherry-pick-from-milestone.sh
@@ -134,7 +134,7 @@ log_info "Fetching merged PRs from milestone '${MILESTONE}'..."

 # Store PR data in a temp file
 PR_DATA=$(mktemp)
-trap 'rm -f "$PR_DATA"' EXIT
+trap "rm -f $PR_DATA" EXIT

 if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
    --limit 1000 \
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -1,37 +1,25 @@
 #!/bin/bash

-# This script runs tests inside the corresponding ROCm docker container.
-# It handles both single-node and multi-node test configurations.
-#
-# Multi-node detection: Instead of matching on fragile group names, we detect
-# multi-node jobs structurally by looking for the bracket command syntax
-# "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
+# This script runs test inside the corresponding ROCm docker container.
 set -o pipefail

 # Export Python path
 export PYTHONPATH=".."

-###############################################################################
-# Helper Functions
-###############################################################################
+# Print ROCm version
+echo "--- Confirming Clean Initial State"
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done

-wait_for_clean_gpus() {
-  local timeout=${1:-300}
-  local start=$SECONDS
-  echo "--- Waiting for clean GPU state (timeout: ${timeout}s)"
-  while true; do
-    if grep -q clean /opt/amdgpu/etc/gpu_state; then
-      echo "GPUs state is \"clean\""
-      return
-    fi
-    if (( SECONDS - start >= timeout )); then
-      echo "Error: GPUs did not reach clean state within ${timeout}s" >&2
-      exit 1
-    fi
-    sleep 3
-  done
-}
+echo "--- ROCm info"
+rocminfo

+# cleanup older docker images
 cleanup_docker() {
  # Get Docker's root directory
  docker_root=$(docker info -f '{{.DockerRootDir}}')
@@ -40,12 +28,15 @@ cleanup_docker() {
    exit 1
  fi
  echo "Docker root directory: $docker_root"
-
+  # Check disk usage of the filesystem where Docker's root directory is located
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+  # Define the threshold
  threshold=70
  if [ "$disk_usage" -gt "$threshold" ]; then
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
+    # Remove unused volumes / force the system prune for old images as well.
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
    echo "Docker images and volumes cleanup completed."
  else
@@ -54,258 +45,193 @@ cleanup_docker() {
 }

 cleanup_network() {
-  local max_nodes=${NUM_NODES:-2}
-  for node in $(seq 0 $((max_nodes - 1))); do
-    if docker ps -a -q -f name="node${node}" | grep -q .; then
-      docker stop "node${node}" || true
+  for node in $(seq 0 $((NUM_NODES-1))); do
+    if docker pr -a -q -f name="node${node}" | grep -q .; then
+      docker stop "node${node}"
    fi
  done
-  if docker network ls | grep -q docker-net; then
-    docker network rm docker-net || true
+  if docker network ls | grep docker-net; then
+    docker network rm docker-net
  fi
 }

-is_multi_node() {
-  local cmds="$1"
-  # Primary signal: NUM_NODES environment variable set by the pipeline
-  if [[ "${NUM_NODES:-1}" -gt 1 ]]; then
-    return 0
-  fi
-  # Fallback: detect the bracket syntax structurally
-  # Pattern: [...] && [...] (per-node command arrays)
-  if [[ "$cmds" =~ \[.*\].*\&\&.*\[.*\] ]]; then
-    return 0
-  fi
-  return 1
-}
-
-###############################################################################
-# Pytest marker re-quoting
-#
-# When commands are passed through Buildkite -> shell -> $* -> bash -c,
-# quotes around pytest -m marker expressions get stripped:
-#   pytest -v -s -m 'not cpu_test' v1/core
-# becomes:
-#   pytest -v -s -m not cpu_test v1/core
-#
-# pytest then interprets "cpu_test" as a file path, not part of the marker.
-# This function detects unquoted multi-word marker expressions and re-quotes
-# them so they survive the final bash -c expansion.
-###############################################################################
-
-re_quote_pytest_markers() {
-  local cmds="$1"
-  # Pattern: -m not <identifier>  ->  -m 'not <identifier>'
-  # Handles the common cases: 'not cpu_test', 'not slow_test', etc.
-  cmds=$(echo "$cmds" | sed -E "s/-m not ([a-zA-Z_][a-zA-Z0-9_]*)/-m 'not \1'/g")
-  echo "$cmds"
-}
-
-###############################################################################
-# ROCm-specific pytest command rewrites
-#
-# These apply ignore flags and environment overrides for tests that are not
-# yet supported or behave differently on ROCm hardware. Kept as a single
-# function so new exclusions are easy to add in one place.
-###############################################################################
-
-apply_rocm_test_overrides() {
-  local cmds="$1"
-
-  # --- Model registry filter ---
-  if [[ $cmds == *"pytest -v -s models/test_registry.py"* ]]; then
-    cmds=${cmds//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
-  fi
-
-  # --- LoRA: disable custom paged attention ---
-  if [[ $cmds == *"pytest -v -s lora"* ]]; then
-    cmds=${cmds//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
-  fi
-
-  # --- Kernel ignores ---
-  if [[ $cmds == *" kernels/core"* ]]; then
-    cmds="${cmds} \
-    --ignore=kernels/core/test_fused_quant_layernorm.py \
-    --ignore=kernels/core/test_permute_cols.py"
-  fi
-
-  if [[ $cmds == *" kernels/attention"* ]]; then
-    cmds="${cmds} \
-    --ignore=kernels/attention/test_attention_selector.py \
-    --ignore=kernels/attention/test_encoder_decoder_attn.py \
-    --ignore=kernels/attention/test_flash_attn.py \
-    --ignore=kernels/attention/test_flashinfer.py \
-    --ignore=kernels/attention/test_prefix_prefill.py \
-    --ignore=kernels/attention/test_cascade_flash_attn.py \
-    --ignore=kernels/attention/test_mha_attn.py \
-    --ignore=kernels/attention/test_lightning_attn.py \
-    --ignore=kernels/attention/test_attention.py"
-  fi
-
-  if [[ $cmds == *" kernels/quantization"* ]]; then
-    cmds="${cmds} \
-    --ignore=kernels/quantization/test_int8_quant.py \
-    --ignore=kernels/quantization/test_machete_mm.py \
-    --ignore=kernels/quantization/test_block_fp8.py \
-    --ignore=kernels/quantization/test_block_int8.py \
-    --ignore=kernels/quantization/test_marlin_gemm.py \
-    --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
-    --ignore=kernels/quantization/test_int8_kernel.py"
-  fi
-
-  if [[ $cmds == *" kernels/mamba"* ]]; then
-    cmds="${cmds} \
-    --ignore=kernels/mamba/test_mamba_mixer2.py \
-    --ignore=kernels/mamba/test_causal_conv1d.py \
-    --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
-  fi
-
-  if [[ $cmds == *" kernels/moe"* ]]; then
-    cmds="${cmds} \
-    --ignore=kernels/moe/test_moe.py \
-    --ignore=kernels/moe/test_cutlass_moe.py \
-    --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
-  fi
-
-  # --- Entrypoint ignores ---
-  if [[ $cmds == *" entrypoints/openai "* ]]; then
-    cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
-    --ignore=entrypoints/openai/test_audio.py \
-    --ignore=entrypoints/openai/test_shutdown.py \
-    --ignore=entrypoints/openai/test_completion.py \
-    --ignore=entrypoints/openai/test_models.py \
-    --ignore=entrypoints/openai/test_lora_adapters.py \
-    --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
-    --ignore=entrypoints/openai/test_root_path.py \
-    --ignore=entrypoints/openai/test_tokenization.py \
-    --ignore=entrypoints/openai/test_prompt_validation.py "}
-  fi
-
-  if [[ $cmds == *" entrypoints/llm "* ]]; then
-    cmds=${cmds//" entrypoints/llm "/" entrypoints/llm \
-    --ignore=entrypoints/llm/test_chat.py \
-    --ignore=entrypoints/llm/test_accuracy.py \
-    --ignore=entrypoints/llm/test_init.py \
-    --ignore=entrypoints/llm/test_prompt_validation.py "}
-  fi
-
-  # Clean up escaped newlines from --ignore appends
-  cmds=$(echo "$cmds" | sed 's/ \\ / /g')
-
-  echo "$cmds"
-}
-
-###############################################################################
-# Main
-###############################################################################
-
-# --- GPU initialization ---
-echo "--- Confirming Clean Initial State"
-wait_for_clean_gpus
-
-echo "--- ROCm info"
-rocminfo
-
-# --- Docker housekeeping ---
+# Call the cleanup docker function
 cleanup_docker

 echo "--- Resetting GPUs"
-echo "reset" > /opt/amdgpu/etc/gpu_state
-wait_for_clean_gpus

-# --- Pull test image ---
+echo "reset" > /opt/amdgpu/etc/gpu_state
+
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+
 echo "--- Pulling container"
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"

 remove_docker_container() {
-  docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
+   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
 }
 trap remove_docker_container EXIT

-# --- Prepare commands ---
 echo "--- Running container"

 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"

-commands="$*"
+commands=$@
 echo "Raw commands: $commands"

-# Fix quoting before ROCm overrides (so overrides see correct structure)
-commands=$(re_quote_pytest_markers "$commands")
-commands=$(apply_rocm_test_overrides "$commands")
+commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
+
+if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
+  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
+fi
+
+commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
+
+if [[ $commands == *"pytest -v -s lora"* ]]; then
+  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
+fi
+
+#ignore certain kernels tests
+if [[ $commands == *" kernels/core"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/core/test_fused_quant_layernorm.py \
+  --ignore=kernels/core/test_permute_cols.py"
+fi
+
+if [[ $commands == *" kernels/attention"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/attention/test_attention_selector.py \
+  --ignore=kernels/attention/test_encoder_decoder_attn.py \
+  --ignore=kernels/attention/test_flash_attn.py \
+  --ignore=kernels/attention/test_flashinfer.py \
+  --ignore=kernels/attention/test_prefix_prefill.py \
+  --ignore=kernels/attention/test_cascade_flash_attn.py \
+  --ignore=kernels/attention/test_mha_attn.py \
+  --ignore=kernels/attention/test_lightning_attn.py \
+  --ignore=kernels/attention/test_attention.py"
+fi
+
+if [[ $commands == *" kernels/quantization"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/quantization/test_int8_quant.py \
+  --ignore=kernels/quantization/test_machete_mm.py \
+  --ignore=kernels/quantization/test_block_fp8.py \
+  --ignore=kernels/quantization/test_block_int8.py \
+  --ignore=kernels/quantization/test_marlin_gemm.py \
+  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
+  --ignore=kernels/quantization/test_int8_kernel.py"
+fi
+
+if [[ $commands == *" kernels/mamba"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/mamba/test_mamba_mixer2.py \
+  --ignore=kernels/mamba/test_causal_conv1d.py \
+  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
+fi
+
+if [[ $commands == *" kernels/moe"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/moe/test_moe.py \
+  --ignore=kernels/moe/test_cutlass_moe.py \
+  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
+fi
+
+#ignore certain Entrypoints/openai tests
+if [[ $commands == *" entrypoints/openai "* ]]; then
+  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
+  --ignore=entrypoints/openai/test_audio.py \
+  --ignore=entrypoints/openai/test_shutdown.py \
+  --ignore=entrypoints/openai/test_completion.py \
+  --ignore=entrypoints/openai/test_models.py \
+  --ignore=entrypoints/openai/test_lora_adapters.py \
+  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
+  --ignore=entrypoints/openai/test_root_path.py \
+  --ignore=entrypoints/openai/test_tokenization.py \
+  --ignore=entrypoints/openai/test_prompt_validation.py "}
+fi
+
+#ignore certain Entrypoints/llm tests
+if [[ $commands == *" entrypoints/llm "* ]]; then
+  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
+  --ignore=entrypoints/llm/test_chat.py \
+  --ignore=entrypoints/llm/test_accuracy.py \
+  --ignore=entrypoints/llm/test_init.py \
+  --ignore=entrypoints/llm/test_prompt_validation.py "}
+fi
+
+commands=$(echo "$commands" | sed 's/ \\ / /g')
 echo "Final commands: $commands"

+# --ignore=entrypoints/openai/test_encoder_decoder.py \
+# --ignore=entrypoints/openai/test_embedding.py \
+# --ignore=entrypoints/openai/test_oot_registration.py
+# --ignore=entrypoints/openai/test_accuracy.py \
+# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
+
+
 MYPYTHONPATH=".."

-# Verify GPU access
+# Test that we're launching on the machine that has
+# proper access to GPUs
 render_gid=$(getent group render | cut -d: -f3)
 if [[ -z "$render_gid" ]]; then
  echo "Error: 'render' group not found. This is required for GPU access." >&2
  exit 1
 fi

-# --- Route: multi-node vs single-node ---
-if is_multi_node "$commands"; then
-  echo "--- Multi-node job detected"
+if [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
+
  export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')

-  # Parse the bracket syntax:  prefix ; [node0_cmds] && [node1_cmds]
-  #   BASH_REMATCH[1] = prefix (everything before first bracket)
-  #   BASH_REMATCH[2] = comma-separated node0 commands
-  #   BASH_REMATCH[3] = comma-separated node1 commands
-  if [[ "$commands" =~ ^(.*)\[(.*)"] && ["(.*)\]$ ]]; then
-    prefix=$(echo "${BASH_REMATCH[1]}" | sed 's/;//g')
-    echo "PREFIX: ${prefix}"
-
-    export composite_command="(command rocm-smi || true)"
-    saved_IFS=$IFS
-    IFS=','
-    read -ra node0 <<< "${BASH_REMATCH[2]}"
-    read -ra node1 <<< "${BASH_REMATCH[3]}"
-    IFS=$saved_IFS
-
-    if [[ ${#node0[@]} -ne ${#node1[@]} ]]; then
-      echo "Warning: node0 has ${#node0[@]} commands, node1 has ${#node1[@]}. They will be paired by index."
-    fi
-
-    for i in "${!node0[@]}"; do
-      command_node_0=$(echo "${node0[i]}" | sed 's/\"//g')
-      command_node_1=$(echo "${node1[i]}" | sed 's/\"//g')
-
-      step_cmd="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
-      echo "COMMANDS: ${step_cmd}"
-      composite_command="${composite_command} && ${step_cmd}"
-    done
-
-    /bin/bash -c "${composite_command}"
-    cleanup_network
+  if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then
+      prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g')
+      echo "PREFIX: ${prefix}"
+      export composite_command="(command rocm-smi || true)"
+      myIFS=$IFS
+      IFS=','
+      read -ra node0 <<< ${BASH_REMATCH[2]}
+      read -ra node1 <<< ${BASH_REMATCH[3]}
+      IFS=$myIFS
+      for i in "${!node0[@]}";do 
+        command_node_0=$(echo ${node0[i]} | sed 's/\"//g')
+        command_node_1=$(echo ${node1[i]} | sed 's/\"//g')
+        
+        export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
+        echo "COMMANDS: ${commands}"
+        composite_command=$(echo "${composite_command} && ${commands}")
+      done
+      /bin/bash -c "${composite_command}"
+      cleanup_network
  else
-    echo "Multi-node job detected but failed to parse bracket command syntax."
-    echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]"
-    echo "Got: $commands"
-    cleanup_network
-    exit 111
+      echo "Failed to parse node commands! Exiting."
+      cleanup_network
+      exit 111
  fi
 else
-  echo "--- Single-node job"
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \
-    --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
-    --network=host \
-    --shm-size=16gb \
-    --group-add "$render_gid" \
-    --rm \
-    -e HF_TOKEN \
-    -e AWS_ACCESS_KEY_ID \
-    -e AWS_SECRET_ACCESS_KEY \
-    -v "${HF_CACHE}:${HF_MOUNT}" \
-    -e "HF_HOME=${HF_MOUNT}" \
-    -e "PYTHONPATH=${MYPYTHONPATH}" \
-    --name "${container_name}" \
-    "${image_name}" \
-    /bin/bash -c "${commands}"
+          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+          --network=host \
+          --shm-size=16gb \
+          --group-add "$render_gid" \
+          --rm \
+          -e HF_TOKEN \
+          -e AWS_ACCESS_KEY_ID \
+          -e AWS_SECRET_ACCESS_KEY \
+          -v "${HF_CACHE}:${HF_MOUNT}" \
+          -e "HF_HOME=${HF_MOUNT}" \
+          -e "PYTHONPATH=${MYPYTHONPATH}" \
+          --name "${container_name}" \
+          "${image_name}" \
+          /bin/bash -c "${commands}"
 fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -27,7 +27,7 @@ function cpu_tests() {
  podman exec -it "$container_id" bash -c "
    export TORCH_COMPILE_DISABLE=1
    set -xve
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log

  # Run basic model test
  podman exec -it "$container_id" bash -c "
@@ -43,7 +43,7 @@ function cpu_tests() {
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
    # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
-    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> "$HOME"/test_rest.log
+    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
 }

 # All of CPU tests are expected to be finished less than 40 mins.
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -16,5 +16,5 @@ echo "--- :docker: Building Docker image"
 docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
-        timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
+docker run --rm --cpuset-cpus=$CORE_RANGE --cpuset-mems=$NUMA_NODE -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g $IMAGE_NAME \
+        timeout $TIMEOUT_VAL bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -7,7 +7,7 @@ set -exuo pipefail
 # Try building the docker image
 image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
 container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
-cat <<EOF | docker build -t "${image_name}" -f - .
+cat <<EOF | docker build -t ${image_name} -f - .
 FROM gaudi-base-image:latest

 COPY ./ /workspace/vllm
@@ -39,12 +39,12 @@ EOF
 # functions, while other platforms only need one remove_docker_container
 # function.
 EXITCODE=1
-remove_docker_containers() { docker rm -f "${container_name}" || true; }
+remove_docker_containers() { docker rm -f ${container_name} || true; }
 trap 'remove_docker_containers; exit $EXITCODE;' EXIT
 remove_docker_containers

 echo "Running HPU plugin v1 test"
-docker run --rm --runtime=habana --name="${container_name}" --network=host \
+docker run --rm --runtime=habana --name=${container_name} --network=host \
  -e HABANA_VISIBLE_DEVICES=all \
  -e VLLM_SKIP_WARMUP=true \
  -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -41,7 +41,6 @@ get_config() {
        echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
        exit 1
    fi
-    # shellcheck source=/dev/null
    source "${TEST_RUN_CONFIG_FILE}"
    echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
    return 0
@@ -49,8 +48,9 @@ get_config() {

 # get test running configuration.
 fetch_vllm_test_cfg
+get_config
 # Check if the function call was successful. If not, exit the script.
-if ! get_config; then
+if [ $? -ne 0 ]; then
  exit 1
 fi

@@ -62,14 +62,14 @@ agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
 echo "agent_idx: ${agent_idx}"
 builder_name="cachebuilder${agent_idx}"
 builder_cache_dir="/mnt/docker-cache${agent_idx}"
-mkdir -p "${builder_cache_dir}"
+mkdir -p ${builder_cache_dir}

 # Try building the docker image
 cat <<EOF | DOCKER_BUILDKIT=1 docker build \
-    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:"${PYPI_CACHE_HOST}" \
-    --builder "${builder_name}" --cache-from type=local,src="${builder_cache_dir}" \
-                           --cache-to type=local,dest="${builder_cache_dir}",mode=max \
-    --progress=plain --load -t "${image_name}" -f - .
+    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
+    --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
+                           --cache-to type=local,dest=${builder_cache_dir},mode=max \
+    --progress=plain --load -t ${image_name} -f - .
 FROM ${BASE_IMAGE_NAME}

 # Define environments
@@ -116,7 +116,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
-    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/devlib && \
+    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
    python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/

 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -139,7 +139,7 @@ trap remove_docker_container EXIT
 # Generate corresponding --device args based on BUILDKITE_AGENT_NAME
 # Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
 #   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
-#   returns one argument per line: --device, /dev/davinciX, ...
+#   returns --device /dev/davinci0 --device /dev/davinci1
 parse_and_gen_devices() {
    local input="$1"
    local index cards_num
@@ -151,24 +151,29 @@ parse_and_gen_devices() {
        return 1
    fi

+    local devices=""
    local i=0
    while (( i < cards_num )); do
        local dev_idx=$(((index - 1)*cards_num + i ))
-        printf '%s\n' "--device"
-        printf '%s\n' "/dev/davinci${dev_idx}"
+        devices="$devices --device /dev/davinci${dev_idx}"
        ((i++))
    done
+
+    # trim leading space
+    devices="${devices#"${devices%%[![:space:]]*}"}"
+    # Output devices: assigned to the caller variable
+    printf '%s' "$devices"
 }

-mapfile -t device_args < <(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
+devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1

 # Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
 # This test checks whether the OOT platform interface is functioning properly in conjunction with
 # the hardware plugin vllm-ascend.
 model_cache_dir=/mnt/modelscope${agent_idx}
-mkdir -p "${model_cache_dir}"
+mkdir -p ${model_cache_dir}
 docker run \
-    "${device_args[@]}" \
+    ${devices} \
    --device /dev/davinci_manager \
    --device /dev/devmm_svm \
    --device /dev/hisi_hdc \
@@ -177,7 +182,7 @@ docker run \
    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
    -v /etc/ascend_install.info:/etc/ascend_install.info \
-    -v "${model_cache_dir}":/root/.cache/modelscope \
+    -v ${model_cache_dir}:/root/.cache/modelscope \
    --entrypoint="" \
    --name "${container_name}" \
    "${image_name}" \
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"

--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"

--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"

 # Try building the docker image
-docker build -t "${image_name}" -f docker/Dockerfile.xpu .
+docker build -t ${image_name} -f docker/Dockerfile.xpu .

 # Setup cleanup
 remove_docker_container() {
--- a/.buildkite/scripts/push-nightly-builds.sh
+++ b/.buildkite/scripts/push-nightly-builds.sh
@@ -21,16 +21,16 @@ echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag nam

 # pull original arch-dependent images from AWS ECR Public
 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX"
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX"
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
 # tag arch-dependent images
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-x86_64
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-aarch64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
 # push arch-dependent images to DockerHub
-docker push vllm/vllm-openai:"$TAG_NAME"-x86_64
-docker push vllm/vllm-openai:"$TAG_NAME"-aarch64
+docker push vllm/vllm-openai:$TAG_NAME-x86_64
+docker push vllm/vllm-openai:$TAG_NAME-aarch64
 # push arch-independent manifest to DockerHub
-docker manifest create vllm/vllm-openai:"$TAG_NAME" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
-docker manifest create vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
-docker manifest push vllm/vllm-openai:"$TAG_NAME"
-docker manifest push vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT"
+docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
+docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
+docker manifest push vllm/vllm-openai:$TAG_NAME
+docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT
--- a/.buildkite/scripts/push-release-images-dockerhub.sh
+++ b/.buildkite/scripts/push-release-images-dockerhub.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+
+set -ex
+
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null | sed 's/^v//')
+if [ -z "${RELEASE_VERSION}" ]; then
+    echo "RELEASE_VERSION is not set"
+    exit 1
+fi
+
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
+
+# Download images:
+
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
+docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
+docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
+
+# Tag and push images:
+
+## CUDA
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
+docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
+docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+docker push vllm/vllm-openai:latest-x86_64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
+docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
+docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
+docker push vllm/vllm-openai:latest-x86_64-cu130
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
+docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
+docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+docker push vllm/vllm-openai:latest-aarch64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
+docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
+docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+docker push vllm/vllm-openai:latest-aarch64-cu130
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+
+## ROCm
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+docker push vllm/vllm-openai-rocm:latest
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+docker push vllm/vllm-openai-rocm:latest-base
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+
+## CPU
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
+docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
+docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
+docker push vllm/vllm-openai-cpu:latest-x86_64
+docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
+docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
+docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+docker push vllm/vllm-openai-cpu:latest-arm64
+docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+
+# Create multi-arch manifest:
+
+docker manifest rm vllm/vllm-openai:latest
+docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+docker manifest push vllm/vllm-openai:latest
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
+
+docker manifest rm vllm/vllm-openai:latest-cu130
+docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+docker manifest push vllm/vllm-openai:latest-cu130
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
+
+docker manifest rm vllm/vllm-openai-cpu:latest || true
+docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
+docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+docker manifest push vllm/vllm-openai-cpu:latest
+docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Setup script for Prime-RL integration tests
+# This script prepares the environment for running Prime-RL tests with nightly vLLM
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
+PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
+
+if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
+    echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
+    exit 0
+fi
+
+echo "Setting up Prime-RL integration test environment..."
+
+# Clean up any existing Prime-RL directory
+if [ -d "${PRIME_RL_DIR}" ]; then
+    echo "Removing existing Prime-RL directory..."
+    rm -rf "${PRIME_RL_DIR}"
+fi
+
+# Install UV if not available
+if ! command -v uv &> /dev/null; then
+    echo "Installing UV package manager..."
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    source $HOME/.local/bin/env
+fi
+
+# Clone Prime-RL repository at specific branch for reproducible tests
+PRIME_RL_BRANCH="integ-vllm-main"
+echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
+git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
+cd "${PRIME_RL_DIR}"
+
+echo "Setting up UV project environment..."
+export UV_PROJECT_ENVIRONMENT=/usr/local
+ln -s /usr/bin/python3 /usr/local/bin/python
+
+# Remove vllm pin from pyproject.toml
+echo "Removing vllm pin from pyproject.toml..."
+sed -i '/vllm==/d' pyproject.toml
+
+# Sync Prime-RL dependencies
+echo "Installing Prime-RL dependencies..."
+uv sync --inexact && uv sync --inexact --all-extras
+
+# Verify installation
+echo "Verifying installations..."
+uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
+uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
+
+echo "Prime-RL integration test environment setup complete!"
+
+echo "Running Prime-RL integration tests..."
+export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
+uv run pytest -vs tests/integration/test_rl.py -m gpu
+
+echo "Prime-RL integration tests completed!"
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -51,14 +51,14 @@ for BACK in "${BACKENDS[@]}"; do
    --enable-eplb \
    --trust-remote-code \
    --max-model-len 2048 \
-    --all2all-backend "$BACK" \
-    --port "$PORT" &
+    --all2all-backend $BACK \
+    --port $PORT &
  SERVER_PID=$!
-  wait_for_server "$PORT"
+  wait_for_server $PORT

  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
@@ -1,57 +0,0 @@
-#!/usr/bin/env bash
-set -euxo pipefail
-
-# Nightly e2e test for prefetch offloading with a MoE model.
-# Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights
-# and validates GSM8K accuracy matches baseline (no offloading).
-#
-# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
-THRESHOLD=${1:-0.25}
-NUM_Q=${2:-1319}
-PORT=${3:-8030}
-OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
-mkdir -p "${OUT_DIR}"
-
-wait_for_server() {
-  local port=$1
-  timeout 600 bash -c '
-    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
-      sleep 1
-    done'
-}
-
-MODEL="deepseek-ai/DeepSeek-V2-Lite"
-
-cleanup() {
-  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
-    kill "${SERVER_PID}" 2>/dev/null || true
-    for _ in {1..20}; do
-      kill -0 "${SERVER_PID}" 2>/dev/null || break
-      sleep 0.5
-    done
-    kill -9 "${SERVER_PID}" 2>/dev/null || true
-  fi
-}
-trap cleanup EXIT
-
-vllm serve "$MODEL" \
-  --max-model-len 2048 \
-  --offload-group-size 8 \
-  --offload-num-in-group 2 \
-  --offload-prefetch-step 1 \
-  --offload-params w13_weight w2_weight \
-  --port "$PORT" &
-SERVER_PID=$!
-wait_for_server "$PORT"
-
-TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
-OUT="${OUT_DIR}/${TAG}_prefetch_offload.json"
-python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
-python3 - <<PY
-import json; acc=json.load(open('${OUT}'))['accuracy']
-print(f"${MODEL} prefetch_offload: accuracy {acc:.3f}")
-assert acc >= ${THRESHOLD}, f"${MODEL} prefetch_offload accuracy {acc}"
-PY
-
-cleanup
-SERVER_PID=
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@@ -47,20 +47,20 @@ for BACK in "${BACKENDS[@]}"; do
  vllm serve "$MODEL" \
    --enforce-eager \
    --enable-eplb \
-    --all2all-backend "$BACK" \
+    --all2all-backend $BACK \
    --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
-    --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \
-    --data-parallel-size "${DATA_PARALLEL_SIZE}" \
+    --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
+    --data-parallel-size ${DATA_PARALLEL_SIZE} \
    --enable-expert-parallel \
    --trust-remote-code \
    --max-model-len 2048 \
-    --port "$PORT" &
+    --port $PORT &
  SERVER_PID=$!
-  wait_for_server "$PORT"
+  wait_for_server $PORT

  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -51,20 +51,20 @@ for BACK in "${BACKENDS[@]}"; do
    --tensor-parallel-size 4 \
    --enable-expert-parallel \
    --enable-eplb \
-    --all2all-backend "$BACK" \
+    --all2all-backend $BACK \
    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
    --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
    --trust-remote-code \
    --max-model-len 2048 \
    --gpu-memory-utilization 0.9 \
    "${PLATFORM_ARGS[@]}" \
-    --port "$PORT" &
+    --port $PORT &
  SERVER_PID=$!
-  wait_for_server "$PORT"
+  wait_for_server $PORT

  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
--- a/.buildkite/scripts/tpu/docker_run_bm.sh
+++ b/.buildkite/scripts/tpu/docker_run_bm.sh
@@ -9,11 +9,10 @@ ENV_FILE=$1

 # For testing on local vm, use `set -a` to export all variables
 source /etc/environment
-# shellcheck source=/dev/null
-source "$ENV_FILE"
+source $ENV_FILE

 remove_docker_container() { 
-    docker rm -f "$CONTAINER_NAME" || true;
+    docker rm -f $CONTAINER_NAME || true;
 }

 trap remove_docker_container EXIT
@@ -42,13 +41,13 @@ echo
 echo "starting docker...$CONTAINER_NAME"
 echo    
 docker run \
- -v "$DOWNLOAD_DIR":"$DOWNLOAD_DIR" \
- --env-file "$ENV_FILE" \
+ -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
+ --env-file $ENV_FILE \
 -e HF_TOKEN="$HF_TOKEN" \
- -e TARGET_COMMIT="$BUILDKITE_COMMIT" \
- -e MODEL="$MODEL" \
+ -e TARGET_COMMIT=$BUILDKITE_COMMIT \
+ -e MODEL=$MODEL \
 -e WORKSPACE=/workspace \
- --name "$CONTAINER_NAME" \
+ --name $CONTAINER_NAME \
 -d \
 --privileged \
 --network host \
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@@ -42,21 +42,21 @@ echo "lanching vllm..."
 echo "logging to $VLLM_LOG"
 echo

-vllm serve "$MODEL" \
+vllm serve $MODEL \
 --seed 42 \
- --max-num-seqs "$MAX_NUM_SEQS" \
- --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
- --tensor-parallel-size "$TENSOR_PARALLEL_SIZE" \
+ --max-num-seqs $MAX_NUM_SEQS \
+ --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
+ --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
 --no-enable-prefix-caching \
- --download_dir "$DOWNLOAD_DIR" \
- --max-model-len "$MAX_MODEL_LEN" > "$VLLM_LOG" 2>&1 &
+ --download_dir $DOWNLOAD_DIR \
+ --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &


 echo "wait for 20 minutes.."
 echo
 # sleep 1200
 # wait for 10 minutes...
-for _ in {1..120}; do
+for i in {1..120}; do
    # TODO: detect other type of errors.
    if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
        echo "Detected RuntimeError, exiting."
@@ -78,11 +78,11 @@ echo "logging to $BM_LOG"
 echo
 vllm bench serve \
    --backend vllm \
-    --model "$MODEL"  \
+    --model $MODEL  \
    --dataset-name sonnet \
    --dataset-path benchmarks/sonnet_4x.txt \
-    --sonnet-input-len "$INPUT_LEN" \
-    --sonnet-output-len "$OUTPUT_LEN" \
+    --sonnet-input-len $INPUT_LEN \
+    --sonnet-output-len $OUTPUT_LEN \
    --ignore-eos > "$BM_LOG"

 echo "completed..."
--- a/.buildkite/scripts/upload-nightly-wheels.sh
+++ b/.buildkite/scripts/upload-nightly-wheels.sh
@@ -76,15 +76,16 @@ mkdir -p "$INDICES_OUTPUT_DIR"
 # this indices have relative paths that could work as long as it is next to the wheel directory in s3
 # i.e., the wheels are always in s3://vllm-wheels/<commit>/
 # and indices can be placed in /<commit>/, or /nightly/, or /<version>/
-alias_args=()
-if [[ -n "$DEFAULT_VARIANT_ALIAS" ]]; then
-    alias_args=(--alias-to-default "$DEFAULT_VARIANT_ALIAS")
+if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
+    alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
+else
+    alias_arg=""
 fi

 # HACK: we do not need regex module here, but it is required by pre-commit hook
 # To avoid any external dependency, we simply replace it back to the stdlib re module
 sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
-$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" "${alias_args[@]}"
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg

 # copy indices to /<commit>/ unconditionally
 echo "Uploading indices to $S3_COMMIT_PREFIX"
@@ -99,9 +100,9 @@ fi
 # re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
 if [[ "$version" != *"dev"* ]]; then
    echo "Re-generating indices for /$pure_version/"
-    rm -rf "${INDICES_OUTPUT_DIR:?}/*"
+    rm -rf "$INDICES_OUTPUT_DIR/*"
    mkdir -p "$INDICES_OUTPUT_DIR"
    # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
-    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" "${alias_args[@]}"
+    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
 fi
--- a/.buildkite/scripts/upload-release-wheels-pypi.sh
+++ b/.buildkite/scripts/upload-release-wheels-pypi.sh
@@ -7,7 +7,7 @@ SUBPATH=$BUILDKITE_COMMIT
 S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"

 RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
-GIT_VERSION=$(git describe --exact-match --tags "$BUILDKITE_COMMIT" 2>/dev/null)
+GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)

 echo "Release version from Buildkite: $RELEASE_VERSION"

@@ -55,7 +55,7 @@ mkdir -p $DIST_DIR
 aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
 echo "Wheels copied to local directory"
 # generate source tarball
-git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT"
+git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
 ls -la $DIST_DIR

 # upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
@@ -65,6 +65,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
  exit 1
 fi

-python3 -m twine check "$PYPI_WHEEL_FILES"
-python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES"
+python3 -m twine check $PYPI_WHEEL_FILES
+python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES
 echo "Wheels uploaded to PyPI"
--- a/.buildkite/scripts/upload-rocm-wheels.sh
+++ b/.buildkite/scripts/upload-rocm-wheels.sh
@@ -55,7 +55,7 @@ mkdir -p all-rocm-wheels
 cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
 cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true

-WHEEL_COUNT=$(find all-rocm-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
+WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
 echo "Total wheels to upload: $WHEEL_COUNT"

 if [ "$WHEEL_COUNT" -eq 0 ]; then
@@ -115,7 +115,7 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] |
 fi

 # Extract version from vLLM wheel and update version-specific index
-VLLM_WHEEL=$(find all-rocm-wheels -maxdepth 1 -name 'vllm*.whl' 2>/dev/null | head -1)
+VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
 if [ -n "$VLLM_WHEEL" ]; then
    VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
    echo "Version in wheel: $VERSION"
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
--- a/.buildkite/test_areas/basic_correctness.yaml
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -14,8 +14,3 @@ steps:
  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
--- a/.buildkite/test_areas/benchmarks.yaml
+++ b/.buildkite/test_areas/benchmarks.yaml
@@ -17,15 +17,3 @@ steps:
  - tests/benchmarks/
  commands:
  - pytest -v -s benchmarks/
-
- label: Attention Benchmarks Smoke Test (B200)
-  device: b200
-  num_gpus: 2
-  optional: true
-  working_dir: "/vllm-workspace/"
-  timeout_in_minutes: 10
-  source_file_dependencies:
-  - benchmarks/attention_benchmarks/
-  - vllm/v1/attention/
-  commands:
-  - python3 benchmarks/attention_benchmarks/benchmark.py --backends flash flashinfer --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -121,10 +121,13 @@ steps:
  optional: true
  commands:
    - nvidia-smi
-    # Run all models but only FLASHINFER, Inductor partition and native custom ops
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    # -k "inductor_partition and not +rms_norm and not +quant_fp8"
    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    # Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"
+    # -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
+    # Run just llama3 (fp8 & fp4) for all config combinations
+    # -k "llama-3"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"

 - label: Fusion E2E TP2 Quick (H100)
  timeout_in_minutes: 20
@@ -159,7 +162,7 @@ steps:
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
-    # Run just llama3 (fp8 & bf16) for all config combinations
+    # Run just llama3 (fp4 & fp8 & bf16) for all config combinations
    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"

 - label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
@@ -194,8 +197,7 @@ steps:
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
-    # Run all models but only FLASHINFER, Inductor partition and native custom ops
-    # include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # Run all models and attn backends but only Inductor partition and native custom ops
    # for ar-rms-quant-fp4, also sweep llama3
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -104,6 +104,7 @@ steps:
  # NEW rlhf examples
  - cd new_weight_syncing
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py

 - label: Distributed Tests (8 GPUs)(H100)
  timeout_in_minutes: 10
@@ -145,7 +146,6 @@ steps:
  num_devices: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
-    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py

@@ -165,7 +165,6 @@ steps:
  num_devices: 2
  num_nodes: 2
  no_plugin: true
-  optional: true # TODO: revert once infra issue solved
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
@@ -198,18 +197,7 @@ steps:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
-  source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-    - tests/v1/kv_connector/nixl_integration/
-  commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
- label: Pipeline + Context Parallelism (4 GPUs)
+- label: Pipeline + Context Parallelism (4 GPUs))
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -29,11 +29,15 @@ steps:
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1

- label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100)
-  timeout_in_minutes: 60
-  device: h100
+- label: Prime-RL Integration (2 GPUs)
+  timeout_in_minutes: 30
  optional: true
-  num_devices: 1
+  soft_fail: true
+  num_devices: 2
  working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/
+  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh 0.25 200 8030
+    - nvidia-smi
+    - bash .buildkite/scripts/run-prime-rl-test.sh
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -28,11 +28,3 @@ steps:
    - pytest -v -s v1/engine/test_preprocess_error_handling.py
    # Run the rest of v1/engine tests
    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-      commands:
-      - pytest -v -s v1/e2e
-      - pytest -v -s v1/engine
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -24,11 +24,6 @@ steps:
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: Entrypoints Integration (API Server 1)
  timeout_in_minutes: 130
@@ -47,13 +42,15 @@ steps:
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
-  - tests/entrypoints/rpc
-  - tests/entrypoints/instrumentator
  - tests/tool_use
+  - tests/entrypoints/sleep
+  - tests/entrypoints/instrumentator
+  - tests/entrypoints/rpc
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/instrumentator
  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
+  - pytest -v -s entrypoints/instrumentator
+  - pytest -v -s entrypoints/sleep
  - pytest -v -s tool_use

 - label: Entrypoints Integration (Pooling)
@@ -65,11 +62,6 @@ steps:
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/pooling
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: Entrypoints Integration (Responses API)
  timeout_in_minutes: 50
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -115,7 +115,6 @@ steps:
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/kernels/moe/test_flashinfer_moe.py
    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
    # e2e
    - pytest -v -s tests/models/quantization/test_nvfp4.py
@@ -157,3 +156,14 @@ steps:
    - pytest -v -s kernels/moe/test_deepep_moe.py
    - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
    # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
+  
+- label: Kernels Fp4 MoE Test (B200)
+  timeout_in_minutes: 60
+  device: b200
+  num_devices: 1
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_cutedsl_moe.py
+    - pytest -v -s kernels/moe/test_flashinfer_moe.py
+    - pytest -v -s kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s kernels/moe/test_ocp_mx_moe.py
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -73,29 +73,3 @@ steps:
  num_devices: 2
  commands:
    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
-
- label: GPQA Eval (GPT-OSS) (H100)
-  timeout_in_minutes: 120
-  device: h100
-  optional: true
-  num_devices: 2
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  - tests/evals/gpt_oss/
-  commands:
-    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-h100.txt
-
- label: GPQA Eval (GPT-OSS) (B200)
-  timeout_in_minutes: 120
-  device: b200
-  optional: true
-  num_devices: 2
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  - tests/evals/gpt_oss/
-  commands:
-    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-b200.txt
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -16,7 +16,6 @@ steps:
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
-    # TODO: create another `optional` test group for slow tests
    - pytest -v -s -m 'not slow_test' v1/spec_decode
    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
    - pytest -v -s -m 'not cpu_test' v1/metrics
@@ -26,11 +25,6 @@ steps:
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: V1 Others (CPU)
  depends_on:
@@ -114,11 +108,9 @@ steps:
  timeout_in_minutes: 50
  source_file_dependencies:
  - vllm/
-  - tests/detokenizer
  - tests/multimodal
  - tests/utils_
  commands:
-  - pytest -v -s detokenizer
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_

@@ -131,7 +123,6 @@ steps:
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/test_pooling_params.py
-  - tests/test_ray_env.py
  - tests/multimodal
  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
@@ -145,7 +136,6 @@ steps:
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s test_pooling_params.py
-  - pytest -v -s test_ray_env.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s renderers
  - pytest -v -s tokenizers_
@@ -153,6 +143,20 @@ steps:
  - pytest -v -s transformers_utils
  - pytest -v -s config

+- label: GPT-OSS Eval (B200)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  device: b200
+  optional: true
+  source_file_dependencies:
+  - tests/evals/gpt_oss
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
 - label: Batch Invariance (H100)
  timeout_in_minutes: 25
  device: h100
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -4,6 +4,7 @@ depends_on:
 steps:
 - label: Basic Models Tests (Initialization)
  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -15,6 +16,7 @@ steps:

 - label: Basic Models Tests (Extra Initialization) %N
  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
@@ -36,12 +38,6 @@ steps:
  - tests/models/test_registry.py
  commands:
    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-    

 - label: Basic Models Test (Other CPU) # 5min
  depends_on: 
--- a/.buildkite/test_areas/models_language.yaml
+++ b/.buildkite/test_areas/models_language.yaml
@@ -4,6 +4,7 @@ depends_on:
 steps:
 - label: Language Models Tests (Standard)
  timeout_in_minutes: 25
+  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -15,6 +16,7 @@ steps:

 - label: Language Models Tests (Extra Standard) %N
  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
@@ -30,6 +32,7 @@ steps:

 - label: Language Models Tests (Hybrid) %N
  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -37,7 +40,7 @@ steps:
  commands:
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
-    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    # Shard hybrid language model tests
    - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
@@ -45,6 +48,7 @@ steps:

 - label: Language Models Test (Extended Generation) # 80min
  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
@@ -52,21 +56,13 @@ steps:
  commands:
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
-    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-      commands:
-      - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
-      - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
-      - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'

 - label: Language Models Test (PPL)
  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
@@ -76,20 +72,17 @@ steps:

 - label: Language Models Test (Extended Pooling)  # 36min
  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/pooling
  commands:
    - pytest -v -s models/language/pooling -m 'not core_model'
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd

 - label: Language Models Test (MTEB)
  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
--- a/.buildkite/test_areas/samplers.yaml
+++ b/.buildkite/test_areas/samplers.yaml
@@ -12,10 +12,3 @@ steps:
  commands:
    - pytest -v -s samplers
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-      commands:
-      - pytest -v -s samplers
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,60 +2,42 @@
 # for more info about CODEOWNERS file

 # This lists cover the "core" components of vLLM that require careful review
-/vllm/compilation @zou3519 @youkaichao @ProExpertProg
-/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
-/vllm/lora @jeejeelee
-/vllm/model_executor/layers/attention @LucasWilkinson @MatthewBonanni
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
+/vllm/model_executor/layers/attention @LucasWilkinson
 /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
 /vllm/model_executor/layers/batch_invariant.py @yewentao256 
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
-/vllm/vllm_flash_attn @LucasWilkinson @MatthewBonanni
+/vllm/vllm_flash_attn @LucasWilkinson
+/vllm/lora @jeejeelee
+/vllm/reasoning @aarnphm @chaunceyjiang
+/vllm/entrypoints @aarnphm @chaunceyjiang
+/vllm/tool_parsers @aarnphm @chaunceyjiang
+/vllm/compilation @zou3519 @youkaichao @ProExpertProg
+/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson

 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
 /vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
-/vllm/config/cache.py @heheda12345
-
-# Entrypoints
-/vllm/entrypoints/anthropic @mgoin @DarkLight1337
-/vllm/entrypoints/cli @hmellor @mgoin @DarkLight1337 @russellb
-/vllm/entrypoints/mcp @heheda12345
-/vllm/entrypoints/openai @aarnphm @chaunceyjiang @DarkLight1337 @russellb
-/vllm/entrypoints/openai/realtime @njhill
-/vllm/entrypoints/openai/speech_to_text @NickLucche
-/vllm/entrypoints/pooling @noooop
-/vllm/entrypoints/sagemaker @DarkLight1337
-/vllm/entrypoints/serve @njhill
-/vllm/entrypoints/*.py @njhill
-/vllm/entrypoints/chat_utils.py @DarkLight1337
-/vllm/entrypoints/llm.py @DarkLight1337
-
-# Input/Output Processing
-/vllm/sampling_params.py @njhill @NickLucche
-/vllm/pooling_params.py @noooop @DarkLight1337
-/vllm/tokenizers @DarkLight1337 @njhill
-/vllm/renderers @DarkLight1337 @njhill
-/vllm/reasoning @aarnphm @chaunceyjiang
-/vllm/tool_parsers @aarnphm @chaunceyjiang
+/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345

 # vLLM V1
-/vllm/v1/attention @LucasWilkinson @MatthewBonanni
+/vllm/v1/attention @LucasWilkinson
 /vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
 /vllm/v1/attention/backends/mla @pavanimajety
 /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
 /vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
 /vllm/v1/sample @22quinn @houseroad @njhill
-/vllm/v1/spec_decode @benchislett @luccafong @MatthewBonanni
+/vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/kv_cache_interface.py @heheda12345
 /vllm/v1/kv_offload @ApostaC @orozery
 /vllm/v1/worker/gpu/kv_connector.py @orozery
-/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery @NickLucche
+/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery

 # Model runner V2
 /vllm/v1/worker/gpu @WoosukKwon
@@ -133,8 +115,8 @@ mkdocs.yaml @hmellor
 /vllm/model_executor/models/mixtral*.py @patrickvonplaten
 /vllm/model_executor/models/voxtral*.py @patrickvonplaten
 /vllm/model_executor/models/pixtral*.py @patrickvonplaten
-/vllm/tokenizers/mistral.py @patrickvonplaten
 /vllm/transformers_utils/configs/mistral.py @patrickvonplaten
+/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten

 # Kernels
 /vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep
@@ -170,7 +152,9 @@ mkdocs.yaml @hmellor
 /examples/pooling @noooop
 /tests/models/*/pooling* @noooop
 /tests/entrypoints/pooling @noooop
+/vllm/entrypoints/pooling @noooop
 /vllm/config/pooler.py @noooop
+/vllm/pooling_params.py @noooop
 /vllm/model_executor/layers/pooler @noooop

 # Security guide and policies
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -19,7 +19,6 @@ jobs:
        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
        with:
          python-version: '3.12'
-          cache: 'pip'

      - name: Install Python dependencies
        run: |
--- a/.gitignore
+++ b/.gitignore
@@ -238,6 +238,3 @@ ep_kernels_workspace/
 vllm/grpc/vllm_engine_pb2.py
 vllm/grpc/vllm_engine_pb2_grpc.py
 vllm/grpc/vllm_engine_pb2.pyi
-
-# Ignore generated cpu headers 
-csrc/cpu/cpu_attn_dispatch_generated.h
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -143,11 +143,6 @@ repos:
    name: Check attention backend documentation is up to date
    entry: python tools/pre_commit/generate_attention_backend_docs.py --check
    language: python
-  - id: check-boolean-context-manager
-    name: Check for boolean ops in with-statements
-    entry: python tools/pre_commit/check_boolean_context_manager.py
-    language: python
-    types: [python]
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -9,14 +9,13 @@ build:
    python: "3.12"
  jobs:
    post_checkout:
-      - git fetch origin main --unshallow --no-tags --filter=blob:none || true
-    pre_create_environment:
-      - pip install uv
-    create_environment:
-      - uv venv $READTHEDOCS_VIRTUALENV_PATH
-    install:
-      - uv pip install --python $READTHEDOCS_VIRTUALENV_PATH/bin/python --no-cache-dir -r requirements/docs.txt 
+      - git fetch --unshallow || true

 mkdocs:
  configuration: mkdocs.yaml
  fail_on_warning: true
+
+# Optionally declare the Python requirements required to build your docs
+python:
+  install:
+    - requirements: requirements/docs.txt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,8 +56,8 @@ endif()
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.10.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.10.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")

 #
 # Try to find python package with an executable that exactly matches
@@ -293,7 +293,6 @@ set(VLLM_EXT_SRC
  "csrc/fused_qknorm_rope_kernel.cu"
  "csrc/layernorm_quant_kernels.cu"
  "csrc/sampler.cu"
-  "csrc/topk.cu"
  "csrc/cuda_view.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/w8a8/int8/scaled_quant.cu"
@@ -434,7 +433,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
    endif()

-    if (MARLIN_SM75_ARCHS)
+    if (MARLIN_SM75_ARCHS) 
      file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/marlin/sm75_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
@@ -446,7 +445,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
    endif()

-    if (MARLIN_FP8_ARCHS)
+    if (MARLIN_FP8_ARCHS) 
      file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/marlin/sm89_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
@@ -771,24 +770,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()

-  # DeepSeek V3 fused A GEMM kernel (requires SM 9.0+, Hopper and later)
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
-  endif()
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_FUSED_A_GEMM_ARCHS)
-    set(DSV3_FUSED_A_GEMM_SRC "csrc/dsv3_fused_a_gemm.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${DSV3_FUSED_A_GEMM_SRC}"
-      CUDA_ARCHS "${DSV3_FUSED_A_GEMM_ARCHS}")
-    list(APPEND VLLM_EXT_SRC ${DSV3_FUSED_A_GEMM_SRC})
-    message(STATUS "Building dsv3_fused_a_gemm for archs: ${DSV3_FUSED_A_GEMM_ARCHS}")
-  else()
-    message(STATUS "Not building dsv3_fused_a_gemm as no compatible archs found "
-                   "in CUDA target architectures.")
-  endif()
-
  # moe_data.cu is used by all CUTLASS MoE kernels.
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
    cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
@@ -1061,7 +1042,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
    endif()

-    if (MARLIN_MOE_SM75_ARCHS)
+    if (MARLIN_MOE_SM75_ARCHS) 
      file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_MOE_SM75_SRC}"
@@ -1100,27 +1081,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
                   " in CUDA target architectures")
  endif()
-
-  # DeepSeek V3 router GEMM kernel - requires SM90+
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
-  endif()
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_ROUTER_GEMM_ARCHS)
-    set(DSV3_ROUTER_GEMM_SRC
-      "csrc/moe/dsv3_router_gemm_entry.cu"
-      "csrc/moe/dsv3_router_gemm_float_out.cu"
-      "csrc/moe/dsv3_router_gemm_bf16_out.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${DSV3_ROUTER_GEMM_SRC}"
-      CUDA_ARCHS "${DSV3_ROUTER_GEMM_ARCHS}")
-    list(APPEND VLLM_MOE_EXT_SRC "${DSV3_ROUTER_GEMM_SRC}")
-    message(STATUS "Building DSV3 router GEMM kernel for archs: ${DSV3_ROUTER_GEMM_ARCHS}")
-  else()
-    message(STATUS "Not building DSV3 router GEMM kernel as no compatible archs found"
-                   " (requires SM90+ and CUDA >= 12.0)")
-  endif()
 endif()

 message(STATUS "Enabling moe extension.")
--- a/benchmarks/attention_benchmarks/batch_spec.py
+++ b/benchmarks/attention_benchmarks/batch_spec.py
@@ -229,40 +229,3 @@ def get_batch_stats(requests: list[BatchRequest]) -> dict:
            sum(r.kv_len for r in requests) / len(requests) if requests else 0
        ),
    }
-
-
-def get_batch_type(batch_spec: str, spec_decode_threshold: int = 8) -> str:
-    """
-    Classify a batch spec into a type string.
-
-    Args:
-        batch_spec: Batch specification string (e.g., "q2k", "8q1s1k", "2q2k_8q1s1k")
-        spec_decode_threshold: Max q_len to be considered spec-decode vs extend
-
-    Returns:
-        Type string: "prefill", "decode", "spec-decode", "extend", or "mixed (types...)"
-    """
-    requests = parse_batch_spec(batch_spec)
-
-    # Classify each request
-    types_present = set()
-    for req in requests:
-        if req.is_decode:
-            types_present.add("decode")
-        elif req.is_prefill:
-            types_present.add("prefill")
-        elif req.is_extend:
-            # Distinguish spec-decode (small q_len) from extend (chunked prefill)
-            if req.q_len <= spec_decode_threshold:
-                types_present.add("spec-decode")
-            else:
-                types_present.add("extend")
-
-    if len(types_present) == 1:
-        return types_present.pop()
-    elif len(types_present) > 1:
-        # Sort for consistent output
-        sorted_types = sorted(types_present)
-        return f"mixed ({'+'.join(sorted_types)})"
-    else:
-        return "unknown"
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -43,7 +43,6 @@ from common import (
    ModelParameterSweep,
    ParameterSweep,
    ResultsFormatter,
-    batch_spec_sort_key,
    is_mla_backend,
 )

@@ -219,13 +218,10 @@ def run_model_parameter_sweep(
                        by_param_and_spec[key].append(r)
                        break

-    # Sort by param value then spec (batch_size, q_len, kv_len)
+    # Sort by param value then spec
    sorted_keys = sorted(
        by_param_and_spec.keys(),
-        key=lambda x: (
-            int(x[0]) if x[0].isdigit() else x[0],
-            batch_spec_sort_key(x[1]),
-        ),
+        key=lambda x: (int(x[0]) if x[0].isdigit() else x[0], x[1]),
    )

    current_param_value = None
@@ -334,7 +330,7 @@ def run_parameter_sweep(
                by_spec[spec] = []
            by_spec[spec].append(r)

-    for spec in sorted(by_spec.keys(), key=batch_spec_sort_key):
+    for spec in sorted(by_spec.keys()):
        results = by_spec[spec]
        best = min(results, key=lambda r: r.mean_time)
        console.print(
@@ -500,18 +496,15 @@ def main():
        if "description" in yaml_config:
            console.print(f"[dim]{yaml_config['description']}[/]")

-        # Override args with YAML values, but CLI args take precedence
-        # Check if CLI provided backends (they would be non-None and not default)
-        cli_backends_provided = args.backends is not None or args.backend is not None
-
-        # Backend(s) - only use YAML if CLI didn't specify
-        if not cli_backends_provided:
-            if "backend" in yaml_config:
-                args.backend = yaml_config["backend"]
-                args.backends = None
-            elif "backends" in yaml_config:
-                args.backends = yaml_config["backends"]
-                args.backend = None
+        # Override args with YAML values
+        # (YAML takes precedence unless CLI arg was explicitly set)
+        # Backend(s)
+        if "backend" in yaml_config:
+            args.backend = yaml_config["backend"]
+            args.backends = None
+        elif "backends" in yaml_config:
+            args.backends = yaml_config["backends"]
+            args.backend = None

        # Check for special modes
        if "mode" in yaml_config:
@@ -551,15 +544,13 @@ def main():
            args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads)
            args.block_size = model.get("block_size", args.block_size)

-        # Benchmark settings (top-level keys)
-        if "device" in yaml_config:
-            args.device = yaml_config["device"]
-        if "repeats" in yaml_config:
-            args.repeats = yaml_config["repeats"]
-        if "warmup_iters" in yaml_config:
-            args.warmup_iters = yaml_config["warmup_iters"]
-        if "profile_memory" in yaml_config:
-            args.profile_memory = yaml_config["profile_memory"]
+        # Benchmark settings
+        if "benchmark" in yaml_config:
+            bench = yaml_config["benchmark"]
+            args.device = bench.get("device", args.device)
+            args.repeats = bench.get("repeats", args.repeats)
+            args.warmup_iters = bench.get("warmup_iters", args.warmup_iters)
+            args.profile_memory = bench.get("profile_memory", args.profile_memory)

        # Parameter sweep configuration
        if "parameter_sweep" in yaml_config:
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -12,36 +12,16 @@ from typing import Any

 import numpy as np
 import torch
-from batch_spec import get_batch_type, parse_batch_spec
 from rich.console import Console
 from rich.table import Table

-
-def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
-    """
-    Extract sorting key from batch spec: (batch_size, max_q_len, max_kv_len).
-
-    This ensures results are sorted by batch size first, then query length,
-    then sequence length, rather than alphabetically.
-    """
-    try:
-        requests = parse_batch_spec(spec)
-        batch_size = len(requests)
-        max_q_len = max(r.q_len for r in requests) if requests else 0
-        max_kv_len = max(r.kv_len for r in requests) if requests else 0
-        return (batch_size, max_q_len, max_kv_len)
-    except Exception:
-        # Fallback for unparseable specs
-        return (0, 0, 0)
-
-
 # Mock classes for vLLM attention infrastructure


 class MockHfConfig:
    """Mock HuggingFace config that satisfies vLLM's requirements."""

-    def __init__(self, mla_dims: dict, index_topk: int | None = None):
+    def __init__(self, mla_dims: dict):
        self.num_attention_heads = mla_dims["num_q_heads"]
        self.num_key_value_heads = mla_dims["num_kv_heads"]
        self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"]
@@ -52,8 +32,6 @@ class MockHfConfig:
        self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"]
        self.v_head_dim = mla_dims["v_head_dim"]
        self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
-        if index_topk is not None:
-            self.index_topk = index_topk

    def get_text_config(self):
        return self
@@ -104,38 +82,6 @@ class MockKVBProj:
        return (result,)  # Return as tuple to match ColumnParallelLinear API


-class MockIndexer:
-    """Mock Indexer for sparse MLA backends.
-
-    Provides topk_indices_buffer that sparse MLA backends use to determine
-    which KV cache slots to attend to for each token.
-    """
-
-    def __init__(
-        self,
-        max_num_tokens: int,
-        topk_tokens: int,
-        device: torch.device,
-    ):
-        self.topk_tokens = topk_tokens
-        self.topk_indices_buffer = torch.zeros(
-            (max_num_tokens, topk_tokens),
-            dtype=torch.int32,
-            device=device,
-        )
-
-    def fill_random_indices(self, num_tokens: int, max_kv_len: int):
-        """Fill topk_indices_buffer with random valid indices for benchmarking."""
-        indices = torch.randint(
-            0,
-            max_kv_len,
-            (num_tokens, self.topk_tokens),
-            dtype=torch.int32,
-            device=self.topk_indices_buffer.device,
-        )
-        self.topk_indices_buffer[:num_tokens] = indices
-
-
 class MockLayer(AttentionLayerBase):
    """Mock attention layer with scale parameters and impl.

@@ -370,19 +316,14 @@ class ResultsFormatter:
            backends: List of backend names being compared
            compare_to_fastest: Show percentage comparison to fastest
        """
-        # Group by batch spec, preserving first-occurrence order
+        # Group by batch spec
        by_spec = {}
-        specs_order = []
        for r in results:
            spec = r.config.batch_spec
            if spec not in by_spec:
                by_spec[spec] = {}
-                specs_order.append(spec)
            by_spec[spec][r.config.backend] = r

-        # Sort specs by (batch_size, q_len, kv_len) instead of alphabetically
-        specs_order = sorted(by_spec.keys(), key=batch_spec_sort_key)
-
        # Create shortened backend names for display
        def shorten_backend_name(name: str) -> str:
            """Shorten long backend names for table display."""
@@ -396,8 +337,6 @@ class ResultsFormatter:

        table = Table(title="Attention Benchmark Results")
        table.add_column("Batch\nSpec", no_wrap=True)
-        table.add_column("Type", no_wrap=True)
-        table.add_column("Batch\nSize", justify="right", no_wrap=True)

        multi = len(backends) > 1
        for backend in backends:
@@ -411,14 +350,12 @@ class ResultsFormatter:
                table.add_column(col_rel, justify="right", no_wrap=False)

        # Add rows
-        for spec in specs_order:
+        for spec in sorted(by_spec.keys()):
            spec_results = by_spec[spec]
            times = {b: r.mean_time for b, r in spec_results.items() if r.success}
            best_time = min(times.values()) if times else 0.0

-            batch_type = get_batch_type(spec)
-            batch_size = len(parse_batch_spec(spec))
-            row = [spec, batch_type, str(batch_size)]
+            row = [spec]
            for backend in backends:
                if backend in spec_results:
                    r = spec_results[backend]
@@ -549,11 +486,10 @@ def get_attention_scale(head_dim: int) -> float:

 def is_mla_backend(backend: str) -> bool:
    """
-    Check if backend is an MLA backend using the AttentionBackendEnum.
+    Check if backend is an MLA backend using the backend's is_mla() property.

    Args:
-        backend: Backend name matching AttentionBackendEnum exactly
-        (e.g., "FLASHMLA_SPARSE")
+        backend: Backend name (e.g., "CUTLASS_MLA", "FLASHINFER_MLA")

    Returns:
        True if the backend is an MLA backend, False otherwise
@@ -561,8 +497,7 @@ def is_mla_backend(backend: str) -> bool:
    from vllm.v1.attention.backends.registry import AttentionBackendEnum

    try:
-        backend_enum = AttentionBackendEnum[backend]
-        backend_class = backend_enum.get_class()
+        backend_class = AttentionBackendEnum[backend.upper()].get_class()
        return backend_class.is_mla()
-    except (KeyError, ValueError, ImportError, AttributeError):
+    except (KeyError, ValueError, ImportError):
        return False
--- a/benchmarks/attention_benchmarks/configs/mla_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
@@ -3,7 +3,7 @@
 model:
  name: "deepseek-v3"
  num_layers: 60
-  num_q_heads: 128  # Base value, can be swept for TP simulation
+  num_q_heads: 128
  num_kv_heads: 1  # MLA uses single latent KV
  head_dim: 576
  kv_lora_rank: 512
@@ -12,13 +12,6 @@ model:
  v_head_dim: 128
  block_size: 128  # CUTLASS MLA and FlashAttn MLA use 128

-# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
-# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
-model_parameter_sweep:
-  param_name: "num_q_heads"
-  values: [128, 64, 32, 16]
-  label_format: "{backend}_{value}h"
-
 batch_specs:
  # Small batches, varying sequence lengths
  - "16q1s512"     # 16 requests, 512 KV cache
@@ -41,30 +34,28 @@ batch_specs:
  # Very large batches
  - "128q1s1k"     # 128 requests, 1k KV cache
  - "128q1s2k"     # 128 requests, 2k KV cache
-  - "128q1s4k"     # 128 requests, 4k KV cache
-  - "128q1s8k"     # 128 requests, 8k KV cache

  # Long context
  - "32q1s16k"     # 32 requests, 16k KV cache
  - "32q1s32k"     # 32 requests, 32k KV cache

 backends:
-  - CUTLASS_MLA
-  - FLASHINFER_MLA
-  - FLASH_ATTN_MLA  # Hopper only
-  - FLASHMLA        # Hopper only
+  - cutlass_mla
+  - flashinfer_mla
+  - flashattn_mla  # Hopper only
+  - flashmla        # Hopper only

 device: "cuda:0"
-repeats: 100
-warmup_iters: 10
+repeats: 5
+warmup_iters: 3
 profile_memory: true

 # Backend-specific tuning
-CUTLASS_MLA:
+cutlass_mla:
  num_kv_splits: auto  # or specific value like 4, 8, 16

-FLASH_ATTN_MLA:
+flashattn_mla:
  reorder_batch_threshold: 512

-FLASHMLA:
+flashmla:
  reorder_batch_threshold: 1
--- a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
@@ -45,10 +45,10 @@ batch_specs:
  - "4q4k_60q1s4k"          # 4 prefill + 60 decode

 backends:
-  - CUTLASS_MLA
-  - FLASHINFER_MLA
-  - FLASH_ATTN_MLA   # Hopper only
-  - FLASHMLA         # Hopper only
+  - cutlass_mla
+  - flashinfer_mla
+  - flashattn_mla   # Hopper only
+  - flashmla        # Hopper only

 device: "cuda:0"
 repeats: 5
--- a/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
@@ -1,62 +0,0 @@
-# MLA prefill-only benchmark configuration for sparse backends
-
-model:
-  name: "deepseek-v3"
-  num_layers: 60
-  num_q_heads: 128
-  num_kv_heads: 1
-  head_dim: 576
-  kv_lora_rank: 512
-  qk_nope_head_dim: 128
-  qk_rope_head_dim: 64
-  v_head_dim: 128
-  block_size: 128
-
-# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
-# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
-model_parameter_sweep:
-  param_name: "num_q_heads"
-  values: [128, 64, 32, 16]
-  label_format: "{backend}_{value}h"
-
-batch_specs:
-  # Pure prefill
-  - "1q512"
-  - "1q1k"
-  - "1q2k"
-  - "1q4k"
-  - "1q8k"
-
-  # Batched pure prefill
-  - "2q512"
-  - "2q1k"
-  - "2q2k"
-  - "2q4k"
-  - "2q8k"
-  - "4q512"
-  - "4q1k"
-  - "4q2k"
-  - "4q4k"
-  - "4q8k"
-  - "8q512"
-  - "8q1k"
-  - "8q2k"
-  - "8q4k"
-  - "8q8k"
-
-  # Extend
-  - "1q512s4k"
-  - "1q512s8k"
-  - "1q1ks8k"
-  - "1q2ks8k"
-  - "1q2ks16k"
-  - "1q4ks16k"
-
-backends:
-  - FLASHMLA_SPARSE
-  - FLASHINFER_MLA_SPARSE
-
-device: "cuda:0"
-repeats: 10
-warmup_iters: 3
-profile_memory: true
--- a/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
+++ b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
@@ -6,7 +6,7 @@
 description: "Decode vs Prefill pipeline crossover analysis"

 # Test FlashAttn MLA
-backend: FLASH_ATTN_MLA
+backend: flashattn_mla

 # Mode: decode_vs_prefill comparison (special sweep mode)
 # For each batch spec, we'll test both decode and prefill pipelines
@@ -62,10 +62,11 @@ model:
  block_size: 128

 # Benchmark settings
-device: "cuda:0"
-repeats: 15          # More repeats for spec decode variance
-warmup_iters: 5
-profile_memory: false
+benchmark:
+  device: "cuda:0"
+  repeats: 15          # More repeats for spec decode variance
+  warmup_iters: 5
+  profile_memory: false

 # Output
 output:
--- a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
@@ -41,17 +41,18 @@ batch_specs:

 # Backends that support query length > 1
 backends:
-  - FLASH_ATTN_MLA    # reorder_batch_threshold = 512
-  - FLASHMLA          # reorder_batch_threshold = 1 (tunable)
+  - flashattn_mla    # reorder_batch_threshold = 512
+  - flashmla          # reorder_batch_threshold = 1 (tunable)

 # FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
-# - FLASHINFER_MLA
+# - flashinfer_mla

 # Benchmark settings
-device: "cuda:0"
-repeats: 10  # More repeats for statistical significance
-warmup_iters: 5
-profile_memory: false
+benchmark:
+  device: "cuda:0"
+  repeats: 10  # More repeats for statistical significance
+  warmup_iters: 5
+  profile_memory: false

 # Test these threshold values for optimization
 parameter_sweep:
--- a/benchmarks/attention_benchmarks/configs/standard_attention.yaml
+++ b/benchmarks/attention_benchmarks/configs/standard_attention.yaml
@@ -25,22 +25,14 @@ batch_specs:
  - "4q1k_16q1s2k"     # 4 prefill + 16 decode
  - "2q4k_32q1s1k"     # 2 large prefill + 32 decode

-  # Speculative decode (q <= 8)
-  - "16q2s1k"         # 16 requests, 2 spec tokens, 1k KV cache
-  - "16q4s1k"         # 16 requests, 4 spec tokens, 1k KV cache
-  - "16q8s1k"         # 16 requests, 8 spec tokens, 1k KV cache
-  - "32q4s2k"         # 32 requests, 4 spec tokens, 2k KV cache
-  - "8q8s4k"          # 8 requests, 8 spec tokens, 4k KV cache
-
-  # Context extension (chunked prefill)
-  - "q1ks2k"          # 1k query, 2k sequence
+  # Context extension
+  - "q1ks2k"          # 1k query, 2k sequence (chunked prefill)
  - "2q1ks4k"         # 2 requests: 1k query, 4k sequence

-# Available backends: FLASH_ATTN, TRITON_ATTN, FLASHINFER
 backends:
-  - FLASH_ATTN
-  - TRITON_ATTN
-  - FLASHINFER
+  - flash
+  - triton
+  - flashinfer

 device: "cuda:0"
 repeats: 5
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -8,13 +8,14 @@ This module provides helpers for running MLA backends without
 needing full VllmConfig integration.
 """

+import importlib
+
 import numpy as np
 import torch
 from batch_spec import parse_batch_spec
 from common import (
    BenchmarkResult,
    MockHfConfig,
-    MockIndexer,
    MockKVBProj,
    MockLayer,
    setup_mla_dims,
@@ -61,7 +62,6 @@ def create_minimal_vllm_config(
    block_size: int = 128,
    max_num_seqs: int = 256,
    mla_dims: dict | None = None,
-    index_topk: int | None = None,
 ) -> VllmConfig:
    """
    Create minimal VllmConfig for MLA benchmarks.
@@ -73,8 +73,6 @@ def create_minimal_vllm_config(
        max_num_seqs: Maximum number of sequences
        mla_dims: Optional custom MLA dimensions dict. If not provided, uses
                  setup_mla_dims(model_name)
-        index_topk: Optional topk value for sparse MLA backends. If provided,
-                    the config will include index_topk for sparse attention.

    Returns:
        VllmConfig for benchmarking
@@ -84,7 +82,7 @@ def create_minimal_vllm_config(
        mla_dims = setup_mla_dims(model_name)

    # Create mock HF config first (avoids downloading from HuggingFace)
-    mock_hf_config = MockHfConfig(mla_dims, index_topk=index_topk)
+    mock_hf_config = MockHfConfig(mla_dims)

    # Create a temporary minimal config.json to avoid HF downloads
    # This ensures consistent ModelConfig construction without network access
@@ -122,12 +120,16 @@ def create_minimal_vllm_config(
            seed=0,
            max_model_len=32768,
            quantization=None,
+            quantization_param_path=None,
            enforce_eager=False,
+            max_context_len_to_capture=None,
+            max_seq_len_to_capture=8192,
            max_logprobs=20,
            disable_sliding_window=False,
            skip_tokenizer_init=True,
            served_model_name=None,
            limit_mm_per_prompt=None,
+            use_async_output_proc=True,
            config_format="auto",
        )
    finally:
@@ -178,65 +180,56 @@ def create_minimal_vllm_config(
 # ============================================================================


-# Backend-specific properties that can't be inferred from the backend class
-# Keys are AttentionBackendEnum names (uppercase)
+# Backend name to class name prefix mapping
+_BACKEND_NAME_MAP = {
+    "flashattn_mla": "FlashAttnMLA",
+    "flashmla": "FlashMLA",
+    "flashinfer_mla": "FlashInferMLA",
+    "cutlass_mla": "CutlassMLA",
+}
+
+# Special properties that differ from defaults
 _BACKEND_PROPERTIES = {
-    "FLASHMLA": {
+    "flashmla": {
        "query_format": "concat",  # Single concatenated tensor (vs tuple)
+        "block_size": 64,  # FlashMLA uses fixed block size
    },
-    "FLASHMLA_SPARSE": {
-        "query_format": "concat",  # Single concatenated tensor (vs tuple)
+    "flashinfer_mla": {
+        "block_size": 64,  # FlashInfer MLA only supports 32 or 64
    },
 }


 def _get_backend_config(backend: str) -> dict:
    """
-    Get backend configuration from AttentionBackendEnum.
+    Get backend configuration using naming conventions.

-    Uses the registry to get the backend class and extract configuration
-    from its methods (get_impl_cls, get_builder_cls, is_sparse, etc.).
-
-    Args:
-        backend: Backend name matching AttentionBackendEnum exactly
-        (e.g., "FLASHMLA_SPARSE")
-
-    Returns:
-        Dict with backend configuration
+    All MLA backends follow the pattern:
+    - Module: vllm.v1.attention.backends.mla.{backend}
+    - Impl: {Name}Impl
+    - Metadata: {Name}Metadata (or MLACommonMetadata)
+    - DecodeMetadata: {Name}DecodeMetadata (or MLACommonDecodeMetadata)
+    - MetadataBuilder: {Name}MetadataBuilder
    """
-    from vllm.v1.attention.backends.registry import AttentionBackendEnum
+    if backend not in _BACKEND_NAME_MAP:
+        raise ValueError(f"Unknown backend: {backend}")

-    try:
-        backend_enum = AttentionBackendEnum[backend]
-        backend_class = backend_enum.get_class()
-    except (KeyError, ValueError) as e:
-        valid_backends = [e.name for e in AttentionBackendEnum if e.name != "CUSTOM"]
-        raise ValueError(
-            f"Unknown backend: {backend}. "
-            f"Valid MLA backends: {[b for b in valid_backends if 'MLA' in b]}"
-        ) from e
-
-    # Get block size from backend class
-    block_sizes = backend_class.get_supported_kernel_block_sizes()
-    # Use first supported block size (backends typically support one for MLA)
-    block_size = block_sizes[0] if block_sizes else None
-    if hasattr(block_size, "value"):
-        # Handle MultipleOf enum
-        block_size = None
-
-    # Check if sparse via class method if available
-    is_sparse = getattr(backend_class, "is_sparse", lambda: False)()
-
-    # Get properties that can't be inferred
+    name = _BACKEND_NAME_MAP[backend]
    props = _BACKEND_PROPERTIES.get(backend, {})

+    # Check if backend uses common metadata (FlashInfer, CUTLASS)
+    uses_common = backend in ("flashinfer_mla", "cutlass_mla")
+
    return {
-        "backend_class": backend_class,
-        "impl_class": backend_class.get_impl_cls(),
-        "builder_class": backend_class.get_builder_cls(),
+        "module": f"vllm.v1.attention.backends.mla.{backend}",
+        "impl_class": f"{name}Impl",
+        "metadata_class": "MLACommonMetadata" if uses_common else f"{name}Metadata",
+        "decode_metadata_class": "MLACommonDecodeMetadata"
+        if uses_common
+        else f"{name}DecodeMetadata",
+        "builder_class": f"{name}MetadataBuilder",
        "query_format": props.get("query_format", "tuple"),
-        "block_size": block_size,
-        "is_sparse": is_sparse,
+        "block_size": props.get("block_size", None),
    }


@@ -454,26 +447,22 @@ def _create_backend_impl(
    mla_dims: dict,
    vllm_config: VllmConfig,
    device: torch.device,
-    max_num_tokens: int = 8192,
-    index_topk: int | None = None,
 ):
    """
    Create backend implementation instance.

    Args:
-        backend_cfg: Backend configuration dict from _get_backend_config()
+        backend_cfg: Backend configuration dict
        mla_dims: MLA dimension configuration
        vllm_config: VllmConfig instance
        device: Target device
-        max_num_tokens: Maximum number of tokens for sparse indexer buffer
-        index_topk: Topk value for sparse MLA backends

    Returns:
-        Tuple of (impl, layer, builder_instance, indexer)
+        Tuple of (impl, layer, builder_instance)
    """
-    # Get classes from backend config (already resolved by _get_backend_config)
-    impl_class = backend_cfg["impl_class"]
-    builder_class = backend_cfg["builder_class"]
+    # Import backend classes
+    backend_module = importlib.import_module(backend_cfg["module"])
+    impl_class = getattr(backend_module, backend_cfg["impl_class"])

    # Calculate scale
    scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"])
@@ -485,44 +474,26 @@ def _create_backend_impl(
        v_head_dim=mla_dims["v_head_dim"],
    )

-    # Create indexer for sparse backends
-    indexer = None
-    if backend_cfg.get("is_sparse", False):
-        if index_topk is None:
-            index_topk = 2048  # Default topk for sparse MLA
-        indexer = MockIndexer(
-            max_num_tokens=max_num_tokens,
-            topk_tokens=index_topk,
-            device=device,
-        )
-
-    # Build impl kwargs
-    impl_kwargs = {
-        "num_heads": mla_dims["num_q_heads"],
-        "head_size": mla_dims["head_dim"],
-        "scale": scale,
-        "num_kv_heads": mla_dims["num_kv_heads"],
-        "alibi_slopes": None,
-        "sliding_window": None,
-        "kv_cache_dtype": "auto",
-        "logits_soft_cap": None,
-        "attn_type": "decoder",
-        "kv_sharing_target_layer_name": None,
-        "q_lora_rank": None,
-        "kv_lora_rank": mla_dims["kv_lora_rank"],
-        "qk_nope_head_dim": mla_dims["qk_nope_head_dim"],
-        "qk_rope_head_dim": mla_dims["qk_rope_head_dim"],
-        "qk_head_dim": mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
-        "v_head_dim": mla_dims["v_head_dim"],
-        "kv_b_proj": mock_kv_b_proj,
-    }
-
-    # Add indexer for sparse backends
-    if indexer is not None:
-        impl_kwargs["indexer"] = indexer
-
    # Create impl
-    impl = impl_class(**impl_kwargs)
+    impl = impl_class(
+        num_heads=mla_dims["num_q_heads"],
+        head_size=mla_dims["head_dim"],
+        scale=scale,
+        num_kv_heads=mla_dims["num_kv_heads"],
+        alibi_slopes=None,
+        sliding_window=None,
+        kv_cache_dtype="auto",
+        logits_soft_cap=None,
+        attn_type="decoder",
+        kv_sharing_target_layer_name=None,
+        q_lora_rank=None,
+        kv_lora_rank=mla_dims["kv_lora_rank"],
+        qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
+        qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
+        qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
+        v_head_dim=mla_dims["v_head_dim"],
+        kv_b_proj=mock_kv_b_proj,
+    )

    # Initialize DCP attributes
    if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1):
@@ -544,7 +515,9 @@ def _create_backend_impl(

    # Create builder instance if needed
    builder_instance = None
-    if builder_class:
+    if backend_cfg["builder_class"]:
+        builder_class = getattr(backend_module, backend_cfg["builder_class"])
+
        # Populate static_forward_context so builder can find the layer
        # MockLayer inherits from AttentionLayerBase, so isinstance checks pass
        vllm_config.compilation_config.static_forward_context = {"placeholder": layer}
@@ -556,7 +529,7 @@ def _create_backend_impl(
            device=device,
        )

-    return impl, layer, builder_instance, indexer
+    return impl, layer, builder_instance


 # ============================================================================
@@ -621,7 +594,6 @@ def _run_single_benchmark(
    backend_cfg: dict,
    mla_dims: dict,
    device: torch.device,
-    indexer=None,
 ) -> BenchmarkResult:
    """
    Run a single benchmark iteration.
@@ -634,7 +606,6 @@ def _run_single_benchmark(
        backend_cfg: Backend configuration dict
        mla_dims: MLA dimension configuration
        device: Target device
-        indexer: Optional MockIndexer for sparse backends

    Returns:
        BenchmarkResult with timing statistics
@@ -642,9 +613,7 @@ def _run_single_benchmark(
    # Parse batch spec
    requests = parse_batch_spec(config.batch_spec)
    q_lens = [r.q_len for r in requests]
-    kv_lens = [r.kv_len for r in requests]
    total_q = sum(q_lens)
-    max_kv_len = max(kv_lens)

    # Determine block size
    block_size = backend_cfg["block_size"] or config.block_size
@@ -672,16 +641,8 @@ def _run_single_benchmark(
        torch.bfloat16,
    )

-    # Fill indexer with random indices for sparse backends
-    is_sparse = backend_cfg.get("is_sparse", False)
-    if is_sparse and indexer is not None:
-        indexer.fill_random_indices(total_q, max_kv_len)
-
-    # Determine which forward method to use
-    if is_sparse:
-        # Sparse backends use forward_mqa
-        forward_fn = lambda: impl.forward_mqa(decode_inputs, kv_cache, metadata, layer)
-    elif metadata.decode is not None:
+    # Determine which forward method to use based on metadata
+    if metadata.decode is not None:
        forward_fn = lambda: impl._forward_decode(
            decode_inputs, kv_cache, metadata, layer
        )
@@ -732,13 +693,11 @@ def _run_single_benchmark(
 def _run_mla_benchmark_batched(
    backend: str,
    configs_with_params: list[tuple],  # [(config, threshold, num_splits), ...]
-    index_topk: int = 2048,
 ) -> list[BenchmarkResult]:
    """
    Unified batched MLA benchmark runner for all backends.

-    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
-               flashinfer_mla_sparse, flashmla_sparse
+    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla

    This function reuses backend initialization across multiple benchmarks
    to avoid setup/teardown overhead.
@@ -748,7 +707,6 @@ def _run_mla_benchmark_batched(
        configs_with_params: List of (config, threshold, num_splits) tuples
            - threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
            - num_splits: num_kv_splits (CUTLASS only)
-        index_topk: Topk value for sparse MLA backends (default 2048)

    Returns:
        List of BenchmarkResult objects
@@ -772,27 +730,19 @@ def _run_mla_benchmark_batched(
    if mla_dims is None:
        mla_dims = setup_mla_dims("deepseek-v3")

-    # Determine if this is a sparse backend
-    is_sparse = backend_cfg.get("is_sparse", False)
-
    # Create and set vLLM config for MLA (reused across all benchmarks)
    vllm_config = create_minimal_vllm_config(
        model_name="deepseek-v3",  # Used only for model path
        block_size=block_size,
        mla_dims=mla_dims,  # Use custom dims from config or default
-        index_topk=index_topk if is_sparse else None,
    )

    results = []

    with set_current_vllm_config(vllm_config):
-        # Create backend impl, layer, builder, and indexer (reused across benchmarks)
-        impl, layer, builder_instance, indexer = _create_backend_impl(
-            backend_cfg,
-            mla_dims,
-            vllm_config,
-            device,
-            index_topk=index_topk if is_sparse else None,
+        # Create backend impl, layer, and builder (reused across benchmarks)
+        impl, layer, builder_instance = _create_backend_impl(
+            backend_cfg, mla_dims, vllm_config, device
        )

        # Run each benchmark with the shared impl
@@ -818,7 +768,6 @@ def _run_mla_benchmark_batched(
                    backend_cfg,
                    mla_dims,
                    device,
-                    indexer=indexer,
                )
                results.append(result)

@@ -844,24 +793,20 @@ def run_mla_benchmark(
    config,
    reorder_batch_threshold: int | None = None,
    num_kv_splits: int | None = None,
-    index_topk: int = 2048,
 ) -> BenchmarkResult | list[BenchmarkResult]:
    """
    Unified MLA benchmark runner for all backends.

-    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
-               flashinfer_mla_sparse, flashmla_sparse
+    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla

    Always uses batched execution internally for optimal performance.

    Args:
-        backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
-                 flashinfer_mla_sparse, flashmla_sparse)
+        backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla)
        config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples
        reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA
                                 (single config mode only)
        num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
-        index_topk: Topk value for sparse MLA backends (default 2048)

    Returns:
        BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
@@ -871,9 +816,9 @@ def run_mla_benchmark(
        # Already in batched format
        if len(config) > 0 and isinstance(config[0], tuple):
            # Format: [(cfg, param), ...] where param is threshold or num_splits
-            if backend in ("flashattn_mla", "flashmla", "flashmla_sparse"):
+            if backend in ("flashattn_mla", "flashmla"):
                configs_with_params = [(cfg, param, None) for cfg, param in config]
-            else:  # cutlass_mla, flashinfer_mla, or sparse backends
+            else:  # cutlass_mla or flashinfer_mla
                configs_with_params = [(cfg, None, param) for cfg, param in config]
        else:
            # Format: [cfg, ...] - just configs
@@ -885,7 +830,7 @@ def run_mla_benchmark(
        return_single = True

    # Use unified batched execution
-    results = _run_mla_benchmark_batched(backend, configs_with_params, index_topk)
+    results = _run_mla_benchmark_batched(backend, configs_with_params)

    # Return single result or list based on input
    return results[0] if return_single else results
--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -8,9 +8,7 @@ This module provides helpers for running standard attention backends
 (FlashAttention, Triton, FlashInfer) with real vLLM integration.
 """

-import logging
 import types
-from contextlib import contextmanager

 import numpy as np
 import torch
@@ -26,13 +24,8 @@ from vllm.config import (
    ParallelConfig,
    SchedulerConfig,
    VllmConfig,
-    set_current_vllm_config,
-)
-from vllm.v1.attention.backends.utils import (
-    CommonAttentionMetadata,
-    get_kv_cache_layout,
-    set_kv_cache_layout,
 )
+from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import FullAttentionSpec

 # ============================================================================
@@ -40,41 +33,37 @@ from vllm.v1.kv_cache_interface import FullAttentionSpec
 # ============================================================================


+_BACKEND_CONFIG = {
+    "flash": {
+        "module": "vllm.v1.attention.backends.flash_attn",
+        "backend_class": "FlashAttentionBackend",
+        "dtype": torch.float16,
+        "cache_layout": "standard",
+        # ^ [2, num_blocks, block_size, num_kv_heads, head_dim]
+    },
+    "triton": {
+        "module": "vllm.v1.attention.backends.triton_attn",
+        "backend_class": "TritonAttentionBackend",
+        "dtype": torch.float32,
+        "cache_layout": "standard",
+    },
+    "flashinfer": {
+        "module": "vllm.v1.attention.backends.flashinfer",
+        "backend_class": "FlashInferBackend",
+        "dtype": torch.float16,
+        "cache_layout": "flashinfer",
+        # ^ [num_blocks, 2, block_size, num_kv_heads, head_dim]
+    },
+}
+
+
 def _get_backend_config(backend: str) -> dict:
-    """
-    Get backend configuration from AttentionBackendEnum.
-
-    Args:
-        backend: Backend name matching AttentionBackendEnum exactly
-                 (e.g., "FLASH_ATTN", "TRITON_ATTN", "FLASHINFER")
-
-    Returns:
-        Dict with backend_class
-    """
-    from vllm.v1.attention.backends.registry import AttentionBackendEnum
-
-    try:
-        backend_enum = AttentionBackendEnum[backend]
-        backend_class = backend_enum.get_class()
-    except (KeyError, ValueError) as e:
-        valid_backends = [b.name for b in AttentionBackendEnum if b.name != "CUSTOM"]
+    if backend not in _BACKEND_CONFIG:
        raise ValueError(
-            f"Unknown backend: {backend}. Valid backends: {valid_backends}"
-        ) from e
-
-    return {"backend_class": backend_class}
-
-
-@contextmanager
-def log_warnings_and_errors_only():
-    """Temporarily set vLLM logger to WARNING level."""
-    logger = logging.getLogger("vllm")
-    old_level = logger.level
-    logger.setLevel(logging.WARNING)
-    try:
-        yield
-    finally:
-        logger.setLevel(old_level)
+            f"Unknown backend: {backend}. "
+            f"Available: {', '.join(_BACKEND_CONFIG.keys())}"
+        )
+    return _BACKEND_CONFIG[backend]


 # ============================================================================
@@ -99,7 +88,11 @@ def _build_common_attn_metadata(
    query_start_loc_cpu = query_start_loc.cpu()

    seq_lens = torch.tensor(kv_lens, dtype=torch.int32, device=device)
-    max_seq_len = int(seq_lens.max().item())
+    seq_lens_cpu = seq_lens.cpu()
+    max_seq_len = int(seq_lens_cpu.max())
+
+    context_lens = [kv - q for kv, q in zip(kv_lens, q_lens)]
+    num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)

    max_blocks = (max(kv_lens) + block_size - 1) // block_size
    num_blocks = batch_size * max_blocks
@@ -114,6 +107,8 @@ def _build_common_attn_metadata(
        query_start_loc=query_start_loc,
        query_start_loc_cpu=query_start_loc_cpu,
        seq_lens=seq_lens,
+        seq_lens_cpu=seq_lens_cpu,
+        num_computed_tokens_cpu=num_computed_tokens_cpu,
        num_reqs=batch_size,
        num_actual_tokens=total_tokens,
        max_query_len=max_query_len,
@@ -126,6 +121,7 @@ def _build_common_attn_metadata(

 def _create_vllm_config(
    config: BenchmarkConfig,
+    dtype: torch.dtype,
    max_num_blocks: int,
 ) -> VllmConfig:
    """Create a VllmConfig for benchmarking with mock model methods."""
@@ -133,7 +129,7 @@ def _create_vllm_config(
        model="meta-llama/Meta-Llama-3-8B",
        tokenizer="meta-llama/Meta-Llama-3-8B",
        trust_remote_code=False,
-        dtype="auto",  # Use model's native dtype
+        dtype=dtype,
        seed=0,
        max_model_len=1024,
    )
@@ -202,12 +198,15 @@ def _create_backend_impl(
    backend_cfg: dict,
    config: BenchmarkConfig,
    device: torch.device,
-    dtype: torch.dtype,
 ):
    """Create backend implementation instance."""
-    backend_class = backend_cfg["backend_class"]
+    import importlib
+
+    backend_module = importlib.import_module(backend_cfg["module"])
+    backend_class = getattr(backend_module, backend_cfg["backend_class"])

    scale = get_attention_scale(config.head_dim)
+    dtype = backend_cfg["dtype"]

    impl = backend_class.get_impl_cls()(
        num_heads=config.num_q_heads,
@@ -228,7 +227,7 @@ def _create_backend_impl(

    layer = MockLayer(device, kv_cache_spec=kv_cache_spec)

-    return backend_class, impl, layer
+    return backend_class, impl, layer, dtype


 def _create_metadata_builder(
@@ -236,44 +235,11 @@ def _create_metadata_builder(
    kv_cache_spec: FullAttentionSpec,
    vllm_config: VllmConfig,
    device: torch.device,
-    backend_name: str = "",
 ):
    """Create metadata builder instance."""
-    layer_names = ["layer_0"]
-    builder_cls = backend_class.get_builder_cls()
-
-    # Flashinfer needs get_per_layer_parameters mocked since we don't have
-    # real model layers registered
-    if backend_name == "FLASHINFER":
-        import unittest.mock
-
-        from vllm.v1.attention.backends.utils import PerLayerParameters
-
-        def mock_get_per_layer_parameters(vllm_config, layer_names, impl_cls):
-            head_size = vllm_config.model_config.get_head_size()
-            return {
-                layer_name: PerLayerParameters(
-                    window_left=-1,  # No sliding window
-                    logits_soft_cap=0.0,  # No soft cap
-                    sm_scale=1.0 / (head_size**0.5),  # Standard scale
-                )
-                for layer_name in layer_names
-            }
-
-        with unittest.mock.patch(
-            "vllm.v1.attention.backends.flashinfer.get_per_layer_parameters",
-            mock_get_per_layer_parameters,
-        ):
-            return builder_cls(
-                kv_cache_spec=kv_cache_spec,
-                layer_names=layer_names,
-                vllm_config=vllm_config,
-                device=device,
-            )
-
-    return builder_cls(
+    return backend_class.get_builder_cls()(
        kv_cache_spec=kv_cache_spec,
-        layer_names=layer_names,
+        layer_names=["layer_0"],
        vllm_config=vllm_config,
        device=device,
    )
@@ -315,44 +281,39 @@ def _create_input_tensors(
 def _create_kv_cache(
    config: BenchmarkConfig,
    max_num_blocks: int,
-    backend_class,
+    cache_layout: str,
    device: torch.device,
    dtype: torch.dtype,
 ) -> list:
-    """Create KV cache tensors for all layers using the backend's methods.
-
-    Uses the backend's get_kv_cache_shape() and get_kv_cache_stride_order()
-    to create the cache with the correct shape and memory layout.
-    """
-    # Get the logical shape from the backend
-    cache_shape = backend_class.get_kv_cache_shape(
-        num_blocks=max_num_blocks,
-        block_size=config.block_size,
-        num_kv_heads=config.num_kv_heads,
-        head_size=config.head_dim,
-    )
-
-    # Get the stride order for custom memory layout
-    try:
-        stride_order = backend_class.get_kv_cache_stride_order()
-        assert len(stride_order) == len(cache_shape)
-    except (AttributeError, NotImplementedError):
-        stride_order = tuple(range(len(cache_shape)))
-
-    # Permute shape to physical layout order
-    physical_shape = tuple(cache_shape[i] for i in stride_order)
-
-    # Compute inverse permutation to get back to logical view
-    inv_order = [stride_order.index(i) for i in range(len(stride_order))]
-
-    cache_list = []
-    for _ in range(config.num_layers):
-        # Allocate in physical layout order (contiguous in memory)
-        cache = torch.zeros(*physical_shape, device=device, dtype=dtype)
-        # Permute to logical view
-        cache = cache.permute(*inv_order)
-        cache_list.append(cache)
-
+    """Create KV cache tensors for all layers."""
+    if cache_layout == "flashinfer":
+        # FlashInfer layout: [num_blocks, 2, block_size, num_kv_heads, head_dim]
+        cache_list = [
+            torch.zeros(
+                max_num_blocks,
+                2,
+                config.block_size,
+                config.num_kv_heads,
+                config.head_dim,
+                device=device,
+                dtype=dtype,
+            )
+            for _ in range(config.num_layers)
+        ]
+    else:
+        # Standard layout: [2, num_blocks, block_size, num_kv_heads, head_dim]
+        cache_list = [
+            torch.zeros(
+                2,
+                max_num_blocks,
+                config.block_size,
+                config.num_kv_heads,
+                config.head_dim,
+                device=device,
+                dtype=dtype,
+            )
+            for _ in range(config.num_layers)
+        ]
    return cache_list


@@ -435,7 +396,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
    """
    Run standard attention benchmark with real kernels.

-    Supports: FLASH_ATTN, TRITON_ATTN, FLASHINFER
+    Supports: flash, triton, flashinfer

    Args:
        config: Benchmark configuration
@@ -450,79 +411,60 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:

    requests = parse_batch_spec(config.batch_spec)

-    if config.backend == "FLASHINFER":
+    if config.backend == "flashinfer":
        requests = reorder_for_flashinfer(requests)

    q_lens = [r.q_len for r in requests]
    kv_lens = [r.kv_len for r in requests]
    total_q = sum(q_lens)
    max_kv = max(kv_lens)
-    batch_size = len(q_lens)

-    # Calculate total blocks needed: batch_size * max_blocks_per_request
-    max_blocks_per_request = (max_kv + config.block_size - 1) // config.block_size
-    max_num_blocks = batch_size * max_blocks_per_request
+    max_num_blocks = (max_kv + config.block_size - 1) // config.block_size

-    # Suppress vLLM logs during setup to reduce spam
-    with log_warnings_and_errors_only():
-        # Create vllm_config first - uses model's native dtype via "auto"
-        vllm_config = _create_vllm_config(config, max_num_blocks)
-        dtype = vllm_config.model_config.dtype
+    backend_class, impl, layer, dtype = _create_backend_impl(
+        backend_cfg, config, device
+    )

-        # Wrap everything in set_current_vllm_config context
-        # This is required for backends like flashinfer that need global config
-        with set_current_vllm_config(vllm_config):
-            backend_class, impl, layer = _create_backend_impl(
-                backend_cfg, config, device, dtype
-            )
+    common_metadata = _build_common_attn_metadata(
+        q_lens, kv_lens, config.block_size, device
+    )

-            # Set KV cache layout if the backend requires a specific one
-            # (e.g., FlashInfer requires HND on SM100/Blackwell for TRTLLM attention)
-            required_layout = backend_class.get_required_kv_cache_layout()
-            if required_layout is not None:
-                set_kv_cache_layout(required_layout)
-                get_kv_cache_layout.cache_clear()
+    kv_cache_spec = FullAttentionSpec(
+        block_size=config.block_size,
+        num_kv_heads=config.num_kv_heads,
+        head_size=config.head_dim,
+        dtype=dtype,
+    )

-            common_metadata = _build_common_attn_metadata(
-                q_lens, kv_lens, config.block_size, device
-            )
+    vllm_config = _create_vllm_config(config, dtype, max_num_blocks)

-            kv_cache_spec = FullAttentionSpec(
-                block_size=config.block_size,
-                num_kv_heads=config.num_kv_heads,
-                head_size=config.head_dim,
-                dtype=dtype,
-            )
+    builder = _create_metadata_builder(
+        backend_class, kv_cache_spec, vllm_config, device
+    )

-            builder = _create_metadata_builder(
-                backend_class, kv_cache_spec, vllm_config, device, config.backend
-            )
+    attn_metadata = builder.build(
+        common_prefix_len=0,
+        common_attn_metadata=common_metadata,
+    )

-            attn_metadata = builder.build(
-                common_prefix_len=0,
-                common_attn_metadata=common_metadata,
-            )
+    q_list, k_list, v_list = _create_input_tensors(config, total_q, device, dtype)

-            q_list, k_list, v_list = _create_input_tensors(
-                config, total_q, device, dtype
-            )
+    cache_list = _create_kv_cache(
+        config, max_num_blocks, backend_cfg["cache_layout"], device, dtype
+    )

-            cache_list = _create_kv_cache(
-                config, max_num_blocks, backend_class, device, dtype
-            )
-
-            times, mem_stats = _run_single_benchmark(
-                config,
-                impl,
-                layer,
-                q_list,
-                k_list,
-                v_list,
-                cache_list,
-                attn_metadata,
-                device,
-                dtype,
-            )
+    times, mem_stats = _run_single_benchmark(
+        config,
+        impl,
+        layer,
+        q_list,
+        k_list,
+        v_list,
+        cache_list,
+        attn_metadata,
+        device,
+        dtype,
+    )

    mean_time = np.mean(times)
    throughput = total_q / mean_time if mean_time > 0 else 0
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -46,10 +46,10 @@ echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
 echo "RESULT_FILE=$RESULT"
 echo "====================== AUTO TUNEPARAMETERS ===================="

-rm -rf "$LOG_FOLDER"
-rm -rf "$PROFILE_PATH"
-mkdir -p "$LOG_FOLDER"
-mkdir -p "$PROFILE_PATH"
+rm -rf $LOG_FOLDER
+rm -rf $PROFILE_PATH
+mkdir -p $LOG_FOLDER
+mkdir -p $PROFILE_PATH

 cd "$BASE/vllm"

@@ -114,7 +114,7 @@ start_server() {

    # wait for 10 minutes...
    server_started=0
-    for _ in {1..60}; do
+    for i in {1..60}; do
        # This line checks whether the server is still alive or not,
        # since that we should always have permission to send signal to the server process.
        kill -0 $server_pid 2> /dev/null || break
@@ -145,12 +145,12 @@ run_benchmark() {
    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
    echo "vllm_log: $vllm_log"
    echo
-    rm -f "$vllm_log"
+    rm -f $vllm_log
    pkill -if "vllm serve" || true

    echo "starting server..."
    # Call start_server without a profile_dir to avoid profiling overhead
-    start_server "$gpu_memory_utilization" "$max_num_seqs" "$max_num_batched_tokens" "$vllm_log" ""
+    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log ""
    result=$?
    if [[ "$result" -eq 1 ]]; then
        echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
@@ -168,15 +168,15 @@ run_benchmark() {
    # --profile flag is removed from this call
    vllm bench serve \
        --backend vllm \
-        --model "$MODEL"  \
+        --model $MODEL  \
        --dataset-name random \
        --random-input-len $adjusted_input_len \
-        --random-output-len "$OUTPUT_LEN" \
+        --random-output-len $OUTPUT_LEN \
        --ignore-eos \
        --disable-tqdm \
        --request-rate inf \
        --percentile-metrics ttft,tpot,itl,e2el \
-        --goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
+        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
        --num-prompts 1000 \
        --random-prefix-len $prefix_len \
        --host "$HOSTNAME" \
@@ -195,20 +195,20 @@ run_benchmark() {
        request_rate=$((${throughput%.*} + 1))
        while ((request_rate > 0)); do
            # clear prefix cache
-            curl -X POST http://"${HOSTNAME}":8004/reset_prefix_cache
+            curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
            sleep 5
            bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
            vllm bench serve \
                --backend vllm \
-                --model "$MODEL"  \
+                --model $MODEL  \
                --dataset-name random \
                --random-input-len $adjusted_input_len \
-                --random-output-len "$OUTPUT_LEN" \
+                --random-output-len $OUTPUT_LEN \
                --ignore-eos \
                --disable-tqdm \
                --request-rate $request_rate \
                --percentile-metrics ttft,tpot,itl,e2el \
-                --goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
+                --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
                --num-prompts 100 \
                --random-prefix-len $prefix_len \
                --host "$HOSTNAME" \
@@ -255,7 +255,7 @@ gpu_memory_utilization=0.98
 find_gpu_memory_utilization=0
 while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
    # Pass empty string for profile_dir argument
-    start_server "$gpu_memory_utilization" "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
+    start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
    result=$?
    if [[ "$result" -eq 0 ]]; then
        find_gpu_memory_utilization=1
@@ -274,7 +274,7 @@ fi

 for num_seqs in "${num_seqs_list[@]}"; do
    for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
-        run_benchmark "$num_seqs" "$num_batched_tokens" "$gpu_memory_utilization"
+        run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
    done
 done
 echo "finish permutations"
@@ -285,7 +285,7 @@ echo "finish permutations"
 if (( $(echo "$best_throughput > 0" | bc -l) )); then
    echo
    echo "Benchmark tuning finished. Now running profiling on the best configuration found..."
-    echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput, goodput: $best_goodput"
+    echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput"
    echo

    vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt"
@@ -293,7 +293,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then

    # Start server with the best params and profiling ENABLED
    echo "Starting server for profiling..."
-    start_server "$gpu_memory_utilization" "$best_max_num_seqs" "$best_num_batched_tokens" "$vllm_log" "$PROFILE_PATH"
+    start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH"

    # Run benchmark with the best params and the --profile flag
    echo "Running benchmark with profiling..."
@@ -301,15 +301,15 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
    adjusted_input_len=$(( INPUT_LEN - prefix_len ))
    vllm bench serve \
        --backend vllm \
-        --model "$MODEL" \
+        --model $MODEL \
        --dataset-name random \
        --random-input-len $adjusted_input_len \
-        --random-output-len "$OUTPUT_LEN" \
+        --random-output-len $OUTPUT_LEN \
        --ignore-eos \
        --disable-tqdm \
-        --request-rate "$best_request_rate" \
+        --request-rate $best_request_rate \
        --percentile-metrics ttft,tpot,itl,e2el \
-        --goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
+        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
        --num-prompts 100 \
        --random-prefix-len $prefix_len \
        --host "$HOSTNAME" \
--- a/benchmarks/auto_tune/batch_auto_tune.sh
+++ b/benchmarks/auto_tune/batch_auto_tune.sh
@@ -64,7 +64,7 @@ for i in $(seq 0 $(($num_runs - 1))); do
  else
    STATUS="FAILURE"
    ((FAILURE_COUNT++))
-    FAILED_RUNS+=("Run #$((i+1)): $(echo "$run_object" | jq -c .)")
+    FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)")
  fi

  RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")
--- a/benchmarks/benchmark_topk_topp.py
+++ b/benchmarks/benchmark_topk_topp.py
@@ -1,471 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Benchmark comparing Triton vs PyTorch sort-based top-k/top-p implementations.
-
-Compares:
- apply_top_k_top_p_triton (Triton binary search)
- apply_top_k_top_p (PyTorch sort-based)
-
-Scenarios:
- top_k only (whole batch, partial batch)
- top_p only (whole batch, partial batch)
- mix of top_k and top_p
-"""
-
-import argparse
-import gc
-from dataclasses import dataclass
-
-import torch
-
-from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p_pytorch
-from vllm.v1.sample.ops.topk_topp_triton import (
-    apply_top_k_top_p_triton,
-    reset_buffer_cache,
-)
-
-
-@dataclass
-class BenchmarkConfig:
-    """Configuration for a benchmark run."""
-
-    name: str
-    batch_size: int
-    vocab_size: int
-    # k and p can be tensors or None
-    k_values: torch.Tensor | None  # [batch_size] or None
-    p_values: torch.Tensor | None  # [batch_size] or None
-    description: str
-    ops_pct: float = 0.0  # Percentage of ops relative to batch size
-
-
-def calculate_ops_pct(
-    k_values: torch.Tensor | None,
-    p_values: torch.Tensor | None,
-    vocab_size: int,
-    batch_size: int,
-) -> float:
-    """
-    Calculate the percentage of active top-k and top-p operations.
-
-    Returns percentage where 100% = batch_size ops.
-    E.g., if all rows have both top-k and top-p active, returns 200%.
-    """
-    active_ops = 0
-
-    if k_values is not None:
-        # Count rows where k < vocab_size (active top-k filtering)
-        active_ops += (k_values < vocab_size).sum().item()
-
-    if p_values is not None:
-        # Count rows where p < 1.0 (active top-p filtering)
-        active_ops += (p_values < 1.0).sum().item()
-
-    return (active_ops / batch_size) * 100 if batch_size > 0 else 0.0
-
-
-def create_logits(
-    batch_size: int, vocab_size: int, device: str = "cuda"
-) -> torch.Tensor:
-    """Create random logits mimicking a realistic LLM distribution.
-
-    Uses a Zipf-like probability distribution (rank^-1.1) converted to logits
-    via log, then randomly permuted per row. This produces a peaked distribution
-    where a small number of tokens capture most probability mass, similar to
-    real model outputs.
-    """
-    # Create Zipf-like probabilities: p(rank) ~ rank^(-alpha)
-    ranks = torch.arange(1, vocab_size + 1, dtype=torch.float32, device=device)
-    probs = ranks.pow(-1.1)
-    probs = probs / probs.sum()
-
-    # Convert to logits (log-probabilities, unnormalized is fine)
-    base_logits = probs.log()
-
-    # Broadcast to batch and randomly permute each row
-    logits = base_logits.unsqueeze(0).expand(batch_size, -1).clone()
-    for i in range(batch_size):
-        logits[i] = logits[i, torch.randperm(vocab_size, device=device)]
-
-    return logits
-
-
-def measure_memory() -> tuple[int, int]:
-    """Return (allocated, reserved) memory in bytes."""
-    torch.cuda.synchronize()
-    return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()
-
-
-def reset_memory_stats():
-    """Reset peak memory statistics."""
-    reset_buffer_cache()
-    torch.cuda.reset_peak_memory_stats()
-    torch.cuda.empty_cache()
-    gc.collect()
-
-
-def benchmark_function(
-    func,
-    logits: torch.Tensor,
-    k: torch.Tensor | None,
-    p: torch.Tensor | None,
-    warmup_iters: int = 5,
-    benchmark_iters: int = 20,
-) -> tuple[float, int]:
-    """
-    Benchmark a function and return (avg_time_ms, peak_memory_bytes).
-
-    Returns average time in milliseconds and peak memory usage.
-    """
-    # Warmup
-    for _ in range(warmup_iters):
-        logits_copy = logits.clone()
-        func(logits_copy, k, p)
-    torch.cuda.synchronize()
-
-    # Reset memory stats before benchmark
-    reset_memory_stats()
-
-    # Benchmark
-    start_events = [
-        torch.cuda.Event(enable_timing=True) for _ in range(benchmark_iters)
-    ]
-    end_events = [torch.cuda.Event(enable_timing=True) for _ in range(benchmark_iters)]
-
-    for i in range(benchmark_iters):
-        logits_copy = logits.clone()
-        start_events[i].record()
-        func(logits_copy, k, p)
-        end_events[i].record()
-
-    torch.cuda.synchronize()
-
-    # Calculate timing
-    times = [
-        start_events[i].elapsed_time(end_events[i]) for i in range(benchmark_iters)
-    ]
-    avg_time = sum(times) / len(times)
-
-    # Get peak memory
-    _, peak_memory = measure_memory()
-
-    return avg_time, peak_memory
-
-
-def create_benchmark_configs(
-    batch_sizes: list[int],
-    vocab_sizes: list[int],
-    device: str = "cuda",
-) -> list[BenchmarkConfig]:
-    """Create all benchmark configurations."""
-    configs = []
-
-    for vocab_size in vocab_sizes:
-        for batch_size in batch_sizes:
-            # 1. Top-k only - whole batch (all rows have k < vocab_size)
-            k_all = torch.full((batch_size,), 50, dtype=torch.int32, device=device)
-            configs.append(
-                BenchmarkConfig(
-                    name=f"topk_whole_b{batch_size}_v{vocab_size // 1000}k",
-                    batch_size=batch_size,
-                    vocab_size=vocab_size,
-                    k_values=k_all,
-                    p_values=None,
-                    description=f"Top-k only (whole batch, k=50), "
-                    f"batch={batch_size}, vocab={vocab_size}",
-                    ops_pct=calculate_ops_pct(k_all, None, vocab_size, batch_size),
-                )
-            )
-
-            # 2. Top-k only - partial batch (half have k=50, half have k=vocab_size)
-            k_partial = torch.full((batch_size,), 50, dtype=torch.int32, device=device)
-            k_partial[batch_size // 2 :] = vocab_size  # No filtering for second half
-            configs.append(
-                BenchmarkConfig(
-                    name=f"topk_partial_b{batch_size}_v{vocab_size // 1000}k",
-                    batch_size=batch_size,
-                    vocab_size=vocab_size,
-                    k_values=k_partial,
-                    p_values=None,
-                    description=f"Top-k only (partial batch, 50% k=50, 50% k=vocab), "
-                    f"batch={batch_size}, vocab={vocab_size}",
-                    ops_pct=calculate_ops_pct(k_partial, None, vocab_size, batch_size),
-                )
-            )
-
-            # 3. Top-p only - whole batch (all rows have p < 1.0)
-            p_all = torch.full((batch_size,), 0.9, dtype=torch.float32, device=device)
-            configs.append(
-                BenchmarkConfig(
-                    name=f"topp_whole_b{batch_size}_v{vocab_size // 1000}k",
-                    batch_size=batch_size,
-                    vocab_size=vocab_size,
-                    k_values=None,
-                    p_values=p_all,
-                    description=f"Top-p only (whole batch, p=0.9), "
-                    f"batch={batch_size}, vocab={vocab_size}",
-                    ops_pct=calculate_ops_pct(None, p_all, vocab_size, batch_size),
-                )
-            )
-
-            # 4. Top-p only - partial batch (half have p=0.9, half have p=1.0)
-            p_partial = torch.full(
-                (batch_size,), 0.9, dtype=torch.float32, device=device
-            )
-            p_partial[batch_size // 2 :] = 1.0  # No filtering for second half
-            configs.append(
-                BenchmarkConfig(
-                    name=f"topp_partial_b{batch_size}_v{vocab_size // 1000}k",
-                    batch_size=batch_size,
-                    vocab_size=vocab_size,
-                    k_values=None,
-                    p_values=p_partial,
-                    description=f"Top-p only (partial batch, 50% p=0.9, 50% p=1.0), "
-                    f"batch={batch_size}, vocab={vocab_size}",
-                    ops_pct=calculate_ops_pct(None, p_partial, vocab_size, batch_size),
-                )
-            )
-
-            # 5. Mix of top-k and top-p (both applied to whole batch)
-            k_mix = torch.full((batch_size,), 100, dtype=torch.int32, device=device)
-            p_mix = torch.full((batch_size,), 0.9, dtype=torch.float32, device=device)
-            configs.append(
-                BenchmarkConfig(
-                    name=f"topk_topp_whole_b{batch_size}_v{vocab_size // 1000}k",
-                    batch_size=batch_size,
-                    vocab_size=vocab_size,
-                    k_values=k_mix,
-                    p_values=p_mix,
-                    description=f"Top-k + Top-p (whole batch, k=100, p=0.9), "
-                    f"batch={batch_size}, vocab={vocab_size}",
-                    ops_pct=calculate_ops_pct(k_mix, p_mix, vocab_size, batch_size),
-                )
-            )
-
-            # 6. Mix with partial application (some rows k only, some p only, some both)
-            k_mixed = torch.full(
-                (batch_size,), vocab_size, dtype=torch.int32, device=device
-            )
-            p_mixed = torch.full((batch_size,), 1.0, dtype=torch.float32, device=device)
-            # First third: k only
-            third = batch_size // 3
-            k_mixed[:third] = 50
-            # Second third: p only
-            p_mixed[third : 2 * third] = 0.5
-            # Last third: both k and p
-            k_mixed[2 * third :] = 100
-            p_mixed[2 * third :] = 0.9
-            configs.append(
-                BenchmarkConfig(
-                    name=f"mixed_partial_b{batch_size}_v{vocab_size // 1000}k",
-                    batch_size=batch_size,
-                    vocab_size=vocab_size,
-                    k_values=k_mixed,
-                    p_values=p_mixed,
-                    description=f"Mixed partial (1/3 k=50, 1/3 p=0.9, 1/3 both), "
-                    f"batch={batch_size}, vocab={vocab_size}",
-                    ops_pct=calculate_ops_pct(k_mixed, p_mixed, vocab_size, batch_size),
-                )
-            )
-
-    return configs
-
-
-def format_memory(bytes_val: int) -> str:
-    """Format memory in human-readable form."""
-    if bytes_val >= 1024**3:
-        return f"{bytes_val / (1024**3):.2f} GB"
-    elif bytes_val >= 1024**2:
-        return f"{bytes_val / (1024**2):.2f} MB"
-    elif bytes_val >= 1024:
-        return f"{bytes_val / 1024:.2f} KB"
-    return f"{bytes_val} B"
-
-
-def run_benchmark(
-    configs: list[BenchmarkConfig],
-    warmup_iters: int = 5,
-    benchmark_iters: int = 20,
-    verbose: bool = True,
-):
-    """Run all benchmarks and print results."""
-    results = []
-
-    print("=" * 100)
-    print("Top-k/Top-p Benchmark: Triton vs PyTorch Sort-based")
-    print("=" * 100)
-    print()
-
-    for config in configs:
-        if verbose:
-            print(f"Running: {config.description}")
-
-        # Create fresh logits for this config
-        logits = create_logits(config.batch_size, config.vocab_size)
-
-        # Benchmark Triton
-        reset_memory_stats()
-        triton_time, triton_mem = benchmark_function(
-            apply_top_k_top_p_triton,
-            logits,
-            config.k_values,
-            config.p_values,
-            warmup_iters,
-            benchmark_iters,
-        )
-
-        # Benchmark PyTorch
-        reset_memory_stats()
-        pytorch_time, pytorch_mem = benchmark_function(
-            apply_top_k_top_p_pytorch,
-            logits,
-            config.k_values,
-            config.p_values,
-            warmup_iters,
-            benchmark_iters,
-        )
-
-        speedup = pytorch_time / triton_time if triton_time > 0 else float("inf")
-        mem_ratio = pytorch_mem / triton_mem if triton_mem > 0 else float("inf")
-
-        result = {
-            "config": config,
-            "triton_time_ms": triton_time,
-            "pytorch_time_ms": pytorch_time,
-            "triton_mem": triton_mem,
-            "pytorch_mem": pytorch_mem,
-            "speedup": speedup,
-            "mem_ratio": mem_ratio,
-        }
-        results.append(result)
-
-        if verbose:
-            print(f"  Triton:  {triton_time:.3f} ms, {format_memory(triton_mem)}")
-            print(f"  PyTorch: {pytorch_time:.3f} ms, {format_memory(pytorch_mem)}")
-            print(f"  Speedup: {speedup:.2f}x, Memory ratio: {mem_ratio:.2f}x")
-            print()
-
-        # Clean up
-        del logits
-        reset_memory_stats()
-
-    return results
-
-
-def print_summary_table(results: list[dict]):
-    """Print a summary table of results."""
-    print()
-    print("=" * 130)
-    print("SUMMARY TABLE")
-    print("=" * 130)
-    print()
-
-    # Header
-    header = (
-        f"{'Scenario':<40} {'Batch':>6} {'Vocab':>7} {'Ops%':>6} "
-        f"{'Triton (ms)':>12} {'PyTorch (ms)':>13} {'Speedup':>8} "
-        f"{'Tri Mem':>10} {'Pyt Mem':>10}"
-    )
-    print(header)
-    print("-" * 130)
-
-    # Group by scenario type
-    current_vocab = None
-    for result in results:
-        config = result["config"]
-
-        # Add separator between vocab sizes
-        if current_vocab != config.vocab_size:
-            if current_vocab is not None:
-                print("-" * 130)
-            current_vocab = config.vocab_size
-
-        scenario = config.name.split("_b")[0]  # Extract scenario name
-        print(
-            f"{scenario:<40} {config.batch_size:>6} {config.vocab_size:>7} "
-            f"{config.ops_pct:>5.0f}% "
-            f"{result['triton_time_ms']:>12.3f} {result['pytorch_time_ms']:>13.3f} "
-            f"{result['speedup']:>7.2f}x "
-            f"{format_memory(result['triton_mem']):>10} "
-            f"{format_memory(result['pytorch_mem']):>10}"
-        )
-
-    print("=" * 130)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Benchmark Triton vs PyTorch sort-based top-k/top-p implementations"
-    )
-    parser.add_argument(
-        "--batch-sizes",
-        type=int,
-        nargs="+",
-        default=[1, 4, 16, 64, 128, 512, 1024, 2048],
-        help="Batch sizes to test (default: 1 4 16 64)",
-    )
-    parser.add_argument(
-        "--vocab-sizes",
-        type=int,
-        nargs="+",
-        default=[32768, 131072],  # 32k, 128k
-        help="Vocabulary sizes to test (default: 32768 131072)",
-    )
-    parser.add_argument(
-        "--warmup-iters",
-        type=int,
-        default=5,
-        help="Number of warmup iterations (default: 5)",
-    )
-    parser.add_argument(
-        "--benchmark-iters",
-        type=int,
-        default=20,
-        help="Number of benchmark iterations (default: 20)",
-    )
-    parser.add_argument(
-        "--quiet",
-        action="store_true",
-        help="Only print summary table",
-    )
-
-    args = parser.parse_args()
-
-    # Print configuration
-    print(f"Batch sizes: {args.batch_sizes}")
-    print(f"Vocab sizes: {args.vocab_sizes}")
-    print(f"Warmup iterations: {args.warmup_iters}")
-    print(f"Benchmark iterations: {args.benchmark_iters}")
-    print()
-
-    # Check CUDA
-    if not torch.cuda.is_available():
-        print("ERROR: CUDA is not available. This benchmark requires a GPU.")
-        return
-
-    device_name = torch.cuda.get_device_name(0)
-    print(f"GPU: {device_name}")
-    print()
-
-    # Create configs
-    configs = create_benchmark_configs(
-        args.batch_sizes,
-        args.vocab_sizes,
-    )
-
-    # Run benchmarks
-    results = run_benchmark(
-        configs,
-        warmup_iters=args.warmup_iters,
-        benchmark_iters=args.benchmark_iters,
-        verbose=not args.quiet,
-    )
-
-    # Print summary
-    print_summary_table(results)
-
-
-if __name__ == "__main__":
-    main()
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -13,7 +13,6 @@ from torch.utils.benchmark import Measurement as TMeasurement
 from tqdm import tqdm

 import vllm._custom_ops as ops
-from vllm.benchmarks.lib.utils import default_vllm_config
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    per_token_group_quant_fp8,
@@ -292,7 +291,6 @@ def print_timers(timers: Iterable[TMeasurement]):
    compare.print()


-@default_vllm_config()
 def main():
    torch.set_default_device("cuda")
    bench_params = get_bench_params()
--- a/benchmarks/kernels/benchmark_block_fp8_gemm.py
+++ b/benchmarks/kernels/benchmark_block_fp8_gemm.py
@@ -8,7 +8,6 @@ os.environ["VLLM_USE_DEEP_GEMM"] = "0"

 import torch

-from vllm.benchmarks.lib.utils import default_vllm_config
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    W8A8BlockFp8LinearOp,
 )
@@ -41,7 +40,6 @@ DEEPSEEK_V3_SHAPES = [
 ]


-@default_vllm_config()
 def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
    """Build runner function for w8a8 block fp8 matmul."""
    factor_for_scale = 1e-2
--- a/benchmarks/kernels/benchmark_fp8_gemm.py
+++ b/benchmarks/kernels/benchmark_fp8_gemm.py
--- a/benchmarks/kernels/benchmark_int8_gemm.py
+++ b/benchmarks/kernels/benchmark_int8_gemm.py
--- a/benchmarks/kernels/benchmark_mxfp4_qutlass.py
+++ b/benchmarks/kernels/benchmark_mxfp4_qutlass.py
--- a/benchmarks/kernels/benchmark_nvfp4_gemm.py
+++ b/benchmarks/kernels/benchmark_nvfp4_gemm.py
--- a/benchmarks/kernels/benchmark_nvfp4_quant.py
+++ b/benchmarks/kernels/benchmark_nvfp4_quant.py
--- a/benchmarks/kernels/benchmark_nvfp4_qutlass.py
+++ b/benchmarks/kernels/benchmark_nvfp4_qutlass.py
--- a/benchmarks/kernels/benchmark_per_token_quant_fp8.py
+++ b/benchmarks/kernels/benchmark_per_token_quant_fp8.py
@@ -7,7 +7,6 @@ from unittest.mock import patch
 import pandas as pd
 import torch

-from vllm.benchmarks.lib.utils import default_vllm_config
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.triton_utils import triton
@@ -85,7 +84,6 @@ def calculate_diff(
 configs = []


-@default_vllm_config()
 def benchmark_quantization(
    batch_size,
    hidden_size,
--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@@ -7,7 +7,6 @@ import itertools
 import torch

 import vllm.model_executor.layers.activation  # noqa F401
-from vllm.benchmarks.lib.utils import default_vllm_config
 from vllm.model_executor.custom_op import op_registry
 from vllm.triton_utils import triton
 from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -19,7 +18,6 @@ intermediate_size = [3072, 9728, 12288]
 configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))


-@default_vllm_config()
 def benchmark_activation(
    batch_size: int,
    seq_len: int,
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -11,7 +11,6 @@ import torch
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.fused_moe.activation import MoEActivation
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
 from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
@@ -162,7 +161,7 @@ def bench_run(
                w2_fp8q_cutlass,
                topk_weights,
                topk_ids,
-                activation=MoEActivation.SILU,
+                activation="silu",
                global_num_experts=num_experts,
            )
    torch.cuda.synchronize()
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -30,9 +30,6 @@ import torch.distributed as dist
 from torch.distributed import ProcessGroup

 from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
-from vllm.distributed.device_communicators.flashinfer_all_reduce import (
-    FlashInferAllReduce,
-)
 from vllm.distributed.device_communicators.pynccl import (
    PyNcclCommunicator,
    register_nccl_symmetric_ops,
@@ -47,7 +44,7 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser
 logger = init_logger(__name__)

 # Default sequence lengths to benchmark
-DEFAULT_SEQUENCE_LENGTHS = [16, 64, 128, 512, 1024, 2048, 4096, 8192]
+DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192]

 # Fixed hidden size and dtype for all benchmarks
 HIDDEN_SIZE = 8192
@@ -84,7 +81,6 @@ class CommunicatorBenchmark:
        self.symm_mem_comm = None
        self.symm_mem_comm_multimem = None
        self.symm_mem_comm_two_shot = None
-        self.fi_ar_comm = None

        self._init_communicators()

@@ -165,22 +161,6 @@ class CommunicatorBenchmark:
            )
            self.symm_mem_comm_two_shot = None

-        try:
-            self.fi_ar_comm = FlashInferAllReduce(
-                group=self.cpu_group,
-                device=self.device,
-            )
-            if not self.fi_ar_comm.disabled:
-                logger.info("Rank %s: FlashInferAllReduce initialized", self.rank)
-            else:
-                logger.info("Rank %s: FlashInferAllReduce disabled", self.rank)
-                self.fi_ar_comm = None
-        except Exception as e:
-            logger.warning(
-                "Rank %s: Failed to initialize FlashInferAllReduce: %s", self.rank, e
-            )
-            self.fi_ar_comm = None
-
    def benchmark_allreduce(
        self, sequence_length: int, num_warmup: int, num_trials: int
    ) -> dict[str, float]:
@@ -200,8 +180,7 @@ class CommunicatorBenchmark:
                    lambda t, c=comm: c.custom_all_reduce(t),
                    lambda t, c=comm: c.should_custom_ar(t),
                    comm.capture(),
-                    {"VLLM_CUSTOM_ALLREDUCE_ALGO": "1stage"},
-                    None,  # no destroy function
+                    "1stage",  # env variable value
                )
            )
            # CustomAllreduce two-shot
@@ -211,8 +190,7 @@ class CommunicatorBenchmark:
                    lambda t, c=comm: c.custom_all_reduce(t),
                    lambda t, c=comm: c.should_custom_ar(t),
                    comm.capture(),
-                    {"VLLM_CUSTOM_ALLREDUCE_ALGO": "2stage"},
-                    None,  # no destroy function
+                    "2stage",  # env variable value
                )
            )

@@ -224,8 +202,7 @@ class CommunicatorBenchmark:
                    lambda t, c=comm: c.all_reduce(t),
                    lambda t: True,  # Always available if initialized
                    nullcontext(),
-                    {},  # no env variable needed
-                    None,  # no destroy function
+                    None,  # no env variable needed
                )
            )
            communicators.append(
@@ -234,8 +211,7 @@ class CommunicatorBenchmark:
                    lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
                    lambda t: True,  # Always available if initialized
                    nullcontext(),
-                    {},  # no env variable needed
-                    None,  # no destroy function
+                    None,  # no env variable needed
                )
            )

@@ -247,8 +223,7 @@ class CommunicatorBenchmark:
                    lambda t, c=comm: c.all_reduce(t),
                    lambda t, c=comm: c.should_use_symm_mem(t),
                    nullcontext(),
-                    {},  # no env variable needed
-                    None,  # no destroy function
+                    None,  # no env variable needed
                )
            )

@@ -260,67 +235,29 @@ class CommunicatorBenchmark:
                    lambda t, c=comm: c.all_reduce(t),
                    lambda t, c=comm: c.should_use_symm_mem(t),
                    nullcontext(),
-                    {},  # no env variable needed
-                    None,  # no destroy function needed
-                )
-            )
-
-        if self.fi_ar_comm is not None:
-            comm = self.fi_ar_comm
-            communicators.append(
-                (
-                    "flashinfer_trtllm",
-                    lambda t, c=comm: c.all_reduce(t),
-                    lambda t, c=comm: c.should_use_fi_ar(t),
-                    nullcontext(),
-                    {"VLLM_FLASHINFER_ALLREDUCE_BACKEND": "trtllm"},
-                    lambda c=comm: c.destroy(),
-                )
-            )
-            communicators.append(
-                (
-                    "flashinfer_mnnvl",
-                    lambda t, c=comm: c.all_reduce(t),
-                    lambda t, c=comm: c.should_use_fi_ar(t),
-                    nullcontext(),
-                    {"VLLM_FLASHINFER_ALLREDUCE_BACKEND": "mnnvl"},
-                    lambda c=comm: c.destroy(),
+                    None,  # no env variable needed
                )
            )

        # Benchmark each communicator
-        for (
-            name,
-            allreduce_fn,
-            should_use_fn,
-            context,
-            env_dict,
-            destroy_fn,
-        ) in communicators:
-            # Save original values and apply new environment variables
-            saved_env = {key: os.environ.get(key) for key in env_dict}
-            for key, value in env_dict.items():
-                os.environ[key] = value
-            try:
-                latency = self.benchmark_allreduce_single(
-                    sequence_length,
-                    allreduce_fn,
-                    should_use_fn,
-                    context,
-                    num_warmup,
-                    num_trials,
-                )
-                if latency is not None:
-                    results[name] = latency
-            finally:
-                if destroy_fn is not None:
-                    destroy_fn()
-                # Restore environment variables to their original state
-                for key, original_value in saved_env.items():
-                    if original_value is None:
-                        os.environ.pop(key, None)
-                    else:
-                        os.environ[key] = original_value
+        for name, allreduce_fn, should_use_fn, context, env_var in communicators:
+            # Set environment variable if needed
+            if env_var is not None:
+                os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var
+            else:
+                # Clear the environment variable to avoid interference
+                os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None)
+
+            latency = self.benchmark_allreduce_single(
+                sequence_length,
+                allreduce_fn,
+                should_use_fn,
+                context,
+                num_warmup,
+                num_trials,
+            )
+            if latency is not None:
+                results[name] = latency

        return results

--- a/benchmarks/kernels/benchmark_fused_collective.py
+++ b/benchmarks/kernels/benchmark_fused_collective.py
@@ -5,11 +5,8 @@
 Benchmark for FlashInfer fused collective operations vs standard operations.

 This benchmark compares:
-1. FlashInfer's allreduce_fusion with trtllm backend
-   (fused allreduce + rmsnorm + optional FP8/FP4 quant)
-2. FlashInfer's allreduce_fusion with mnnvl backend
-   (fused allreduce + rmsnorm only, no quantization support)
-3. Standard tensor_model_parallel_all_reduce + separate rmsnorm/quant operations
+1. FlashInfer's trtllm_allreduce_fusion (fused allreduce + rmsnorm + optional quant)
+2. Standard tensor_model_parallel_all_reduce + separate rmsnorm/quant operations

 Usage with torchrun:
    torchrun --nproc_per_node=2 benchmark_fused_collective.py
@@ -27,6 +24,7 @@ import torch.distributed as dist  # type: ignore

 from vllm.config.vllm import CompilationConfig, VllmConfig, set_current_vllm_config
 from vllm.distributed import (
+    get_tp_group,
    tensor_model_parallel_all_reduce,
 )
 from vllm.distributed.parallel_state import (
@@ -51,19 +49,14 @@ SCALED_FP4_QUANT_OP = torch.ops._C.scaled_fp4_quant
 logger = init_logger(__name__)

 # Try to import FlashInfer
-TorchDistBackend = None
 try:
    import flashinfer.comm as flashinfer_comm  # type: ignore
-    from flashinfer.comm.mnnvl import (  # type: ignore
-        TorchDistBackend,
-    )

-    if not (
-        hasattr(flashinfer_comm, "allreduce_fusion")
-        and hasattr(flashinfer_comm, "create_allreduce_fusion_workspace")
-    ):
+    if not hasattr(flashinfer_comm, "trtllm_allreduce_fusion"):
        flashinfer_comm = None
-        logger.warning("FlashInfer comm module found but missing allreduce_fusion API")
+        logger.warning(
+            "FlashInfer comm module found but missing trtllm_allreduce_fusion"
+        )
 except ImportError:
    flashinfer_comm = None
    logger.warning("FlashInfer not found, only benchmarking standard operations")
@@ -81,70 +74,57 @@ _FI_MAX_SIZES = {
    8: 64 * MiB,  # 64MB
 }

-# Global workspace tensors for FlashInfer (keyed by backend name)
-_FI_WORKSPACES: dict = {}
-
-# Backends to benchmark
-FLASHINFER_BACKENDS = ["trtllm", "mnnvl"]
+# Global workspace tensor for FlashInfer
+_FI_WORKSPACE_TENSOR = None


 def setup_flashinfer_workspace(
-    backend: str,
    world_size: int,
    rank: int,
    hidden_dim: int,
    max_token_num: int,
-    dtype: torch.dtype,
+    use_fp32_lamport: bool = False,
 ):
    """Setup FlashInfer workspace for fused allreduce operations."""
-    global FI_WORKSPACES
+    global _FI_WORKSPACE_TENSOR

    if flashinfer_comm is None:
-        return None
+        return None, None

    if world_size not in _FI_MAX_SIZES:
        logger.warning("FlashInfer not supported for world size %s", world_size)
-        return None
+        return None, None

    try:
-        kwargs = {}
-        if TorchDistBackend is not None:
-            kwargs["comm_backend"] = TorchDistBackend(group=dist.group.WORLD)
-
-        workspace = flashinfer_comm.create_allreduce_fusion_workspace(
-            backend=backend,
-            world_size=world_size,
-            rank=rank,
-            max_token_num=max_token_num,
-            hidden_dim=hidden_dim,
-            dtype=dtype,
-            **kwargs,
+        # Create IPC workspace
+        ipc_handles, workspace_tensor = (
+            flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion(
+                tp_rank=rank,
+                tp_size=world_size,
+                max_token_num=max_token_num,
+                hidden_dim=hidden_dim,
+                group=get_tp_group().device_group,
+                use_fp32_lamport=use_fp32_lamport,
+            )
        )

-        _FI_WORKSPACES[backend] = workspace
-        return workspace
+        _FI_WORKSPACE_TENSOR = workspace_tensor
+        return ipc_handles, workspace_tensor
    except Exception as e:
-        logger.error(
-            "Failed to setup FlashInfer workspace (backend=%s): %s", backend, e
-        )
-        return None
+        logger.error("Failed to setup FlashInfer workspace: %s", e)
+        return None, None


-def cleanup_flashinfer_workspaces():
-    """Cleanup all FlashInfer workspaces."""
-    if flashinfer_comm is None:
+def cleanup_flashinfer_workspace(ipc_handles):
+    """Cleanup FlashInfer workspace."""
+    if flashinfer_comm is None or ipc_handles is None:
        return

-    for backend, workspace in _FI_WORKSPACES.items():
-        try:
-            workspace.destroy()
-        except Exception as e:
-            logger.error(
-                "Failed to cleanup FlashInfer workspace (backend=%s): %s",
-                backend,
-                e,
-            )
-    _FI_WORKSPACES.clear()
+    try:
+        group = get_tp_group().device_group
+        flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce(ipc_handles, group)
+    except Exception as e:
+        logger.error("Failed to cleanup FlashInfer workspace: %s", e)


 class FlashInferFusedAllReduceParams:
@@ -152,15 +132,25 @@ class FlashInferFusedAllReduceParams:

    def __init__(
        self,
+        rank: int,
+        world_size: int,
+        use_fp32_lamport: bool = False,
        max_token_num: int = 1024,
    ):
+        self.rank = rank
+        self.world_size = world_size
+        self.use_fp32_lamport = use_fp32_lamport
+        self.trigger_completion_at_end = True
        self.launch_with_pdl = True
        self.fp32_acc = True
        self.max_token_num = max_token_num

-    def get_flashinfer_fused_allreduce_kwargs(self):
+    def get_trtllm_fused_allreduce_kwargs(self):
        return {
+            "world_rank": self.rank,
+            "world_size": self.world_size,
            "launch_with_pdl": self.launch_with_pdl,
+            "trigger_completion_at_end": self.trigger_completion_at_end,
            "fp32_acc": self.fp32_acc,
        }

@@ -171,12 +161,11 @@ def flashinfer_fused_allreduce_rmsnorm(
    rms_gamma: torch.Tensor,
    rms_eps: float,
    allreduce_params: "FlashInferFusedAllReduceParams",
-    workspace: object,
    use_oneshot: bool,
    norm_out: torch.Tensor | None = None,
 ):
    """FlashInfer fused allreduce + rmsnorm operation."""
-    if flashinfer_comm is None or workspace is None:
+    if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
        raise RuntimeError("FlashInfer not available or workspace not initialized")

    if norm_out is None:
@@ -185,25 +174,24 @@ def flashinfer_fused_allreduce_rmsnorm(
    else:
        residual_out = input_tensor

-    layout_code = None
-    if workspace.backend == "trtllm":
-        layout_code = flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4
-
-    flashinfer_comm.allreduce_fusion(
-        input=input_tensor,
-        workspace=workspace,
-        pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm,
+    flashinfer_comm.trtllm_allreduce_fusion(
+        allreduce_in=input_tensor,
+        token_num=input_tensor.shape[0],
        residual_in=residual,
        residual_out=residual_out,
        norm_out=norm_out,
        rms_gamma=rms_gamma,
        rms_eps=rms_eps,
+        hidden_dim=input_tensor.shape[-1],
+        workspace_ptrs=_FI_WORKSPACE_TENSOR,
+        pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm,
+        allreduce_out=None,
        quant_out=None,
        scale_out=None,
-        layout_code=layout_code,
+        layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
        scale_factor=None,
        use_oneshot=use_oneshot,
-        **allreduce_params.get_flashinfer_fused_allreduce_kwargs(),
+        **allreduce_params.get_trtllm_fused_allreduce_kwargs(),
    )


@@ -214,16 +202,12 @@ def flashinfer_fused_allreduce_rmsnorm_fp8_quant(
    rms_eps: float,
    scale_factor: torch.Tensor,
    allreduce_params: FlashInferFusedAllReduceParams,
-    workspace: object,
    use_oneshot: bool = True,
    norm_out: torch.Tensor | None = None,
    quant_out: torch.Tensor | None = None,
 ):
-    """FlashInfer fused allreduce + rmsnorm + FP8 quantization.
-
-    Note: Only supported by the trtllm backend.
-    """
-    if flashinfer_comm is None or workspace is None:
+    """FlashInfer fused allreduce + rmsnorm + FP8 quantization."""
+    if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
        raise RuntimeError("FlashInfer not available or workspace not initialized")

    if norm_out is None:
@@ -232,21 +216,24 @@ def flashinfer_fused_allreduce_rmsnorm_fp8_quant(
    else:
        residual_out = input_tensor

-    flashinfer_comm.allreduce_fusion(
-        input=input_tensor,
-        workspace=workspace,
-        pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant,
+    flashinfer_comm.trtllm_allreduce_fusion(
+        allreduce_in=input_tensor,
+        token_num=input_tensor.shape[0],
        residual_in=residual,
        residual_out=residual_out,
        norm_out=norm_out,
        rms_gamma=rms_gamma,
        rms_eps=rms_eps,
+        hidden_dim=input_tensor.shape[-1],
+        workspace_ptrs=_FI_WORKSPACE_TENSOR,
+        pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant,
+        allreduce_out=None,
        quant_out=quant_out,
        scale_out=None,
        layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
        scale_factor=scale_factor,
        use_oneshot=use_oneshot,
-        **allreduce_params.get_flashinfer_fused_allreduce_kwargs(),
+        **allreduce_params.get_trtllm_fused_allreduce_kwargs(),
    )


@@ -257,17 +244,13 @@ def flashinfer_fused_allreduce_rmsnorm_fp4_quant(
    rms_eps: float,
    input_global_scale: torch.Tensor,
    allreduce_params: FlashInferFusedAllReduceParams,
-    workspace: object,
    quant_out: torch.Tensor,
    use_oneshot: bool,
    output_scale: torch.Tensor,
    norm_out: torch.Tensor | None = None,
 ):
-    """FlashInfer fused allreduce + rmsnorm + FP4 quantization.
-
-    Note: Only supported by the trtllm backend.
-    """
-    if flashinfer_comm is None or workspace is None:
+    """FlashInfer fused allreduce + rmsnorm + FP4 quantization."""
+    if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
        raise RuntimeError("FlashInfer not available or workspace not initialized")

    if norm_out is None:
@@ -276,21 +259,24 @@ def flashinfer_fused_allreduce_rmsnorm_fp4_quant(
    else:
        residual_out = input_tensor

-    flashinfer_comm.allreduce_fusion(
-        input=input_tensor,
-        workspace=workspace,
-        pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant,
+    flashinfer_comm.trtllm_allreduce_fusion(
+        allreduce_in=input_tensor,
+        token_num=input_tensor.shape[0],
        residual_in=residual,
        residual_out=residual_out,
        norm_out=norm_out,
        rms_gamma=rms_gamma,
        rms_eps=rms_eps,
+        hidden_dim=input_tensor.shape[-1],
+        workspace_ptrs=_FI_WORKSPACE_TENSOR,
+        pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant,
+        allreduce_out=None,
        quant_out=quant_out,
        scale_out=output_scale,
        layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
        scale_factor=input_global_scale,
        use_oneshot=use_oneshot,
-        **allreduce_params.get_flashinfer_fused_allreduce_kwargs(),
+        **allreduce_params.get_trtllm_fused_allreduce_kwargs(),
    )


@@ -423,16 +409,13 @@ def run_benchmarks(
    dtype: torch.dtype,
    use_residual: bool,
    allreduce_params: FlashInferFusedAllReduceParams | None,
-    workspaces: dict,
    quant_modes: set[str],
    no_oneshot: bool,
 ):
    """Run all benchmarks for given configuration.

    Args:
-        allreduce_params: Shared parameters for FlashInfer fused allreduce.
-        workspaces: Dict mapping backend name ("trtllm", "mnnvl") to workspace.
-        quant_modes: Set of quantization modes: "none", "fp8", "fp4".
+        quant_mode: "none", "fp8_only", "fp4_only", or "all"
    """
    (
        input_tensor,
@@ -448,18 +431,18 @@ def run_benchmarks(

    rms_eps = 1e-6
    results = {}
+    vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
    use_oneshot_options = [False] if no_oneshot else [True, False]

+    # Create RMSNorm and QuantFP8 layers once for native benchmarks
+
    if "none" in quant_modes:
        # Standard AllReduce + RMSNorm
-        # Re-create VllmFusedAllreduce per config so CustomOp binds the
-        # correct forward method (native vs custom kernel).
        for custom_op in ["-rms_norm", "+rms_norm"]:
            with set_current_vllm_config(
                VllmConfig(compilation_config=CompilationConfig(custom_ops=[custom_op]))
            ):
                try:
-                    vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
                    suffix = (
                        "_custom_rms_norm" if "+" in custom_op else "_native_rms_norm"
                    )
@@ -478,7 +461,6 @@ def run_benchmarks(
            VllmConfig(compilation_config=CompilationConfig(custom_ops=["-rms_norm"]))
        ):
            try:
-                vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
                standard_allreduce_rmsnorm_native_compiled = torch.compile(
                    vllm_fused_allreduce.allreduce_rmsnorm,
                    fullgraph=True,
@@ -494,11 +476,10 @@ def run_benchmarks(
                logger.error("Standard AllReduce+RMSNorm Native Compiled failed: %s", e)
                results["standard_allreduce_rmsnorm_native_compiled"] = float("inf")

-        # FlashInfer Fused AllReduce + RMSNorm (all backends)
-        for backend, workspace in workspaces.items():
+        # FlashInfer Fused AllReduce + RMSNorm Oneshot/Twoshot
+        if flashinfer_comm is not None and allreduce_params is not None:
            for use_oneshot in use_oneshot_options:
                suffix = "_oneshot" if use_oneshot else "_twoshot"
-                key = f"flashinfer_{backend}_fused_allreduce_rmsnorm{suffix}"
                try:
                    time_ms = benchmark_operation(
                        flashinfer_fused_allreduce_rmsnorm,
@@ -508,17 +489,14 @@ def run_benchmarks(
                        rms_gamma=rms_gamma,
                        rms_eps=rms_eps,
                        allreduce_params=allreduce_params,
-                        workspace=workspace,
                        use_oneshot=use_oneshot,
                    )
-                    results[key] = time_ms
+                    results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = time_ms
                except Exception as e:
-                    logger.error(
-                        "FlashInfer (%s) Fused AllReduce+RMSNorm failed: %s",
-                        backend,
-                        e,
+                    logger.error("FlashInfer Fused AllReduce+RMSNorm failed: %s", e)
+                    results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = float(
+                        "inf"
                    )
-                    results[key] = float("inf")

    if "fp8" in quant_modes:
        # Standard AllReduce + RMSNorm + FP8 Quant
@@ -527,7 +505,7 @@ def run_benchmarks(
                "_custom_rms_norm" if "+" in rms_norm_custom_op else "_native_rms_norm"
            )
            for quant_fp8_custom_op in ["-quant_fp8", "+quant_fp8"]:
-                op_suffix = suffix + (
+                suffix += (
                    "_custom_quant_fp8"
                    if "+" in quant_fp8_custom_op
                    else "_native_quant_fp8"
@@ -540,17 +518,16 @@ def run_benchmarks(
                    )
                ):
                    try:
-                        vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
                        time_ms = benchmark_operation(
                            vllm_fused_allreduce.allreduce_rmsnorm_fp8_quant,
                            input_tensor,
                            residual=residual,
                            scale_factor=scale_fp8,
                        )
-                        results[f"standard_allreduce{op_suffix}"] = time_ms
+                        results[f"standard_allreduce{suffix}"] = time_ms
                    except Exception as e:
                        logger.error("Standard AllReduce+RMSNorm+FP8 failed: %s", e)
-                        results[f"standard_allreduce{op_suffix}"] = float("inf")
+                        results[f"standard_allreduce{suffix}"] = float("inf")

        # Standard AllReduce + RMSNorm + FP8 Quant Native Compiled
        with set_current_vllm_config(
@@ -561,7 +538,6 @@ def run_benchmarks(
            )
        ):
            try:
-                vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
                standard_allreduce_rmsnorm_fp8_quant_native_compiled = torch.compile(
                    vllm_fused_allreduce.allreduce_rmsnorm_fp8_quant,
                    fullgraph=True,
@@ -584,12 +560,10 @@ def run_benchmarks(
                    "inf"
                )

-        # FlashInfer Fused AllReduce + RMSNorm + FP8 Quant (trtllm only)
-        if "trtllm" in workspaces:
-            trtllm_ws = workspaces["trtllm"]
+        # FlashInfer Fused AllReduce + RMSNorm + FP8 Quant Oneshot
+        if flashinfer_comm is not None and allreduce_params is not None:
            for use_oneshot in use_oneshot_options:
                suffix = "_oneshot" if use_oneshot else "_twoshot"
-                key = f"flashinfer_trtllm_fused_allreduce_rmsnorm_fp8_quant{suffix}"
                try:
                    time_ms = benchmark_operation(
                        flashinfer_fused_allreduce_rmsnorm_fp8_quant,
@@ -601,16 +575,19 @@ def run_benchmarks(
                        scale_factor=scale_fp8,
                        quant_out=quant_out_fp8,
                        allreduce_params=allreduce_params,
-                        workspace=trtllm_ws,
                        use_oneshot=use_oneshot,
                    )
-                    results[key] = time_ms
+                    results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = (
+                        time_ms
+                    )
                except Exception as e:
                    logger.error(
-                        "FlashInfer (trtllm) Fused AllReduce+RMSNorm+FP8 failed: %s",
+                        "FlashInfer Fused AllReduce+RMSNorm+FP8 Oneshot failed: %s",
                        e,
                    )
-                    results[key] = float("inf")
+                    results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = (
+                        float("inf")
+                    )

    if "fp4" in quant_modes and current_platform.has_device_capability(100):
        # Standard AllReduce + RMSNorm + FP4 Quant
@@ -626,7 +603,6 @@ def run_benchmarks(
                )
            ):
                try:
-                    vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
                    time_ms = benchmark_operation(
                        vllm_fused_allreduce.allreduce_rmsnorm_fp4_quant,
                        input_tensor,
@@ -645,7 +621,6 @@ def run_benchmarks(
            VllmConfig(compilation_config=CompilationConfig(custom_ops=["-rms_norm"]))
        ):
            try:
-                vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
                standard_allreduce_rmsnorm_fp4_quant_native_compiled = torch.compile(
                    vllm_fused_allreduce.allreduce_rmsnorm_fp4_quant,
                    fullgraph=True,
@@ -670,12 +645,10 @@ def run_benchmarks(
                    "inf"
                )

-        # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant (trtllm only)
-        if "trtllm" in workspaces:
-            trtllm_ws = workspaces["trtllm"]
+        # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Oneshot
+        if flashinfer_comm is not None and allreduce_params is not None:
            for use_oneshot in use_oneshot_options:
                suffix = "_oneshot" if use_oneshot else "_twoshot"
-                key = f"flashinfer_trtllm_fused_allreduce_rmsnorm_fp4_quant{suffix}"
                try:
                    time_ms = benchmark_operation(
                        flashinfer_fused_allreduce_rmsnorm_fp4_quant,
@@ -686,18 +659,49 @@ def run_benchmarks(
                        rms_eps=rms_eps,
                        input_global_scale=scale_fp4,
                        allreduce_params=allreduce_params,
-                        workspace=trtllm_ws,
                        quant_out=fp4_quant_out,
                        output_scale=fp4_output_scale,
                        use_oneshot=use_oneshot,
                    )
-                    results[key] = time_ms
+                    results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = (
+                        time_ms
+                    )
                except Exception as e:
                    logger.error(
-                        "FlashInfer (trtllm) Fused AllReduce+RMSNorm+FP4 failed: %s",
+                        "FlashInfer Fused AllReduce+RMSNorm+FP4 Oneshot failed: %s",
                        e,
                    )
-                    results[key] = float("inf")
+                    results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = (
+                        float("inf")
+                    )
+
+        # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Two-shot
+        if flashinfer_comm is not None and allreduce_params is not None:
+            try:
+                time_ms = benchmark_operation(
+                    flashinfer_fused_allreduce_rmsnorm_fp4_quant,
+                    input_tensor,
+                    residual=residual,
+                    norm_out=norm_out,
+                    rms_gamma=rms_gamma,
+                    rms_eps=rms_eps,
+                    input_global_scale=scale_fp4,
+                    allreduce_params=allreduce_params,
+                    quant_out=fp4_quant_out,
+                    output_scale=fp4_output_scale,
+                    use_oneshot=False,
+                )
+                results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = (
+                    time_ms
+                )
+            except Exception as e:
+                logger.error(
+                    "FlashInfer Fused AllReduce+RMSNorm+FP4 Two-shot failed: %s",
+                    e,
+                )
+                results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = float(
+                    "inf"
+                )

    return results

@@ -1035,33 +1039,24 @@ def main():

    configs = list(itertools.product(args.num_tokens, dtypes, residual_options))

-    # Setup FlashInfer workspaces for all backends
+    # Setup FlashInfer workspace if available
+    ipc_handles = None
    allreduce_params = None

    if flashinfer_comm is not None:
        # Use the largest hidden dimension for workspace setup
-        max_element_size = max(torch.finfo(dt).bits // 8 for dt in dtypes)
-        workspace_dtype = (
-            torch.float32
-            if max_element_size == 4
-            else (torch.bfloat16 if torch.bfloat16 in dtypes else torch.float16)
-        )
        max_num_token = _FI_MAX_SIZES.get(world_size) // (
-            args.hidden_dim * max_element_size
+            args.hidden_dim * world_size * 2
        )

-        for backend in FLASHINFER_BACKENDS:
-            setup_flashinfer_workspace(
-                backend=backend,
-                world_size=world_size,
-                rank=rank,
-                hidden_dim=args.hidden_dim,
-                max_token_num=max_num_token,
-                dtype=workspace_dtype,
-            )
+        ipc_handles, workspace_tensor = setup_flashinfer_workspace(
+            world_size, rank, args.hidden_dim, max_num_token
+        )

-        if _FI_WORKSPACES:
+        if workspace_tensor is not None:
            allreduce_params = FlashInferFusedAllReduceParams(
+                rank=rank,
+                world_size=world_size,
                max_token_num=max_num_token,
            )

@@ -1086,7 +1081,6 @@ def main():
                dtype,
                use_residual,
                allreduce_params,
-                workspaces=_FI_WORKSPACES,
                quant_modes=quant_modes,
                no_oneshot=args.no_oneshot,
            )
@@ -1125,13 +1119,11 @@ def main():

    finally:
        # Cleanup
-        cleanup_flashinfer_workspaces()
+        if ipc_handles is not None:
+            cleanup_flashinfer_workspace(ipc_handles)

        dist.barrier()


 if __name__ == "__main__":
-    from vllm.config import VllmConfig, set_current_vllm_config
-
-    with set_current_vllm_config(VllmConfig()):
-        main()
+    main()
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -5,14 +5,12 @@ import time

 import torch

-from vllm.benchmarks.lib.utils import default_vllm_config
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed


@torch.inference_mode()
-@default_vllm_config()
 def main(
    num_tokens: int,
    hidden_size: int,
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -16,7 +16,6 @@ import torch
 from ray.experimental.tqdm_ray import tqdm

 from vllm.model_executor.layers.fused_moe import fused_topk
-from vllm.model_executor.layers.fused_moe.activation import MoEActivation
 from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEConfig,
    FusedMoEParallelConfig,
@@ -100,38 +99,13 @@ def benchmark_config(
    dtype: torch.dtype,
    use_fp8_w8a8: bool,
    use_int8_w8a16: bool,
-    use_int4_w4a16: bool = False,
    num_iters: int = 100,
    block_quant_shape: list[int] = None,
    use_deep_gemm: bool = False,
 ) -> float:
    init_dtype = torch.float16 if use_fp8_w8a8 else dtype
    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
-    if use_int4_w4a16:
-        # Int4 packed weights: 2 int4 values per uint8 byte
-        # K dimension is packed (halved)
-        intermediate_size = shard_intermediate_size // 2  # after silu_and_mul
-        w1 = torch.randint(
-            0,
-            255,
-            (
-                num_experts,
-                shard_intermediate_size,
-                hidden_size // 2,  # int4 packing
-            ),
-            dtype=torch.uint8,
-        )
-        w2 = torch.randint(
-            0,
-            255,
-            (
-                num_experts,
-                hidden_size,
-                intermediate_size // 2,  # int4 packing
-            ),
-            dtype=torch.uint8,
-        )
-    elif use_int8_w8a16:
+    if use_int8_w8a16:
        w1 = torch.randint(
            -127,
            127,
@@ -165,20 +139,7 @@ def benchmark_config(
    w2_scale = None
    a1_scale = None
    a2_scale = None
-    if use_int4_w4a16:
-        if block_quant_shape is None:
-            raise ValueError("block_quant_shape is required for int4_w4a16")
-        group_size = block_quant_shape[1]
-        # Scales shape: (E, N, K // group_size) in fp16
-        w1_scale = torch.rand(
-            (num_experts, shard_intermediate_size, hidden_size // group_size),
-            dtype=dtype,
-        )
-        w2_scale = torch.rand(
-            (num_experts, hidden_size, intermediate_size // group_size),
-            dtype=dtype,
-        )
-    elif use_int8_w8a16:
+    if use_int8_w8a16:
        w1_scale = torch.randn(
            (num_experts, 2 * shard_intermediate_size), dtype=torch.float32
        )
@@ -237,7 +198,6 @@ def benchmark_config(
            a1_scale=a1_scale,
            a2_scale=a2_scale,
            block_shape=block_quant_shape,
-            weight_dtype="int4" if use_int4_w4a16 else None,
        )

        deep_gemm_experts = None
@@ -251,8 +211,7 @@ def benchmark_config(
                        hidden_dim=hidden_size,
                        intermediate_size_per_partition=shard_intermediate_size,
                        num_local_experts=num_experts,
-                        num_logical_experts=num_experts,
-                        activation=MoEActivation.SILU,
+                        activation="silu",
                        moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
                        in_dtype=init_dtype,
                        routing_method=RoutingMethodType.TopK,
@@ -267,10 +226,9 @@ def benchmark_config(
                x, input_gating, topk, renormalize=not use_deep_gemm
            )

-            inplace = not disable_inplace()
            if use_deep_gemm:
                return deep_gemm_experts(
-                    x, w1, w2, topk_weights, topk_ids, inplace=inplace
+                    x, w1, w2, topk_weights, topk_ids, inplace=True
                )
            return fused_experts(
                x,
@@ -278,7 +236,7 @@ def benchmark_config(
                w2,
                topk_weights,
                topk_ids,
-                inplace=inplace,
+                inplace=True,
                quant_config=quant_config,
            )

@@ -520,7 +478,6 @@ class BenchmarkWorker:
        dtype: torch.dtype,
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
-        use_int4_w4a16: bool = False,
        block_quant_shape: list[int] = None,
        use_deep_gemm: bool = False,
    ) -> tuple[dict[str, int], float]:
@@ -528,10 +485,7 @@ class BenchmarkWorker:

        set_random_seed(self.seed)
        dtype_str = _get_config_dtype_str(
-            dtype,
-            use_int8_w8a16=use_int8_w8a16,
-            use_fp8_w8a8=use_fp8_w8a8,
-            use_int4_w4a16=use_int4_w4a16,
+            dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
        )
        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
        # is the intermediate size after silu_and_mul.
@@ -562,7 +516,6 @@ class BenchmarkWorker:
            dtype,
            use_fp8_w8a8,
            use_int8_w8a16,
-            use_int4_w4a16=use_int4_w4a16,
            num_iters=100,
            block_quant_shape=block_quant_shape,
            use_deep_gemm=use_deep_gemm,
@@ -579,7 +532,6 @@ class BenchmarkWorker:
        dtype: torch.dtype,
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
-        use_int4_w4a16: bool,
        search_space: list[dict[str, int]],
        block_quant_shape: list[int],
        use_deep_gemm: bool,
@@ -590,7 +542,7 @@ class BenchmarkWorker:
        best_config = None
        best_time = float("inf")
        if current_platform.is_rocm():
-            is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16 or use_int4_w4a16)
+            is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
            search_space = prune_rocm_search_space(
                num_tokens,
                shard_intermediate_size,
@@ -619,7 +571,6 @@ class BenchmarkWorker:
                        dtype,
                        use_fp8_w8a8,
                        use_int8_w8a16,
-                        use_int4_w4a16,
                        num_iters=20,
                        block_quant_shape=block_quant_shape,
                        use_deep_gemm=use_deep_gemm,
@@ -667,7 +618,6 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
            else {}
        ),
        **({"kpack": config["kpack"]} if "kpack" in config else {}),
-        **({"SPLIT_K": config["SPLIT_K"]} if "SPLIT_K" in config else {}),
    }


@@ -680,15 +630,11 @@ def save_configs(
    dtype: torch.dtype,
    use_fp8_w8a8: bool,
    use_int8_w8a16: bool,
-    use_int4_w4a16: bool,
    block_quant_shape: list[int],
    save_dir: str,
 ) -> None:
    dtype_str = _get_config_dtype_str(
-        dtype,
-        use_int8_w8a16=use_int8_w8a16,
-        use_fp8_w8a8=use_fp8_w8a8,
-        use_int4_w4a16=use_int4_w4a16,
+        dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
    )

    # NOTE(woosuk): The current naming convention uses w2.shape[2], which
@@ -790,38 +736,6 @@ def get_model_params(config):
    return E, topk, intermediate_size, hidden_size


-def get_quantization_group_size(config) -> int | None:
-    """Extract the quantization group size from the HF model config.
-
-    This reads directly from the HuggingFace config object (as returned by
-    ``get_config()``), not from vLLM's quantization config classes.
-
-    Supports AWQ/GPTQ-style configs (direct 'group_size' key) and
-    compressed-tensors configs (nested inside 'config_groups').
-    """
-    quantization_config = getattr(config, "quantization_config", {})
-    if not isinstance(quantization_config, dict):
-        return None
-    # AWQ / GPTQ style: group_size is a top-level key
-    gs = quantization_config.get("group_size")
-    if gs is not None:
-        return gs
-    # compressed-tensors style: group_size is nested in config_groups
-    config_groups = quantization_config.get("config_groups", {})
-    if not isinstance(config_groups, dict):
-        return None
-    for group_cfg in config_groups.values():
-        if not isinstance(group_cfg, dict):
-            continue
-        weights = group_cfg.get("weights", {})
-        if not isinstance(weights, dict):
-            continue
-        gs = weights.get("group_size")
-        if gs is not None:
-            return gs
-    return None
-
-
 def main(args: argparse.Namespace):
    print(args)

@@ -840,20 +754,7 @@ def main(args: argparse.Namespace):
    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
-    use_int4_w4a16 = args.dtype == "int4_w4a16"
    block_quant_shape = get_weight_block_size_safety(config)
-    if use_int4_w4a16:
-        group_size = get_quantization_group_size(config)
-        if group_size is None:
-            raise ValueError(
-                "Could not determine group_size from model config. "
-                "The model's quantization_config must contain a 'group_size' "
-                "field (AWQ/GPTQ) or 'config_groups.*.weights.group_size' "
-                "(compressed-tensors)."
-            )
-        # For int4_w4a16, block_shape = [0, group_size]
-        # block_shape[0]=0 means no block quantization on N dimension
-        block_quant_shape = [0, group_size]

    if args.batch_size is None:
        batch_sizes = [
@@ -907,20 +808,8 @@ def main(args: argparse.Namespace):
        return ray.get(outputs)

    if args.tune:
-        # int4_w4a16 weights are uint8-packed, not fp16; treat like fp8 for
-        # search space generation (no matrix_instr_nonkdim/kpack exploration).
-        is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16 or use_int4_w4a16)
-        # For int4_w4a16, the group_size constraint on BLOCK_SIZE_K does not
-        # apply: the gptq_awq kernel handles arbitrary BLOCK_SIZE_K regardless
-        # of group_size. Skip block_quant_shape filtering to keep the full
-        # search space (e.g. BLOCK_SIZE_K=64 with group_size=128).
-        tune_block_quant_shape = None if use_int4_w4a16 else block_quant_shape
-        search_space = get_configs_compute_bound(is_fp16, tune_block_quant_shape)
-        if use_int4_w4a16:
-            # SPLIT_K is a required kernel constexpr for gptq_awq kernel;
-            # only SPLIT_K=1 is used at runtime, so fix it during tuning.
-            for cfg in search_space:
-                cfg["SPLIT_K"] = 1
+        is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
+        search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
        print(f"Start tuning over {len(search_space)} configurations...")
        if use_deep_gemm:
            raise ValueError(
@@ -940,7 +829,6 @@ def main(args: argparse.Namespace):
                    dtype,
                    use_fp8_w8a8,
                    use_int8_w8a16,
-                    use_int4_w4a16,
                    search_space,
                    block_quant_shape,
                    use_deep_gemm,
@@ -960,7 +848,6 @@ def main(args: argparse.Namespace):
            dtype,
            use_fp8_w8a8,
            use_int8_w8a16,
-            use_int4_w4a16,
            block_quant_shape,
            args.save_dir,
        )
@@ -979,7 +866,6 @@ def main(args: argparse.Namespace):
                    dtype,
                    use_fp8_w8a8,
                    use_int8_w8a16,
-                    use_int4_w4a16,
                    block_quant_shape,
                    use_deep_gemm,
                )
@@ -1002,10 +888,7 @@ if __name__ == "__main__":
    )
    parser.add_argument("--enable-expert-parallel", "-enable-ep", action="store_true")
    parser.add_argument(
-        "--dtype",
-        type=str,
-        choices=["auto", "fp8_w8a8", "int8_w8a16", "int4_w4a16"],
-        default="auto",
+        "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
    )
    parser.add_argument("--use-deep-gemm", action="store_true")
    parser.add_argument(
--- a/benchmarks/kernels/benchmark_moe_defaults.py
+++ b/benchmarks/kernels/benchmark_moe_defaults.py
@@ -1,278 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Benchmark comparing old vs new default fused MoE configs.
-
-Runs the triton fused_moe kernel with three configurations for each scenario:
-  1. Tuned config (from JSON file, if available) — the target to match
-  2. Old default (the hardcoded defaults before this change)
-  3. New default (the improved defaults)
-
-Usage:
-    python benchmarks/kernels/benchmark_moe_defaults.py
-
-Produces a table showing kernel time (us) and speedup of new vs old defaults.
-"""
-
-import torch
-
-from vllm.model_executor.layers.fused_moe import fused_topk, override_config
-from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.fused_moe.fused_moe import (
-    fused_experts,
-    get_default_config,
-    get_moe_configs,
-)
-from vllm.platforms import current_platform
-from vllm.triton_utils import triton
-from vllm.utils.torch_utils import set_random_seed
-
-FP8_DTYPE = current_platform.fp8_dtype()
-
-
-def old_default_config(M, E, N, K, topk, dtype=None, block_shape=None):
-    """The original defaults before https://github.com/vllm-project/vllm/pull/34846,
-    for comparison."""
-    if dtype == "fp8_w8a8" and block_shape is not None:
-        return {
-            "BLOCK_SIZE_M": 64,
-            "BLOCK_SIZE_N": block_shape[0],
-            "BLOCK_SIZE_K": block_shape[1],
-            "GROUP_SIZE_M": 32,
-            "SPLIT_K": 1,
-            "num_warps": 4,
-            "num_stages": 3 if not current_platform.is_rocm() else 2,
-        }
-    elif M <= E:
-        return {
-            "BLOCK_SIZE_M": 16,
-            "BLOCK_SIZE_N": 32,
-            "BLOCK_SIZE_K": 64,
-            "GROUP_SIZE_M": 1,
-            "SPLIT_K": 1,
-        }
-    else:
-        return {
-            "BLOCK_SIZE_M": 64,
-            "BLOCK_SIZE_N": 64,
-            "BLOCK_SIZE_K": 32,
-            "GROUP_SIZE_M": 8,
-            "SPLIT_K": 1,
-        }
-
-
-def benchmark_config(
-    config,
-    M,
-    E,
-    N,
-    K,
-    topk,
-    dtype,
-    use_fp8=False,
-    block_shape=None,
-    num_iters=100,
-):
-    """Time a single kernel config. Returns kernel time in microseconds."""
-    init_dtype = torch.float16 if use_fp8 else dtype
-
-    a = torch.randn(M, K, device="cuda", dtype=init_dtype) / 10
-    w1 = torch.randn(E, 2 * N, K, device="cuda", dtype=init_dtype) / 10
-    w2 = torch.randn(E, K, N, device="cuda", dtype=init_dtype) / 10
-
-    w1_scale = None
-    w2_scale = None
-    a1_scale = None
-    a2_scale = None
-    if use_fp8:
-        if block_shape is not None:
-            bsn, bsk = block_shape
-            n_tiles_w1 = triton.cdiv(2 * N, bsn)
-            k_tiles_w1 = triton.cdiv(K, bsk)
-            n_tiles_w2 = triton.cdiv(K, bsn)
-            k_tiles_w2 = triton.cdiv(N, bsk)
-            w1_scale = torch.rand(
-                E, n_tiles_w1, k_tiles_w1, device="cuda", dtype=torch.float32
-            )
-            w2_scale = torch.rand(
-                E, n_tiles_w2, k_tiles_w2, device="cuda", dtype=torch.float32
-            )
-        else:
-            w1_scale = torch.rand(E, device="cuda", dtype=torch.float32)
-            w2_scale = torch.rand(E, device="cuda", dtype=torch.float32)
-        a1_scale = torch.rand(1, device="cuda", dtype=torch.float32)
-        a2_scale = torch.rand(1, device="cuda", dtype=torch.float32)
-        # Only weights are stored in fp8; activations stay in bf16/fp16
-        # and get dynamically quantized inside the kernel.
-        w1 = w1.to(FP8_DTYPE)
-        w2 = w2.to(FP8_DTYPE)
-
-    quant_config = FusedMoEQuantConfig.make(
-        quant_dtype=torch.float8_e4m3fn if use_fp8 else None,
-        w1_scale=w1_scale,
-        w2_scale=w2_scale,
-        a1_scale=a1_scale,
-        a2_scale=a2_scale,
-        block_shape=block_shape,
-    )
-
-    gating = torch.randn(M, E, device="cuda", dtype=torch.float32)
-
-    # Warmup
-    for _ in range(20):
-        with override_config(config):
-            topk_weights, topk_ids, _ = fused_topk(a, gating, topk, renormalize=True)
-            fused_experts(
-                a,
-                w1,
-                w2,
-                topk_weights,
-                topk_ids,
-                quant_config=quant_config,
-            )
-    torch.cuda.synchronize()
-
-    # Benchmark
-    start = torch.cuda.Event(enable_timing=True)
-    end = torch.cuda.Event(enable_timing=True)
-    start.record()
-    for _ in range(num_iters):
-        with override_config(config):
-            topk_weights, topk_ids, _ = fused_topk(a, gating, topk, renormalize=True)
-            fused_experts(
-                a,
-                w1,
-                w2,
-                topk_weights,
-                topk_ids,
-                quant_config=quant_config,
-            )
-    end.record()
-    torch.cuda.synchronize()
-    return start.elapsed_time(end) / num_iters * 1000  # ms -> us
-
-
-# Model configurations: (name, E, N, K, topk, dtype_str, use_fp8, block_shape)
-# N = moe_intermediate_size // tp_size (the value used in config file lookup)
-MODELS = [
-    # --- Few experts ---
-    ("Mixtral bf16", 8, 7168, 4096, 2, None, False, None),
-    ("Mixtral fp8", 8, 7168, 4096, 2, "fp8_w8a8", True, None),
-    # --- Many experts: real model shapes at tp=1 ---
-    # Qwen2-MoE-57B: E=60, topk=4, N=1408, K=2048
-    ("Qwen2-MoE bf16", 60, 1408, 2048, 4, None, False, None),
-    # DeepSeek-V2: E=64, topk=6, N=1407, K=4096
-    # (use 1408 to avoid odd alignment; real model is 1407)
-    ("DeepSeek-V2 bf16", 64, 1408, 4096, 6, None, False, None),
-    # OLMoE-7B: E=64, topk=8, N=2048, K=2048
-    ("OLMoE bf16", 64, 2048, 2048, 8, None, False, None),
-    # GLM-4-100B-A10B: E=128, topk=8, N=1408, K=4096
-    ("GLM-4-MoE bf16", 128, 1408, 4096, 8, None, False, None),
-    # Qwen3-30B-A3B: E=128, topk=8, N=768, K=2048
-    ("Qwen3-MoE bf16", 128, 768, 2048, 8, None, False, None),
-    # DeepSeek-V3 / MiMo-V2-Flash: E=256, topk=8, N=2048, K=7168
-    ("DeepSeek-V3 bf16", 256, 2048, 7168, 8, None, False, None),
-    # Qwen3.5-70B-A22B (Qwen3-Next): E=512, topk=10, N=512, K=2048
-    ("Qwen3-Next bf16", 512, 512, 2048, 10, None, False, None),
-    # E=128 N=1856 bf16
-    ("E128 N1856 bf16", 128, 1856, 4096, 8, None, False, None),
-    # E=256 N=512 bf16 (DS-V3 tp=4)
-    ("DS-V3 tp4 bf16", 256, 512, 7168, 8, None, False, None),
-    # E=512 N=512 bf16 (Qwen3-Next tp=1)
-    ("Qwen3-Next bf16", 512, 512, 2048, 10, None, False, None),
-    # E=512 N=256 bf16 (Qwen3-Next tp=2)
-    ("Qwen3-Next tp2", 512, 256, 2048, 10, None, False, None),
-    # --- FP8 block quant (many experts) ---
-    # DS-V3 tp=4: E=256, N=512, fp8 block
-    ("DS-V3 tp4 fp8blk", 256, 512, 7168, 8, "fp8_w8a8", True, [128, 128]),
-    # DS-V3 tp=8: E=256, N=256, fp8 block
-    ("DS-V3 tp8 fp8blk", 256, 256, 7168, 8, "fp8_w8a8", True, [128, 128]),
-    # Qwen3-Next tp=2 fp8 block
-    ("Qwen3-Next tp2 fp8blk", 512, 256, 2048, 10, "fp8_w8a8", True, [128, 128]),
-]
-
-BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
-
-
-def main():
-    set_random_seed(0)
-    torch.set_default_device("cuda")
-    dtype = torch.bfloat16
-
-    for name, E, N, K, topk, dtype_str, use_fp8, block_shape in MODELS:
-        print(f"\n{'=' * 90}")
-        print(f"  {name}  (E={E}, N={N}, K={K}, topk={topk})")
-        print(f"{'=' * 90}")
-
-        # Try to load tuned config
-        block_n = block_shape[0] if block_shape else None
-        block_k = block_shape[1] if block_shape else None
-        tuned = get_moe_configs(E, N, dtype_str, block_n, block_k)
-        has_tuned = tuned is not None
-        print(f"  Tuned config available: {has_tuned}")
-
-        hdr = (
-            f"{'Batch':>6} | {'Tuned (us)':>11} | {'Old (us)':>11} | "
-            f"{'New (us)':>11} | {'New/Old':>8} | {'New/Tuned':>10}"
-        )
-        print(f"  {hdr}")
-        print(f"  {'-' * len(hdr)}")
-
-        for M in BATCH_SIZES:
-            old_cfg = old_default_config(M, E, N, K, topk, dtype_str, block_shape)
-            new_cfg = get_default_config(M, E, N, K, topk, dtype_str, block_shape)
-
-            if has_tuned:
-                tuned_cfg = tuned[min(tuned.keys(), key=lambda x: abs(x - M))]
-                t_tuned = benchmark_config(
-                    tuned_cfg,
-                    M,
-                    E,
-                    N,
-                    K,
-                    topk,
-                    dtype,
-                    use_fp8=use_fp8,
-                    block_shape=block_shape,
-                )
-            else:
-                t_tuned = None
-
-            t_old = benchmark_config(
-                old_cfg,
-                M,
-                E,
-                N,
-                K,
-                topk,
-                dtype,
-                use_fp8=use_fp8,
-                block_shape=block_shape,
-            )
-            t_new = benchmark_config(
-                new_cfg,
-                M,
-                E,
-                N,
-                K,
-                topk,
-                dtype,
-                use_fp8=use_fp8,
-                block_shape=block_shape,
-            )
-
-            ratio_new_old = t_new / t_old
-            tuned_str = f"{t_tuned:11.2f}" if t_tuned else f"{'N/A':>11}"
-            ratio_tuned = f"{t_new / t_tuned:10.2f}x" if t_tuned else f"{'N/A':>10}"
-            # flag regressions where new default is >5% slower than old
-            marker = " <--" if ratio_new_old > 1.05 else ""
-
-            print(
-                f"  {M:>6} | {tuned_str} | {t_old:11.2f} | {t_new:11.2f} "
-                f"| {ratio_new_old:7.2f}x | {ratio_tuned}{marker}"
-            )
-
-
-if __name__ == "__main__":
-    main()
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -36,7 +36,6 @@ from typing import Any
 import numpy as np
 import torch

-from vllm.benchmarks.lib.utils import default_vllm_config
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.transformers_utils.config import get_config
 from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -79,7 +78,6 @@ def calculate_stats(times: list[float]) -> dict[str, float]:
    }


-@default_vllm_config()
 def benchmark_mrope(
    model_name: str,
    num_tokens: int,
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -5,7 +5,6 @@ import itertools

 import torch

-from vllm.benchmarks.lib.utils import default_vllm_config
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.triton_utils import triton
 from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -30,7 +29,6 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device):
            args={},
        )
    )
-    @default_vllm_config()
    def benchmark(batch_size, seq_len, num_heads, provider):
        dtype = torch.bfloat16
        max_position = 8192
--- a/benchmarks/run_structured_output_benchmark.sh
+++ b/benchmarks/run_structured_output_benchmark.sh
@@ -71,7 +71,7 @@ while [[ $# -gt 0 ]]; do
      usage
      ;;
    *)
-      printf "Unknown argument: %s\n" "$1"
+      echo "Unknown argument: $1\n"
      usage
      ;;
  esac
@@ -84,17 +84,15 @@ mkdir -p "$OUTPUT_DIR"
 QPS_VALUES=(25 20 15 10 5 1)

 # Common parameters
-COMMON_PARAMS=(
-  --backend "$BACKEND"
-  --model "$MODEL"
-  --dataset "$DATASET"
-  --structured-output-ratio "$STRUCTURED_OUTPUT_RATIO"
-  --save-results
-  --result-dir "$OUTPUT_DIR"
-  --output-len "$MAX_NEW_TOKENS"
-  --port "$PORT"
-  --tokenizer-mode "$TOKENIZER_MODE"
-)
+COMMON_PARAMS="--backend $BACKEND \
+               --model $MODEL \
+               --dataset $DATASET \
+               --structured-output-ratio $STRUCTURED_OUTPUT_RATIO \
+               --save-results \
+               --result-dir $OUTPUT_DIR \
+               --output-len $MAX_NEW_TOKENS \
+               --port $PORT \
+               --tokenizer-mode $TOKENIZER_MODE"

 echo "Starting structured output benchmark with model: $MODEL"
 echo "Backend: $BACKEND"
@@ -111,17 +109,17 @@ for qps in "${QPS_VALUES[@]}"; do
  GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")

  # Construct filename for this run
-  FILENAME="${BACKEND}_${qps}qps_$(basename "$MODEL")_${DATASET}_${GIT_HASH}_${GIT_BRANCH}.json"
+  FILENAME="${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json"

  NUM_PROMPTS=$(echo "$TOTAL_SECONDS * $qps" | bc)
  NUM_PROMPTS=${NUM_PROMPTS%.*}  # Remove fractional part
  echo "Running benchmark with $NUM_PROMPTS prompts"

  # Run the benchmark
-  python "$SCRIPT_DIR/benchmark_serving_structured_output.py" "${COMMON_PARAMS[@]}" \
-    --request-rate "$qps" \
+  python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
+    --request-rate $qps \
    --result-filename "$FILENAME" \
-    --num-prompts "$NUM_PROMPTS"
+    --num-prompts $NUM_PROMPTS

  echo "Completed benchmark with QPS: $qps"
  echo "----------------------------------------"
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -18,7 +18,6 @@ set(ENABLE_AVX512 $ENV{VLLM_CPU_AVX512})
 set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16})
 set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI})
 set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16})
-set(ENABLE_ARM_BF16 $ENV{VLLM_CPU_ARM_BF16})

 include_directories("${CMAKE_SOURCE_DIR}/csrc")

@@ -116,10 +115,6 @@ else()
        set(AVX512_FOUND ON)
        message(STATUS "AVX512 support enabled via VLLM_CPU_AVX512 environment variable")
    endif()
-    if (ENABLE_ARM_BF16)
-        set(ARM_BF16_FOUND ON)
-        message(STATUS "ARM BF16 support enabled via VLLM_CPU_ARM_BF16 environment variable")
-    endif()
 endif()

 if (AVX512_FOUND AND NOT AVX512_DISABLED)
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -19,7 +19,7 @@ else()
  FetchContent_Declare(
        flashmla
        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
-        GIT_TAG 692917b1cda61b93ac9ee2d846ec54e75afe87b1
+        GIT_TAG c2afa9cb93e674d5a9120a170a6da57b89267208
        GIT_PROGRESS TRUE
        CONFIGURE_COMMAND ""
        BUILD_COMMAND ""
--- a/cmake/external_projects/triton_kernels.cmake
+++ b/cmake/external_projects/triton_kernels.cmake
@@ -1,9 +1,9 @@
 # Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels

-set(DEFAULT_TRITON_KERNELS_TAG "v3.6.0")
+set(DEFAULT_TRITON_KERNELS_TAG "v3.5.0")

 # Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
-# be directly set to the triton_kernels python directory.
+# be directly set to the triton_kernels python directory. 
 if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
  message(STATUS "[triton_kernels] Fetch from $ENV{TRITON_KERNELS_SRC_DIR}")
  FetchContent_Declare(
@@ -24,7 +24,7 @@ else()
  )
 endif()

-# Fetch content
+# Fetch content 
 FetchContent_MakeAvailable(triton_kernels)

 if (NOT triton_kernels_SOURCE_DIR)
@@ -47,7 +47,7 @@ install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/third_party/tr
 ## Copy .py files to install directory.
 install(DIRECTORY
        ${TRITON_KERNELS_PYTHON_DIR}
-        DESTINATION
+        DESTINATION 
        vllm/third_party/triton_kernels/
        COMPONENT triton_kernels
        FILES_MATCHING PATTERN "*.py")
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 5824e6e2008271063c3229ab3e7032bd74abbbc6
+          GIT_TAG 188be16520ceefdc625fdf71365585d2ee348fe2
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -9,113 +9,6 @@

 namespace vllm {

-struct alignas(32) u32x8_t {
-  uint32_t u0, u1, u2, u3, u4, u5, u6, u7;
-};
-
-__device__ __forceinline__ void ld256(u32x8_t& val, const u32x8_t* ptr) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 && \
-    defined(CUDA_VERSION) && CUDA_VERSION >= 12090
-  asm volatile("ld.global.nc.v8.u32 {%0,%1,%2,%3,%4,%5,%6,%7}, [%8];\n"
-               : "=r"(val.u0), "=r"(val.u1), "=r"(val.u2), "=r"(val.u3),
-                 "=r"(val.u4), "=r"(val.u5), "=r"(val.u6), "=r"(val.u7)
-               : "l"(ptr));
-#else
-  const uint4* uint_ptr = reinterpret_cast<const uint4*>(ptr);
-  uint4 top_half = __ldg(&uint_ptr[0]);
-  uint4 bottom_half = __ldg(&uint_ptr[1]);
-  val.u0 = top_half.x;
-  val.u1 = top_half.y;
-  val.u2 = top_half.z;
-  val.u3 = top_half.w;
-  val.u4 = bottom_half.x;
-  val.u5 = bottom_half.y;
-  val.u6 = bottom_half.z;
-  val.u7 = bottom_half.w;
-#endif
-}
-
-__device__ __forceinline__ void st256(u32x8_t& val, u32x8_t* ptr) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 && \
-    defined(CUDA_VERSION) && CUDA_VERSION >= 12090
-  asm volatile("st.global.v8.u32 [%0], {%1,%2,%3,%4,%5,%6,%7,%8};\n"
-               :
-               : "l"(ptr), "r"(val.u0), "r"(val.u1), "r"(val.u2), "r"(val.u3),
-                 "r"(val.u4), "r"(val.u5), "r"(val.u6), "r"(val.u7)
-               : "memory");
-#else
-  uint4* uint_ptr = reinterpret_cast<uint4*>(ptr);
-  uint_ptr[0] = make_uint4(val.u0, val.u1, val.u2, val.u3);
-  uint_ptr[1] = make_uint4(val.u4, val.u5, val.u6, val.u7);
-#endif
-}
-
-template <bool support_256>
-struct VecTraits;
-
-template <>
-struct VecTraits<true> {
-  static constexpr int ARCH_MAX_VEC_SIZE = 32;
-  using vec_t = u32x8_t;
-};
-
-template <>
-struct VecTraits<false> {
-  static constexpr int ARCH_MAX_VEC_SIZE = 16;
-  using vec_t = int4;
-};
-
-template <typename T>
-struct PackedTraits;
-
-template <>
-struct PackedTraits<c10::BFloat16> {
-  using packed_t = __nv_bfloat162;
-};
-
-template <>
-struct PackedTraits<c10::Half> {
-  using packed_t = __half2;
-};
-
-template <>
-struct PackedTraits<float> {
-  using packed_t = float2;
-};
-
-template <typename packed_t>
-__device__ __forceinline__ float2 cast_to_float2(const packed_t& val) {
-  if constexpr (std::is_same_v<packed_t, __nv_bfloat162>) {
-    return __bfloat1622float2(val);
-  } else if constexpr (std::is_same_v<packed_t, __half2>) {
-    return __half22float2(val);
-  } else if constexpr (std::is_same_v<packed_t, float2>) {
-    return float2(val);
-  }
-}
-
-template <typename packed_t>
-__device__ __forceinline__ packed_t cast_to_packed(const float2& val) {
-  if constexpr (std::is_same_v<packed_t, __nv_bfloat162>) {
-    return __float22bfloat162_rn(val);
-  } else if constexpr (std::is_same_v<packed_t, __half2>) {
-    return __float22half2_rn(val);
-  } else if constexpr (std::is_same_v<packed_t, float2>) {
-    return float2(val);
-  }
-}
-
-template <typename packed_t>
-__device__ __forceinline__ packed_t packed_mul(const packed_t& x,
-                                               const packed_t& y) {
-  if constexpr (std::is_same_v<packed_t, __nv_bfloat162> ||
-                std::is_same_v<packed_t, __half2>) {
-    return __hmul2(x, y);
-  } else if constexpr (std::is_same_v<packed_t, float2>) {
-    return make_float2(x.x * y.x, x.y * y.y);
-  }
-}
-
 template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
          bool act_first>
 __device__ __forceinline__ scalar_t compute(const scalar_t& x,
@@ -123,69 +16,52 @@ __device__ __forceinline__ scalar_t compute(const scalar_t& x,
  return act_first ? ACT_FN(x) * y : x * ACT_FN(y);
 }

-template <typename packed_t, packed_t (*PACKED_ACT_FN)(const packed_t&),
-          bool act_first>
-__device__ __forceinline__ packed_t packed_compute(const packed_t& x,
-                                                   const packed_t& y) {
-  return act_first ? packed_mul(PACKED_ACT_FN(x), y)
-                   : packed_mul(x, PACKED_ACT_FN(y));
-}
-
 // Check if all pointers are 16-byte aligned for int4 vectorized access
-__host__ __device__ __forceinline__ bool is_16byte_aligned(const void* ptr) {
+__device__ __forceinline__ bool is_16byte_aligned(const void* ptr) {
  return (reinterpret_cast<uintptr_t>(ptr) & 15) == 0;
 }

-// Check if all pointers are 16-byte aligned for longlong4_32a vectorized access
-__host__ __device__ __forceinline__ bool is_32byte_aligned(const void* ptr) {
-  return (reinterpret_cast<uintptr_t>(ptr) & 31) == 0;
-}
-
 // Activation and gating kernel template.
-template <typename scalar_t, typename packed_t,
-          scalar_t (*ACT_FN)(const scalar_t&),
-          packed_t (*PACKED_ACT_FN)(const packed_t&), bool act_first,
-          bool use_vec, bool use_256b = false>
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
+          bool act_first>
 __global__ void act_and_mul_kernel(
    scalar_t* __restrict__ out,          // [..., d]
    const scalar_t* __restrict__ input,  // [..., 2, d]
    const int d) {
-  const scalar_t* x_ptr = input + blockIdx.x * 2 * d;
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
+  const int64_t token_idx = blockIdx.x;
+  const scalar_t* x_ptr = input + token_idx * 2 * d;
  const scalar_t* y_ptr = x_ptr + d;
-  scalar_t* out_ptr = out + blockIdx.x * d;
+  scalar_t* out_ptr = out + token_idx * d;

-  if constexpr (use_vec) {
-    // Fast path: 128-bit/256-bit vectorized loop
-    using vec_t = typename VecTraits<use_256b>::vec_t;
-    constexpr int ARCH_MAX_VEC_SIZE = VecTraits<use_256b>::ARCH_MAX_VEC_SIZE;
-    constexpr int VEC_SIZE = ARCH_MAX_VEC_SIZE / sizeof(packed_t);
+  // Check alignment for 128-bit vectorized access.
+  // All three pointers must be 16-byte aligned for safe int4 operations.
+  const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
+                       is_16byte_aligned(out_ptr);

-    const vec_t* x_vec = reinterpret_cast<const vec_t*>(x_ptr);
-    const vec_t* y_vec = reinterpret_cast<const vec_t*>(y_ptr);
-    vec_t* out_vec = reinterpret_cast<vec_t*>(out_ptr);
-    const int num_vecs = d / 2 / VEC_SIZE;
+  if (aligned && d >= VEC_SIZE) {
+    // Fast path: 128-bit vectorized loop
+    const int4* x_vec = reinterpret_cast<const int4*>(x_ptr);
+    const int4* y_vec = reinterpret_cast<const int4*>(y_ptr);
+    int4* out_vec = reinterpret_cast<int4*>(out_ptr);
+    const int num_vecs = d / VEC_SIZE;
+    const int vec_end = num_vecs * VEC_SIZE;

    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
-      vec_t x, y;
-      if constexpr (use_256b) {
-        ld256(x, &x_vec[i]);
-        ld256(y, &y_vec[i]);
-      } else {
-        x = VLLM_LDG(&x_vec[i]);
-        y = VLLM_LDG(&y_vec[i]);
-      }
-      auto* xp = reinterpret_cast<packed_t*>(&x);
-      auto* yp = reinterpret_cast<packed_t*>(&y);
+      int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
+      auto* xp = reinterpret_cast<scalar_t*>(&x);
+      auto* yp = reinterpret_cast<scalar_t*>(&y);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
 #pragma unroll
      for (int j = 0; j < VEC_SIZE; j++) {
-        xp[j] =
-            packed_compute<packed_t, PACKED_ACT_FN, act_first>(xp[j], yp[j]);
-      }
-      if constexpr (use_256b) {
-        st256(x, &out_vec[i]);
-      } else {
-        out_vec[i] = x;
+        rp[j] = compute<scalar_t, ACT_FN, act_first>(xp[j], yp[j]);
      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = compute<scalar_t, ACT_FN, act_first>(VLLM_LDG(&x_ptr[i]),
+                                                        VLLM_LDG(&y_ptr[i]));
    }
  } else {
    // Scalar fallback for unaligned data or small d
@@ -203,15 +79,6 @@ __device__ __forceinline__ T silu_kernel(const T& x) {
  return (T)(((float)x) / (1.0f + expf((float)-x)));
 }

-template <typename packed_t>
-__device__ __forceinline__ packed_t packed_silu_kernel(const packed_t& val) {
-  // x * sigmoid(x)
-  float2 fval = cast_to_float2(val);
-  fval.x = fval.x / (1.0f + expf(-fval.x));
-  fval.y = fval.y / (1.0f + expf(-fval.y));
-  return cast_to_packed<packed_t>(fval);
-}
-
 template <typename T>
 __device__ __forceinline__ T gelu_kernel(const T& x) {
  // Equivalent to PyTorch GELU with 'none' approximation.
@@ -222,18 +89,6 @@ __device__ __forceinline__ T gelu_kernel(const T& x) {
  return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA)));
 }

-template <typename packed_t>
-__device__ __forceinline__ packed_t packed_gelu_kernel(const packed_t& val) {
-  // Equivalent to PyTorch GELU with 'none' approximation.
-  // Refer to:
-  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
-  constexpr float ALPHA = M_SQRT1_2;
-  float2 fval = cast_to_float2(val);
-  fval.x = fval.x * 0.5f * (1.0f + ::erf(fval.x * ALPHA));
-  fval.y = fval.y * 0.5f * (1.0f + ::erf(fval.y * ALPHA));
-  return cast_to_packed<packed_t>(fval);
-}
-
 template <typename T>
 __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
  // Equivalent to PyTorch GELU with 'tanh' approximation.
@@ -247,83 +102,32 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
  return (T)(0.5f * f * (1.0f + ::tanhf(inner)));
 }

-template <typename packed_t>
-__device__ __forceinline__ packed_t
-packed_gelu_tanh_kernel(const packed_t& val) {
-  // Equivalent to PyTorch GELU with 'tanh' approximation.
-  // Refer to:
-  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
-  float2 fval = cast_to_float2(val);
-  constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
-  constexpr float KAPPA = 0.044715;
-
-  float x_cube = fval.x * fval.x * fval.x;
-  float inner = BETA * (fval.x + KAPPA * x_cube);
-  fval.x = 0.5f * fval.x * (1.0f + ::tanhf(inner));
-
-  x_cube = fval.y * fval.y * fval.y;
-  inner = BETA * (fval.y + KAPPA * x_cube);
-  fval.y = 0.5f * fval.y * (1.0f + ::tanhf(inner));
-  return cast_to_packed<packed_t>(fval);
-}
-
 }  // namespace vllm

 // Launch activation and gating kernel.
 // Use ACT_FIRST (bool) indicating whether to apply the activation function
 // first.
-#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, PACKED_KERNEL, ACT_FIRST)     \
-  auto dtype = input.scalar_type();                                         \
-  int d = input.size(-1) / 2;                                               \
-  int64_t num_tokens = input.numel() / input.size(-1);                      \
-  if (num_tokens == 0) {                                                    \
-    return;                                                                 \
-  }                                                                         \
-  dim3 grid(num_tokens);                                                    \
-  int cc_major = at::cuda::getCurrentDeviceProperties()->major;             \
-  int support_vec = (cc_major >= 10 && num_tokens > 128) ? 32 : 16;         \
-  int vec_size = support_vec / at::elementSize(dtype);                      \
-  const bool use_vec = (d % vec_size == 0);                                 \
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));         \
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();             \
-  if (use_vec) {                                                            \
-    dim3 block(std::min(d / vec_size, 1024));                               \
-    if (cc_major >= 10 && num_tokens > 128) {                               \
-      VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] {       \
-        vllm::act_and_mul_kernel<                                           \
-            scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t,      \
-            KERNEL<scalar_t>,                                               \
-            PACKED_KERNEL<typename vllm::PackedTraits<scalar_t>::packed_t>, \
-            ACT_FIRST, true, true><<<grid, block, 0, stream>>>(             \
-            out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d);       \
-      });                                                                   \
-    } else {                                                                \
-      VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] {       \
-        vllm::act_and_mul_kernel<                                           \
-            scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t,      \
-            KERNEL<scalar_t>,                                               \
-            PACKED_KERNEL<typename vllm::PackedTraits<scalar_t>::packed_t>, \
-            ACT_FIRST, true, false><<<grid, block, 0, stream>>>(            \
-            out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d);       \
-      });                                                                   \
-    }                                                                       \
-  } else {                                                                  \
-    dim3 block(std::min(d, 1024));                                          \
-    VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] {         \
-      vllm::act_and_mul_kernel<                                             \
-          scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t,        \
-          KERNEL<scalar_t>,                                                 \
-          PACKED_KERNEL<typename vllm::PackedTraits<scalar_t>::packed_t>,   \
-          ACT_FIRST, false><<<grid, block, 0, stream>>>(                    \
-          out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d);         \
-    });                                                                     \
-  }
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, ACT_FIRST)                 \
+  int d = input.size(-1) / 2;                                            \
+  int64_t num_tokens = input.numel() / input.size(-1);                   \
+  dim3 grid(num_tokens);                                                 \
+  dim3 block(std::min(d, 1024));                                         \
+  if (num_tokens == 0) {                                                 \
+    return;                                                              \
+  }                                                                      \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));      \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();          \
+  VLLM_DISPATCH_FLOATING_TYPES(                                          \
+      input.scalar_type(), "act_and_mul_kernel", [&] {                   \
+        vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>, ACT_FIRST>  \
+            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),       \
+                                         input.data_ptr<scalar_t>(), d); \
+      });

 void silu_and_mul(torch::Tensor& out,    // [..., d]
                  torch::Tensor& input)  // [..., 2 * d]
 {
-  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, vllm::packed_silu_kernel,
-                                true);
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, true);
 }

 void mul_and_silu(torch::Tensor& out,    // [..., d]
@@ -331,22 +135,19 @@ void mul_and_silu(torch::Tensor& out,    // [..., d]
 {
  // The difference between mul_and_silu and silu_and_mul is that mul_and_silu
  // applies the silu to the latter half of the input.
-  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, vllm::packed_silu_kernel,
-                                false);
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, false);
 }

 void gelu_and_mul(torch::Tensor& out,    // [..., d]
                  torch::Tensor& input)  // [..., 2 * d]
 {
-  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel, vllm::packed_gelu_kernel,
-                                true);
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel, true);
 }

 void gelu_tanh_and_mul(torch::Tensor& out,    // [..., d]
                       torch::Tensor& input)  // [..., 2 * d]
 {
-  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel,
-                                vllm::packed_gelu_tanh_kernel, true);
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel, true);
 }

 namespace vllm {
@@ -357,57 +158,42 @@ __device__ __forceinline__ T fatrelu_kernel(const T& x, const float threshold) {
  return (T)(f > threshold ? f : 0.0f);
 }

-template <typename packed_t>
-__device__ __forceinline__ packed_t
-packed_fatrelu_kernel(const packed_t& val, const float threshold) {
-  float2 fval = cast_to_float2(val);
-  fval.x = fval.x > threshold ? fval.x : 0.0f;
-  fval.y = fval.y > threshold ? fval.y : 0.0f;
-  return cast_to_packed<packed_t>(fval);
-}
-
-template <typename scalar_t, typename packed_t,
-          scalar_t (*ACT_FN)(const scalar_t&, const float),
-          packed_t (*PACKED_ACT_FN)(const packed_t&, const float), bool use_vec,
-          bool use_256b = false>
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&, const float)>
 __global__ void act_and_mul_kernel_with_param(
    scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d,
    const float param) {
-  const scalar_t* x_ptr = input + blockIdx.x * 2 * d;
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
+  const int64_t token_idx = blockIdx.x;
+  const scalar_t* x_ptr = input + token_idx * 2 * d;
  const scalar_t* y_ptr = x_ptr + d;
-  scalar_t* out_ptr = out + blockIdx.x * d;
+  scalar_t* out_ptr = out + token_idx * d;

-  if constexpr (use_vec) {
-    // Fast path: 128-bit/256-bit vectorized loop
-    using vec_t = typename VecTraits<use_256b>::vec_t;
-    constexpr int ARCH_MAX_VEC_SIZE = VecTraits<use_256b>::ARCH_MAX_VEC_SIZE;
-    constexpr int VEC_SIZE = ARCH_MAX_VEC_SIZE / sizeof(packed_t);
+  // Check alignment for 128-bit vectorized access
+  const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
+                       is_16byte_aligned(out_ptr);

-    const vec_t* x_vec = reinterpret_cast<const vec_t*>(x_ptr);
-    const vec_t* y_vec = reinterpret_cast<const vec_t*>(y_ptr);
-    vec_t* out_vec = reinterpret_cast<vec_t*>(out_ptr);
-    const int num_vecs = d / 2 / VEC_SIZE;
+  if (aligned && d >= VEC_SIZE) {
+    // Fast path: 128-bit vectorized loop
+    const int4* x_vec = reinterpret_cast<const int4*>(x_ptr);
+    const int4* y_vec = reinterpret_cast<const int4*>(y_ptr);
+    int4* out_vec = reinterpret_cast<int4*>(out_ptr);
+    const int num_vecs = d / VEC_SIZE;
+    const int vec_end = num_vecs * VEC_SIZE;

    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
-      vec_t x, y;
-      if constexpr (use_256b) {
-        ld256(x, &x_vec[i]);
-        ld256(y, &y_vec[i]);
-      } else {
-        x = VLLM_LDG(&x_vec[i]);
-        y = VLLM_LDG(&y_vec[i]);
-      }
-      auto* xp = reinterpret_cast<packed_t*>(&x);
-      auto* yp = reinterpret_cast<packed_t*>(&y);
+      int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
+      auto* xp = reinterpret_cast<scalar_t*>(&x);
+      auto* yp = reinterpret_cast<scalar_t*>(&y);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
 #pragma unroll
      for (int j = 0; j < VEC_SIZE; j++) {
-        xp[j] = packed_mul(PACKED_ACT_FN(xp[j], param), yp[j]);
-      }
-      if constexpr (use_256b) {
-        st256(x, &out_vec[i]);
-      } else {
-        out_vec[i] = x;
+        rp[j] = ACT_FN(xp[j], param) * yp[j];
      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = ACT_FN(VLLM_LDG(&x_ptr[i]), param) * VLLM_LDG(&y_ptr[i]);
    }
  } else {
    // Scalar fallback for unaligned data or small d
@@ -490,58 +276,20 @@ __global__ void swigluoai_and_mul_kernel(

 }  // namespace vllm

-#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PACKED_KERNEL, PARAM) \
-  auto dtype = input.scalar_type();                                            \
-  int d = input.size(-1) / 2;                                                  \
-  int64_t num_tokens = input.numel() / input.size(-1);                         \
-  if (num_tokens == 0) {                                                       \
-    return;                                                                    \
-  }                                                                            \
-  dim3 grid(num_tokens);                                                       \
-  int cc_major = at::cuda::getCurrentDeviceProperties()->major;                \
-  int support_vec = (cc_major >= 10 && num_tokens > 128) ? 32 : 16;            \
-  int vec_size = support_vec / at::elementSize(dtype);                         \
-  const bool use_vec = (d % vec_size == 0);                                    \
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));            \
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                \
-  if (use_vec) {                                                               \
-    dim3 block(std::min(d / vec_size, 1024));                                  \
-    if (cc_major >= 10 && num_tokens > 128) {                                  \
-      VLLM_DISPATCH_FLOATING_TYPES(                                            \
-          dtype, "act_and_mul_kernel_with_param", [&] {                        \
-            vllm::act_and_mul_kernel_with_param<                               \
-                scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t,     \
-                KERNEL<scalar_t>,                                              \
-                PACKED_KERNEL<                                                 \
-                    typename vllm::PackedTraits<scalar_t>::packed_t>,          \
-                true, true><<<grid, block, 0, stream>>>(                       \
-                out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d,       \
-                PARAM);                                                        \
-          });                                                                  \
-    } else {                                                                   \
-      VLLM_DISPATCH_FLOATING_TYPES(                                            \
-          dtype, "act_and_mul_kernel_with_param", [&] {                        \
-            vllm::act_and_mul_kernel_with_param<                               \
-                scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t,     \
-                KERNEL<scalar_t>,                                              \
-                PACKED_KERNEL<                                                 \
-                    typename vllm::PackedTraits<scalar_t>::packed_t>,          \
-                true, false><<<grid, block, 0, stream>>>(                      \
-                out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d,       \
-                PARAM);                                                        \
-          });                                                                  \
-    }                                                                          \
-  } else {                                                                     \
-    dim3 block(std::min(d, 1024));                                             \
-    VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel_with_param", [&] { \
-      vllm::act_and_mul_kernel_with_param<                                     \
-          scalar_t, typename vllm::PackedTraits<scalar_t>::packed_t,           \
-          KERNEL<scalar_t>,                                                    \
-          PACKED_KERNEL<typename vllm::PackedTraits<scalar_t>::packed_t>,      \
-          false><<<grid, block, 0, stream>>>(                                  \
-          out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d, PARAM);     \
-    });                                                                        \
-  }
+#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM)         \
+  int d = input.size(-1) / 2;                                           \
+  int64_t num_tokens = input.numel() / input.size(-1);                  \
+  dim3 grid(num_tokens);                                                \
+  dim3 block(std::min(d, 1024));                                        \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));     \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();         \
+  VLLM_DISPATCH_FLOATING_TYPES(                                         \
+      input.scalar_type(), "act_and_mul_kernel_with_param", [&] {       \
+        vllm::act_and_mul_kernel_with_param<scalar_t, KERNEL<scalar_t>> \
+            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),      \
+                                         input.data_ptr<scalar_t>(), d, \
+                                         PARAM);                        \
+      });

 #define LAUNCH_SIGLUOAI_AND_MUL(KERNEL, ALPHA, LIMIT)                          \
  int d = input.size(-1) / 2;                                                  \
@@ -561,8 +309,7 @@ __global__ void swigluoai_and_mul_kernel(
 void fatrelu_and_mul(torch::Tensor& out,    // [..., d],
                     torch::Tensor& input,  // [..., 2 * d]
                     double threshold) {
-  LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(
-      vllm::fatrelu_kernel, vllm::packed_fatrelu_kernel, threshold);
+  LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold);
 }
 void swigluoai_and_mul(torch::Tensor& out,    // [..., d]
                       torch::Tensor& input,  // [..., 2 * d]
@@ -572,41 +319,39 @@ void swigluoai_and_mul(torch::Tensor& out,    // [..., d]
 namespace vllm {

 // Element-wise activation kernel template.
-template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&), bool use_vec,
-          bool use_256b = false>
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
 __global__ void activation_kernel(
    scalar_t* __restrict__ out,          // [..., d]
    const scalar_t* __restrict__ input,  // [..., d]
    const int d) {
-  const scalar_t* in_ptr = input + blockIdx.x * d;
-  scalar_t* out_ptr = out + blockIdx.x * d;
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
+  const int64_t token_idx = blockIdx.x;
+  const scalar_t* in_ptr = input + token_idx * d;
+  scalar_t* out_ptr = out + token_idx * d;

-  if constexpr (use_vec) {
-    // Fast path: 128-bit/256-bit vectorized loop
-    using vec_t = typename VecTraits<use_256b>::vec_t;
-    constexpr int ARCH_MAX_VEC_SIZE = VecTraits<use_256b>::ARCH_MAX_VEC_SIZE;
-    constexpr int VEC_SIZE = ARCH_MAX_VEC_SIZE / sizeof(scalar_t);
-    const vec_t* in_vec = reinterpret_cast<const vec_t*>(in_ptr);
-    vec_t* out_vec = reinterpret_cast<vec_t*>(out_ptr);
+  // Check alignment for 128-bit vectorized access
+  const bool aligned = is_16byte_aligned(in_ptr) && is_16byte_aligned(out_ptr);
+
+  if (aligned && d >= VEC_SIZE) {
+    // Fast path: 128-bit vectorized loop
+    const int4* in_vec = reinterpret_cast<const int4*>(in_ptr);
+    int4* out_vec = reinterpret_cast<int4*>(out_ptr);
    const int num_vecs = d / VEC_SIZE;
+    const int vec_end = num_vecs * VEC_SIZE;

    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
-      vec_t v;
-      if constexpr (use_256b) {
-        ld256(v, &in_vec[i]);
-      } else {
-        v = VLLM_LDG(&in_vec[i]);
-      }
+      int4 v = VLLM_LDG(&in_vec[i]), r;
      auto* vp = reinterpret_cast<scalar_t*>(&v);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
 #pragma unroll
      for (int j = 0; j < VEC_SIZE; j++) {
-        vp[j] = ACT_FN(vp[j]);
-      }
-      if constexpr (use_256b) {
-        st256(v, &out_vec[i]);
-      } else {
-        out_vec[i] = v;
+        rp[j] = ACT_FN(vp[j]);
      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[i]));
    }
  } else {
    // Scalar fallback for unaligned data or small d
@@ -620,43 +365,18 @@ __global__ void activation_kernel(
 }  // namespace vllm

 // Launch element-wise activation kernel.
-#define LAUNCH_ACTIVATION_KERNEL(KERNEL)                                 \
-  auto dtype = input.scalar_type();                                      \
-  int d = input.size(-1);                                                \
-  int64_t num_tokens = input.numel() / input.size(-1);                   \
-  if (num_tokens == 0) {                                                 \
-    return;                                                              \
-  }                                                                      \
-  dim3 grid(num_tokens);                                                 \
-  int cc_major = at::cuda::getCurrentDeviceProperties()->major;          \
-  int support_vec = (cc_major >= 10 && num_tokens > 128) ? 32 : 16;      \
-  int vec_size = support_vec / at::elementSize(dtype);                   \
-  const bool use_vec = (d % vec_size == 0);                              \
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));      \
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();          \
-  if (use_vec) {                                                         \
-    dim3 block(std::min(d / vec_size, 1024));                            \
-    if (cc_major >= 10 && num_tokens > 128) {                            \
-      VLLM_DISPATCH_FLOATING_TYPES(dtype, "activation_kernel", [&] {     \
-        vllm::activation_kernel<scalar_t, KERNEL<scalar_t>, true, true>  \
-            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),       \
-                                         input.data_ptr<scalar_t>(), d); \
-      });                                                                \
-    } else {                                                             \
-      VLLM_DISPATCH_FLOATING_TYPES(dtype, "activation_kernel", [&] {     \
-        vllm::activation_kernel<scalar_t, KERNEL<scalar_t>, true, false> \
-            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),       \
-                                         input.data_ptr<scalar_t>(), d); \
-      });                                                                \
-    }                                                                    \
-  } else {                                                               \
-    dim3 block(std::min(d, 1024));                                       \
-    VLLM_DISPATCH_FLOATING_TYPES(dtype, "activation_kernel", [&] {       \
-      vllm::activation_kernel<scalar_t, KERNEL<scalar_t>, false>         \
-          <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),         \
-                                       input.data_ptr<scalar_t>(), d);   \
-    });                                                                  \
-  }
+#define LAUNCH_ACTIVATION_KERNEL(KERNEL)                                       \
+  int d = input.size(-1);                                                      \
+  int64_t num_tokens = input.numel() / d;                                      \
+  dim3 grid(num_tokens);                                                       \
+  dim3 block(std::min(d, 1024));                                               \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));            \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                \
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] { \
+    vllm::activation_kernel<scalar_t, KERNEL<scalar_t>>                        \
+        <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),                 \
+                                     input.data_ptr<scalar_t>(), d);           \
+  });

 namespace vllm {

--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -1234,13 +1234,8 @@ void cp_gather_and_upconvert_fp8_kv_cache(
              "src_cache and seq_lens must be on the same device");
  TORCH_CHECK(src_cache.device() == workspace_starts.device(),
              "src_cache and workspace_starts must be on the same device");
-  auto dtype = src_cache.scalar_type();
-  TORCH_CHECK(
-      dtype == at::ScalarType::Byte ||               // uint8
-          dtype == at::ScalarType::Float8_e4m3fn ||  // fp8 e4m3
-          dtype == at::ScalarType::Float8_e5m2,      // fp8 e5m2
-      "src_cache must be uint8, float8_e4m3fn, or float8_e5m2, but got ",
-      src_cache.dtype());
+
+  TORCH_CHECK(src_cache.dtype() == torch::kUInt8, "src_cache must be uint8");
  TORCH_CHECK(dst.dtype() == torch::kBFloat16, "dst must be bfloat16");
  TORCH_CHECK(head_dim == 576, "head_dim must be 576 for MLA");

@@ -1249,21 +1244,14 @@ void cp_gather_and_upconvert_fp8_kv_cache(
  int64_t cache_entry_stride = src_cache.stride(1);
  int64_t dst_entry_stride = dst.stride(0);

-  const uint8_t* src_ptr = nullptr;
-  if (dtype == at::ScalarType::Byte) {
-    src_ptr = src_cache.data_ptr<uint8_t>();
-  } else {
-    // float8_e4m3fn or float8_e5m2
-    src_ptr = reinterpret_cast<const uint8_t*>(src_cache.data_ptr());
-  }
-
  // Decide on the number of splits based on the batch size
  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
  dim3 grid(batch_size, num_splits);
  dim3 block(576);

  vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid, block, 0, stream>>>(
-      src_ptr, reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
+      src_cache.data_ptr<uint8_t>(),
+      reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
      block_table.data_ptr<int32_t>(), seq_lens.data_ptr<int32_t>(),
      workspace_starts.data_ptr<int32_t>(), block_size, head_dim,
      block_table_stride, cache_block_stride, cache_entry_stride,
@@ -1305,8 +1293,7 @@ void indexer_k_quant_and_cache(
  const at::cuda::OptionalCUDAGuard device_guard(device_of(k));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-  static const std::string kv_cache_dtype = "fp8_e4m3";
-  DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), kv_cache_dtype,
+  DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3",
                             CALL_INDEXER_K_QUANT_AND_CACHE);
 }

--- a/csrc/cpu/cpu_attn.cpp
+++ b/csrc/cpu/cpu_attn.cpp
@@ -16,8 +16,6 @@ torch::Tensor get_scheduler_metadata(
    isa = cpu_attention::ISA::VEC16;
  } else if (isa_hint == "neon") {
    isa = cpu_attention::ISA::NEON;
-  } else if (isa_hint == "vxe") {
-    isa = cpu_attention::ISA::VXE;
  } else {
    TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint);
  }
@@ -102,8 +100,6 @@ void cpu_attn_reshape_and_cache(
      return cpu_attention::ISA::VEC16;
    } else if (isa == "neon") {
      return cpu_attention::ISA::NEON;
-    } else if (isa == "vxe") {
-      return cpu_attention::ISA::VXE;
    } else {
      TORCH_CHECK(false, "Invalid ISA type: " + isa);
    }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Andreas Karatzas	89a77b1084	[ROCm][CI] Pin TorchCodec to v0.10.0 for ROCm compatibility (#34447 ) Signed-off-by: Andreas Karatzas <akaratza@amd.com> (cherry picked from commit `4c078fa546`) (cherry picked from commit a976961fb77d38129abf69edd4952101731f2421)	2026-02-24 20:30:22 -08:00
Kevin H. Luu	d3c1513f5f	[ci] Use the right tag for CPU arm64 image (#34915 ) Signed-off-by: Kevin H. Luu <khluu000@gmail.com> (cherry picked from commit `a1a2d79442`) (cherry picked from commit 772f70839192262ff01c533d821a11a225d1c00f)	2026-02-24 20:30:13 -08:00
Cyrus Leung	5dbfbc967b	[CI/Build] Fix gRPC version mismatch (#35013 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> (cherry picked from commit `965fe45935`) (cherry picked from commit 90308959295b66049024649fe1273070477f343d)	2026-02-24 20:30:02 -08:00
khluu	c86cdcbcd2	Revert "[Release 2.10] Update to Torch 2.10 - final release (#30525 )" This reverts commit `f97ca67176`.	2026-02-24 20:28:53 -08:00
khluu	3c9496f146	Revert "[Bugfix][ROCm][GPT-OSS] Use old triton_kernels implementation on ROCm if the new API is not available (#34153 )" This reverts commit `55a1baebc5`.	2026-02-24 20:28:45 -08:00
khluu	2d5be1dd5c	release script Signed-off-by: khluu <khluu000@gmail.com>	2026-02-12 02:37:52 -08:00
Michael Goin	7a06e5b05b	[Bugfix] Fix MTP accuracy for GLM-5 (#34385 ) Signed-off-by: mgoin <mgoin64@gmail.com> (cherry picked from commit `ec12d39d44`)	2026-02-11 20:54:27 -08:00
Junseo Park	946b2f106c	[Bugfix] send None sentinel on final commit so server properly sends transcription.done (#33963 ) Signed-off-by: pjs102793 <pjs102793@naver.com> Co-authored-by: Nick Hill <nickhill123@gmail.com> (cherry picked from commit `5458eb835d`)	2026-02-11 20:54:14 -08:00
Nick Hill	5e8adb0c49	[Misc] Bump `fastsafetensors` version for latest fixes (#34273 ) Signed-off-by: Nick Hill <nickhill123@gmail.com> (cherry picked from commit `79504027ef`)	2026-02-11 20:54:00 -08:00
Xinyu Dong	9be1ff2d3a	[Bugfix] fix default is_neox_style is True for deepseek (#34353 ) Signed-off-by: dongxinyu03 <dongxinyu03@baidu.com> (cherry picked from commit `be7f3d5d20`)	2026-02-11 20:53:40 -08:00
Jee Jee Li	b3ee90f961	[Model] GLM adaptation (#34124 ) (cherry picked from commit `978a37c823`)	2026-02-11 20:53:11 -08:00
Seiji Eicher	c44d0c6d66	Patch protobuf for CVE-2026-0994 (#34253 ) Signed-off-by: Seiji Eicher <seiji@anyscale.com> Co-authored-by: Kevin H. Luu <khluu000@gmail.com> (cherry picked from commit `5045d5c983`)	2026-02-11 02:33:40 -08:00
Kunshang Ji	83db96d8cd	[XPU][9/N] clean up existing ipex code/doc (#34111 ) Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> (cherry picked from commit `cb9574eb85`)	2026-02-11 02:33:27 -08:00
zofia	dbfb79fe45	[XPU][7/N] enable xpu fp8 moe (#34202 ) Signed-off-by: Zhu, Zufang <zufang.zhu@intel.com> (cherry picked from commit `b482f71e9f`)	2026-02-11 02:33:15 -08:00
Roger Wang	b2e1fc3589	[Bugfix][Core] Fix CPU memory leak from Request reference cycle in prefix caching (#34183 ) Signed-off-by: Roger Wang <hey@rogerw.io> (cherry picked from commit `8a5e0e2b2b`)	2026-02-11 02:33:04 -08:00
Gregory Shtrasberg	55a1baebc5	[Bugfix][ROCm][GPT-OSS] Use old triton_kernels implementation on ROCm if the new API is not available (#34153 ) Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> (cherry picked from commit `c60f8e3b49`)	2026-02-11 02:32:52 -08:00
Charlie Fu	e1e9841631	[torch.compile][Fusion] Fix attention fusion pass removing kv_udpate op. (#33945 ) Signed-off-by: charlifu <charlifu@amd.com> (cherry picked from commit `bb9f97308d`)	2026-02-11 02:32:41 -08:00
zofia	5bd63387c3	[XPU][6/N] add xpu scaled_mm kernel (#34117 ) Signed-off-by: Zhu, Zufang <zufang.zhu@intel.com> (cherry picked from commit `9bdb06b436`)	2026-02-11 02:32:27 -08:00