[Test] Add tests for n parameter in chat completions API (#35283 )

Signed-off-by: KrxGu <krishom70@gmail.com>
[Bugfix][Hardware][AMD] Gate FP4 ops on gfx950 to prevent MI300X crash (#35250 )
2026-02-26 09:14:07 +00:00 · 2026-02-26 16:11:07 +08:00 · 2026-02-26 16:05:40 +08:00 · 2026-02-26 15:46:44 +08:00 · 2026-02-26 07:15:16 +00:00 · 2026-02-25 22:35:41 -08:00
1260 changed files with 101937 additions and 27281 deletions
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -1,6 +1,7 @@
-group: Hardware
+group: Hardware - AMD Build 
 steps:
  - label: "AMD: :docker: build image"
    key: image-build-amd
    depends_on: []
    device: amd_cpu
    no_plugin: true
@@ -9,7 +10,7 @@ steps:
      docker build
      --build-arg max_jobs=16
      --build-arg REMOTE_VLLM=1
-      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
+      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx942;gfx950'
      --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
      --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
      -f docker/Dockerfile.rocm
--- a/.buildkite/hardware_tests/arm.yaml
+++ b/.buildkite/hardware_tests/arm.yaml
@@ -1,8 +0,0 @@
 group: Hardware
 steps:
  - label: "Arm CPU Test"
    soft_fail: true
    device: arm_cpu
    no_plugin: true
    commands: 
    - bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -0,0 +1,100 @@
 group: CPU
 depends_on: []
 steps:
 - label: CPU-Kernel Tests
  depends_on: []
  soft_fail: true
  device: intel_cpu
  no_plugin: true
  source_file_dependencies:
  - csrc/cpu/
  - cmake/cpu_extension.cmake
  - CMakeLists.txt
  - vllm/_custom_ops.py
  - tests/kernels/attention/test_cpu_attn.py
  - tests/kernels/moe/test_cpu_fused_moe.py
  - tests/kernels/test_onednn.py
  commands:
    - |
      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
      pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
      pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
      pytest -x -v -s tests/kernels/test_onednn.py"
 - label: CPU-Language Generation and Pooling Model Tests
  depends_on: []
  soft_fail: true
  device: intel_cpu
  no_plugin: true
  source_file_dependencies:
  - csrc/cpu/
  - vllm/
  - tests/models/language/generation/
  - tests/models/language/pooling/
  commands:
    - |
      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
      pytest -x -v -s tests/models/language/generation -m cpu_model
      pytest -x -v -s tests/models/language/pooling -m cpu_model"
 - label: CPU-Quantization Model Tests
  depends_on: []
  soft_fail: true
  device: intel_cpu
  no_plugin: true
  source_file_dependencies:
  - csrc/cpu/
  - vllm/model_executor/layers/quantization/cpu_wna16.py
  - vllm/model_executor/layers/quantization/gptq_marlin.py
  - vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
  - vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
  - vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
  - tests/quantization/test_compressed_tensors.py
  - tests/quantization/test_cpu_wna16.py
  commands:
    - |
      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
      pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
      pytest -x -v -s tests/quantization/test_cpu_wna16.py"
 - label: CPU-Distributed Tests
  depends_on: []
  soft_fail: true
  device: intel_cpu
  no_plugin: true
  source_file_dependencies:
  - csrc/cpu/shm.cpp
  - vllm/v1/worker/cpu_worker.py
  - vllm/v1/worker/gpu_worker.py
  - vllm/v1/worker/cpu_model_runner.py
  - vllm/v1/worker/gpu_model_runner.py
  - vllm/platforms/cpu.py
  - vllm/distributed/parallel_state.py
  - vllm/distributed/device_communicators/cpu_communicator.py
  commands:
    - |
      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
 - label: CPU-Multi-Modal Model Tests %N
  depends_on: []
  soft_fail: true
  device: intel_cpu
  no_plugin: true
  source_file_dependencies:
  # - vllm/
  - vllm/model_executor/layers/rotary_embedding
  - tests/models/multimodal/generation/
  commands:
    - |
      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m "
      pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB"
  parallelism: 2
 - label: "Arm CPU Test"
  depends_on: []
  soft_fail: true
  device: arm_cpu
  no_plugin: true
  commands: 
  - bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
--- a/.buildkite/hardware_tests/intel.yaml
+++ b/.buildkite/hardware_tests/intel.yaml
@@ -1,13 +1,6 @@
 group: Hardware
 depends_on: ~
 steps:
  - label: "Intel CPU Test"
    soft_fail: true
    device: intel_cpu
    no_plugin: true
    commands: 
    - bash .buildkite/scripts/hardware_ci/run-cpu-test.sh
  - label: "Intel HPU Test"
    soft_fail: true
    device: intel_hpu
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@@ -8,7 +8,7 @@ clean_docker_tag() {
 }
 print_usage_and_exit() {
-    echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
+    echo "Usage: $0 <registry> <repo> <commit> <branch> <image_tag> [<image_tag_latest>]"
    exit 1
 }
@@ -142,11 +142,16 @@ resolve_parent_commit() {
 print_bake_config() {
    echo "--- :page_facing_up: Resolved bake configuration"
-    BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
+    # Write to a temp directory to avoid polluting the repo root (which is the
    # Docker build context). Files left in the repo root get COPY'd into the
    # image and can cause duplicate artifact uploads from downstream steps.
    local bake_tmp
    bake_tmp="$(mktemp -d)"
    BAKE_CONFIG_FILE="${bake_tmp}/bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
    docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
    echo "Saved bake config to ${BAKE_CONFIG_FILE}"
    echo "--- :arrow_down: Uploading bake config to Buildkite"
-    buildkite-agent artifact upload "${BAKE_CONFIG_FILE}"
+    (cd "$(dirname "${BAKE_CONFIG_FILE}")" && buildkite-agent artifact upload "$(basename "${BAKE_CONFIG_FILE}")")
 }
 #################################
@@ -154,7 +159,7 @@ print_bake_config() {
 #################################
 print_instance_info
-if [[ $# -lt 7 ]]; then
+if [[ $# -lt 5 ]]; then
    print_usage_and_exit
 fi
@@ -163,10 +168,8 @@ REGISTRY=$1
 REPO=$2
 BUILDKITE_COMMIT=$3
 BRANCH=$4
-VLLM_USE_PRECOMPILED=$5
+IMAGE_TAG=$5
-VLLM_MERGE_BASE_COMMIT=$6
+IMAGE_TAG_LATEST=${6:-} # only used for main branch, optional
 IMAGE_TAG=$7
 IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional
 # build config
 TARGET="test-ci"
@@ -193,8 +196,6 @@ export CACHE_FROM
 export CACHE_FROM_BASE_BRANCH
 export CACHE_FROM_MAIN
 export CACHE_TO
 export VLLM_USE_PRECOMPILED
 export VLLM_MERGE_BASE_COMMIT
 # print args
 echo "--- :mag: Arguments"
@@ -202,8 +203,6 @@ echo "REGISTRY: ${REGISTRY}"
 echo "REPO: ${REPO}"
 echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
 echo "BRANCH: ${BRANCH}"
 echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
 echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
 echo "IMAGE_TAG: ${IMAGE_TAG}"
 echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -3,9 +3,9 @@ steps:
  - label: ":docker: Build image"
    key: image-build
    depends_on: []
    timeout_in_minutes: 600
    commands:
-    - if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
+    - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
    - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
 # skip build if image already exists
-if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -24,13 +24,13 @@ fi
 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
  --build-arg VLLM_CPU_AVX512BF16=true \
  --build-arg VLLM_CPU_AVX512VNNI=true \
  --build-arg VLLM_CPU_AMXBF16=true \
-  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
  --target vllm-test \
  --progress plain .
 # push
-docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
+docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu
--- a/.buildkite/image_build/image_build_cpu_arm64.sh
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
 # skip build if image already exists
-if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -24,10 +24,10 @@ fi
 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
  --target vllm-test \
  --progress plain .
 # push
-docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
+docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu
--- a/.buildkite/image_build/image_build_hpu.sh
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
 # skip build if image already exists
-if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
+if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -25,10 +25,10 @@ fi
 docker build \
  --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
+  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu \
  --progress plain \
  https://github.com/vllm-project/vllm-gaudi.git
 # push
-docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
+docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on chartqa for vllm.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install "lm-eval[api]>=0.4.11"
 usage() {
    echo``
@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
  --tasks chartqa \
  --batch_size auto \
  --apply_chat_template \
-  --limit $LIMIT
+  --limit "$LIMIT"
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install "lm-eval[api]>=0.4.11"
 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install "lm-eval[api]>=0.4.11"
 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install "lm-eval[api]>=0.4.11"
 usage() {
    echo``
@@ -20,14 +20,11 @@ usage() {
    echo
 }
-while getopts "m:b:l:f:t:" OPT; do
+while getopts "m:l:f:t:" OPT; do
  case ${OPT} in
    m )
        MODEL="$OPTARG"
        ;;
    b )
        BATCH_SIZE="$OPTARG"
        ;;
    l )
        LIMIT="$OPTARG"
        ;;
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@@ -9,8 +9,10 @@ import json
 import os
 from dataclasses import dataclass
 from importlib import util
 from pathlib import Path
 import pandas as pd
 import regex as re
 pd.options.display.float_format = "{:.2f}".format
 plotly_found = util.find_spec("plotly.express") is not None
@@ -275,6 +277,131 @@ def _apply_two_decimals(
    return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
 # -----------------------------
 # Export helpers (Excel + CSV)
 # -----------------------------
 def _sanitize_sheet_name(name: str) -> str:
    """
    Excel sheet constraints:
      - max 31 chars
      - cannot contain: : \ / ? * [ ]
      - cannot be empty
    """
    name = "sheet" if name is None else str(name)
    name = re.sub(r"[:\\/?*\[\]]", "_", name)
    name = name.strip().strip("'")
    name = re.sub(r"\s+", " ", name)
    if not name:
        name = "sheet"
    return name[:31]
 def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
    d = dict(zip(group_cols, gkey_tuple))
    model = d.get("Model", "model")
    model_short = str(model).split("/")[-1]
    ilen = d.get("Input Len", "")
    olen = d.get("Output Len", "")
    lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
    return _sanitize_sheet_name(f"{model_short}{lens}")
 def _write_tables_to_excel_sheet(
    writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
 ):
    startrow = 0
    for title, df in blocks:
        pd.DataFrame([[title]]).to_excel(
            writer, sheet_name=sheet, index=False, header=False, startrow=startrow
        )
        startrow += 1
        df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow)
        startrow += len(df) + 3
 def _safe_filename(s: str) -> str:
    s = re.sub(r"[^\w\-.]+", "_", str(s).strip())
    return s[:180] if len(s) > 180 else s
 # -----------------------------
 # vLLM environment export helper
 # -----------------------------
 def _parse_vllm_env_txt(env_path: Path) -> pd.DataFrame:
    """Parse vllm_env.txt into a flat table (Section, Key, Value).
    Supports:
      - section headers as standalone lines (no ':' or '=')
      - key-value lines like 'OS: Ubuntu ...'
      - env var lines like 'HF_HOME=/data/hf'
    """
    lines = env_path.read_text(encoding="utf-8", errors="replace").splitlines()
    section = "General"
    rows: list[dict] = []
    def set_section(s: str):
        nonlocal section
        s = (s or "").strip()
        if s:
            section = s
    for raw in lines:
        stripped = raw.strip()
        if not stripped:
            continue
        # divider lines like =====
        if set(stripped) <= {"="}:
            continue
        # section header heuristic: short standalone line
        if ":" not in stripped and "=" not in stripped and len(stripped) <= 64:
            if stripped.lower().startswith("collecting environment information"):
                continue
            set_section(stripped)
            continue
        # env var style: KEY=VALUE (and not a URL with :)
        if "=" in stripped and ":" not in stripped:
            k, v = stripped.split("=", 1)
            k = k.strip()
            v = v.strip()
            if k:
                rows.append({"Section": section, "Key": k, "Value": v})
            continue
        # key: value
        if ":" in stripped:
            k, v = stripped.split(":", 1)
            k = k.strip()
            v = v.strip()
            if k:
                rows.append({"Section": section, "Key": k, "Value": v})
            continue
    return pd.DataFrame(rows, columns=["Section", "Key", "Value"])
 def _load_env_df_for_inputs(args, files: list[str]) -> pd.DataFrame | None:
    """Load vllm_env.txt next to the *original* input JSON file.
    Note: when only one -f is provided, the script may split JSON into ./splits/...,
    but vllm_env.txt typically lives next to the original benchmark_results.json.
    """
    base_dir: Path | None = None
    if getattr(args, "file", None):
        base_dir = Path(args.file[0]).resolve().parent
    elif files:
        base_dir = Path(files[0]).resolve().parent
    if base_dir is None:
        return None
    env_path = base_dir / "vllm_env.txt"
    if not env_path.exists():
        return None
    df = _parse_vllm_env_txt(env_path)
    return df
 # -----------------------------
 # Valid max concurrency summary helpers
 # -----------------------------
@@ -428,7 +555,6 @@ def build_valid_max_concurrency_summary_html(
    summary_df = pd.DataFrame(rows)
    # --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
    for c in summary_df.columns:
        if c == "Configuration":
            continue
@@ -436,12 +562,10 @@ def build_valid_max_concurrency_summary_html(
    both_col = f"Max {conc_col} (Both)"
    # --- Strict 2-decimal formatting for ALL non-Configuration columns ---
    formatters = {}
    for c in summary_df.columns:
        if c == "Configuration":
            continue
        # default argument binds per-column formatter correctly
        formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
    styler = summary_df.style.format(formatters)
@@ -460,6 +584,95 @@ def build_valid_max_concurrency_summary_html(
    return title + styler.to_html(table_attributes='border="1" class="dataframe"')
 def build_valid_max_concurrency_summary_df(
    tput_group_df: pd.DataFrame | None,
    ttft_group_df: pd.DataFrame | None,
    tpot_group_df: pd.DataFrame | None,
    conc_col: str,
    args,
 ) -> pd.DataFrame | None:
    if ttft_group_df is None and tpot_group_df is None:
        return None
    ttft_cols = (
        _config_value_columns(ttft_group_df, conc_col)
        if ttft_group_df is not None
        else []
    )
    tpot_cols = (
        _config_value_columns(tpot_group_df, conc_col)
        if tpot_group_df is not None
        else []
    )
    tput_cols = (
        _config_value_columns(tput_group_df, conc_col)
        if tput_group_df is not None
        else []
    )
    if ttft_group_df is not None and tpot_group_df is not None:
        cfg_cols = [c for c in ttft_cols if c in tpot_cols]
        if tput_group_df is not None:
            cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
    else:
        cfg_cols = ttft_cols or tpot_cols
    if not cfg_cols:
        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
    rows = []
    for cfg in cfg_cols:
        ttft_max = (
            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_max = (
            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
            if tpot_group_df is not None
            else pd.NA
        )
        both = (
            pd.NA
            if (pd.isna(ttft_max) or pd.isna(tpot_max))
            else min(ttft_max, tpot_max)
        )
        tput_at_both = (
            _value_at_concurrency(tput_group_df, conc_col, cfg, both)
            if tput_group_df is not None
            else pd.NA
        )
        ttft_at_both = (
            _value_at_concurrency(ttft_group_df, conc_col, cfg, both)
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_at_both = (
            _value_at_concurrency(tpot_group_df, conc_col, cfg, both)
            if tpot_group_df is not None
            else pd.NA
        )
        rows.append(
            {
                "Configuration": cfg,
                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
                f"Max {conc_col} (Both)": both,
                "Output Tput @ Both (tok/s)": tput_at_both,
                "TTFT @ Both (ms)": ttft_at_both,
                "TPOT @ Both (ms)": tpot_at_both,
            }
        )
    df = pd.DataFrame(rows)
    for c in df.columns:
        if c != "Configuration":
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df
 # -----------------------------
 # Plot helper
 # -----------------------------
@@ -537,6 +750,21 @@ def build_parser() -> argparse.ArgumentParser:
        default=100.0,
        help="Reference limit for TPOT plots (ms)",
    )
    # ---- NEW: export options ----
    parser.add_argument(
        "--excel-out",
        type=str,
        default="perf_comparison.xlsx",
        help="Write one sheet per (Model, Dataset, Input Len, Output Len).",
    )
    parser.add_argument(
        "--csv-out-dir",
        type=str,
        default="",
        help="If set, write per-group per-metric CSVs into this directory.",
    )
    return parser
@@ -657,7 +885,6 @@ def maybe_write_plot(
        markers=True,
    )
    # Ensure plot hover + y tick labels are also 2 decimals.
    fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
    fig.update_yaxes(tickformat=".2f")
@@ -730,6 +957,27 @@ def write_report_group_first(
        for metric_label, (df, _) in metric_cache.items()
    }
    csv_dir = Path(args.csv_out_dir) if args.csv_out_dir else None
    if csv_dir:
        csv_dir.mkdir(parents=True, exist_ok=True)
    excel_path = args.excel_out or "perf_comparison.xlsx"
    with pd.ExcelWriter(excel_path, engine="openpyxl") as xw:
        # ---- Environment sheet (first) ----
        env_sheet = _sanitize_sheet_name("Environment")
        env_df = _load_env_df_for_inputs(args, files)
        if env_df is None or env_df.empty:
            pd.DataFrame(
                [
                    {
                        "Section": "Environment",
                        "Key": "vllm_env.txt",
                        "Value": "NOT FOUND (or empty)",
                    }
                ]
            ).to_excel(xw, sheet_name=env_sheet, index=False)
        else:
            env_df.to_excel(xw, sheet_name=env_sheet, index=False)
        with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
            main_fh.write('<meta charset="utf-8">\n')
            for gkey in group_keys:
@@ -744,6 +992,16 @@ def write_report_group_first(
                )
                main_fh.write(group_header)
                sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
                sheet_base = sheet
                dedup_i = 1
                while sheet in xw.sheets:
                    dedup_i += 1
                    sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}")
                excel_blocks: list[tuple[str, pd.DataFrame]] = []
                with open(sub_path, "w", encoding="utf-8") as sub_fh:
                    sub_fh.write('<meta charset="utf-8">\n')
                    sub_fh.write(group_header)
@@ -765,7 +1023,6 @@ def write_report_group_first(
                                f"{_html.escape(metric_label)} — missing for this group"
                                "</div>\n"
                            )
                            main_fh.write(missing)
                            sub_fh.write(missing)
                            continue
@@ -801,6 +1058,17 @@ def write_report_group_first(
                            args=args,
                        )
                        excel_blocks.append(
                            (metric_label, display_group.reset_index(drop=True))
                        )
                        if csv_dir:
                            fn = _safe_filename(
                                f"{sheet}__{metric_label}".replace(" ", "_").replace(
                                    "/", "_"
                                )
                            )
                            display_group.to_csv(csv_dir / f"{fn}.csv", index=False)
                    summary_html = build_valid_max_concurrency_summary_html(
                        tput_group_df=tput_group_df,
                        ttft_group_df=ttft_group_df,
@@ -812,6 +1080,29 @@ def write_report_group_first(
                        main_fh.write(summary_html)
                        sub_fh.write(summary_html)
                    summary_df = build_valid_max_concurrency_summary_df(
                        tput_group_df=tput_group_df,
                        ttft_group_df=ttft_group_df,
                        tpot_group_df=tpot_group_df,
                        conc_col=conc_col,
                        args=args,
                    )
                    if summary_df is not None:
                        excel_blocks.append(
                            ("Valid Max Concurrency Summary", summary_df)
                        )
                        if csv_dir:
                            fn = _safe_filename(
                                f"{sheet}__Valid_Max_Concurrency_Summary"
                            )
                            summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
                _write_tables_to_excel_sheet(xw, sheet, excel_blocks)
    print(f"Wrote Excel: {excel_path}")
    if csv_dir:
        print(f"Wrote CSVs under: {csv_dir}")
 def main():
    args = build_parser().parse_args()
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -1,6 +1,4 @@
 #!/bin/bash
 # This script should be run inside the CI process
 # This script assumes that we are already inside the vllm/ directory
 # Benchmarking results will be available inside vllm/benchmarks/results/
@@ -9,14 +7,19 @@
 set -x
 set -o pipefail
 # Environment-driven debug controls (like ON_CPU=1)
 DRY_RUN="${DRY_RUN:-0}"
 MODEL_FILTER="${MODEL_FILTER:-}"
 DTYPE_FILTER="${DTYPE_FILTER:-}"
 check_gpus() {
  if command -v nvidia-smi; then
    # check the number of GPUs and GPU type.
-    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+    declare -g gpu_count=$(nvidia-smi --list-gpus | grep -c . || true)
  elif command -v amd-smi; then
-    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
+    declare -g gpu_count=$(amd-smi list | grep -c 'GPU' || true)
  elif command -v hl-smi; then
-    declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
+    declare -g gpu_count=$(hl-smi --list | grep -ci "Module ID" || true)
  fi
  if [[ $gpu_count -gt 0 ]]; then
@@ -44,7 +47,7 @@ check_cpus() {
  declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
  if [[ $numa_count -gt 0 ]]; then
    echo "NUMA found."
-    echo $numa_count
+    echo "$numa_count"
  else
    echo "Need at least 1 NUMA to run benchmarking."
    exit 1
@@ -112,13 +115,12 @@ json2envs() {
 }
 wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
  local timeout_val="1200"
  timeout "$timeout_val" bash -c '
-    until curl -X POST localhost:8000/v1/completions; do
+    until curl -sf http://localhost:8000/v1/models >/dev/null; do
      sleep 1
-    done' && return 0 || return 1
+    done
  '
 }
 kill_processes_launched_by_current_bash() {
@@ -252,37 +254,16 @@ run_benchmark_tests() {
  done
 }
-run_latency_tests() {
+run_latency_tests() { run_benchmark_tests "latency" "$1"; }
-  run_benchmark_tests "latency" "$1"
+run_startup_tests() { run_benchmark_tests "startup" "$1"; }
-}
+run_throughput_tests() { run_benchmark_tests "throughput" "$1"; }
-run_startup_tests() {
+merge_serving_tests_stream() {
-  run_benchmark_tests "startup" "$1"
+  # Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode.
-}
+  # This helper does NOT modify JSON; it only filters the stream in dry-run mode.
-
+  local serving_test_file="$1"
-run_throughput_tests() {
+  # shellcheck disable=SC2016
-  run_benchmark_tests "throughput" "$1"
+  local merged='
 }
 run_serving_tests() {
  # run serving tests using `vllm bench serve` command
  # $1: a json file specifying serving test cases
  #
  # Supported JSON formats:
  # 1) Plain format: top-level array
  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
  #
  # 2) Default parameters field + plain format tests
  #    {
  #      "defaults": { ... },
  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
  #    }
  local serving_test_file
  serving_test_file=$1
  # Iterate over serving tests
  jq -c '
    if type == "array" then
      # Plain format: test cases array
      .[]
@@ -304,7 +285,50 @@ run_serving_tests() {
    else
      error("Unsupported serving test file format: must be array or object with .tests")
    end
-  ' "$serving_test_file" | while read -r params; do
+  '
  jq -c "$merged" "$serving_test_file" | \
  if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
    jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" '
      select((($model|length)==0)
             or ((.server_parameters.model // "") == $model)
             or ((.client_parameters.model // "") == $model))
      | select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
    '
  else
    cat
  fi
 }
 run_serving_tests() {
  # run serving tests using `vllm bench serve` command
  # $1: a json file specifying serving test cases
  #
  # Supported JSON formats:
  # 1) Plain format: top-level array
  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
  #
  # 2) Default parameters field + plain format tests
  #    {
  #      "defaults": { ... },
  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
  #    }
  local serving_test_file
  serving_test_file=$1
  # In dry-run mode, if filters are provided but no tests match, fail fast.
  if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
    local count
    count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ')
    if [[ "$count" -eq 0 ]]; then
      echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2
      return 0
    fi
  fi
  # Iterate over serving tests (merged + optional filtered stream)
  merge_serving_tests_stream "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -373,7 +397,7 @@ run_serving_tests() {
    echo "Server command: $server_command"
    # support remote vllm server
    client_remote_args=""
-    if [[ -z "${REMOTE_HOST}" ]]; then
+    if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then
      bash -c "$server_command" &
      server_pid=$!
      # wait until the server is alive
@@ -384,6 +408,9 @@ run_serving_tests() {
        echo ""
        echo "vLLM failed to start within the timeout period."
      fi
    elif [[ "${DRY_RUN:-0}" == "1" ]]; then
        # dry-run: don't start server
        echo "Dry Run."
    else
      server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
      if [[ ${REMOTE_PORT} ]]; then
@@ -402,14 +429,12 @@ run_serving_tests() {
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps="inf"
        echo "now qps is $qps"
      fi
      # iterate over different max_concurrency
      for max_concurrency in $max_concurrency_list; do
-        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
+        new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
        echo " new test name $new_test_name"
        # pass the tensor parallel size, the compilation mode, and the optimization
        # level to the client so that they can be used on the benchmark dashboard
@@ -425,7 +450,9 @@ run_serving_tests() {
        echo "Running test case $test_name with qps $qps"
        echo "Client command: $client_command"
        if [[ "${DRY_RUN:-0}" != "1" ]]; then
          bash -c "$client_command"
        fi
        # record the benchmarking commands
        jq_output=$(jq -n \
@@ -443,12 +470,15 @@ run_serving_tests() {
    done
    # clean up
-    kill -9 $server_pid
+    if [[ "${DRY_RUN:-0}" != "1" ]]; then
      kill -9 "$server_pid"
      kill_gpu_processes
    fi
  done
 }
 main() {
  local ARCH
  ARCH=''
  if [[ "$ON_CPU" == "1" ]]; then
@@ -458,7 +488,13 @@ main() {
     check_gpus
     ARCH="$arch_suffix"
  fi
  # DRY_RUN does not execute vLLM; do not require HF_TOKEN.
  if [[ "${DRY_RUN:-0}" != "1" ]]; then
    check_hf_token
  else
    echo "DRY_RUN=1 -> skip HF_TOKEN validation"
  fi
  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
@@ -479,11 +515,16 @@ main() {
  # dump vllm info via vllm collect-env
  env_output=$(vllm collect-env)
  echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
  # benchmarking
-  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
+  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $?
  if [[ "${DRY_RUN:-0}" == "1" ]]; then
    echo "DRY_RUN=1 -> skip latency/startup/throughput suites"
    exit 0
  fi
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
  run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
@@ -0,0 +1,41 @@
 {
  "defaults": {
    "qps_list": [
      "inf"
    ],
    "max_concurrency_list": [
      32,
      64,
      128
    ],
    "server_environment_variables": {
      "VLLM_RPC_TIMEOUT": 100000,
      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
      "VLLM_CPU_SGL_KERNEL": 1,
      "VLLM_CPU_KVCACHE_SPACE": 40
    },
    "server_parameters": {
      "dtype": "bfloat16",
      "model": "jinaai/jina-embeddings-v3",
      "trust_remote_code": ""
    },
    "client_parameters": {
      "model": "jinaai/jina-embeddings-v3",
      "backend": "openai-embeddings",
      "endpoint": "/v1/embeddings",
      "dataset_name": "sharegpt",
      "dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json",
      "num_prompts": 200
    }
  },
  "tests": [
    {
      "test_name": "serving_jina_embed_v3_tp1_sharegpt",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {}
    }
  ]
 }
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
@@ -0,0 +1,283 @@
 {
  "defaults": {
    "qps_list": [
      "inf"
    ],
    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
    "server_environment_variables": {
      "VLLM_RPC_TIMEOUT": 100000,
      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
      "VLLM_CPU_SGL_KERNEL": 1,
      "VLLM_CPU_KVCACHE_SPACE": 40
    },
    "server_parameters": {
      "model": "meta-llama/Llama-3.1-8B-Instruct",
      "tensor_parallel_size": 1,
      "dtype": "bfloat16",
      "distributed_executor_backend": "mp",
      "block_size": 128,
      "trust_remote_code": "",
      "disable_log_stats": "",
      "max_num_batched_tokens": 2048,
      "max_num_seqs": 256
    },
    "client_parameters": {
      "model": "meta-llama/Llama-3.1-8B-Instruct",
      "backend": "vllm",
      "ignore-eos": "",
      "num_prompts": 200
    }
  },
  "tests": [
    {
      "test_name": "serving_llama8B_tp1_sharegpt",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "sharegpt",
        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
      }
    },
    {
      "test_name": "serving_llama8B_tp2_sharegpt",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "sharegpt",
        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_128_128",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp2_random_128_128",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_128_128",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_128_2048",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp2_random_128_2048",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_128_2048",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_2048_128",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp2_random_2048_128",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_2048_128",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int4_tp1_random_128_128",
      "server_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int4_tp2_random_128_128",
      "server_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int4_tp4_random_128_128",
      "server_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama3B_tp1_random_128_128",
      "server_parameters": {
        "model": "meta-llama/Llama-3.2-3B-Instruct",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "meta-llama/Llama-3.2-3B-Instruct",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_granite2B_tp1_random_128_128",
      "server_parameters": {
        "model": "ibm-granite/granite-3.2-2b-instruct",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "ibm-granite/granite-3.2-2b-instruct",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen1.7B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-1.7B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-1.7B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen4B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-4B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-4B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen8B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-8B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-8B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_glm9B_tp1_random_128_128",
      "server_parameters": {
        "model": "zai-org/glm-4-9b-hf",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "zai-org/glm-4-9b-hf",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_gemma7B_tp1_random_128_128",
      "server_parameters": {
        "model": "google/gemma-7b",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "google/gemma-7b",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    }
  ]
 }
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -148,136 +148,6 @@
        "random-input-len": 2048,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int4_tp1_random_128_128",
      "server_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int4_tp2_random_128_128",
      "server_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int4_tp4_random_128_128",
      "server_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama3B_tp1_random_128_128",
      "server_parameters": {
        "model": "meta-llama/Llama-3.2-3B-Instruct",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "meta-llama/Llama-3.2-3B-Instruct",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_granite2B_tp1_random_128_128",
      "server_parameters": {
        "model": "ibm-granite/granite-3.2-2b-instruct",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "ibm-granite/granite-3.2-2b-instruct",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen1.7B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-1.7B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-1.7B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen4B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-4B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-4B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen8B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-8B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-8B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_glm9B_tp1_random_128_128",
      "server_parameters": {
        "model": "zai-org/glm-4-9b-hf",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "zai-org/glm-4-9b-hf",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_gemma7B_tp1_random_128_128",
      "server_parameters": {
        "model": "google/gemma-7b",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "google/gemma-7b",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    }
  ]
 }
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -27,7 +27,7 @@ aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-
 To download and upload the image:
 \`\`\`
-Download images:
+# Download images:
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
@@ -35,8 +35,12 @@ docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
 docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
 docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
-Tag and push images:
+# Tag and push images:
 ## CUDA
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
@@ -62,34 +66,21 @@ docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-a
 docker push vllm/vllm-openai:latest-aarch64-cu130
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm
+## ROCm
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:latest
+
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
-docker push vllm/vllm-openai-rocm:latest
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
-docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
 docker push vllm/vllm-openai-rocm:latest
 docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
 Create multi-arch manifest:
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
 docker push vllm/vllm-openai-rocm:latest-base
 docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
-docker manifest rm vllm/vllm-openai:latest
+## CPU
 docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker manifest push vllm/vllm-openai:latest
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
 docker manifest rm vllm/vllm-openai:latest-cu130
 docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
 docker manifest push vllm/vllm-openai:latest-cu130
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
 # CPU images (vllm/vllm-openai-cpu)
 docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
 docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
 docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
 docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
@@ -103,6 +94,20 @@ docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-a
 docker push vllm/vllm-openai-cpu:latest-arm64
 docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
 # Create multi-arch manifest:
 docker manifest rm vllm/vllm-openai:latest
 docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker manifest push vllm/vllm-openai:latest
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
 docker manifest rm vllm/vllm-openai:latest-cu130
 docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
 docker manifest push vllm/vllm-openai:latest-cu130
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
 docker manifest rm vllm/vllm-openai-cpu:latest || true
 docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
 docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
--- a/.buildkite/scripts/annotate-rocm-release.sh
+++ b/.buildkite/scripts/annotate-rocm-release.sh
@@ -25,7 +25,7 @@ S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
 S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"
 # Format ROCm version for path (e.g., "7.1" -> "rocm710")
-ROCM_VERSION_PATH="rocm$(echo ${ROCM_VERSION} | tr -d '.')"
+ROCM_VERSION_PATH="rocm$(echo "${ROCM_VERSION}" | tr -d '.')"
 ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
 buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
 ## ROCm Wheel and Docker Image Releases
--- a/.buildkite/scripts/cache-rocm-base-wheels.sh
+++ b/.buildkite/scripts/cache-rocm-base-wheels.sh
@@ -83,7 +83,7 @@ case "${1:-}" in
            exit 1
        fi
-        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
+        WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
        if [[ "$WHEEL_COUNT" -eq 0 ]]; then
            echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
            exit 1
@@ -110,9 +110,9 @@ case "${1:-}" in
        echo ""
        echo "Downloaded wheels:"
-        ls -lh artifacts/rocm-base-wheels/
+        find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \;
-        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
+        WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
        echo ""
        echo "Total: $WHEEL_COUNT wheels"
        echo "========================================"
--- a/.buildkite/scripts/cherry-pick-from-milestone.sh
+++ b/.buildkite/scripts/cherry-pick-from-milestone.sh
@@ -134,7 +134,7 @@ log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
 # Store PR data in a temp file
 PR_DATA=$(mktemp)
-trap "rm -f $PR_DATA" EXIT
+trap 'rm -f "$PR_DATA"' EXIT
 if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
    --limit 1000 \
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -1,25 +1,37 @@
 #!/bin/bash
-# This script runs test inside the corresponding ROCm docker container.
+# This script runs tests inside the corresponding ROCm docker container.
 # It handles both single-node and multi-node test configurations.
 #
 # Multi-node detection: Instead of matching on fragile group names, we detect
 # multi-node jobs structurally by looking for the bracket command syntax
 # "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
 set -o pipefail
 # Export Python path
 export PYTHONPATH=".."
-# Print ROCm version
+###############################################################################
-echo "--- Confirming Clean Initial State"
+# Helper Functions
-while true; do
+###############################################################################
-        sleep 3
+
 wait_for_clean_gpus() {
  local timeout=${1:-300}
  local start=$SECONDS
  echo "--- Waiting for clean GPU state (timeout: ${timeout}s)"
  while true; do
    if grep -q clean /opt/amdgpu/etc/gpu_state; then
      echo "GPUs state is \"clean\""
-                break
+      return
    fi
-done
+    if (( SECONDS - start >= timeout )); then
      echo "Error: GPUs did not reach clean state within ${timeout}s" >&2
      exit 1
    fi
    sleep 3
  done
 }
 echo "--- ROCm info"
 rocminfo
 # cleanup older docker images
 cleanup_docker() {
  # Get Docker's root directory
  docker_root=$(docker info -f '{{.DockerRootDir}}')
@@ -28,15 +40,12 @@ cleanup_docker() {
    exit 1
  fi
  echo "Docker root directory: $docker_root"
-  # Check disk usage of the filesystem where Docker's root directory is located
+
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
  # Define the threshold
  threshold=70
  if [ "$disk_usage" -gt "$threshold" ]; then
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
    # Remove unused volumes / force the system prune for old images as well.
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
    echo "Docker images and volumes cleanup completed."
  else
@@ -45,31 +54,166 @@ cleanup_docker() {
 }
 cleanup_network() {
-  for node in $(seq 0 $((NUM_NODES-1))); do
+  local max_nodes=${NUM_NODES:-2}
-    if docker pr -a -q -f name="node${node}" | grep -q .; then
+  for node in $(seq 0 $((max_nodes - 1))); do
-      docker stop "node${node}"
+    if docker ps -a -q -f name="node${node}" | grep -q .; then
      docker stop "node${node}" || true
    fi
  done
-  if docker network ls | grep docker-net; then
+  if docker network ls | grep -q docker-net; then
-    docker network rm docker-net
+    docker network rm docker-net || true
  fi
 }
-# Call the cleanup docker function
+is_multi_node() {
  local cmds="$1"
  # Primary signal: NUM_NODES environment variable set by the pipeline
  if [[ "${NUM_NODES:-1}" -gt 1 ]]; then
    return 0
  fi
  # Fallback: detect the bracket syntax structurally
  # Pattern: [...] && [...] (per-node command arrays)
  if [[ "$cmds" =~ \[.*\].*\&\&.*\[.*\] ]]; then
    return 0
  fi
  return 1
 }
 ###############################################################################
 # Pytest marker re-quoting
 #
 # When commands are passed through Buildkite -> shell -> $* -> bash -c,
 # quotes around pytest -m marker expressions get stripped:
 #   pytest -v -s -m 'not cpu_test' v1/core
 # becomes:
 #   pytest -v -s -m not cpu_test v1/core
 #
 # pytest then interprets "cpu_test" as a file path, not part of the marker.
 # This function detects unquoted multi-word marker expressions and re-quotes
 # them so they survive the final bash -c expansion.
 ###############################################################################
 re_quote_pytest_markers() {
  local cmds="$1"
  # Pattern: -m not <identifier>  ->  -m 'not <identifier>'
  # Handles the common cases: 'not cpu_test', 'not slow_test', etc.
  cmds=$(echo "$cmds" | sed -E "s/-m not ([a-zA-Z_][a-zA-Z0-9_]*)/-m 'not \1'/g")
  echo "$cmds"
 }
 ###############################################################################
 # ROCm-specific pytest command rewrites
 #
 # These apply ignore flags and environment overrides for tests that are not
 # yet supported or behave differently on ROCm hardware. Kept as a single
 # function so new exclusions are easy to add in one place.
 ###############################################################################
 apply_rocm_test_overrides() {
  local cmds="$1"
  # --- Model registry filter ---
  if [[ $cmds == *"pytest -v -s models/test_registry.py"* ]]; then
    cmds=${cmds//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
  fi
  # --- LoRA: disable custom paged attention ---
  if [[ $cmds == *"pytest -v -s lora"* ]]; then
    cmds=${cmds//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
  fi
  # --- Kernel ignores ---
  if [[ $cmds == *" kernels/core"* ]]; then
    cmds="${cmds} \
    --ignore=kernels/core/test_fused_quant_layernorm.py \
    --ignore=kernels/core/test_permute_cols.py"
  fi
  if [[ $cmds == *" kernels/attention"* ]]; then
    cmds="${cmds} \
    --ignore=kernels/attention/test_attention_selector.py \
    --ignore=kernels/attention/test_encoder_decoder_attn.py \
    --ignore=kernels/attention/test_flash_attn.py \
    --ignore=kernels/attention/test_flashinfer.py \
    --ignore=kernels/attention/test_prefix_prefill.py \
    --ignore=kernels/attention/test_cascade_flash_attn.py \
    --ignore=kernels/attention/test_mha_attn.py \
    --ignore=kernels/attention/test_lightning_attn.py \
    --ignore=kernels/attention/test_attention.py"
  fi
  if [[ $cmds == *" kernels/quantization"* ]]; then
    cmds="${cmds} \
    --ignore=kernels/quantization/test_int8_quant.py \
    --ignore=kernels/quantization/test_machete_mm.py \
    --ignore=kernels/quantization/test_block_fp8.py \
    --ignore=kernels/quantization/test_block_int8.py \
    --ignore=kernels/quantization/test_marlin_gemm.py \
    --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
    --ignore=kernels/quantization/test_int8_kernel.py"
  fi
  if [[ $cmds == *" kernels/mamba"* ]]; then
    cmds="${cmds} \
    --ignore=kernels/mamba/test_mamba_mixer2.py \
    --ignore=kernels/mamba/test_causal_conv1d.py \
    --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
  fi
  if [[ $cmds == *" kernels/moe"* ]]; then
    cmds="${cmds} \
    --ignore=kernels/moe/test_moe.py \
    --ignore=kernels/moe/test_cutlass_moe.py \
    --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
  fi
  # --- Entrypoint ignores ---
  if [[ $cmds == *" entrypoints/openai "* ]]; then
    cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
    --ignore=entrypoints/openai/test_audio.py \
    --ignore=entrypoints/openai/test_shutdown.py \
    --ignore=entrypoints/openai/test_completion.py \
    --ignore=entrypoints/openai/test_models.py \
    --ignore=entrypoints/openai/test_lora_adapters.py \
    --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
    --ignore=entrypoints/openai/test_root_path.py \
    --ignore=entrypoints/openai/test_tokenization.py \
    --ignore=entrypoints/openai/test_prompt_validation.py "}
  fi
  if [[ $cmds == *" entrypoints/llm "* ]]; then
    cmds=${cmds//" entrypoints/llm "/" entrypoints/llm \
    --ignore=entrypoints/llm/test_chat.py \
    --ignore=entrypoints/llm/test_accuracy.py \
    --ignore=entrypoints/llm/test_init.py \
    --ignore=entrypoints/llm/test_prompt_validation.py "}
  fi
  # Clean up escaped newlines from --ignore appends
  cmds=$(echo "$cmds" | sed 's/ \\ / /g')
  echo "$cmds"
 }
 ###############################################################################
 # Main
 ###############################################################################
 # --- GPU initialization ---
 echo "--- Confirming Clean Initial State"
 wait_for_clean_gpus
 echo "--- ROCm info"
 rocminfo
 # --- Docker housekeeping ---
 cleanup_docker
 echo "--- Resetting GPUs"
 echo "reset" > /opt/amdgpu/etc/gpu_state
 wait_for_clean_gpus
-while true; do
+# --- Pull test image ---
        sleep 3
        if grep -q clean /opt/amdgpu/etc/gpu_state; then
                echo "GPUs state is \"clean\""
                break
        fi
 done
 echo "--- Pulling container"
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
@@ -80,144 +224,74 @@ remove_docker_container() {
 }
 trap remove_docker_container EXIT
 # --- Prepare commands ---
 echo "--- Running container"
 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
-commands=$@
+commands="$*"
 echo "Raw commands: $commands"
-commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
+# Fix quoting before ROCm overrides (so overrides see correct structure)
-
+commands=$(re_quote_pytest_markers "$commands")
-if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
+commands=$(apply_rocm_test_overrides "$commands")
  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi
 commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
 if [[ $commands == *"pytest -v -s lora"* ]]; then
  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
 fi
 #ignore certain kernels tests
 if [[ $commands == *" kernels/core"* ]]; then
  commands="${commands} \
  --ignore=kernels/core/test_fused_quant_layernorm.py \
  --ignore=kernels/core/test_permute_cols.py"
 fi
 if [[ $commands == *" kernels/attention"* ]]; then
  commands="${commands} \
  --ignore=kernels/attention/test_attention_selector.py \
  --ignore=kernels/attention/test_encoder_decoder_attn.py \
  --ignore=kernels/attention/test_flash_attn.py \
  --ignore=kernels/attention/test_flashinfer.py \
  --ignore=kernels/attention/test_prefix_prefill.py \
  --ignore=kernels/attention/test_cascade_flash_attn.py \
  --ignore=kernels/attention/test_mha_attn.py \
  --ignore=kernels/attention/test_lightning_attn.py \
  --ignore=kernels/attention/test_attention.py"
 fi
 if [[ $commands == *" kernels/quantization"* ]]; then
  commands="${commands} \
  --ignore=kernels/quantization/test_int8_quant.py \
  --ignore=kernels/quantization/test_machete_mm.py \
  --ignore=kernels/quantization/test_block_fp8.py \
  --ignore=kernels/quantization/test_block_int8.py \
  --ignore=kernels/quantization/test_marlin_gemm.py \
  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
  --ignore=kernels/quantization/test_int8_kernel.py"
 fi
 if [[ $commands == *" kernels/mamba"* ]]; then
  commands="${commands} \
  --ignore=kernels/mamba/test_mamba_mixer2.py \
  --ignore=kernels/mamba/test_causal_conv1d.py \
  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
 fi
 if [[ $commands == *" kernels/moe"* ]]; then
  commands="${commands} \
  --ignore=kernels/moe/test_moe.py \
  --ignore=kernels/moe/test_cutlass_moe.py \
  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
 fi
 #ignore certain Entrypoints/openai tests
 if [[ $commands == *" entrypoints/openai "* ]]; then
  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
  --ignore=entrypoints/openai/test_audio.py \
  --ignore=entrypoints/openai/test_shutdown.py \
  --ignore=entrypoints/openai/test_completion.py \
  --ignore=entrypoints/openai/test_models.py \
  --ignore=entrypoints/openai/test_lora_adapters.py \
  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
  --ignore=entrypoints/openai/test_root_path.py \
  --ignore=entrypoints/openai/test_tokenization.py \
  --ignore=entrypoints/openai/test_prompt_validation.py "}
 fi
 #ignore certain Entrypoints/llm tests
 if [[ $commands == *" entrypoints/llm "* ]]; then
  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
  --ignore=entrypoints/llm/test_chat.py \
  --ignore=entrypoints/llm/test_accuracy.py \
  --ignore=entrypoints/llm/test_init.py \
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi
 commands=$(echo "$commands" | sed 's/ \\ / /g')
 echo "Final commands: $commands"
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
 # --ignore=entrypoints/openai/test_accuracy.py \
 # --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
 MYPYTHONPATH=".."
-# Test that we're launching on the machine that has
+# Verify GPU access
 # proper access to GPUs
 render_gid=$(getent group render | cut -d: -f3)
 if [[ -z "$render_gid" ]]; then
  echo "Error: 'render' group not found. This is required for GPU access." >&2
  exit 1
 fi
-if [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
+# --- Route: multi-node vs single-node ---
-
+if is_multi_node "$commands"; then
  echo "--- Multi-node job detected"
  export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
-  if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then
+  # Parse the bracket syntax:  prefix ; [node0_cmds] && [node1_cmds]
-      prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g')
+  #   BASH_REMATCH[1] = prefix (everything before first bracket)
  #   BASH_REMATCH[2] = comma-separated node0 commands
  #   BASH_REMATCH[3] = comma-separated node1 commands
  if [[ "$commands" =~ ^(.*)\[(.*)"] && ["(.*)\]$ ]]; then
    prefix=$(echo "${BASH_REMATCH[1]}" | sed 's/;//g')
    echo "PREFIX: ${prefix}"
      export composite_command="(command rocm-smi || true)"
      myIFS=$IFS
      IFS=','
      read -ra node0 <<< ${BASH_REMATCH[2]}
      read -ra node1 <<< ${BASH_REMATCH[3]}
      IFS=$myIFS
      for i in "${!node0[@]}";do 
        command_node_0=$(echo ${node0[i]} | sed 's/\"//g')
        command_node_1=$(echo ${node1[i]} | sed 's/\"//g')
-        export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
+    export composite_command="(command rocm-smi || true)"
-        echo "COMMANDS: ${commands}"
+    saved_IFS=$IFS
-        composite_command=$(echo "${composite_command} && ${commands}")
+    IFS=','
    read -ra node0 <<< "${BASH_REMATCH[2]}"
    read -ra node1 <<< "${BASH_REMATCH[3]}"
    IFS=$saved_IFS
    if [[ ${#node0[@]} -ne ${#node1[@]} ]]; then
      echo "Warning: node0 has ${#node0[@]} commands, node1 has ${#node1[@]}. They will be paired by index."
    fi
    for i in "${!node0[@]}"; do
      command_node_0=$(echo "${node0[i]}" | sed 's/\"//g')
      command_node_1=$(echo "${node1[i]}" | sed 's/\"//g')
      step_cmd="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
      echo "COMMANDS: ${step_cmd}"
      composite_command="${composite_command} && ${step_cmd}"
    done
    /bin/bash -c "${composite_command}"
    cleanup_network
  else
-      echo "Failed to parse node commands! Exiting."
+    echo "Multi-node job detected but failed to parse bracket command syntax."
    echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]"
    echo "Got: $commands"
    cleanup_network
    exit 111
  fi
 else
  echo "--- Single-node job"
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \
    --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--- a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
@@ -0,0 +1,26 @@
 #!/bin/bash
 set -euox pipefail
 echo "--- PP+TP"
 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
 server_pid=$!
 timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
 vllm bench serve \
    --backend vllm \
    --dataset-name random \
    --model meta-llama/Llama-3.2-3B-Instruct \
    --num-prompts 20 \
    --endpoint /v1/completions
 kill -s SIGTERM $server_pid &
 echo "--- DP+TP"
 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
 server_pid=$!
 timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
 vllm bench serve \
    --backend vllm \
    --dataset-name random \
    --model meta-llama/Llama-3.2-3B-Instruct \
    --num-prompts 20 \
    --endpoint /v1/completions
 kill -s SIGTERM $server_pid &
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -27,7 +27,7 @@ function cpu_tests() {
  podman exec -it "$container_id" bash -c "
    export TORCH_COMPILE_DISABLE=1
    set -xve
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
  # Run basic model test
  podman exec -it "$container_id" bash -c "
@@ -43,7 +43,7 @@ function cpu_tests() {
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
    # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
-    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
+    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> "$HOME"/test_rest.log
 }
 # All of CPU tests are expected to be finished less than 40 mins.
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -2,119 +2,19 @@
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
-set -ex
+set -euox pipefail
 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-48-95}
 # used for TP/PP E2E test
 OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 IMAGE_NAME="cpu-test-$NUMA_NODE"
 TIMEOUT_VAL=$1
 TEST_COMMAND=$2
-export CMAKE_BUILD_PARALLEL_LEVEL=32
+# building the docker image
-
+echo "--- :docker: Building Docker image"
-# Setup cleanup
+docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
 remove_docker_container() {
    set -e;
    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
 }
 trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+        timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
 function cpu_tests() {
  set -e
  export NUMA_NODE=$2
  # list packages
  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
    set -e
    pip list"
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pip list"
  # offline inference
  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
  # Run kernel tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
    pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
    pytest -x -v -s tests/kernels/test_onednn.py"
  # Run basic model test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    # Note: disable until supports V1
    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
    pytest -x -v -s tests/models/language/generation -m cpu_model
    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
    pytest -x -v -s tests/models/language/pooling -m cpu_model
    pytest -x -v -s tests/models/multimodal/generation \
                --ignore=tests/models/multimodal/generation/test_pixtral.py \
                -m cpu_model"
  # Run compressed-tensor test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -s -v \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
  # Run AWQ/GPTQ test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -s -v \
    tests/quantization/test_cpu_wna16.py"
  # Run multi-lora tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -s -v \
    tests/lora/test_qwenvl.py"
  # online serving: tp+pp
  docker exec cpu-test-"$NUMA_NODE" bash -c '
    set -e
    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
    server_pid=$!
    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
    vllm bench serve \
      --backend vllm \
      --dataset-name random \
      --model meta-llama/Llama-3.2-3B-Instruct \
      --num-prompts 20 \
      --endpoint /v1/completions
    kill -s SIGTERM $server_pid &'
  # online serving: tp+dp
  docker exec cpu-test-"$NUMA_NODE" bash -c '
    set -e
    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
    server_pid=$!
    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
    vllm bench serve \
      --backend vllm \
      --dataset-name random \
      --model meta-llama/Llama-3.2-3B-Instruct \
      --num-prompts 20 \
      --endpoint /v1/completions
    kill -s SIGTERM $server_pid &'
 }
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
 timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -7,7 +7,7 @@ set -exuo pipefail
 # Try building the docker image
 image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
 container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
-cat <<EOF | docker build -t ${image_name} -f - .
+cat <<EOF | docker build -t "${image_name}" -f - .
 FROM gaudi-base-image:latest
 COPY ./ /workspace/vllm
@@ -39,12 +39,12 @@ EOF
 # functions, while other platforms only need one remove_docker_container
 # function.
 EXITCODE=1
-remove_docker_containers() { docker rm -f ${container_name} || true; }
+remove_docker_containers() { docker rm -f "${container_name}" || true; }
 trap 'remove_docker_containers; exit $EXITCODE;' EXIT
 remove_docker_containers
 echo "Running HPU plugin v1 test"
-docker run --rm --runtime=habana --name=${container_name} --network=host \
+docker run --rm --runtime=habana --name="${container_name}" --network=host \
  -e HABANA_VISIBLE_DEVICES=all \
  -e VLLM_SKIP_WARMUP=true \
  -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -41,6 +41,7 @@ get_config() {
        echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
        exit 1
    fi
    # shellcheck source=/dev/null
    source "${TEST_RUN_CONFIG_FILE}"
    echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
    return 0
@@ -48,9 +49,8 @@ get_config() {
 # get test running configuration.
 fetch_vllm_test_cfg
 get_config
 # Check if the function call was successful. If not, exit the script.
-if [ $? -ne 0 ]; then
+if ! get_config; then
  exit 1
 fi
@@ -62,14 +62,14 @@ agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
 echo "agent_idx: ${agent_idx}"
 builder_name="cachebuilder${agent_idx}"
 builder_cache_dir="/mnt/docker-cache${agent_idx}"
-mkdir -p ${builder_cache_dir}
+mkdir -p "${builder_cache_dir}"
 # Try building the docker image
 cat <<EOF | DOCKER_BUILDKIT=1 docker build \
-    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
+    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:"${PYPI_CACHE_HOST}" \
-    --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
+    --builder "${builder_name}" --cache-from type=local,src="${builder_cache_dir}" \
-                           --cache-to type=local,dest=${builder_cache_dir},mode=max \
+                           --cache-to type=local,dest="${builder_cache_dir}",mode=max \
-    --progress=plain --load -t ${image_name} -f - .
+    --progress=plain --load -t "${image_name}" -f - .
 FROM ${BASE_IMAGE_NAME}
 # Define environments
@@ -116,7 +116,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
-    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/devlib && \
    python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -139,7 +139,7 @@ trap remove_docker_container EXIT
 # Generate corresponding --device args based on BUILDKITE_AGENT_NAME
 # Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
 #   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
-#   returns --device /dev/davinci0 --device /dev/davinci1
+#   returns one argument per line: --device, /dev/davinciX, ...
 parse_and_gen_devices() {
    local input="$1"
    local index cards_num
@@ -151,29 +151,24 @@ parse_and_gen_devices() {
        return 1
    fi
    local devices=""
    local i=0
    while (( i < cards_num )); do
        local dev_idx=$(((index - 1)*cards_num + i ))
-        devices="$devices --device /dev/davinci${dev_idx}"
+        printf '%s\n' "--device"
        printf '%s\n' "/dev/davinci${dev_idx}"
        ((i++))
    done
    # trim leading space
    devices="${devices#"${devices%%[![:space:]]*}"}"
    # Output devices: assigned to the caller variable
    printf '%s' "$devices"
 }
-devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
+mapfile -t device_args < <(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
 # Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
 # This test checks whether the OOT platform interface is functioning properly in conjunction with
 # the hardware plugin vllm-ascend.
 model_cache_dir=/mnt/modelscope${agent_idx}
-mkdir -p ${model_cache_dir}
+mkdir -p "${model_cache_dir}"
 docker run \
-    ${devices} \
+    "${device_args[@]}" \
    --device /dev/davinci_manager \
    --device /dev/devmm_svm \
    --device /dev/hisi_hdc \
@@ -182,7 +177,7 @@ docker run \
    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
    -v /etc/ascend_install.info:/etc/ascend_install.info \
-    -v ${model_cache_dir}:/root/.cache/modelscope \
+    -v "${model_cache_dir}":/root/.cache/modelscope \
    --entrypoint="" \
    --name "${container_name}" \
    "${image_name}" \
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 # Try building the docker image
-docker build -t ${image_name} -f docker/Dockerfile.xpu .
+docker build -t "${image_name}" -f docker/Dockerfile.xpu .
 # Setup cleanup
 remove_docker_container() {
@@ -39,6 +39,8 @@ docker run \
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
    python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
    cd tests
--- a/.buildkite/scripts/push-nightly-builds.sh
+++ b/.buildkite/scripts/push-nightly-builds.sh
@@ -21,16 +21,16 @@ echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag nam
 # pull original arch-dependent images from AWS ECR Public
 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX"
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX"
 # tag arch-dependent images
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-x86_64
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-aarch64
 # push arch-dependent images to DockerHub
-docker push vllm/vllm-openai:$TAG_NAME-x86_64
+docker push vllm/vllm-openai:"$TAG_NAME"-x86_64
-docker push vllm/vllm-openai:$TAG_NAME-aarch64
+docker push vllm/vllm-openai:"$TAG_NAME"-aarch64
 # push arch-independent manifest to DockerHub
-docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
+docker manifest create vllm/vllm-openai:"$TAG_NAME" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
-docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
+docker manifest create vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
-docker manifest push vllm/vllm-openai:$TAG_NAME
+docker manifest push vllm/vllm-openai:"$TAG_NAME"
-docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT
+docker manifest push vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT"
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@@ -1,64 +0,0 @@
 #!/bin/bash
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Setup script for Prime-RL integration tests
 # This script prepares the environment for running Prime-RL tests with nightly vLLM
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
 PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
 PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
 if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
    echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
    exit 0
 fi
 echo "Setting up Prime-RL integration test environment..."
 # Clean up any existing Prime-RL directory
 if [ -d "${PRIME_RL_DIR}" ]; then
    echo "Removing existing Prime-RL directory..."
    rm -rf "${PRIME_RL_DIR}"
 fi
 # Install UV if not available
 if ! command -v uv &> /dev/null; then
    echo "Installing UV package manager..."
    curl -LsSf https://astral.sh/uv/install.sh | sh
    source $HOME/.local/bin/env
 fi
 # Clone Prime-RL repository at specific branch for reproducible tests
 PRIME_RL_BRANCH="integ-vllm-main"
 echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
 git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
 cd "${PRIME_RL_DIR}"
 echo "Setting up UV project environment..."
 export UV_PROJECT_ENVIRONMENT=/usr/local
 ln -s /usr/bin/python3 /usr/local/bin/python
 # Remove vllm pin from pyproject.toml
 echo "Removing vllm pin from pyproject.toml..."
 sed -i '/vllm==/d' pyproject.toml
 # Sync Prime-RL dependencies
 echo "Installing Prime-RL dependencies..."
 uv sync --inexact && uv sync --inexact --all-extras
 # Verify installation
 echo "Verifying installations..."
 uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
 uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
 echo "Prime-RL integration test environment setup complete!"
 echo "Running Prime-RL integration tests..."
 export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
 uv run pytest -vs tests/integration/test_rl.py -m gpu
 echo "Prime-RL integration tests completed!"
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -51,14 +51,14 @@ for BACK in "${BACKENDS[@]}"; do
    --enable-eplb \
    --trust-remote-code \
    --max-model-len 2048 \
-    --all2all-backend $BACK \
+    --all2all-backend "$BACK" \
-    --port $PORT &
+    --port "$PORT" &
  SERVER_PID=$!
-  wait_for_server $PORT
+  wait_for_server "$PORT"
  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
@@ -0,0 +1,57 @@
 #!/usr/bin/env bash
 set -euxo pipefail
 # Nightly e2e test for prefetch offloading with a MoE model.
 # Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights
 # and validates GSM8K accuracy matches baseline (no offloading).
 #
 # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
 THRESHOLD=${1:-0.25}
 NUM_Q=${2:-1319}
 PORT=${3:-8030}
 OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
 mkdir -p "${OUT_DIR}"
 wait_for_server() {
  local port=$1
  timeout 600 bash -c '
    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
      sleep 1
    done'
 }
 MODEL="deepseek-ai/DeepSeek-V2-Lite"
 cleanup() {
  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
    kill "${SERVER_PID}" 2>/dev/null || true
    for _ in {1..20}; do
      kill -0 "${SERVER_PID}" 2>/dev/null || break
      sleep 0.5
    done
    kill -9 "${SERVER_PID}" 2>/dev/null || true
  fi
 }
 trap cleanup EXIT
 vllm serve "$MODEL" \
  --max-model-len 2048 \
  --offload-group-size 8 \
  --offload-num-in-group 2 \
  --offload-prefetch-step 1 \
  --offload-params w13_weight w2_weight \
  --port "$PORT" &
 SERVER_PID=$!
 wait_for_server "$PORT"
 TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
 OUT="${OUT_DIR}/${TAG}_prefetch_offload.json"
 python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
 python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} prefetch_offload: accuracy {acc:.3f}")
 assert acc >= ${THRESHOLD}, f"${MODEL} prefetch_offload accuracy {acc}"
 PY
 cleanup
 SERVER_PID=
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@@ -47,20 +47,20 @@ for BACK in "${BACKENDS[@]}"; do
  vllm serve "$MODEL" \
    --enforce-eager \
    --enable-eplb \
-    --all2all-backend $BACK \
+    --all2all-backend "$BACK" \
    --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
-    --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
+    --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \
-    --data-parallel-size ${DATA_PARALLEL_SIZE} \
+    --data-parallel-size "${DATA_PARALLEL_SIZE}" \
    --enable-expert-parallel \
    --trust-remote-code \
    --max-model-len 2048 \
-    --port $PORT &
+    --port "$PORT" &
  SERVER_PID=$!
-  wait_for_server $PORT
+  wait_for_server "$PORT"
  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -51,20 +51,20 @@ for BACK in "${BACKENDS[@]}"; do
    --tensor-parallel-size 4 \
    --enable-expert-parallel \
    --enable-eplb \
-    --all2all-backend $BACK \
+    --all2all-backend "$BACK" \
    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
    --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
    --trust-remote-code \
    --max-model-len 2048 \
    --gpu-memory-utilization 0.9 \
    "${PLATFORM_ARGS[@]}" \
-    --port $PORT &
+    --port "$PORT" &
  SERVER_PID=$!
-  wait_for_server $PORT
+  wait_for_server "$PORT"
  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
--- a/.buildkite/scripts/tpu/docker_run_bm.sh
+++ b/.buildkite/scripts/tpu/docker_run_bm.sh
@@ -9,10 +9,11 @@ ENV_FILE=$1
 # For testing on local vm, use `set -a` to export all variables
 source /etc/environment
-source $ENV_FILE
+# shellcheck source=/dev/null
 source "$ENV_FILE"
 remove_docker_container() { 
-    docker rm -f $CONTAINER_NAME || true;
+    docker rm -f "$CONTAINER_NAME" || true;
 }
 trap remove_docker_container EXIT
@@ -41,13 +42,13 @@ echo
 echo "starting docker...$CONTAINER_NAME"
 echo    
 docker run \
- -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
+ -v "$DOWNLOAD_DIR":"$DOWNLOAD_DIR" \
- --env-file $ENV_FILE \
+ --env-file "$ENV_FILE" \
 -e HF_TOKEN="$HF_TOKEN" \
- -e TARGET_COMMIT=$BUILDKITE_COMMIT \
+ -e TARGET_COMMIT="$BUILDKITE_COMMIT" \
- -e MODEL=$MODEL \
+ -e MODEL="$MODEL" \
 -e WORKSPACE=/workspace \
- --name $CONTAINER_NAME \
+ --name "$CONTAINER_NAME" \
 -d \
 --privileged \
 --network host \
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@@ -42,21 +42,21 @@ echo "lanching vllm..."
 echo "logging to $VLLM_LOG"
 echo
-vllm serve $MODEL \
+vllm serve "$MODEL" \
 --seed 42 \
- --max-num-seqs $MAX_NUM_SEQS \
+ --max-num-seqs "$MAX_NUM_SEQS" \
- --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
+ --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
- --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
+ --tensor-parallel-size "$TENSOR_PARALLEL_SIZE" \
 --no-enable-prefix-caching \
- --download_dir $DOWNLOAD_DIR \
+ --download_dir "$DOWNLOAD_DIR" \
- --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
+ --max-model-len "$MAX_MODEL_LEN" > "$VLLM_LOG" 2>&1 &
 echo "wait for 20 minutes.."
 echo
 # sleep 1200
 # wait for 10 minutes...
-for i in {1..120}; do
+for _ in {1..120}; do
    # TODO: detect other type of errors.
    if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
        echo "Detected RuntimeError, exiting."
@@ -78,11 +78,11 @@ echo "logging to $BM_LOG"
 echo
 vllm bench serve \
    --backend vllm \
-    --model $MODEL  \
+    --model "$MODEL"  \
    --dataset-name sonnet \
    --dataset-path benchmarks/sonnet_4x.txt \
-    --sonnet-input-len $INPUT_LEN \
+    --sonnet-input-len "$INPUT_LEN" \
-    --sonnet-output-len $OUTPUT_LEN \
+    --sonnet-output-len "$OUTPUT_LEN" \
    --ignore-eos > "$BM_LOG"
 echo "completed..."
--- a/.buildkite/scripts/upload-nightly-wheels.sh
+++ b/.buildkite/scripts/upload-nightly-wheels.sh
@@ -76,16 +76,15 @@ mkdir -p "$INDICES_OUTPUT_DIR"
 # this indices have relative paths that could work as long as it is next to the wheel directory in s3
 # i.e., the wheels are always in s3://vllm-wheels/<commit>/
 # and indices can be placed in /<commit>/, or /nightly/, or /<version>/
-if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
+alias_args=()
-    alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
+if [[ -n "$DEFAULT_VARIANT_ALIAS" ]]; then
-else
+    alias_args=(--alias-to-default "$DEFAULT_VARIANT_ALIAS")
    alias_arg=""
 fi
 # HACK: we do not need regex module here, but it is required by pre-commit hook
 # To avoid any external dependency, we simply replace it back to the stdlib re module
 sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
-$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" "${alias_args[@]}"
 # copy indices to /<commit>/ unconditionally
 echo "Uploading indices to $S3_COMMIT_PREFIX"
@@ -100,9 +99,9 @@ fi
 # re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
 if [[ "$version" != *"dev"* ]]; then
    echo "Re-generating indices for /$pure_version/"
-    rm -rf "$INDICES_OUTPUT_DIR/*"
+    rm -rf "${INDICES_OUTPUT_DIR:?}/*"
    mkdir -p "$INDICES_OUTPUT_DIR"
    # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
-    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
+    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" "${alias_args[@]}"
    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
 fi
--- a/.buildkite/scripts/upload-release-wheels-pypi.sh
+++ b/.buildkite/scripts/upload-release-wheels-pypi.sh
@@ -7,7 +7,7 @@ SUBPATH=$BUILDKITE_COMMIT
 S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
 RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
-GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
+GIT_VERSION=$(git describe --exact-match --tags "$BUILDKITE_COMMIT" 2>/dev/null)
 echo "Release version from Buildkite: $RELEASE_VERSION"
@@ -55,7 +55,7 @@ mkdir -p $DIST_DIR
 aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
 echo "Wheels copied to local directory"
 # generate source tarball
-git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
+git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT"
 ls -la $DIST_DIR
 # upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
@@ -65,6 +65,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
  exit 1
 fi
-python3 -m twine check $PYPI_WHEEL_FILES
+python3 -m twine check "$PYPI_WHEEL_FILES"
-python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES
+python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES"
 echo "Wheels uploaded to PyPI"
--- a/.buildkite/scripts/upload-rocm-wheels.sh
+++ b/.buildkite/scripts/upload-rocm-wheels.sh
@@ -55,7 +55,7 @@ mkdir -p all-rocm-wheels
 cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
 cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
-WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
+WHEEL_COUNT=$(find all-rocm-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
 echo "Total wheels to upload: $WHEEL_COUNT"
 if [ "$WHEEL_COUNT" -eq 0 ]; then
@@ -115,7 +115,7 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] |
 fi
 # Extract version from vLLM wheel and update version-specific index
-VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
+VLLM_WHEEL=$(find all-rocm-wheels -maxdepth 1 -name 'vllm*.whl' 2>/dev/null | head -1)
 if [ -n "$VLLM_WHEEL" ]; then
    VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
    echo "Version in wheel: $VERSION"
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
--- a/.buildkite/test_areas/basic_correctness.yaml
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -14,3 +14,8 @@ steps:
  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
--- a/.buildkite/test_areas/benchmarks.yaml
+++ b/.buildkite/test_areas/benchmarks.yaml
@@ -17,3 +17,15 @@ steps:
  - tests/benchmarks/
  commands:
  - pytest -v -s benchmarks/
 - label: Attention Benchmarks Smoke Test (B200)
  device: b200
  num_gpus: 2
  optional: true
  working_dir: "/vllm-workspace/"
  timeout_in_minutes: 10
  source_file_dependencies:
  - benchmarks/attention_benchmarks/
  - vllm/v1/attention/
  commands:
  - python3 benchmarks/attention_benchmarks/benchmark.py --backends flash flashinfer --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -2,7 +2,7 @@ group: Compile
 depends_on: 
  - image-build
 steps:
- label: Sequence Parallel Tests (2 GPUs)
+- label: Sequence Parallel Correctness Tests (2 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/"
  num_devices: 2
@@ -11,12 +11,12 @@ steps:
  - vllm/compilation/
  - vllm/v1/worker/
  - vllm/v1/cudagraph_dispatcher.py
-  - tests/distributed/test_sequence_parallel.py
+  - tests/compile/correctness_e2e/test_sequence_parallel.py
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -v -s tests/distributed/test_sequence_parallel.py
+  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
- label: Sequence Parallel Tests (2xH100)
+- label: Sequence Parallel Correctness Tests (2xH100)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/"
  device: h100
@@ -24,24 +24,30 @@ steps:
  num_devices: 2
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -v -s tests/distributed/test_sequence_parallel.py
+  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
 - label: AsyncTP Correctness Tests (2xH100)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/"
  device: h100
  optional: true
  num_devices: 2
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
 - label: Distributed Compile Unit Tests (2xH100)
-  timeout_in_minutes: 40
+  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
  device: h100
  num_devices: 2
  source_file_dependencies:
  - vllm/compilation/
  - vllm/model_executor/layers
-  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/passes/distributed/
  - tests/compile/distributed/test_sequence_parallelism.py
  - tests/compile/distributed/test_async_tp.py
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+  - pytest -s -v tests/compile/passes/distributed
  - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
  - pytest -v -s tests/compile/distributed/test_async_tp.py
 - label: Fusion and Compile Unit Tests (B200)
  timeout_in_minutes: 20
@@ -55,17 +61,17 @@ steps:
  - vllm/model_executor/layers/attention/attention.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
-  - tests/compile/test_fusion_attn.py
+  - tests/compile/passes/test_fusion_attn.py
-  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/passes/test_silu_mul_quant_fusion.py
-  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/passes/distributed/test_fusion_all_reduce.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    # b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
    - nvidia-smi
-    - pytest -v -s tests/compile/test_fusion_attn.py -k FLASHINFER
+    - pytest -v -s tests/compile/passes/test_fusion_attn.py -k FLASHINFER
-    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_devices=2 is not set
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
    # TODO(luka) move to H100 once pass tests run on H100
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
@@ -115,13 +121,10 @@ steps:
  optional: true
  commands:
    - nvidia-smi
-    # Run all models and attn backends but only Inductor partition and native custom ops
+    # Run all models but only FLASHINFER, Inductor partition and native custom ops
    # -k "inductor_partition and not +rms_norm and not +quant_fp8"
    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    # -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
+    # Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
-    # Run just llama3 (fp8 & fp4) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"
    # -k "llama-3"
    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
 - label: Fusion E2E TP2 Quick (H100)
  timeout_in_minutes: 20
@@ -156,7 +159,7 @@ steps:
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
-    # Run just llama3 (fp4 & fp8 & bf16) for all config combinations
+    # Run just llama3 (fp8 & bf16) for all config combinations
    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
 - label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
@@ -191,7 +194,8 @@ steps:
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
-    # Run all models and attn backends but only Inductor partition and native custom ops
+    # Run all models but only FLASHINFER, Inductor partition and native custom ops
    # include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    # for ar-rms-quant-fp4, also sweep llama3
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"
--- a/.buildkite/test_areas/cuda.yaml
+++ b/.buildkite/test_areas/cuda.yaml
@@ -9,6 +9,7 @@ steps:
  - tests/cuda
  commands:
    - pytest -v -s cuda/test_cuda_context.py
    - pytest -v -s cuda/test_platform_no_cuda_init.py
 - label: Cudagraph
  timeout_in_minutes: 20
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -62,6 +62,7 @@ steps:
  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - examples/offline_inference/new_weight_syncing/
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
@@ -96,9 +97,13 @@ steps:
  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  # OLD rlhf examples
  - cd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  # NEW rlhf examples
  - cd new_weight_syncing
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
 - label: Distributed Tests (8 GPUs)(H100)
  timeout_in_minutes: 10
@@ -140,6 +145,7 @@ steps:
  num_devices: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py
@@ -159,6 +165,7 @@ steps:
  num_devices: 2
  num_nodes: 2
  no_plugin: true
  optional: true # TODO: revert once infra issue solved
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
@@ -191,7 +198,18 @@ steps:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: Pipeline + Context Parallelism (4 GPUs))
+- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 - label: Pipeline + Context Parallelism (4 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -29,15 +29,11 @@ steps:
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
- label: Prime-RL Integration (2 GPUs)
+- label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100)
-  timeout_in_minutes: 30
+  timeout_in_minutes: 60
  device: h100
  optional: true
-  soft_fail: true
+  num_devices: 1
  num_devices: 2
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
-    - nvidia-smi
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh 0.25 200 8030
    - bash .buildkite/scripts/run-prime-rl-test.sh
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -28,3 +28,11 @@ steps:
    - pytest -v -s v1/engine/test_preprocess_error_handling.py
    # Run the rest of v1/engine tests
    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
      commands:
      - pytest -v -s v1/e2e
      - pytest -v -s v1/engine
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -24,6 +24,11 @@ steps:
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Entrypoints Integration (API Server 1)
  timeout_in_minutes: 130
@@ -42,15 +47,13 @@ steps:
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/tool_use
  - tests/entrypoints/sleep
  - tests/entrypoints/instrumentator
  - tests/entrypoints/rpc
  - tests/entrypoints/instrumentator
  - tests/tool_use
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
  - pytest -v -s entrypoints/instrumentator
-  - pytest -v -s entrypoints/sleep
+  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
  - pytest -v -s tool_use
 - label: Entrypoints Integration (Pooling)
@@ -62,6 +65,11 @@ steps:
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/pooling
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Entrypoints Integration (Responses API)
  timeout_in_minutes: 50
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -115,6 +115,7 @@ steps:
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
    - pytest -v -s tests/kernels/moe/test_flashinfer_moe.py
    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
    # e2e
    - pytest -v -s tests/models/quantization/test_nvfp4.py
@@ -156,14 +157,3 @@ steps:
    - pytest -v -s kernels/moe/test_deepep_moe.py
    - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
    # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
 - label: Kernels Fp4 MoE Test (B200)
  timeout_in_minutes: 60
  device: b200
  num_devices: 1
  optional: true
  commands:
    - pytest -v -s kernels/moe/test_cutedsl_moe.py
    - pytest -v -s kernels/moe/test_flashinfer_moe.py
    - pytest -v -s kernels/moe/test_nvfp4_moe.py
    - pytest -v -s kernels/moe/test_ocp_mx_moe.py
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -73,3 +73,29 @@ steps:
  num_devices: 2
  commands:
    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
 - label: GPQA Eval (GPT-OSS) (H100)
  timeout_in_minutes: 120
  device: h100
  optional: true
  num_devices: 2
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/evals/gpt_oss/
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-h100.txt
 - label: GPQA Eval (GPT-OSS) (B200)
  timeout_in_minutes: 120
  device: b200
  optional: true
  num_devices: 2
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/evals/gpt_oss/
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-b200.txt
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -16,6 +16,7 @@ steps:
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
    # TODO: create another `optional` test group for slow tests
    - pytest -v -s -m 'not slow_test' v1/spec_decode
    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
    - pytest -v -s -m 'not cpu_test' v1/metrics
@@ -25,6 +26,11 @@ steps:
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: V1 Others (CPU)
  depends_on:
@@ -72,7 +78,7 @@ steps:
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
     # for pooling models
-    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    - python3 pooling/embed/vision_embedding_offline.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
@@ -108,9 +114,11 @@ steps:
  timeout_in_minutes: 50
  source_file_dependencies:
  - vllm/
  - tests/detokenizer
  - tests/multimodal
  - tests/utils_
  commands:
  - pytest -v -s detokenizer
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_
@@ -122,6 +130,8 @@ steps:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/test_pooling_params.py
  - tests/test_ray_env.py
  - tests/multimodal
  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
@@ -134,6 +144,8 @@ steps:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s test_pooling_params.py
  - pytest -v -s test_ray_env.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s renderers
  - pytest -v -s tokenizers_
@@ -141,20 +153,6 @@ steps:
  - pytest -v -s transformers_utils
  - pytest -v -s config
 - label: GPT-OSS Eval (B200)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  device: b200
  optional: true
  source_file_dependencies:
  - tests/evals/gpt_oss
  - vllm/model_executor/models/gpt_oss.py
  - vllm/model_executor/layers/quantization/mxfp4.py
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
 - label: Batch Invariance (H100)
  timeout_in_minutes: 25
  device: h100
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -4,7 +4,6 @@ depends_on:
 steps:
 - label: Basic Models Tests (Initialization)
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -16,7 +15,6 @@ steps:
 - label: Basic Models Tests (Extra Initialization) %N
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
@@ -33,10 +31,17 @@ steps:
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
  - tests/models/test_terratorch.py
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
  commands:
-    - pytest -v -s models/test_transformers.py models/test_registry.py
+    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Basic Models Test (Other CPU) # 5min
  depends_on: 
--- a/.buildkite/test_areas/models_language.yaml
+++ b/.buildkite/test_areas/models_language.yaml
@@ -4,7 +4,6 @@ depends_on:
 steps:
 - label: Language Models Tests (Standard)
  timeout_in_minutes: 25
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -16,7 +15,6 @@ steps:
 - label: Language Models Tests (Extra Standard) %N
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
@@ -32,7 +30,6 @@ steps:
 - label: Language Models Tests (Hybrid) %N
  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -40,7 +37,7 @@ steps:
  commands:
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
-    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    # Shard hybrid language model tests
    - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
@@ -48,7 +45,6 @@ steps:
 - label: Language Models Test (Extended Generation) # 80min
  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
@@ -56,13 +52,21 @@ steps:
  commands:
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
-    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
      commands:
      - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
      - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
      - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 - label: Language Models Test (PPL)
  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
@@ -72,17 +76,20 @@ steps:
 - label: Language Models Test (Extended Pooling)  # 36min
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/pooling
  commands:
    - pytest -v -s models/language/pooling -m 'not core_model'
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Language Models Test (MTEB)
  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -3,7 +3,7 @@ depends_on:
  - image-build
 steps:
 - label: PyTorch Compilation Unit Tests
-  timeout_in_minutes: 30
+  timeout_in_minutes: 10
  source_file_dependencies:
    - vllm/
    - tests/compile
@@ -17,6 +17,14 @@ steps:
  # (using -0 for proper path handling)
  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
 - label: PyTorch Compilation Passes Unit Tests
  timeout_in_minutes: 20
  source_file_dependencies:
    - vllm/
    - tests/compile/passes
  commands:
  - pytest -s -v compile/passes --ignore compile/passes/distributed
 - label: PyTorch Fullgraph Smoke Test
  timeout_in_minutes: 35
  source_file_dependencies:
--- a/.buildkite/test_areas/samplers.yaml
+++ b/.buildkite/test_areas/samplers.yaml
@@ -12,3 +12,10 @@ steps:
  commands:
    - pytest -v -s samplers
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
      commands:
      - pytest -v -s samplers
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,42 +2,60 @@
 # for more info about CODEOWNERS file
 # This lists cover the "core" components of vLLM that require careful review
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
+/vllm/compilation @zou3519 @youkaichao @ProExpertProg
-/vllm/model_executor/layers/attention @LucasWilkinson
+/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
 /vllm/lora @jeejeelee
 /vllm/model_executor/layers/attention @LucasWilkinson @MatthewBonanni
 /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
 /vllm/model_executor/layers/batch_invariant.py @yewentao256 
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
-/vllm/vllm_flash_attn @LucasWilkinson
+/vllm/vllm_flash_attn @LucasWilkinson @MatthewBonanni
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm @chaunceyjiang
 /vllm/entrypoints @aarnphm @chaunceyjiang
 /vllm/tool_parsers @aarnphm @chaunceyjiang
 /vllm/compilation @zou3519 @youkaichao @ProExpertProg
 /vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
 /vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
-/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
+/vllm/config/cache.py @heheda12345
 # Entrypoints
 /vllm/entrypoints/anthropic @mgoin @DarkLight1337
 /vllm/entrypoints/cli @hmellor @mgoin @DarkLight1337 @russellb
 /vllm/entrypoints/mcp @heheda12345
 /vllm/entrypoints/openai @aarnphm @chaunceyjiang @DarkLight1337 @russellb
 /vllm/entrypoints/openai/realtime @njhill
 /vllm/entrypoints/openai/speech_to_text @NickLucche
 /vllm/entrypoints/pooling @noooop
 /vllm/entrypoints/sagemaker @DarkLight1337
 /vllm/entrypoints/serve @njhill
 /vllm/entrypoints/*.py @njhill
 /vllm/entrypoints/chat_utils.py @DarkLight1337
 /vllm/entrypoints/llm.py @DarkLight1337
 # Input/Output Processing
 /vllm/sampling_params.py @njhill @NickLucche
 /vllm/pooling_params.py @noooop @DarkLight1337
 /vllm/tokenizers @DarkLight1337 @njhill
 /vllm/renderers @DarkLight1337 @njhill
 /vllm/reasoning @aarnphm @chaunceyjiang
 /vllm/tool_parsers @aarnphm @chaunceyjiang
 # vLLM V1
-/vllm/v1/attention @LucasWilkinson
+/vllm/v1/attention @LucasWilkinson @MatthewBonanni
 /vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
 /vllm/v1/attention/backends/mla @pavanimajety
 /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
 /vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
 /vllm/v1/sample @22quinn @houseroad @njhill
-/vllm/v1/spec_decode @benchislett @luccafong
+/vllm/v1/spec_decode @benchislett @luccafong @MatthewBonanni
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/kv_cache_interface.py @heheda12345
 /vllm/v1/kv_offload @ApostaC @orozery
 /vllm/v1/worker/gpu/kv_connector.py @orozery
-/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery
+/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery @NickLucche
 # Model runner V2
 /vllm/v1/worker/gpu @WoosukKwon
@@ -115,8 +133,8 @@ mkdocs.yaml @hmellor
 /vllm/model_executor/models/mixtral*.py @patrickvonplaten
 /vllm/model_executor/models/voxtral*.py @patrickvonplaten
 /vllm/model_executor/models/pixtral*.py @patrickvonplaten
 /vllm/tokenizers/mistral.py @patrickvonplaten
 /vllm/transformers_utils/configs/mistral.py @patrickvonplaten
 /vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
 # Kernels
 /vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep
@@ -152,9 +170,7 @@ mkdocs.yaml @hmellor
 /examples/pooling @noooop
 /tests/models/*/pooling* @noooop
 /tests/entrypoints/pooling @noooop
 /vllm/entrypoints/pooling @noooop
 /vllm/config/pooler.py @noooop
 /vllm/pooling_params.py @noooop
 /vllm/model_executor/layers/pooler @noooop
 # Security guide and policies
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -19,6 +19,7 @@ jobs:
        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
        with:
          python-version: '3.12'
          cache: 'pip'
      - name: Install Python dependencies
        run: |
--- a/.gitignore
+++ b/.gitignore
@@ -238,3 +238,6 @@ ep_kernels_workspace/
 vllm/grpc/vllm_engine_pb2.py
 vllm/grpc/vllm_engine_pb2_grpc.py
 vllm/grpc/vllm_engine_pb2.pyi
 # Ignore generated cpu headers 
 csrc/cpu/cpu_attn_dispatch_generated.h
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -121,24 +121,9 @@ repos:
    name: Update Dockerfile dependency graph
    entry: tools/pre_commit/update-dockerfile-graph.sh
    language: script
-  - id: enforce-import-regex-instead-of-re
+  - id: check-forbidden-imports
-    name: Enforce import regex as re
+    name: Check for forbidden imports
-    entry: python tools/pre_commit/enforce_regex_import.py
+    entry: python tools/pre_commit/check_forbidden_imports.py
    language: python
    types: [python]
    pass_filenames: false
    additional_dependencies: [regex]
  # forbid directly import triton
  - id: forbid-direct-triton-import
    name: "Forbid direct 'import triton'"
    entry: python tools/pre_commit/check_triton_import.py
    language: python
    types: [python]
    pass_filenames: false
    additional_dependencies: [regex]
  - id: check-pickle-imports
    name: Prevent new pickle/cloudpickle imports
    entry: python tools/pre_commit/check_pickle_imports.py
    language: python
    types: [python]
    additional_dependencies: [regex]
@@ -158,6 +143,11 @@ repos:
    name: Check attention backend documentation is up to date
    entry: python tools/pre_commit/generate_attention_backend_docs.py --check
    language: python
  - id: check-boolean-context-manager
    name: Check for boolean ops in with-statements
    entry: python tools/pre_commit/check_boolean_context_manager.py
    language: python
    types: [python]
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -9,13 +9,14 @@ build:
    python: "3.12"
  jobs:
    post_checkout:
-      - git fetch --unshallow || true
+      - git fetch origin main --unshallow --no-tags --filter=blob:none || true
    pre_create_environment:
      - pip install uv
    create_environment:
      - uv venv $READTHEDOCS_VIRTUALENV_PATH
    install:
      - uv pip install --python $READTHEDOCS_VIRTUALENV_PATH/bin/python --no-cache-dir -r requirements/docs.txt 
 mkdocs:
  configuration: mkdocs.yaml
  fail_on_warning: true
 # Optionally declare the Python requirements required to build your docs
 python:
  install:
    - requirements: requirements/docs.txt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,8 +56,8 @@ endif()
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.10.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.10.0")
 #
 # Try to find python package with an executable that exactly matches
@@ -293,6 +293,7 @@ set(VLLM_EXT_SRC
  "csrc/fused_qknorm_rope_kernel.cu"
  "csrc/layernorm_quant_kernels.cu"
  "csrc/sampler.cu"
  "csrc/topk.cu"
  "csrc/cuda_view.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/w8a8/int8/scaled_quant.cu"
@@ -770,6 +771,24 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()
  # DeepSeek V3 fused A GEMM kernel (requires SM 9.0+, Hopper and later)
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
    cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
  else()
    cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_FUSED_A_GEMM_ARCHS)
    set(DSV3_FUSED_A_GEMM_SRC "csrc/dsv3_fused_a_gemm.cu")
    set_gencode_flags_for_srcs(
      SRCS "${DSV3_FUSED_A_GEMM_SRC}"
      CUDA_ARCHS "${DSV3_FUSED_A_GEMM_ARCHS}")
    list(APPEND VLLM_EXT_SRC ${DSV3_FUSED_A_GEMM_SRC})
    message(STATUS "Building dsv3_fused_a_gemm for archs: ${DSV3_FUSED_A_GEMM_ARCHS}")
  else()
    message(STATUS "Not building dsv3_fused_a_gemm as no compatible archs found "
                   "in CUDA target architectures.")
  endif()
  # moe_data.cu is used by all CUTLASS MoE kernels.
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
    cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
@@ -1081,6 +1100,27 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
                   " in CUDA target architectures")
  endif()
  # DeepSeek V3 router GEMM kernel - requires SM90+
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
    cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
  else()
    cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_ROUTER_GEMM_ARCHS)
    set(DSV3_ROUTER_GEMM_SRC
      "csrc/moe/dsv3_router_gemm_entry.cu"
      "csrc/moe/dsv3_router_gemm_float_out.cu"
      "csrc/moe/dsv3_router_gemm_bf16_out.cu")
    set_gencode_flags_for_srcs(
      SRCS "${DSV3_ROUTER_GEMM_SRC}"
      CUDA_ARCHS "${DSV3_ROUTER_GEMM_ARCHS}")
    list(APPEND VLLM_MOE_EXT_SRC "${DSV3_ROUTER_GEMM_SRC}")
    message(STATUS "Building DSV3 router GEMM kernel for archs: ${DSV3_ROUTER_GEMM_ARCHS}")
  else()
    message(STATUS "Not building DSV3 router GEMM kernel as no compatible archs found"
                   " (requires SM90+ and CUDA >= 12.0)")
  endif()
 endif()
 message(STATUS "Enabling moe extension.")
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -11,7 +11,7 @@ This directory used to contain vLLM's benchmark scripts and utilities for perfor
 ## Usage
-For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).
+For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/benchmarking/cli/#benchmark-cli).
 For full CLI reference see:
--- a/benchmarks/attention_benchmarks/batch_spec.py
+++ b/benchmarks/attention_benchmarks/batch_spec.py
@@ -229,3 +229,40 @@ def get_batch_stats(requests: list[BatchRequest]) -> dict:
            sum(r.kv_len for r in requests) / len(requests) if requests else 0
        ),
    }
 def get_batch_type(batch_spec: str, spec_decode_threshold: int = 8) -> str:
    """
    Classify a batch spec into a type string.
    Args:
        batch_spec: Batch specification string (e.g., "q2k", "8q1s1k", "2q2k_8q1s1k")
        spec_decode_threshold: Max q_len to be considered spec-decode vs extend
    Returns:
        Type string: "prefill", "decode", "spec-decode", "extend", or "mixed (types...)"
    """
    requests = parse_batch_spec(batch_spec)
    # Classify each request
    types_present = set()
    for req in requests:
        if req.is_decode:
            types_present.add("decode")
        elif req.is_prefill:
            types_present.add("prefill")
        elif req.is_extend:
            # Distinguish spec-decode (small q_len) from extend (chunked prefill)
            if req.q_len <= spec_decode_threshold:
                types_present.add("spec-decode")
            else:
                types_present.add("extend")
    if len(types_present) == 1:
        return types_present.pop()
    elif len(types_present) > 1:
        # Sort for consistent output
        sorted_types = sorted(types_present)
        return f"mixed ({'+'.join(sorted_types)})"
    else:
        return "unknown"
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -43,6 +43,7 @@ from common import (
    ModelParameterSweep,
    ParameterSweep,
    ResultsFormatter,
    batch_spec_sort_key,
    is_mla_backend,
 )
@@ -218,10 +219,13 @@ def run_model_parameter_sweep(
                        by_param_and_spec[key].append(r)
                        break
-    # Sort by param value then spec
+    # Sort by param value then spec (batch_size, q_len, kv_len)
    sorted_keys = sorted(
        by_param_and_spec.keys(),
-        key=lambda x: (int(x[0]) if x[0].isdigit() else x[0], x[1]),
+        key=lambda x: (
            int(x[0]) if x[0].isdigit() else x[0],
            batch_spec_sort_key(x[1]),
        ),
    )
    current_param_value = None
@@ -330,7 +334,7 @@ def run_parameter_sweep(
                by_spec[spec] = []
            by_spec[spec].append(r)
-    for spec in sorted(by_spec.keys()):
+    for spec in sorted(by_spec.keys(), key=batch_spec_sort_key):
        results = by_spec[spec]
        best = min(results, key=lambda r: r.mean_time)
        console.print(
@@ -496,9 +500,12 @@ def main():
        if "description" in yaml_config:
            console.print(f"[dim]{yaml_config['description']}[/]")
-        # Override args with YAML values
+        # Override args with YAML values, but CLI args take precedence
-        # (YAML takes precedence unless CLI arg was explicitly set)
+        # Check if CLI provided backends (they would be non-None and not default)
-        # Backend(s)
+        cli_backends_provided = args.backends is not None or args.backend is not None
        # Backend(s) - only use YAML if CLI didn't specify
        if not cli_backends_provided:
            if "backend" in yaml_config:
                args.backend = yaml_config["backend"]
                args.backends = None
@@ -544,13 +551,15 @@ def main():
            args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads)
            args.block_size = model.get("block_size", args.block_size)
-        # Benchmark settings
+        # Benchmark settings (top-level keys)
-        if "benchmark" in yaml_config:
+        if "device" in yaml_config:
-            bench = yaml_config["benchmark"]
+            args.device = yaml_config["device"]
-            args.device = bench.get("device", args.device)
+        if "repeats" in yaml_config:
-            args.repeats = bench.get("repeats", args.repeats)
+            args.repeats = yaml_config["repeats"]
-            args.warmup_iters = bench.get("warmup_iters", args.warmup_iters)
+        if "warmup_iters" in yaml_config:
-            args.profile_memory = bench.get("profile_memory", args.profile_memory)
+            args.warmup_iters = yaml_config["warmup_iters"]
        if "profile_memory" in yaml_config:
            args.profile_memory = yaml_config["profile_memory"]
        # Parameter sweep configuration
        if "parameter_sweep" in yaml_config:
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -12,16 +12,36 @@ from typing import Any
 import numpy as np
 import torch
 from batch_spec import get_batch_type, parse_batch_spec
 from rich.console import Console
 from rich.table import Table
 def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
    """
    Extract sorting key from batch spec: (batch_size, max_q_len, max_kv_len).
    This ensures results are sorted by batch size first, then query length,
    then sequence length, rather than alphabetically.
    """
    try:
        requests = parse_batch_spec(spec)
        batch_size = len(requests)
        max_q_len = max(r.q_len for r in requests) if requests else 0
        max_kv_len = max(r.kv_len for r in requests) if requests else 0
        return (batch_size, max_q_len, max_kv_len)
    except Exception:
        # Fallback for unparseable specs
        return (0, 0, 0)
 # Mock classes for vLLM attention infrastructure
 class MockHfConfig:
    """Mock HuggingFace config that satisfies vLLM's requirements."""
-    def __init__(self, mla_dims: dict):
+    def __init__(self, mla_dims: dict, index_topk: int | None = None):
        self.num_attention_heads = mla_dims["num_q_heads"]
        self.num_key_value_heads = mla_dims["num_kv_heads"]
        self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"]
@@ -32,6 +52,8 @@ class MockHfConfig:
        self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"]
        self.v_head_dim = mla_dims["v_head_dim"]
        self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
        if index_topk is not None:
            self.index_topk = index_topk
    def get_text_config(self):
        return self
@@ -82,6 +104,38 @@ class MockKVBProj:
        return (result,)  # Return as tuple to match ColumnParallelLinear API
 class MockIndexer:
    """Mock Indexer for sparse MLA backends.
    Provides topk_indices_buffer that sparse MLA backends use to determine
    which KV cache slots to attend to for each token.
    """
    def __init__(
        self,
        max_num_tokens: int,
        topk_tokens: int,
        device: torch.device,
    ):
        self.topk_tokens = topk_tokens
        self.topk_indices_buffer = torch.zeros(
            (max_num_tokens, topk_tokens),
            dtype=torch.int32,
            device=device,
        )
    def fill_random_indices(self, num_tokens: int, max_kv_len: int):
        """Fill topk_indices_buffer with random valid indices for benchmarking."""
        indices = torch.randint(
            0,
            max_kv_len,
            (num_tokens, self.topk_tokens),
            dtype=torch.int32,
            device=self.topk_indices_buffer.device,
        )
        self.topk_indices_buffer[:num_tokens] = indices
 class MockLayer(AttentionLayerBase):
    """Mock attention layer with scale parameters and impl.
@@ -316,14 +370,19 @@ class ResultsFormatter:
            backends: List of backend names being compared
            compare_to_fastest: Show percentage comparison to fastest
        """
-        # Group by batch spec
+        # Group by batch spec, preserving first-occurrence order
        by_spec = {}
        specs_order = []
        for r in results:
            spec = r.config.batch_spec
            if spec not in by_spec:
                by_spec[spec] = {}
                specs_order.append(spec)
            by_spec[spec][r.config.backend] = r
        # Sort specs by (batch_size, q_len, kv_len) instead of alphabetically
        specs_order = sorted(by_spec.keys(), key=batch_spec_sort_key)
        # Create shortened backend names for display
        def shorten_backend_name(name: str) -> str:
            """Shorten long backend names for table display."""
@@ -337,6 +396,8 @@ class ResultsFormatter:
        table = Table(title="Attention Benchmark Results")
        table.add_column("Batch\nSpec", no_wrap=True)
        table.add_column("Type", no_wrap=True)
        table.add_column("Batch\nSize", justify="right", no_wrap=True)
        multi = len(backends) > 1
        for backend in backends:
@@ -350,12 +411,14 @@ class ResultsFormatter:
                table.add_column(col_rel, justify="right", no_wrap=False)
        # Add rows
-        for spec in sorted(by_spec.keys()):
+        for spec in specs_order:
            spec_results = by_spec[spec]
            times = {b: r.mean_time for b, r in spec_results.items() if r.success}
            best_time = min(times.values()) if times else 0.0
-            row = [spec]
+            batch_type = get_batch_type(spec)
            batch_size = len(parse_batch_spec(spec))
            row = [spec, batch_type, str(batch_size)]
            for backend in backends:
                if backend in spec_results:
                    r = spec_results[backend]
@@ -486,10 +549,11 @@ def get_attention_scale(head_dim: int) -> float:
 def is_mla_backend(backend: str) -> bool:
    """
-    Check if backend is an MLA backend using the backend's is_mla() property.
+    Check if backend is an MLA backend using the AttentionBackendEnum.
    Args:
-        backend: Backend name (e.g., "CUTLASS_MLA", "FLASHINFER_MLA")
+        backend: Backend name matching AttentionBackendEnum exactly
        (e.g., "FLASHMLA_SPARSE")
    Returns:
        True if the backend is an MLA backend, False otherwise
@@ -497,7 +561,8 @@ def is_mla_backend(backend: str) -> bool:
    from vllm.v1.attention.backends.registry import AttentionBackendEnum
    try:
-        backend_class = AttentionBackendEnum[backend.upper()].get_class()
+        backend_enum = AttentionBackendEnum[backend]
        backend_class = backend_enum.get_class()
        return backend_class.is_mla()
-    except (KeyError, ValueError, ImportError):
+    except (KeyError, ValueError, ImportError, AttributeError):
        return False
--- a/benchmarks/attention_benchmarks/configs/mla_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
@@ -3,7 +3,7 @@
 model:
  name: "deepseek-v3"
  num_layers: 60
-  num_q_heads: 128
+  num_q_heads: 128  # Base value, can be swept for TP simulation
  num_kv_heads: 1  # MLA uses single latent KV
  head_dim: 576
  kv_lora_rank: 512
@@ -12,6 +12,13 @@ model:
  v_head_dim: 128
  block_size: 128  # CUTLASS MLA and FlashAttn MLA use 128
 # Model parameter sweep: simulate tensor parallelism by varying num_q_heads
 # TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
 model_parameter_sweep:
  param_name: "num_q_heads"
  values: [128, 64, 32, 16]
  label_format: "{backend}_{value}h"
 batch_specs:
  # Small batches, varying sequence lengths
  - "16q1s512"     # 16 requests, 512 KV cache
@@ -34,28 +41,30 @@ batch_specs:
  # Very large batches
  - "128q1s1k"     # 128 requests, 1k KV cache
  - "128q1s2k"     # 128 requests, 2k KV cache
  - "128q1s4k"     # 128 requests, 4k KV cache
  - "128q1s8k"     # 128 requests, 8k KV cache
  # Long context
  - "32q1s16k"     # 32 requests, 16k KV cache
  - "32q1s32k"     # 32 requests, 32k KV cache
 backends:
-  - cutlass_mla
+  - CUTLASS_MLA
-  - flashinfer_mla
+  - FLASHINFER_MLA
-  - flashattn_mla  # Hopper only
+  - FLASH_ATTN_MLA  # Hopper only
-  - flashmla        # Hopper only
+  - FLASHMLA        # Hopper only
 device: "cuda:0"
-repeats: 5
+repeats: 100
-warmup_iters: 3
+warmup_iters: 10
 profile_memory: true
 # Backend-specific tuning
-cutlass_mla:
+CUTLASS_MLA:
  num_kv_splits: auto  # or specific value like 4, 8, 16
-flashattn_mla:
+FLASH_ATTN_MLA:
  reorder_batch_threshold: 512
-flashmla:
+FLASHMLA:
  reorder_batch_threshold: 1
--- a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
@@ -45,10 +45,10 @@ batch_specs:
  - "4q4k_60q1s4k"          # 4 prefill + 60 decode
 backends:
-  - cutlass_mla
+  - CUTLASS_MLA
-  - flashinfer_mla
+  - FLASHINFER_MLA
-  - flashattn_mla   # Hopper only
+  - FLASH_ATTN_MLA   # Hopper only
-  - flashmla        # Hopper only
+  - FLASHMLA         # Hopper only
 device: "cuda:0"
 repeats: 5
--- a/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
@@ -0,0 +1,62 @@
 # MLA prefill-only benchmark configuration for sparse backends
 model:
  name: "deepseek-v3"
  num_layers: 60
  num_q_heads: 128
  num_kv_heads: 1
  head_dim: 576
  kv_lora_rank: 512
  qk_nope_head_dim: 128
  qk_rope_head_dim: 64
  v_head_dim: 128
  block_size: 128
 # Model parameter sweep: simulate tensor parallelism by varying num_q_heads
 # TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
 model_parameter_sweep:
  param_name: "num_q_heads"
  values: [128, 64, 32, 16]
  label_format: "{backend}_{value}h"
 batch_specs:
  # Pure prefill
  - "1q512"
  - "1q1k"
  - "1q2k"
  - "1q4k"
  - "1q8k"
  # Batched pure prefill
  - "2q512"
  - "2q1k"
  - "2q2k"
  - "2q4k"
  - "2q8k"
  - "4q512"
  - "4q1k"
  - "4q2k"
  - "4q4k"
  - "4q8k"
  - "8q512"
  - "8q1k"
  - "8q2k"
  - "8q4k"
  - "8q8k"
  # Extend
  - "1q512s4k"
  - "1q512s8k"
  - "1q1ks8k"
  - "1q2ks8k"
  - "1q2ks16k"
  - "1q4ks16k"
 backends:
  - FLASHMLA_SPARSE
  - FLASHINFER_MLA_SPARSE
 device: "cuda:0"
 repeats: 10
 warmup_iters: 3
 profile_memory: true
--- a/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
+++ b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
@@ -6,7 +6,7 @@
 description: "Decode vs Prefill pipeline crossover analysis"
 # Test FlashAttn MLA
-backend: flashattn_mla
+backend: FLASH_ATTN_MLA
 # Mode: decode_vs_prefill comparison (special sweep mode)
 # For each batch spec, we'll test both decode and prefill pipelines
@@ -62,11 +62,10 @@ model:
  block_size: 128
 # Benchmark settings
-benchmark:
+device: "cuda:0"
-  device: "cuda:0"
+repeats: 15          # More repeats for spec decode variance
-  repeats: 15          # More repeats for spec decode variance
+warmup_iters: 5
-  warmup_iters: 5
+profile_memory: false
  profile_memory: false
 # Output
 output:
--- a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
@@ -41,18 +41,17 @@ batch_specs:
 # Backends that support query length > 1
 backends:
-  - flashattn_mla    # reorder_batch_threshold = 512
+  - FLASH_ATTN_MLA    # reorder_batch_threshold = 512
-  - flashmla          # reorder_batch_threshold = 1 (tunable)
+  - FLASHMLA          # reorder_batch_threshold = 1 (tunable)
 # FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
-# - flashinfer_mla
+# - FLASHINFER_MLA
 # Benchmark settings
-benchmark:
+device: "cuda:0"
-  device: "cuda:0"
+repeats: 10  # More repeats for statistical significance
-  repeats: 10  # More repeats for statistical significance
+warmup_iters: 5
-  warmup_iters: 5
+profile_memory: false
  profile_memory: false
 # Test these threshold values for optimization
 parameter_sweep:
--- a/benchmarks/attention_benchmarks/configs/standard_attention.yaml
+++ b/benchmarks/attention_benchmarks/configs/standard_attention.yaml
@@ -25,14 +25,22 @@ batch_specs:
  - "4q1k_16q1s2k"     # 4 prefill + 16 decode
  - "2q4k_32q1s1k"     # 2 large prefill + 32 decode
-  # Context extension
+  # Speculative decode (q <= 8)
-  - "q1ks2k"          # 1k query, 2k sequence (chunked prefill)
+  - "16q2s1k"         # 16 requests, 2 spec tokens, 1k KV cache
  - "16q4s1k"         # 16 requests, 4 spec tokens, 1k KV cache
  - "16q8s1k"         # 16 requests, 8 spec tokens, 1k KV cache
  - "32q4s2k"         # 32 requests, 4 spec tokens, 2k KV cache
  - "8q8s4k"          # 8 requests, 8 spec tokens, 4k KV cache
  # Context extension (chunked prefill)
  - "q1ks2k"          # 1k query, 2k sequence
  - "2q1ks4k"         # 2 requests: 1k query, 4k sequence
 # Available backends: FLASH_ATTN, TRITON_ATTN, FLASHINFER
 backends:
-  - flash
+  - FLASH_ATTN
-  - triton
+  - TRITON_ATTN
-  - flashinfer
+  - FLASHINFER
 device: "cuda:0"
 repeats: 5
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -8,14 +8,13 @@ This module provides helpers for running MLA backends without
 needing full VllmConfig integration.
 """
 import importlib
 import numpy as np
 import torch
 from batch_spec import parse_batch_spec
 from common import (
    BenchmarkResult,
    MockHfConfig,
    MockIndexer,
    MockKVBProj,
    MockLayer,
    setup_mla_dims,
@@ -62,6 +61,7 @@ def create_minimal_vllm_config(
    block_size: int = 128,
    max_num_seqs: int = 256,
    mla_dims: dict | None = None,
    index_topk: int | None = None,
 ) -> VllmConfig:
    """
    Create minimal VllmConfig for MLA benchmarks.
@@ -73,6 +73,8 @@ def create_minimal_vllm_config(
        max_num_seqs: Maximum number of sequences
        mla_dims: Optional custom MLA dimensions dict. If not provided, uses
                  setup_mla_dims(model_name)
        index_topk: Optional topk value for sparse MLA backends. If provided,
                    the config will include index_topk for sparse attention.
    Returns:
        VllmConfig for benchmarking
@@ -82,7 +84,7 @@ def create_minimal_vllm_config(
        mla_dims = setup_mla_dims(model_name)
    # Create mock HF config first (avoids downloading from HuggingFace)
-    mock_hf_config = MockHfConfig(mla_dims)
+    mock_hf_config = MockHfConfig(mla_dims, index_topk=index_topk)
    # Create a temporary minimal config.json to avoid HF downloads
    # This ensures consistent ModelConfig construction without network access
@@ -120,16 +122,12 @@ def create_minimal_vllm_config(
            seed=0,
            max_model_len=32768,
            quantization=None,
            quantization_param_path=None,
            enforce_eager=False,
            max_context_len_to_capture=None,
            max_seq_len_to_capture=8192,
            max_logprobs=20,
            disable_sliding_window=False,
            skip_tokenizer_init=True,
            served_model_name=None,
            limit_mm_per_prompt=None,
            use_async_output_proc=True,
            config_format="auto",
        )
    finally:
@@ -180,56 +178,65 @@ def create_minimal_vllm_config(
 # ============================================================================
-# Backend name to class name prefix mapping
+# Backend-specific properties that can't be inferred from the backend class
-_BACKEND_NAME_MAP = {
+# Keys are AttentionBackendEnum names (uppercase)
    "flashattn_mla": "FlashAttnMLA",
    "flashmla": "FlashMLA",
    "flashinfer_mla": "FlashInferMLA",
    "cutlass_mla": "CutlassMLA",
 }
 # Special properties that differ from defaults
 _BACKEND_PROPERTIES = {
-    "flashmla": {
+    "FLASHMLA": {
        "query_format": "concat",  # Single concatenated tensor (vs tuple)
        "block_size": 64,  # FlashMLA uses fixed block size
    },
-    "flashinfer_mla": {
+    "FLASHMLA_SPARSE": {
-        "block_size": 64,  # FlashInfer MLA only supports 32 or 64
+        "query_format": "concat",  # Single concatenated tensor (vs tuple)
    },
 }
 def _get_backend_config(backend: str) -> dict:
    """
-    Get backend configuration using naming conventions.
+    Get backend configuration from AttentionBackendEnum.
-    All MLA backends follow the pattern:
+    Uses the registry to get the backend class and extract configuration
-    - Module: vllm.v1.attention.backends.mla.{backend}
+    from its methods (get_impl_cls, get_builder_cls, is_sparse, etc.).
-    - Impl: {Name}Impl
+
-    - Metadata: {Name}Metadata (or MLACommonMetadata)
+    Args:
-    - DecodeMetadata: {Name}DecodeMetadata (or MLACommonDecodeMetadata)
+        backend: Backend name matching AttentionBackendEnum exactly
-    - MetadataBuilder: {Name}MetadataBuilder
+        (e.g., "FLASHMLA_SPARSE")
    Returns:
        Dict with backend configuration
    """
-    if backend not in _BACKEND_NAME_MAP:
+    from vllm.v1.attention.backends.registry import AttentionBackendEnum
        raise ValueError(f"Unknown backend: {backend}")
-    name = _BACKEND_NAME_MAP[backend]
+    try:
        backend_enum = AttentionBackendEnum[backend]
        backend_class = backend_enum.get_class()
    except (KeyError, ValueError) as e:
        valid_backends = [e.name for e in AttentionBackendEnum if e.name != "CUSTOM"]
        raise ValueError(
            f"Unknown backend: {backend}. "
            f"Valid MLA backends: {[b for b in valid_backends if 'MLA' in b]}"
        ) from e
    # Get block size from backend class
    block_sizes = backend_class.get_supported_kernel_block_sizes()
    # Use first supported block size (backends typically support one for MLA)
    block_size = block_sizes[0] if block_sizes else None
    if hasattr(block_size, "value"):
        # Handle MultipleOf enum
        block_size = None
    # Check if sparse via class method if available
    is_sparse = getattr(backend_class, "is_sparse", lambda: False)()
    # Get properties that can't be inferred
    props = _BACKEND_PROPERTIES.get(backend, {})
    # Check if backend uses common metadata (FlashInfer, CUTLASS)
    uses_common = backend in ("flashinfer_mla", "cutlass_mla")
    return {
-        "module": f"vllm.v1.attention.backends.mla.{backend}",
+        "backend_class": backend_class,
-        "impl_class": f"{name}Impl",
+        "impl_class": backend_class.get_impl_cls(),
-        "metadata_class": "MLACommonMetadata" if uses_common else f"{name}Metadata",
+        "builder_class": backend_class.get_builder_cls(),
        "decode_metadata_class": "MLACommonDecodeMetadata"
        if uses_common
        else f"{name}DecodeMetadata",
        "builder_class": f"{name}MetadataBuilder",
        "query_format": props.get("query_format", "tuple"),
-        "block_size": props.get("block_size", None),
+        "block_size": block_size,
        "is_sparse": is_sparse,
    }
@@ -447,22 +454,26 @@ def _create_backend_impl(
    mla_dims: dict,
    vllm_config: VllmConfig,
    device: torch.device,
    max_num_tokens: int = 8192,
    index_topk: int | None = None,
 ):
    """
    Create backend implementation instance.
    Args:
-        backend_cfg: Backend configuration dict
+        backend_cfg: Backend configuration dict from _get_backend_config()
        mla_dims: MLA dimension configuration
        vllm_config: VllmConfig instance
        device: Target device
        max_num_tokens: Maximum number of tokens for sparse indexer buffer
        index_topk: Topk value for sparse MLA backends
    Returns:
-        Tuple of (impl, layer, builder_instance)
+        Tuple of (impl, layer, builder_instance, indexer)
    """
-    # Import backend classes
+    # Get classes from backend config (already resolved by _get_backend_config)
-    backend_module = importlib.import_module(backend_cfg["module"])
+    impl_class = backend_cfg["impl_class"]
-    impl_class = getattr(backend_module, backend_cfg["impl_class"])
+    builder_class = backend_cfg["builder_class"]
    # Calculate scale
    scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"])
@@ -474,27 +485,45 @@ def _create_backend_impl(
        v_head_dim=mla_dims["v_head_dim"],
    )
-    # Create impl
+    # Create indexer for sparse backends
-    impl = impl_class(
+    indexer = None
-        num_heads=mla_dims["num_q_heads"],
+    if backend_cfg.get("is_sparse", False):
-        head_size=mla_dims["head_dim"],
+        if index_topk is None:
-        scale=scale,
+            index_topk = 2048  # Default topk for sparse MLA
-        num_kv_heads=mla_dims["num_kv_heads"],
+        indexer = MockIndexer(
-        alibi_slopes=None,
+            max_num_tokens=max_num_tokens,
-        sliding_window=None,
+            topk_tokens=index_topk,
-        kv_cache_dtype="auto",
+            device=device,
        logits_soft_cap=None,
        attn_type="decoder",
        kv_sharing_target_layer_name=None,
        q_lora_rank=None,
        kv_lora_rank=mla_dims["kv_lora_rank"],
        qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
        qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
        qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
        v_head_dim=mla_dims["v_head_dim"],
        kv_b_proj=mock_kv_b_proj,
        )
    # Build impl kwargs
    impl_kwargs = {
        "num_heads": mla_dims["num_q_heads"],
        "head_size": mla_dims["head_dim"],
        "scale": scale,
        "num_kv_heads": mla_dims["num_kv_heads"],
        "alibi_slopes": None,
        "sliding_window": None,
        "kv_cache_dtype": "auto",
        "logits_soft_cap": None,
        "attn_type": "decoder",
        "kv_sharing_target_layer_name": None,
        "q_lora_rank": None,
        "kv_lora_rank": mla_dims["kv_lora_rank"],
        "qk_nope_head_dim": mla_dims["qk_nope_head_dim"],
        "qk_rope_head_dim": mla_dims["qk_rope_head_dim"],
        "qk_head_dim": mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
        "v_head_dim": mla_dims["v_head_dim"],
        "kv_b_proj": mock_kv_b_proj,
    }
    # Add indexer for sparse backends
    if indexer is not None:
        impl_kwargs["indexer"] = indexer
    # Create impl
    impl = impl_class(**impl_kwargs)
    # Initialize DCP attributes
    if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1):
        impl.dcp_world_size = 1
@@ -515,9 +544,7 @@ def _create_backend_impl(
    # Create builder instance if needed
    builder_instance = None
-    if backend_cfg["builder_class"]:
+    if builder_class:
        builder_class = getattr(backend_module, backend_cfg["builder_class"])
        # Populate static_forward_context so builder can find the layer
        # MockLayer inherits from AttentionLayerBase, so isinstance checks pass
        vllm_config.compilation_config.static_forward_context = {"placeholder": layer}
@@ -529,7 +556,7 @@ def _create_backend_impl(
            device=device,
        )
-    return impl, layer, builder_instance
+    return impl, layer, builder_instance, indexer
 # ============================================================================
@@ -594,6 +621,7 @@ def _run_single_benchmark(
    backend_cfg: dict,
    mla_dims: dict,
    device: torch.device,
    indexer=None,
 ) -> BenchmarkResult:
    """
    Run a single benchmark iteration.
@@ -606,6 +634,7 @@ def _run_single_benchmark(
        backend_cfg: Backend configuration dict
        mla_dims: MLA dimension configuration
        device: Target device
        indexer: Optional MockIndexer for sparse backends
    Returns:
        BenchmarkResult with timing statistics
@@ -613,7 +642,9 @@ def _run_single_benchmark(
    # Parse batch spec
    requests = parse_batch_spec(config.batch_spec)
    q_lens = [r.q_len for r in requests]
    kv_lens = [r.kv_len for r in requests]
    total_q = sum(q_lens)
    max_kv_len = max(kv_lens)
    # Determine block size
    block_size = backend_cfg["block_size"] or config.block_size
@@ -641,8 +672,16 @@ def _run_single_benchmark(
        torch.bfloat16,
    )
-    # Determine which forward method to use based on metadata
+    # Fill indexer with random indices for sparse backends
-    if metadata.decode is not None:
+    is_sparse = backend_cfg.get("is_sparse", False)
    if is_sparse and indexer is not None:
        indexer.fill_random_indices(total_q, max_kv_len)
    # Determine which forward method to use
    if is_sparse:
        # Sparse backends use forward_mqa
        forward_fn = lambda: impl.forward_mqa(decode_inputs, kv_cache, metadata, layer)
    elif metadata.decode is not None:
        forward_fn = lambda: impl._forward_decode(
            decode_inputs, kv_cache, metadata, layer
        )
@@ -693,11 +732,13 @@ def _run_single_benchmark(
 def _run_mla_benchmark_batched(
    backend: str,
    configs_with_params: list[tuple],  # [(config, threshold, num_splits), ...]
    index_topk: int = 2048,
 ) -> list[BenchmarkResult]:
    """
    Unified batched MLA benchmark runner for all backends.
-    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
+    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
               flashinfer_mla_sparse, flashmla_sparse
    This function reuses backend initialization across multiple benchmarks
    to avoid setup/teardown overhead.
@@ -707,6 +748,7 @@ def _run_mla_benchmark_batched(
        configs_with_params: List of (config, threshold, num_splits) tuples
            - threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
            - num_splits: num_kv_splits (CUTLASS only)
        index_topk: Topk value for sparse MLA backends (default 2048)
    Returns:
        List of BenchmarkResult objects
@@ -730,19 +772,27 @@ def _run_mla_benchmark_batched(
    if mla_dims is None:
        mla_dims = setup_mla_dims("deepseek-v3")
    # Determine if this is a sparse backend
    is_sparse = backend_cfg.get("is_sparse", False)
    # Create and set vLLM config for MLA (reused across all benchmarks)
    vllm_config = create_minimal_vllm_config(
        model_name="deepseek-v3",  # Used only for model path
        block_size=block_size,
        mla_dims=mla_dims,  # Use custom dims from config or default
        index_topk=index_topk if is_sparse else None,
    )
    results = []
    with set_current_vllm_config(vllm_config):
-        # Create backend impl, layer, and builder (reused across benchmarks)
+        # Create backend impl, layer, builder, and indexer (reused across benchmarks)
-        impl, layer, builder_instance = _create_backend_impl(
+        impl, layer, builder_instance, indexer = _create_backend_impl(
-            backend_cfg, mla_dims, vllm_config, device
+            backend_cfg,
            mla_dims,
            vllm_config,
            device,
            index_topk=index_topk if is_sparse else None,
        )
        # Run each benchmark with the shared impl
@@ -768,6 +818,7 @@ def _run_mla_benchmark_batched(
                    backend_cfg,
                    mla_dims,
                    device,
                    indexer=indexer,
                )
                results.append(result)
@@ -793,20 +844,24 @@ def run_mla_benchmark(
    config,
    reorder_batch_threshold: int | None = None,
    num_kv_splits: int | None = None,
    index_topk: int = 2048,
 ) -> BenchmarkResult | list[BenchmarkResult]:
    """
    Unified MLA benchmark runner for all backends.
-    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
+    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
               flashinfer_mla_sparse, flashmla_sparse
    Always uses batched execution internally for optimal performance.
    Args:
-        backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla)
+        backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
                 flashinfer_mla_sparse, flashmla_sparse)
        config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples
        reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA
                                 (single config mode only)
        num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
        index_topk: Topk value for sparse MLA backends (default 2048)
    Returns:
        BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
@@ -816,9 +871,9 @@ def run_mla_benchmark(
        # Already in batched format
        if len(config) > 0 and isinstance(config[0], tuple):
            # Format: [(cfg, param), ...] where param is threshold or num_splits
-            if backend in ("flashattn_mla", "flashmla"):
+            if backend in ("flashattn_mla", "flashmla", "flashmla_sparse"):
                configs_with_params = [(cfg, param, None) for cfg, param in config]
-            else:  # cutlass_mla or flashinfer_mla
+            else:  # cutlass_mla, flashinfer_mla, or sparse backends
                configs_with_params = [(cfg, None, param) for cfg, param in config]
        else:
            # Format: [cfg, ...] - just configs
@@ -830,7 +885,7 @@ def run_mla_benchmark(
        return_single = True
    # Use unified batched execution
-    results = _run_mla_benchmark_batched(backend, configs_with_params)
+    results = _run_mla_benchmark_batched(backend, configs_with_params, index_topk)
    # Return single result or list based on input
    return results[0] if return_single else results
--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -8,7 +8,9 @@ This module provides helpers for running standard attention backends
 (FlashAttention, Triton, FlashInfer) with real vLLM integration.
 """
 import logging
 import types
 from contextlib import contextmanager
 import numpy as np
 import torch
@@ -24,8 +26,13 @@ from vllm.config import (
    ParallelConfig,
    SchedulerConfig,
    VllmConfig,
    set_current_vllm_config,
 )
 from vllm.v1.attention.backends.utils import (
    CommonAttentionMetadata,
    get_kv_cache_layout,
    set_kv_cache_layout,
 )
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import FullAttentionSpec
 # ============================================================================
@@ -33,37 +40,41 @@ from vllm.v1.kv_cache_interface import FullAttentionSpec
 # ============================================================================
 _BACKEND_CONFIG = {
    "flash": {
        "module": "vllm.v1.attention.backends.flash_attn",
        "backend_class": "FlashAttentionBackend",
        "dtype": torch.float16,
        "cache_layout": "standard",
        # ^ [2, num_blocks, block_size, num_kv_heads, head_dim]
    },
    "triton": {
        "module": "vllm.v1.attention.backends.triton_attn",
        "backend_class": "TritonAttentionBackend",
        "dtype": torch.float32,
        "cache_layout": "standard",
    },
    "flashinfer": {
        "module": "vllm.v1.attention.backends.flashinfer",
        "backend_class": "FlashInferBackend",
        "dtype": torch.float16,
        "cache_layout": "flashinfer",
        # ^ [num_blocks, 2, block_size, num_kv_heads, head_dim]
    },
 }
 def _get_backend_config(backend: str) -> dict:
-    if backend not in _BACKEND_CONFIG:
+    """
    Get backend configuration from AttentionBackendEnum.
    Args:
        backend: Backend name matching AttentionBackendEnum exactly
                 (e.g., "FLASH_ATTN", "TRITON_ATTN", "FLASHINFER")
    Returns:
        Dict with backend_class
    """
    from vllm.v1.attention.backends.registry import AttentionBackendEnum
    try:
        backend_enum = AttentionBackendEnum[backend]
        backend_class = backend_enum.get_class()
    except (KeyError, ValueError) as e:
        valid_backends = [b.name for b in AttentionBackendEnum if b.name != "CUSTOM"]
        raise ValueError(
-            f"Unknown backend: {backend}. "
+            f"Unknown backend: {backend}. Valid backends: {valid_backends}"
-            f"Available: {', '.join(_BACKEND_CONFIG.keys())}"
+        ) from e
-        )
+
-    return _BACKEND_CONFIG[backend]
+    return {"backend_class": backend_class}
@contextmanager
 def log_warnings_and_errors_only():
    """Temporarily set vLLM logger to WARNING level."""
    logger = logging.getLogger("vllm")
    old_level = logger.level
    logger.setLevel(logging.WARNING)
    try:
        yield
    finally:
        logger.setLevel(old_level)
 # ============================================================================
@@ -88,11 +99,7 @@ def _build_common_attn_metadata(
    query_start_loc_cpu = query_start_loc.cpu()
    seq_lens = torch.tensor(kv_lens, dtype=torch.int32, device=device)
-    seq_lens_cpu = seq_lens.cpu()
+    max_seq_len = int(seq_lens.max().item())
    max_seq_len = int(seq_lens_cpu.max())
    context_lens = [kv - q for kv, q in zip(kv_lens, q_lens)]
    num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
    max_blocks = (max(kv_lens) + block_size - 1) // block_size
    num_blocks = batch_size * max_blocks
@@ -107,8 +114,6 @@ def _build_common_attn_metadata(
        query_start_loc=query_start_loc,
        query_start_loc_cpu=query_start_loc_cpu,
        seq_lens=seq_lens,
        seq_lens_cpu=seq_lens_cpu,
        num_computed_tokens_cpu=num_computed_tokens_cpu,
        num_reqs=batch_size,
        num_actual_tokens=total_tokens,
        max_query_len=max_query_len,
@@ -121,7 +126,6 @@ def _build_common_attn_metadata(
 def _create_vllm_config(
    config: BenchmarkConfig,
    dtype: torch.dtype,
    max_num_blocks: int,
 ) -> VllmConfig:
    """Create a VllmConfig for benchmarking with mock model methods."""
@@ -129,7 +133,7 @@ def _create_vllm_config(
        model="meta-llama/Meta-Llama-3-8B",
        tokenizer="meta-llama/Meta-Llama-3-8B",
        trust_remote_code=False,
-        dtype=dtype,
+        dtype="auto",  # Use model's native dtype
        seed=0,
        max_model_len=1024,
    )
@@ -198,15 +202,12 @@ def _create_backend_impl(
    backend_cfg: dict,
    config: BenchmarkConfig,
    device: torch.device,
    dtype: torch.dtype,
 ):
    """Create backend implementation instance."""
-    import importlib
+    backend_class = backend_cfg["backend_class"]
    backend_module = importlib.import_module(backend_cfg["module"])
    backend_class = getattr(backend_module, backend_cfg["backend_class"])
    scale = get_attention_scale(config.head_dim)
    dtype = backend_cfg["dtype"]
    impl = backend_class.get_impl_cls()(
        num_heads=config.num_q_heads,
@@ -227,7 +228,7 @@ def _create_backend_impl(
    layer = MockLayer(device, kv_cache_spec=kv_cache_spec)
-    return backend_class, impl, layer, dtype
+    return backend_class, impl, layer
 def _create_metadata_builder(
@@ -235,11 +236,44 @@ def _create_metadata_builder(
    kv_cache_spec: FullAttentionSpec,
    vllm_config: VllmConfig,
    device: torch.device,
    backend_name: str = "",
 ):
    """Create metadata builder instance."""
-    return backend_class.get_builder_cls()(
+    layer_names = ["layer_0"]
    builder_cls = backend_class.get_builder_cls()
    # Flashinfer needs get_per_layer_parameters mocked since we don't have
    # real model layers registered
    if backend_name == "FLASHINFER":
        import unittest.mock
        from vllm.v1.attention.backends.utils import PerLayerParameters
        def mock_get_per_layer_parameters(vllm_config, layer_names, impl_cls):
            head_size = vllm_config.model_config.get_head_size()
            return {
                layer_name: PerLayerParameters(
                    window_left=-1,  # No sliding window
                    logits_soft_cap=0.0,  # No soft cap
                    sm_scale=1.0 / (head_size**0.5),  # Standard scale
                )
                for layer_name in layer_names
            }
        with unittest.mock.patch(
            "vllm.v1.attention.backends.flashinfer.get_per_layer_parameters",
            mock_get_per_layer_parameters,
        ):
            return builder_cls(
                kv_cache_spec=kv_cache_spec,
-        layer_names=["layer_0"],
+                layer_names=layer_names,
                vllm_config=vllm_config,
                device=device,
            )
    return builder_cls(
        kv_cache_spec=kv_cache_spec,
        layer_names=layer_names,
        vllm_config=vllm_config,
        device=device,
    )
@@ -281,39 +315,44 @@ def _create_input_tensors(
 def _create_kv_cache(
    config: BenchmarkConfig,
    max_num_blocks: int,
-    cache_layout: str,
+    backend_class,
    device: torch.device,
    dtype: torch.dtype,
 ) -> list:
-    """Create KV cache tensors for all layers."""
+    """Create KV cache tensors for all layers using the backend's methods.
-    if cache_layout == "flashinfer":
+
-        # FlashInfer layout: [num_blocks, 2, block_size, num_kv_heads, head_dim]
+    Uses the backend's get_kv_cache_shape() and get_kv_cache_stride_order()
-        cache_list = [
+    to create the cache with the correct shape and memory layout.
-            torch.zeros(
+    """
-                max_num_blocks,
+    # Get the logical shape from the backend
-                2,
+    cache_shape = backend_class.get_kv_cache_shape(
-                config.block_size,
+        num_blocks=max_num_blocks,
-                config.num_kv_heads,
+        block_size=config.block_size,
-                config.head_dim,
+        num_kv_heads=config.num_kv_heads,
-                device=device,
+        head_size=config.head_dim,
                dtype=dtype,
    )
-            for _ in range(config.num_layers)
+
-        ]
+    # Get the stride order for custom memory layout
-    else:
+    try:
-        # Standard layout: [2, num_blocks, block_size, num_kv_heads, head_dim]
+        stride_order = backend_class.get_kv_cache_stride_order()
-        cache_list = [
+        assert len(stride_order) == len(cache_shape)
-            torch.zeros(
+    except (AttributeError, NotImplementedError):
-                2,
+        stride_order = tuple(range(len(cache_shape)))
-                max_num_blocks,
+
-                config.block_size,
+    # Permute shape to physical layout order
-                config.num_kv_heads,
+    physical_shape = tuple(cache_shape[i] for i in stride_order)
-                config.head_dim,
+
-                device=device,
+    # Compute inverse permutation to get back to logical view
-                dtype=dtype,
+    inv_order = [stride_order.index(i) for i in range(len(stride_order))]
-            )
+
-            for _ in range(config.num_layers)
+    cache_list = []
-        ]
+    for _ in range(config.num_layers):
        # Allocate in physical layout order (contiguous in memory)
        cache = torch.zeros(*physical_shape, device=device, dtype=dtype)
        # Permute to logical view
        cache = cache.permute(*inv_order)
        cache_list.append(cache)
    return cache_list
@@ -396,7 +435,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
    """
    Run standard attention benchmark with real kernels.
-    Supports: flash, triton, flashinfer
+    Supports: FLASH_ATTN, TRITON_ATTN, FLASHINFER
    Args:
        config: Benchmark configuration
@@ -411,20 +450,39 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
    requests = parse_batch_spec(config.batch_spec)
-    if config.backend == "flashinfer":
+    if config.backend == "FLASHINFER":
        requests = reorder_for_flashinfer(requests)
    q_lens = [r.q_len for r in requests]
    kv_lens = [r.kv_len for r in requests]
    total_q = sum(q_lens)
    max_kv = max(kv_lens)
    batch_size = len(q_lens)
-    max_num_blocks = (max_kv + config.block_size - 1) // config.block_size
+    # Calculate total blocks needed: batch_size * max_blocks_per_request
    max_blocks_per_request = (max_kv + config.block_size - 1) // config.block_size
    max_num_blocks = batch_size * max_blocks_per_request
-    backend_class, impl, layer, dtype = _create_backend_impl(
+    # Suppress vLLM logs during setup to reduce spam
-        backend_cfg, config, device
+    with log_warnings_and_errors_only():
        # Create vllm_config first - uses model's native dtype via "auto"
        vllm_config = _create_vllm_config(config, max_num_blocks)
        dtype = vllm_config.model_config.dtype
        # Wrap everything in set_current_vllm_config context
        # This is required for backends like flashinfer that need global config
        with set_current_vllm_config(vllm_config):
            backend_class, impl, layer = _create_backend_impl(
                backend_cfg, config, device, dtype
            )
            # Set KV cache layout if the backend requires a specific one
            # (e.g., FlashInfer requires HND on SM100/Blackwell for TRTLLM attention)
            required_layout = backend_class.get_required_kv_cache_layout()
            if required_layout is not None:
                set_kv_cache_layout(required_layout)
                get_kv_cache_layout.cache_clear()
            common_metadata = _build_common_attn_metadata(
                q_lens, kv_lens, config.block_size, device
            )
@@ -436,10 +494,8 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
                dtype=dtype,
            )
    vllm_config = _create_vllm_config(config, dtype, max_num_blocks)
            builder = _create_metadata_builder(
-        backend_class, kv_cache_spec, vllm_config, device
+                backend_class, kv_cache_spec, vllm_config, device, config.backend
            )
            attn_metadata = builder.build(
@@ -447,10 +503,12 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
                common_attn_metadata=common_metadata,
            )
-    q_list, k_list, v_list = _create_input_tensors(config, total_q, device, dtype)
+            q_list, k_list, v_list = _create_input_tensors(
                config, total_q, device, dtype
            )
            cache_list = _create_kv_cache(
-        config, max_num_blocks, backend_cfg["cache_layout"], device, dtype
+                config, max_num_blocks, backend_class, device, dtype
            )
            times, mem_stats = _run_single_benchmark(
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -46,10 +46,10 @@ echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
 echo "RESULT_FILE=$RESULT"
 echo "====================== AUTO TUNEPARAMETERS ===================="
-rm -rf $LOG_FOLDER
+rm -rf "$LOG_FOLDER"
-rm -rf $PROFILE_PATH
+rm -rf "$PROFILE_PATH"
-mkdir -p $LOG_FOLDER
+mkdir -p "$LOG_FOLDER"
-mkdir -p $PROFILE_PATH
+mkdir -p "$PROFILE_PATH"
 cd "$BASE/vllm"
@@ -114,7 +114,7 @@ start_server() {
    # wait for 10 minutes...
    server_started=0
-    for i in {1..60}; do
+    for _ in {1..60}; do
        # This line checks whether the server is still alive or not,
        # since that we should always have permission to send signal to the server process.
        kill -0 $server_pid 2> /dev/null || break
@@ -145,12 +145,12 @@ run_benchmark() {
    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
    echo "vllm_log: $vllm_log"
    echo
-    rm -f $vllm_log
+    rm -f "$vllm_log"
    pkill -if "vllm serve" || true
    echo "starting server..."
    # Call start_server without a profile_dir to avoid profiling overhead
-    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log ""
+    start_server "$gpu_memory_utilization" "$max_num_seqs" "$max_num_batched_tokens" "$vllm_log" ""
    result=$?
    if [[ "$result" -eq 1 ]]; then
        echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
@@ -168,15 +168,15 @@ run_benchmark() {
    # --profile flag is removed from this call
    vllm bench serve \
        --backend vllm \
-        --model $MODEL  \
+        --model "$MODEL"  \
        --dataset-name random \
        --random-input-len $adjusted_input_len \
-        --random-output-len $OUTPUT_LEN \
+        --random-output-len "$OUTPUT_LEN" \
        --ignore-eos \
        --disable-tqdm \
        --request-rate inf \
        --percentile-metrics ttft,tpot,itl,e2el \
-        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
+        --goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
        --num-prompts 1000 \
        --random-prefix-len $prefix_len \
        --host "$HOSTNAME" \
@@ -195,20 +195,20 @@ run_benchmark() {
        request_rate=$((${throughput%.*} + 1))
        while ((request_rate > 0)); do
            # clear prefix cache
-            curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
+            curl -X POST http://"${HOSTNAME}":8004/reset_prefix_cache
            sleep 5
            bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
            vllm bench serve \
                --backend vllm \
-                --model $MODEL  \
+                --model "$MODEL"  \
                --dataset-name random \
                --random-input-len $adjusted_input_len \
-                --random-output-len $OUTPUT_LEN \
+                --random-output-len "$OUTPUT_LEN" \
                --ignore-eos \
                --disable-tqdm \
                --request-rate $request_rate \
                --percentile-metrics ttft,tpot,itl,e2el \
-                --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
+                --goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
                --num-prompts 100 \
                --random-prefix-len $prefix_len \
                --host "$HOSTNAME" \
@@ -255,7 +255,7 @@ gpu_memory_utilization=0.98
 find_gpu_memory_utilization=0
 while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
    # Pass empty string for profile_dir argument
-    start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
+    start_server "$gpu_memory_utilization" "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
    result=$?
    if [[ "$result" -eq 0 ]]; then
        find_gpu_memory_utilization=1
@@ -274,7 +274,7 @@ fi
 for num_seqs in "${num_seqs_list[@]}"; do
    for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
-        run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
+        run_benchmark "$num_seqs" "$num_batched_tokens" "$gpu_memory_utilization"
    done
 done
 echo "finish permutations"
@@ -285,7 +285,7 @@ echo "finish permutations"
 if (( $(echo "$best_throughput > 0" | bc -l) )); then
    echo
    echo "Benchmark tuning finished. Now running profiling on the best configuration found..."
-    echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput"
+    echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput, goodput: $best_goodput"
    echo
    vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt"
@@ -293,7 +293,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
    # Start server with the best params and profiling ENABLED
    echo "Starting server for profiling..."
-    start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH"
+    start_server "$gpu_memory_utilization" "$best_max_num_seqs" "$best_num_batched_tokens" "$vllm_log" "$PROFILE_PATH"
    # Run benchmark with the best params and the --profile flag
    echo "Running benchmark with profiling..."
@@ -301,15 +301,15 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
    adjusted_input_len=$(( INPUT_LEN - prefix_len ))
    vllm bench serve \
        --backend vllm \
-        --model $MODEL \
+        --model "$MODEL" \
        --dataset-name random \
        --random-input-len $adjusted_input_len \
-        --random-output-len $OUTPUT_LEN \
+        --random-output-len "$OUTPUT_LEN" \
        --ignore-eos \
        --disable-tqdm \
-        --request-rate $best_request_rate \
+        --request-rate "$best_request_rate" \
        --percentile-metrics ttft,tpot,itl,e2el \
-        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
+        --goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
        --num-prompts 100 \
        --random-prefix-len $prefix_len \
        --host "$HOSTNAME" \
--- a/benchmarks/auto_tune/batch_auto_tune.sh
+++ b/benchmarks/auto_tune/batch_auto_tune.sh
@@ -64,7 +64,7 @@ for i in $(seq 0 $(($num_runs - 1))); do
  else
    STATUS="FAILURE"
    ((FAILURE_COUNT++))
-    FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)")
+    FAILED_RUNS+=("Run #$((i+1)): $(echo "$run_object" | jq -c .)")
  fi
  RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")
--- a/benchmarks/benchmark_topk_topp.py
+++ b/benchmarks/benchmark_topk_topp.py
@@ -0,0 +1,471 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Benchmark comparing Triton vs PyTorch sort-based top-k/top-p implementations.
 Compares:
 - apply_top_k_top_p_triton (Triton binary search)
 - apply_top_k_top_p (PyTorch sort-based)
 Scenarios:
 - top_k only (whole batch, partial batch)
 - top_p only (whole batch, partial batch)
 - mix of top_k and top_p
 """
 import argparse
 import gc
 from dataclasses import dataclass
 import torch
 from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p_pytorch
 from vllm.v1.sample.ops.topk_topp_triton import (
    apply_top_k_top_p_triton,
    reset_buffer_cache,
 )
@dataclass
 class BenchmarkConfig:
    """Configuration for a benchmark run."""
    name: str
    batch_size: int
    vocab_size: int
    # k and p can be tensors or None
    k_values: torch.Tensor | None  # [batch_size] or None
    p_values: torch.Tensor | None  # [batch_size] or None
    description: str
    ops_pct: float = 0.0  # Percentage of ops relative to batch size
 def calculate_ops_pct(
    k_values: torch.Tensor | None,
    p_values: torch.Tensor | None,
    vocab_size: int,
    batch_size: int,
 ) -> float:
    """
    Calculate the percentage of active top-k and top-p operations.
    Returns percentage where 100% = batch_size ops.
    E.g., if all rows have both top-k and top-p active, returns 200%.
    """
    active_ops = 0
    if k_values is not None:
        # Count rows where k < vocab_size (active top-k filtering)
        active_ops += (k_values < vocab_size).sum().item()
    if p_values is not None:
        # Count rows where p < 1.0 (active top-p filtering)
        active_ops += (p_values < 1.0).sum().item()
    return (active_ops / batch_size) * 100 if batch_size > 0 else 0.0
 def create_logits(
    batch_size: int, vocab_size: int, device: str = "cuda"
 ) -> torch.Tensor:
    """Create random logits mimicking a realistic LLM distribution.
    Uses a Zipf-like probability distribution (rank^-1.1) converted to logits
    via log, then randomly permuted per row. This produces a peaked distribution
    where a small number of tokens capture most probability mass, similar to
    real model outputs.
    """
    # Create Zipf-like probabilities: p(rank) ~ rank^(-alpha)
    ranks = torch.arange(1, vocab_size + 1, dtype=torch.float32, device=device)
    probs = ranks.pow(-1.1)
    probs = probs / probs.sum()
    # Convert to logits (log-probabilities, unnormalized is fine)
    base_logits = probs.log()
    # Broadcast to batch and randomly permute each row
    logits = base_logits.unsqueeze(0).expand(batch_size, -1).clone()
    for i in range(batch_size):
        logits[i] = logits[i, torch.randperm(vocab_size, device=device)]
    return logits
 def measure_memory() -> tuple[int, int]:
    """Return (allocated, reserved) memory in bytes."""
    torch.cuda.synchronize()
    return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()
 def reset_memory_stats():
    """Reset peak memory statistics."""
    reset_buffer_cache()
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.empty_cache()
    gc.collect()
 def benchmark_function(
    func,
    logits: torch.Tensor,
    k: torch.Tensor | None,
    p: torch.Tensor | None,
    warmup_iters: int = 5,
    benchmark_iters: int = 20,
 ) -> tuple[float, int]:
    """
    Benchmark a function and return (avg_time_ms, peak_memory_bytes).
    Returns average time in milliseconds and peak memory usage.
    """
    # Warmup
    for _ in range(warmup_iters):
        logits_copy = logits.clone()
        func(logits_copy, k, p)
    torch.cuda.synchronize()
    # Reset memory stats before benchmark
    reset_memory_stats()
    # Benchmark
    start_events = [
        torch.cuda.Event(enable_timing=True) for _ in range(benchmark_iters)
    ]
    end_events = [torch.cuda.Event(enable_timing=True) for _ in range(benchmark_iters)]
    for i in range(benchmark_iters):
        logits_copy = logits.clone()
        start_events[i].record()
        func(logits_copy, k, p)
        end_events[i].record()
    torch.cuda.synchronize()
    # Calculate timing
    times = [
        start_events[i].elapsed_time(end_events[i]) for i in range(benchmark_iters)
    ]
    avg_time = sum(times) / len(times)
    # Get peak memory
    _, peak_memory = measure_memory()
    return avg_time, peak_memory
 def create_benchmark_configs(
    batch_sizes: list[int],
    vocab_sizes: list[int],
    device: str = "cuda",
 ) -> list[BenchmarkConfig]:
    """Create all benchmark configurations."""
    configs = []
    for vocab_size in vocab_sizes:
        for batch_size in batch_sizes:
            # 1. Top-k only - whole batch (all rows have k < vocab_size)
            k_all = torch.full((batch_size,), 50, dtype=torch.int32, device=device)
            configs.append(
                BenchmarkConfig(
                    name=f"topk_whole_b{batch_size}_v{vocab_size // 1000}k",
                    batch_size=batch_size,
                    vocab_size=vocab_size,
                    k_values=k_all,
                    p_values=None,
                    description=f"Top-k only (whole batch, k=50), "
                    f"batch={batch_size}, vocab={vocab_size}",
                    ops_pct=calculate_ops_pct(k_all, None, vocab_size, batch_size),
                )
            )
            # 2. Top-k only - partial batch (half have k=50, half have k=vocab_size)
            k_partial = torch.full((batch_size,), 50, dtype=torch.int32, device=device)
            k_partial[batch_size // 2 :] = vocab_size  # No filtering for second half
            configs.append(
                BenchmarkConfig(
                    name=f"topk_partial_b{batch_size}_v{vocab_size // 1000}k",
                    batch_size=batch_size,
                    vocab_size=vocab_size,
                    k_values=k_partial,
                    p_values=None,
                    description=f"Top-k only (partial batch, 50% k=50, 50% k=vocab), "
                    f"batch={batch_size}, vocab={vocab_size}",
                    ops_pct=calculate_ops_pct(k_partial, None, vocab_size, batch_size),
                )
            )
            # 3. Top-p only - whole batch (all rows have p < 1.0)
            p_all = torch.full((batch_size,), 0.9, dtype=torch.float32, device=device)
            configs.append(
                BenchmarkConfig(
                    name=f"topp_whole_b{batch_size}_v{vocab_size // 1000}k",
                    batch_size=batch_size,
                    vocab_size=vocab_size,
                    k_values=None,
                    p_values=p_all,
                    description=f"Top-p only (whole batch, p=0.9), "
                    f"batch={batch_size}, vocab={vocab_size}",
                    ops_pct=calculate_ops_pct(None, p_all, vocab_size, batch_size),
                )
            )
            # 4. Top-p only - partial batch (half have p=0.9, half have p=1.0)
            p_partial = torch.full(
                (batch_size,), 0.9, dtype=torch.float32, device=device
            )
            p_partial[batch_size // 2 :] = 1.0  # No filtering for second half
            configs.append(
                BenchmarkConfig(
                    name=f"topp_partial_b{batch_size}_v{vocab_size // 1000}k",
                    batch_size=batch_size,
                    vocab_size=vocab_size,
                    k_values=None,
                    p_values=p_partial,
                    description=f"Top-p only (partial batch, 50% p=0.9, 50% p=1.0), "
                    f"batch={batch_size}, vocab={vocab_size}",
                    ops_pct=calculate_ops_pct(None, p_partial, vocab_size, batch_size),
                )
            )
            # 5. Mix of top-k and top-p (both applied to whole batch)
            k_mix = torch.full((batch_size,), 100, dtype=torch.int32, device=device)
            p_mix = torch.full((batch_size,), 0.9, dtype=torch.float32, device=device)
            configs.append(
                BenchmarkConfig(
                    name=f"topk_topp_whole_b{batch_size}_v{vocab_size // 1000}k",
                    batch_size=batch_size,
                    vocab_size=vocab_size,
                    k_values=k_mix,
                    p_values=p_mix,
                    description=f"Top-k + Top-p (whole batch, k=100, p=0.9), "
                    f"batch={batch_size}, vocab={vocab_size}",
                    ops_pct=calculate_ops_pct(k_mix, p_mix, vocab_size, batch_size),
                )
            )
            # 6. Mix with partial application (some rows k only, some p only, some both)
            k_mixed = torch.full(
                (batch_size,), vocab_size, dtype=torch.int32, device=device
            )
            p_mixed = torch.full((batch_size,), 1.0, dtype=torch.float32, device=device)
            # First third: k only
            third = batch_size // 3
            k_mixed[:third] = 50
            # Second third: p only
            p_mixed[third : 2 * third] = 0.5
            # Last third: both k and p
            k_mixed[2 * third :] = 100
            p_mixed[2 * third :] = 0.9
            configs.append(
                BenchmarkConfig(
                    name=f"mixed_partial_b{batch_size}_v{vocab_size // 1000}k",
                    batch_size=batch_size,
                    vocab_size=vocab_size,
                    k_values=k_mixed,
                    p_values=p_mixed,
                    description=f"Mixed partial (1/3 k=50, 1/3 p=0.9, 1/3 both), "
                    f"batch={batch_size}, vocab={vocab_size}",
                    ops_pct=calculate_ops_pct(k_mixed, p_mixed, vocab_size, batch_size),
                )
            )
    return configs
 def format_memory(bytes_val: int) -> str:
    """Format memory in human-readable form."""
    if bytes_val >= 1024**3:
        return f"{bytes_val / (1024**3):.2f} GB"
    elif bytes_val >= 1024**2:
        return f"{bytes_val / (1024**2):.2f} MB"
    elif bytes_val >= 1024:
        return f"{bytes_val / 1024:.2f} KB"
    return f"{bytes_val} B"
 def run_benchmark(
    configs: list[BenchmarkConfig],
    warmup_iters: int = 5,
    benchmark_iters: int = 20,
    verbose: bool = True,
 ):
    """Run all benchmarks and print results."""
    results = []
    print("=" * 100)
    print("Top-k/Top-p Benchmark: Triton vs PyTorch Sort-based")
    print("=" * 100)
    print()
    for config in configs:
        if verbose:
            print(f"Running: {config.description}")
        # Create fresh logits for this config
        logits = create_logits(config.batch_size, config.vocab_size)
        # Benchmark Triton
        reset_memory_stats()
        triton_time, triton_mem = benchmark_function(
            apply_top_k_top_p_triton,
            logits,
            config.k_values,
            config.p_values,
            warmup_iters,
            benchmark_iters,
        )
        # Benchmark PyTorch
        reset_memory_stats()
        pytorch_time, pytorch_mem = benchmark_function(
            apply_top_k_top_p_pytorch,
            logits,
            config.k_values,
            config.p_values,
            warmup_iters,
            benchmark_iters,
        )
        speedup = pytorch_time / triton_time if triton_time > 0 else float("inf")
        mem_ratio = pytorch_mem / triton_mem if triton_mem > 0 else float("inf")
        result = {
            "config": config,
            "triton_time_ms": triton_time,
            "pytorch_time_ms": pytorch_time,
            "triton_mem": triton_mem,
            "pytorch_mem": pytorch_mem,
            "speedup": speedup,
            "mem_ratio": mem_ratio,
        }
        results.append(result)
        if verbose:
            print(f"  Triton:  {triton_time:.3f} ms, {format_memory(triton_mem)}")
            print(f"  PyTorch: {pytorch_time:.3f} ms, {format_memory(pytorch_mem)}")
            print(f"  Speedup: {speedup:.2f}x, Memory ratio: {mem_ratio:.2f}x")
            print()
        # Clean up
        del logits
        reset_memory_stats()
    return results
 def print_summary_table(results: list[dict]):
    """Print a summary table of results."""
    print()
    print("=" * 130)
    print("SUMMARY TABLE")
    print("=" * 130)
    print()
    # Header
    header = (
        f"{'Scenario':<40} {'Batch':>6} {'Vocab':>7} {'Ops%':>6} "
        f"{'Triton (ms)':>12} {'PyTorch (ms)':>13} {'Speedup':>8} "
        f"{'Tri Mem':>10} {'Pyt Mem':>10}"
    )
    print(header)
    print("-" * 130)
    # Group by scenario type
    current_vocab = None
    for result in results:
        config = result["config"]
        # Add separator between vocab sizes
        if current_vocab != config.vocab_size:
            if current_vocab is not None:
                print("-" * 130)
            current_vocab = config.vocab_size
        scenario = config.name.split("_b")[0]  # Extract scenario name
        print(
            f"{scenario:<40} {config.batch_size:>6} {config.vocab_size:>7} "
            f"{config.ops_pct:>5.0f}% "
            f"{result['triton_time_ms']:>12.3f} {result['pytorch_time_ms']:>13.3f} "
            f"{result['speedup']:>7.2f}x "
            f"{format_memory(result['triton_mem']):>10} "
            f"{format_memory(result['pytorch_mem']):>10}"
        )
    print("=" * 130)
 def main():
    parser = argparse.ArgumentParser(
        description="Benchmark Triton vs PyTorch sort-based top-k/top-p implementations"
    )
    parser.add_argument(
        "--batch-sizes",
        type=int,
        nargs="+",
        default=[1, 4, 16, 64, 128, 512, 1024, 2048],
        help="Batch sizes to test (default: 1 4 16 64)",
    )
    parser.add_argument(
        "--vocab-sizes",
        type=int,
        nargs="+",
        default=[32768, 131072],  # 32k, 128k
        help="Vocabulary sizes to test (default: 32768 131072)",
    )
    parser.add_argument(
        "--warmup-iters",
        type=int,
        default=5,
        help="Number of warmup iterations (default: 5)",
    )
    parser.add_argument(
        "--benchmark-iters",
        type=int,
        default=20,
        help="Number of benchmark iterations (default: 20)",
    )
    parser.add_argument(
        "--quiet",
        action="store_true",
        help="Only print summary table",
    )
    args = parser.parse_args()
    # Print configuration
    print(f"Batch sizes: {args.batch_sizes}")
    print(f"Vocab sizes: {args.vocab_sizes}")
    print(f"Warmup iterations: {args.warmup_iters}")
    print(f"Benchmark iterations: {args.benchmark_iters}")
    print()
    # Check CUDA
    if not torch.cuda.is_available():
        print("ERROR: CUDA is not available. This benchmark requires a GPU.")
        return
    device_name = torch.cuda.get_device_name(0)
    print(f"GPU: {device_name}")
    print()
    # Create configs
    configs = create_benchmark_configs(
        args.batch_sizes,
        args.vocab_sizes,
    )
    # Run benchmarks
    results = run_benchmark(
        configs,
        warmup_iters=args.warmup_iters,
        benchmark_iters=args.benchmark_iters,
        verbose=not args.quiet,
    )
    # Print summary
    print_summary_table(results)
 if __name__ == "__main__":
    main()
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -13,6 +13,7 @@ from torch.utils.benchmark import Measurement as TMeasurement
 from tqdm import tqdm
 import vllm._custom_ops as ops
 from vllm.benchmarks.lib.utils import default_vllm_config
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    per_token_group_quant_fp8,
@@ -291,6 +292,7 @@ def print_timers(timers: Iterable[TMeasurement]):
    compare.print()
@default_vllm_config()
 def main():
    torch.set_default_device("cuda")
    bench_params = get_bench_params()
--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@@ -7,6 +7,7 @@ import itertools
 import torch
 import vllm.model_executor.layers.activation  # noqa F401
 from vllm.benchmarks.lib.utils import default_vllm_config
 from vllm.model_executor.custom_op import op_registry
 from vllm.triton_utils import triton
 from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -18,6 +19,7 @@ intermediate_size = [3072, 9728, 12288]
 configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
@default_vllm_config()
 def benchmark_activation(
    batch_size: int,
    seq_len: int,
--- a/benchmarks/kernels/benchmark_block_fp8_gemm.py
+++ b/benchmarks/kernels/benchmark_block_fp8_gemm.py
@@ -8,6 +8,7 @@ os.environ["VLLM_USE_DEEP_GEMM"] = "0"
 import torch
 from vllm.benchmarks.lib.utils import default_vllm_config
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    W8A8BlockFp8LinearOp,
 )
@@ -40,6 +41,7 @@ DEEPSEEK_V3_SHAPES = [
 ]
@default_vllm_config()
 def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
    """Build runner function for w8a8 block fp8 matmul."""
    factor_for_scale = 1e-2
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -11,6 +11,7 @@ import torch
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.activation import MoEActivation
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
 from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
@@ -161,7 +162,7 @@ def bench_run(
                w2_fp8q_cutlass,
                topk_weights,
                topk_ids,
-                activation="silu",
+                activation=MoEActivation.SILU,
                global_num_experts=num_experts,
            )
    torch.cuda.synchronize()
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -30,6 +30,9 @@ import torch.distributed as dist
 from torch.distributed import ProcessGroup
 from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
 from vllm.distributed.device_communicators.flashinfer_all_reduce import (
    FlashInferAllReduce,
 )
 from vllm.distributed.device_communicators.pynccl import (
    PyNcclCommunicator,
    register_nccl_symmetric_ops,
@@ -44,7 +47,7 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser
 logger = init_logger(__name__)
 # Default sequence lengths to benchmark
-DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192]
+DEFAULT_SEQUENCE_LENGTHS = [16, 64, 128, 512, 1024, 2048, 4096, 8192]
 # Fixed hidden size and dtype for all benchmarks
 HIDDEN_SIZE = 8192
@@ -81,6 +84,7 @@ class CommunicatorBenchmark:
        self.symm_mem_comm = None
        self.symm_mem_comm_multimem = None
        self.symm_mem_comm_two_shot = None
        self.fi_ar_comm = None
        self._init_communicators()
@@ -161,6 +165,22 @@ class CommunicatorBenchmark:
            )
            self.symm_mem_comm_two_shot = None
        try:
            self.fi_ar_comm = FlashInferAllReduce(
                group=self.cpu_group,
                device=self.device,
            )
            if not self.fi_ar_comm.disabled:
                logger.info("Rank %s: FlashInferAllReduce initialized", self.rank)
            else:
                logger.info("Rank %s: FlashInferAllReduce disabled", self.rank)
                self.fi_ar_comm = None
        except Exception as e:
            logger.warning(
                "Rank %s: Failed to initialize FlashInferAllReduce: %s", self.rank, e
            )
            self.fi_ar_comm = None
    def benchmark_allreduce(
        self, sequence_length: int, num_warmup: int, num_trials: int
    ) -> dict[str, float]:
@@ -180,7 +200,8 @@ class CommunicatorBenchmark:
                    lambda t, c=comm: c.custom_all_reduce(t),
                    lambda t, c=comm: c.should_custom_ar(t),
                    comm.capture(),
-                    "1stage",  # env variable value
+                    {"VLLM_CUSTOM_ALLREDUCE_ALGO": "1stage"},
                    None,  # no destroy function
                )
            )
            # CustomAllreduce two-shot
@@ -190,7 +211,8 @@ class CommunicatorBenchmark:
                    lambda t, c=comm: c.custom_all_reduce(t),
                    lambda t, c=comm: c.should_custom_ar(t),
                    comm.capture(),
-                    "2stage",  # env variable value
+                    {"VLLM_CUSTOM_ALLREDUCE_ALGO": "2stage"},
                    None,  # no destroy function
                )
            )
@@ -202,7 +224,8 @@ class CommunicatorBenchmark:
                    lambda t, c=comm: c.all_reduce(t),
                    lambda t: True,  # Always available if initialized
                    nullcontext(),
-                    None,  # no env variable needed
+                    {},  # no env variable needed
                    None,  # no destroy function
                )
            )
            communicators.append(
@@ -211,7 +234,8 @@ class CommunicatorBenchmark:
                    lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
                    lambda t: True,  # Always available if initialized
                    nullcontext(),
-                    None,  # no env variable needed
+                    {},  # no env variable needed
                    None,  # no destroy function
                )
            )
@@ -223,7 +247,8 @@ class CommunicatorBenchmark:
                    lambda t, c=comm: c.all_reduce(t),
                    lambda t, c=comm: c.should_use_symm_mem(t),
                    nullcontext(),
-                    None,  # no env variable needed
+                    {},  # no env variable needed
                    None,  # no destroy function
                )
            )
@@ -235,19 +260,48 @@ class CommunicatorBenchmark:
                    lambda t, c=comm: c.all_reduce(t),
                    lambda t, c=comm: c.should_use_symm_mem(t),
                    nullcontext(),
-                    None,  # no env variable needed
+                    {},  # no env variable needed
                    None,  # no destroy function needed
                )
            )
        if self.fi_ar_comm is not None:
            comm = self.fi_ar_comm
            communicators.append(
                (
                    "flashinfer_trtllm",
                    lambda t, c=comm: c.all_reduce(t),
                    lambda t, c=comm: c.should_use_fi_ar(t),
                    nullcontext(),
                    {"VLLM_FLASHINFER_ALLREDUCE_BACKEND": "trtllm"},
                    lambda c=comm: c.destroy(),
                )
            )
            communicators.append(
                (
                    "flashinfer_mnnvl",
                    lambda t, c=comm: c.all_reduce(t),
                    lambda t, c=comm: c.should_use_fi_ar(t),
                    nullcontext(),
                    {"VLLM_FLASHINFER_ALLREDUCE_BACKEND": "mnnvl"},
                    lambda c=comm: c.destroy(),
                )
            )
        # Benchmark each communicator
-        for name, allreduce_fn, should_use_fn, context, env_var in communicators:
+        for (
-            # Set environment variable if needed
+            name,
-            if env_var is not None:
+            allreduce_fn,
-                os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var
+            should_use_fn,
-            else:
+            context,
-                # Clear the environment variable to avoid interference
+            env_dict,
-                os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None)
+            destroy_fn,
-
+        ) in communicators:
            # Save original values and apply new environment variables
            saved_env = {key: os.environ.get(key) for key in env_dict}
            for key, value in env_dict.items():
                os.environ[key] = value
            try:
                latency = self.benchmark_allreduce_single(
                    sequence_length,
                    allreduce_fn,
@@ -258,6 +312,15 @@ class CommunicatorBenchmark:
                )
                if latency is not None:
                    results[name] = latency
            finally:
                if destroy_fn is not None:
                    destroy_fn()
                # Restore environment variables to their original state
                for key, original_value in saved_env.items():
                    if original_value is None:
                        os.environ.pop(key, None)
                    else:
                        os.environ[key] = original_value
        return results
--- a/benchmarks/kernels/benchmark_fp8_gemm.py
+++ b/benchmarks/kernels/benchmark_fp8_gemm.py
--- a/benchmarks/kernels/benchmark_fused_collective.py
+++ b/benchmarks/kernels/benchmark_fused_collective.py
@@ -5,8 +5,11 @@
 Benchmark for FlashInfer fused collective operations vs standard operations.
 This benchmark compares:
-1. FlashInfer's trtllm_allreduce_fusion (fused allreduce + rmsnorm + optional quant)
+1. FlashInfer's allreduce_fusion with trtllm backend
-2. Standard tensor_model_parallel_all_reduce + separate rmsnorm/quant operations
+   (fused allreduce + rmsnorm + optional FP8/FP4 quant)
 2. FlashInfer's allreduce_fusion with mnnvl backend
   (fused allreduce + rmsnorm only, no quantization support)
 3. Standard tensor_model_parallel_all_reduce + separate rmsnorm/quant operations
 Usage with torchrun:
    torchrun --nproc_per_node=2 benchmark_fused_collective.py
@@ -24,7 +27,6 @@ import torch.distributed as dist  # type: ignore
 from vllm.config.vllm import CompilationConfig, VllmConfig, set_current_vllm_config
 from vllm.distributed import (
    get_tp_group,
    tensor_model_parallel_all_reduce,
 )
 from vllm.distributed.parallel_state import (
@@ -49,14 +51,19 @@ SCALED_FP4_QUANT_OP = torch.ops._C.scaled_fp4_quant
 logger = init_logger(__name__)
 # Try to import FlashInfer
 TorchDistBackend = None
 try:
    import flashinfer.comm as flashinfer_comm  # type: ignore
-
+    from flashinfer.comm.mnnvl import (  # type: ignore
-    if not hasattr(flashinfer_comm, "trtllm_allreduce_fusion"):
+        TorchDistBackend,
        flashinfer_comm = None
        logger.warning(
            "FlashInfer comm module found but missing trtllm_allreduce_fusion"
    )
    if not (
        hasattr(flashinfer_comm, "allreduce_fusion")
        and hasattr(flashinfer_comm, "create_allreduce_fusion_workspace")
    ):
        flashinfer_comm = None
        logger.warning("FlashInfer comm module found but missing allreduce_fusion API")
 except ImportError:
    flashinfer_comm = None
    logger.warning("FlashInfer not found, only benchmarking standard operations")
@@ -74,57 +81,70 @@ _FI_MAX_SIZES = {
    8: 64 * MiB,  # 64MB
 }
-# Global workspace tensor for FlashInfer
+# Global workspace tensors for FlashInfer (keyed by backend name)
-_FI_WORKSPACE_TENSOR = None
+_FI_WORKSPACES: dict = {}
 # Backends to benchmark
 FLASHINFER_BACKENDS = ["trtllm", "mnnvl"]
 def setup_flashinfer_workspace(
    backend: str,
    world_size: int,
    rank: int,
    hidden_dim: int,
    max_token_num: int,
-    use_fp32_lamport: bool = False,
+    dtype: torch.dtype,
 ):
    """Setup FlashInfer workspace for fused allreduce operations."""
-    global _FI_WORKSPACE_TENSOR
+    global FI_WORKSPACES
    if flashinfer_comm is None:
-        return None, None
+        return None
    if world_size not in _FI_MAX_SIZES:
        logger.warning("FlashInfer not supported for world size %s", world_size)
-        return None, None
+        return None
    try:
-        # Create IPC workspace
+        kwargs = {}
-        ipc_handles, workspace_tensor = (
+        if TorchDistBackend is not None:
-            flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion(
+            kwargs["comm_backend"] = TorchDistBackend(group=dist.group.WORLD)
-                tp_rank=rank,
+
-                tp_size=world_size,
+        workspace = flashinfer_comm.create_allreduce_fusion_workspace(
            backend=backend,
            world_size=world_size,
            rank=rank,
            max_token_num=max_token_num,
            hidden_dim=hidden_dim,
-                group=get_tp_group().device_group,
+            dtype=dtype,
-                use_fp32_lamport=use_fp32_lamport,
+            **kwargs,
            )
        )
-        _FI_WORKSPACE_TENSOR = workspace_tensor
+        _FI_WORKSPACES[backend] = workspace
-        return ipc_handles, workspace_tensor
+        return workspace
    except Exception as e:
-        logger.error("Failed to setup FlashInfer workspace: %s", e)
+        logger.error(
-        return None, None
+            "Failed to setup FlashInfer workspace (backend=%s): %s", backend, e
        )
        return None
-def cleanup_flashinfer_workspace(ipc_handles):
+def cleanup_flashinfer_workspaces():
-    """Cleanup FlashInfer workspace."""
+    """Cleanup all FlashInfer workspaces."""
-    if flashinfer_comm is None or ipc_handles is None:
+    if flashinfer_comm is None:
        return
    for backend, workspace in _FI_WORKSPACES.items():
        try:
-        group = get_tp_group().device_group
+            workspace.destroy()
        flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce(ipc_handles, group)
        except Exception as e:
-        logger.error("Failed to cleanup FlashInfer workspace: %s", e)
+            logger.error(
                "Failed to cleanup FlashInfer workspace (backend=%s): %s",
                backend,
                e,
            )
    _FI_WORKSPACES.clear()
 class FlashInferFusedAllReduceParams:
@@ -132,25 +152,15 @@ class FlashInferFusedAllReduceParams:
    def __init__(
        self,
        rank: int,
        world_size: int,
        use_fp32_lamport: bool = False,
        max_token_num: int = 1024,
    ):
        self.rank = rank
        self.world_size = world_size
        self.use_fp32_lamport = use_fp32_lamport
        self.trigger_completion_at_end = True
        self.launch_with_pdl = True
        self.fp32_acc = True
        self.max_token_num = max_token_num
-    def get_trtllm_fused_allreduce_kwargs(self):
+    def get_flashinfer_fused_allreduce_kwargs(self):
        return {
            "world_rank": self.rank,
            "world_size": self.world_size,
            "launch_with_pdl": self.launch_with_pdl,
            "trigger_completion_at_end": self.trigger_completion_at_end,
            "fp32_acc": self.fp32_acc,
        }
@@ -161,11 +171,12 @@ def flashinfer_fused_allreduce_rmsnorm(
    rms_gamma: torch.Tensor,
    rms_eps: float,
    allreduce_params: "FlashInferFusedAllReduceParams",
    workspace: object,
    use_oneshot: bool,
    norm_out: torch.Tensor | None = None,
 ):
    """FlashInfer fused allreduce + rmsnorm operation."""
-    if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
+    if flashinfer_comm is None or workspace is None:
        raise RuntimeError("FlashInfer not available or workspace not initialized")
    if norm_out is None:
@@ -174,24 +185,25 @@ def flashinfer_fused_allreduce_rmsnorm(
    else:
        residual_out = input_tensor
-    flashinfer_comm.trtllm_allreduce_fusion(
+    layout_code = None
-        allreduce_in=input_tensor,
+    if workspace.backend == "trtllm":
-        token_num=input_tensor.shape[0],
+        layout_code = flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4
    flashinfer_comm.allreduce_fusion(
        input=input_tensor,
        workspace=workspace,
        pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm,
        residual_in=residual,
        residual_out=residual_out,
        norm_out=norm_out,
        rms_gamma=rms_gamma,
        rms_eps=rms_eps,
        hidden_dim=input_tensor.shape[-1],
        workspace_ptrs=_FI_WORKSPACE_TENSOR,
        pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm,
        allreduce_out=None,
        quant_out=None,
        scale_out=None,
-        layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
+        layout_code=layout_code,
        scale_factor=None,
        use_oneshot=use_oneshot,
-        **allreduce_params.get_trtllm_fused_allreduce_kwargs(),
+        **allreduce_params.get_flashinfer_fused_allreduce_kwargs(),
    )
@@ -202,12 +214,16 @@ def flashinfer_fused_allreduce_rmsnorm_fp8_quant(
    rms_eps: float,
    scale_factor: torch.Tensor,
    allreduce_params: FlashInferFusedAllReduceParams,
    workspace: object,
    use_oneshot: bool = True,
    norm_out: torch.Tensor | None = None,
    quant_out: torch.Tensor | None = None,
 ):
-    """FlashInfer fused allreduce + rmsnorm + FP8 quantization."""
+    """FlashInfer fused allreduce + rmsnorm + FP8 quantization.
-    if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
+
    Note: Only supported by the trtllm backend.
    """
    if flashinfer_comm is None or workspace is None:
        raise RuntimeError("FlashInfer not available or workspace not initialized")
    if norm_out is None:
@@ -216,24 +232,21 @@ def flashinfer_fused_allreduce_rmsnorm_fp8_quant(
    else:
        residual_out = input_tensor
-    flashinfer_comm.trtllm_allreduce_fusion(
+    flashinfer_comm.allreduce_fusion(
-        allreduce_in=input_tensor,
+        input=input_tensor,
-        token_num=input_tensor.shape[0],
+        workspace=workspace,
        pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant,
        residual_in=residual,
        residual_out=residual_out,
        norm_out=norm_out,
        rms_gamma=rms_gamma,
        rms_eps=rms_eps,
        hidden_dim=input_tensor.shape[-1],
        workspace_ptrs=_FI_WORKSPACE_TENSOR,
        pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant,
        allreduce_out=None,
        quant_out=quant_out,
        scale_out=None,
        layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
        scale_factor=scale_factor,
        use_oneshot=use_oneshot,
-        **allreduce_params.get_trtllm_fused_allreduce_kwargs(),
+        **allreduce_params.get_flashinfer_fused_allreduce_kwargs(),
    )
@@ -244,13 +257,17 @@ def flashinfer_fused_allreduce_rmsnorm_fp4_quant(
    rms_eps: float,
    input_global_scale: torch.Tensor,
    allreduce_params: FlashInferFusedAllReduceParams,
    workspace: object,
    quant_out: torch.Tensor,
    use_oneshot: bool,
    output_scale: torch.Tensor,
    norm_out: torch.Tensor | None = None,
 ):
-    """FlashInfer fused allreduce + rmsnorm + FP4 quantization."""
+    """FlashInfer fused allreduce + rmsnorm + FP4 quantization.
-    if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
+
    Note: Only supported by the trtllm backend.
    """
    if flashinfer_comm is None or workspace is None:
        raise RuntimeError("FlashInfer not available or workspace not initialized")
    if norm_out is None:
@@ -259,24 +276,21 @@ def flashinfer_fused_allreduce_rmsnorm_fp4_quant(
    else:
        residual_out = input_tensor
-    flashinfer_comm.trtllm_allreduce_fusion(
+    flashinfer_comm.allreduce_fusion(
-        allreduce_in=input_tensor,
+        input=input_tensor,
-        token_num=input_tensor.shape[0],
+        workspace=workspace,
        pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant,
        residual_in=residual,
        residual_out=residual_out,
        norm_out=norm_out,
        rms_gamma=rms_gamma,
        rms_eps=rms_eps,
        hidden_dim=input_tensor.shape[-1],
        workspace_ptrs=_FI_WORKSPACE_TENSOR,
        pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant,
        allreduce_out=None,
        quant_out=quant_out,
        scale_out=output_scale,
        layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
        scale_factor=input_global_scale,
        use_oneshot=use_oneshot,
-        **allreduce_params.get_trtllm_fused_allreduce_kwargs(),
+        **allreduce_params.get_flashinfer_fused_allreduce_kwargs(),
    )
@@ -409,13 +423,16 @@ def run_benchmarks(
    dtype: torch.dtype,
    use_residual: bool,
    allreduce_params: FlashInferFusedAllReduceParams | None,
    workspaces: dict,
    quant_modes: set[str],
    no_oneshot: bool,
 ):
    """Run all benchmarks for given configuration.
    Args:
-        quant_mode: "none", "fp8_only", "fp4_only", or "all"
+        allreduce_params: Shared parameters for FlashInfer fused allreduce.
        workspaces: Dict mapping backend name ("trtllm", "mnnvl") to workspace.
        quant_modes: Set of quantization modes: "none", "fp8", "fp4".
    """
    (
        input_tensor,
@@ -431,18 +448,18 @@ def run_benchmarks(
    rms_eps = 1e-6
    results = {}
    vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
    use_oneshot_options = [False] if no_oneshot else [True, False]
    # Create RMSNorm and QuantFP8 layers once for native benchmarks
    if "none" in quant_modes:
        # Standard AllReduce + RMSNorm
        # Re-create VllmFusedAllreduce per config so CustomOp binds the
        # correct forward method (native vs custom kernel).
        for custom_op in ["-rms_norm", "+rms_norm"]:
            with set_current_vllm_config(
                VllmConfig(compilation_config=CompilationConfig(custom_ops=[custom_op]))
            ):
                try:
                    vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
                    suffix = (
                        "_custom_rms_norm" if "+" in custom_op else "_native_rms_norm"
                    )
@@ -461,6 +478,7 @@ def run_benchmarks(
            VllmConfig(compilation_config=CompilationConfig(custom_ops=["-rms_norm"]))
        ):
            try:
                vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
                standard_allreduce_rmsnorm_native_compiled = torch.compile(
                    vllm_fused_allreduce.allreduce_rmsnorm,
                    fullgraph=True,
@@ -476,10 +494,11 @@ def run_benchmarks(
                logger.error("Standard AllReduce+RMSNorm Native Compiled failed: %s", e)
                results["standard_allreduce_rmsnorm_native_compiled"] = float("inf")
-        # FlashInfer Fused AllReduce + RMSNorm Oneshot/Twoshot
+        # FlashInfer Fused AllReduce + RMSNorm (all backends)
-        if flashinfer_comm is not None and allreduce_params is not None:
+        for backend, workspace in workspaces.items():
            for use_oneshot in use_oneshot_options:
                suffix = "_oneshot" if use_oneshot else "_twoshot"
                key = f"flashinfer_{backend}_fused_allreduce_rmsnorm{suffix}"
                try:
                    time_ms = benchmark_operation(
                        flashinfer_fused_allreduce_rmsnorm,
@@ -489,14 +508,17 @@ def run_benchmarks(
                        rms_gamma=rms_gamma,
                        rms_eps=rms_eps,
                        allreduce_params=allreduce_params,
                        workspace=workspace,
                        use_oneshot=use_oneshot,
                    )
-                    results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = time_ms
+                    results[key] = time_ms
                except Exception as e:
-                    logger.error("FlashInfer Fused AllReduce+RMSNorm failed: %s", e)
+                    logger.error(
-                    results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = float(
+                        "FlashInfer (%s) Fused AllReduce+RMSNorm failed: %s",
-                        "inf"
+                        backend,
                        e,
                    )
                    results[key] = float("inf")
    if "fp8" in quant_modes:
        # Standard AllReduce + RMSNorm + FP8 Quant
@@ -505,7 +527,7 @@ def run_benchmarks(
                "_custom_rms_norm" if "+" in rms_norm_custom_op else "_native_rms_norm"
            )
            for quant_fp8_custom_op in ["-quant_fp8", "+quant_fp8"]:
-                suffix += (
+                op_suffix = suffix + (
                    "_custom_quant_fp8"
                    if "+" in quant_fp8_custom_op
                    else "_native_quant_fp8"
@@ -518,16 +540,17 @@ def run_benchmarks(
                    )
                ):
                    try:
                        vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
                        time_ms = benchmark_operation(
                            vllm_fused_allreduce.allreduce_rmsnorm_fp8_quant,
                            input_tensor,
                            residual=residual,
                            scale_factor=scale_fp8,
                        )
-                        results[f"standard_allreduce{suffix}"] = time_ms
+                        results[f"standard_allreduce{op_suffix}"] = time_ms
                    except Exception as e:
                        logger.error("Standard AllReduce+RMSNorm+FP8 failed: %s", e)
-                        results[f"standard_allreduce{suffix}"] = float("inf")
+                        results[f"standard_allreduce{op_suffix}"] = float("inf")
        # Standard AllReduce + RMSNorm + FP8 Quant Native Compiled
        with set_current_vllm_config(
@@ -538,6 +561,7 @@ def run_benchmarks(
            )
        ):
            try:
                vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
                standard_allreduce_rmsnorm_fp8_quant_native_compiled = torch.compile(
                    vllm_fused_allreduce.allreduce_rmsnorm_fp8_quant,
                    fullgraph=True,
@@ -560,10 +584,12 @@ def run_benchmarks(
                    "inf"
                )
-        # FlashInfer Fused AllReduce + RMSNorm + FP8 Quant Oneshot
+        # FlashInfer Fused AllReduce + RMSNorm + FP8 Quant (trtllm only)
-        if flashinfer_comm is not None and allreduce_params is not None:
+        if "trtllm" in workspaces:
            trtllm_ws = workspaces["trtllm"]
            for use_oneshot in use_oneshot_options:
                suffix = "_oneshot" if use_oneshot else "_twoshot"
                key = f"flashinfer_trtllm_fused_allreduce_rmsnorm_fp8_quant{suffix}"
                try:
                    time_ms = benchmark_operation(
                        flashinfer_fused_allreduce_rmsnorm_fp8_quant,
@@ -575,19 +601,16 @@ def run_benchmarks(
                        scale_factor=scale_fp8,
                        quant_out=quant_out_fp8,
                        allreduce_params=allreduce_params,
                        workspace=trtllm_ws,
                        use_oneshot=use_oneshot,
                    )
-                    results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = (
+                    results[key] = time_ms
                        time_ms
                    )
                except Exception as e:
                    logger.error(
-                        "FlashInfer Fused AllReduce+RMSNorm+FP8 Oneshot failed: %s",
+                        "FlashInfer (trtllm) Fused AllReduce+RMSNorm+FP8 failed: %s",
                        e,
                    )
-                    results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = (
+                    results[key] = float("inf")
                        float("inf")
                    )
    if "fp4" in quant_modes and current_platform.has_device_capability(100):
        # Standard AllReduce + RMSNorm + FP4 Quant
@@ -603,6 +626,7 @@ def run_benchmarks(
                )
            ):
                try:
                    vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
                    time_ms = benchmark_operation(
                        vllm_fused_allreduce.allreduce_rmsnorm_fp4_quant,
                        input_tensor,
@@ -621,6 +645,7 @@ def run_benchmarks(
            VllmConfig(compilation_config=CompilationConfig(custom_ops=["-rms_norm"]))
        ):
            try:
                vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype)
                standard_allreduce_rmsnorm_fp4_quant_native_compiled = torch.compile(
                    vllm_fused_allreduce.allreduce_rmsnorm_fp4_quant,
                    fullgraph=True,
@@ -645,10 +670,12 @@ def run_benchmarks(
                    "inf"
                )
-        # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Oneshot
+        # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant (trtllm only)
-        if flashinfer_comm is not None and allreduce_params is not None:
+        if "trtllm" in workspaces:
            trtllm_ws = workspaces["trtllm"]
            for use_oneshot in use_oneshot_options:
                suffix = "_oneshot" if use_oneshot else "_twoshot"
                key = f"flashinfer_trtllm_fused_allreduce_rmsnorm_fp4_quant{suffix}"
                try:
                    time_ms = benchmark_operation(
                        flashinfer_fused_allreduce_rmsnorm_fp4_quant,
@@ -659,49 +686,18 @@ def run_benchmarks(
                        rms_eps=rms_eps,
                        input_global_scale=scale_fp4,
                        allreduce_params=allreduce_params,
                        workspace=trtllm_ws,
                        quant_out=fp4_quant_out,
                        output_scale=fp4_output_scale,
                        use_oneshot=use_oneshot,
                    )
-                    results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = (
+                    results[key] = time_ms
                        time_ms
                    )
                except Exception as e:
                    logger.error(
-                        "FlashInfer Fused AllReduce+RMSNorm+FP4 Oneshot failed: %s",
+                        "FlashInfer (trtllm) Fused AllReduce+RMSNorm+FP4 failed: %s",
                        e,
                    )
-                    results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = (
+                    results[key] = float("inf")
                        float("inf")
                    )
        # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Two-shot
        if flashinfer_comm is not None and allreduce_params is not None:
            try:
                time_ms = benchmark_operation(
                    flashinfer_fused_allreduce_rmsnorm_fp4_quant,
                    input_tensor,
                    residual=residual,
                    norm_out=norm_out,
                    rms_gamma=rms_gamma,
                    rms_eps=rms_eps,
                    input_global_scale=scale_fp4,
                    allreduce_params=allreduce_params,
                    quant_out=fp4_quant_out,
                    output_scale=fp4_output_scale,
                    use_oneshot=False,
                )
                results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = (
                    time_ms
                )
            except Exception as e:
                logger.error(
                    "FlashInfer Fused AllReduce+RMSNorm+FP4 Two-shot failed: %s",
                    e,
                )
                results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = float(
                    "inf"
                )
    return results
@@ -1039,24 +1035,33 @@ def main():
    configs = list(itertools.product(args.num_tokens, dtypes, residual_options))
-    # Setup FlashInfer workspace if available
+    # Setup FlashInfer workspaces for all backends
    ipc_handles = None
    allreduce_params = None
    if flashinfer_comm is not None:
        # Use the largest hidden dimension for workspace setup
        max_element_size = max(torch.finfo(dt).bits // 8 for dt in dtypes)
        workspace_dtype = (
            torch.float32
            if max_element_size == 4
            else (torch.bfloat16 if torch.bfloat16 in dtypes else torch.float16)
        )
        max_num_token = _FI_MAX_SIZES.get(world_size) // (
-            args.hidden_dim * world_size * 2
+            args.hidden_dim * max_element_size
        )
-        ipc_handles, workspace_tensor = setup_flashinfer_workspace(
+        for backend in FLASHINFER_BACKENDS:
-            world_size, rank, args.hidden_dim, max_num_token
+            setup_flashinfer_workspace(
-        )
+                backend=backend,
        if workspace_tensor is not None:
            allreduce_params = FlashInferFusedAllReduceParams(
                rank=rank,
                world_size=world_size,
                rank=rank,
                hidden_dim=args.hidden_dim,
                max_token_num=max_num_token,
                dtype=workspace_dtype,
            )
        if _FI_WORKSPACES:
            allreduce_params = FlashInferFusedAllReduceParams(
                max_token_num=max_num_token,
            )
@@ -1081,6 +1086,7 @@ def main():
                dtype,
                use_residual,
                allreduce_params,
                workspaces=_FI_WORKSPACES,
                quant_modes=quant_modes,
                no_oneshot=args.no_oneshot,
            )
@@ -1119,11 +1125,13 @@ def main():
    finally:
        # Cleanup
-        if ipc_handles is not None:
+        cleanup_flashinfer_workspaces()
            cleanup_flashinfer_workspace(ipc_handles)
        dist.barrier()
 if __name__ == "__main__":
    from vllm.config import VllmConfig, set_current_vllm_config
    with set_current_vllm_config(VllmConfig()):
        main()
--- a/benchmarks/kernels/benchmark_int8_gemm.py
+++ b/benchmarks/kernels/benchmark_int8_gemm.py
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -5,12 +5,14 @@ import time
 import torch
 from vllm.benchmarks.lib.utils import default_vllm_config
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
@torch.inference_mode()
@default_vllm_config()
 def main(
    num_tokens: int,
    hidden_size: int,
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -16,6 +16,7 @@ import torch
 from ray.experimental.tqdm_ray import tqdm
 from vllm.model_executor.layers.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.activation import MoEActivation
 from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEConfig,
    FusedMoEParallelConfig,
@@ -99,13 +100,38 @@ def benchmark_config(
    dtype: torch.dtype,
    use_fp8_w8a8: bool,
    use_int8_w8a16: bool,
    use_int4_w4a16: bool = False,
    num_iters: int = 100,
    block_quant_shape: list[int] = None,
    use_deep_gemm: bool = False,
 ) -> float:
    init_dtype = torch.float16 if use_fp8_w8a8 else dtype
    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
-    if use_int8_w8a16:
+    if use_int4_w4a16:
        # Int4 packed weights: 2 int4 values per uint8 byte
        # K dimension is packed (halved)
        intermediate_size = shard_intermediate_size // 2  # after silu_and_mul
        w1 = torch.randint(
            0,
            255,
            (
                num_experts,
                shard_intermediate_size,
                hidden_size // 2,  # int4 packing
            ),
            dtype=torch.uint8,
        )
        w2 = torch.randint(
            0,
            255,
            (
                num_experts,
                hidden_size,
                intermediate_size // 2,  # int4 packing
            ),
            dtype=torch.uint8,
        )
    elif use_int8_w8a16:
        w1 = torch.randint(
            -127,
            127,
@@ -139,7 +165,20 @@ def benchmark_config(
    w2_scale = None
    a1_scale = None
    a2_scale = None
-    if use_int8_w8a16:
+    if use_int4_w4a16:
        if block_quant_shape is None:
            raise ValueError("block_quant_shape is required for int4_w4a16")
        group_size = block_quant_shape[1]
        # Scales shape: (E, N, K // group_size) in fp16
        w1_scale = torch.rand(
            (num_experts, shard_intermediate_size, hidden_size // group_size),
            dtype=dtype,
        )
        w2_scale = torch.rand(
            (num_experts, hidden_size, intermediate_size // group_size),
            dtype=dtype,
        )
    elif use_int8_w8a16:
        w1_scale = torch.randn(
            (num_experts, 2 * shard_intermediate_size), dtype=torch.float32
        )
@@ -198,6 +237,7 @@ def benchmark_config(
            a1_scale=a1_scale,
            a2_scale=a2_scale,
            block_shape=block_quant_shape,
            weight_dtype="int4" if use_int4_w4a16 else None,
        )
        deep_gemm_experts = None
@@ -211,7 +251,8 @@ def benchmark_config(
                        hidden_dim=hidden_size,
                        intermediate_size_per_partition=shard_intermediate_size,
                        num_local_experts=num_experts,
-                        activation="silu",
+                        num_logical_experts=num_experts,
                        activation=MoEActivation.SILU,
                        moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
                        in_dtype=init_dtype,
                        routing_method=RoutingMethodType.TopK,
@@ -226,9 +267,10 @@ def benchmark_config(
                x, input_gating, topk, renormalize=not use_deep_gemm
            )
            inplace = not disable_inplace()
            if use_deep_gemm:
                return deep_gemm_experts(
-                    x, w1, w2, topk_weights, topk_ids, inplace=True
+                    x, w1, w2, topk_weights, topk_ids, inplace=inplace
                )
            return fused_experts(
                x,
@@ -236,7 +278,7 @@ def benchmark_config(
                w2,
                topk_weights,
                topk_ids,
-                inplace=True,
+                inplace=inplace,
                quant_config=quant_config,
            )
@@ -478,6 +520,7 @@ class BenchmarkWorker:
        dtype: torch.dtype,
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
        use_int4_w4a16: bool = False,
        block_quant_shape: list[int] = None,
        use_deep_gemm: bool = False,
    ) -> tuple[dict[str, int], float]:
@@ -485,7 +528,10 @@ class BenchmarkWorker:
        set_random_seed(self.seed)
        dtype_str = _get_config_dtype_str(
-            dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
+            dtype,
            use_int8_w8a16=use_int8_w8a16,
            use_fp8_w8a8=use_fp8_w8a8,
            use_int4_w4a16=use_int4_w4a16,
        )
        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
        # is the intermediate size after silu_and_mul.
@@ -516,6 +562,7 @@ class BenchmarkWorker:
            dtype,
            use_fp8_w8a8,
            use_int8_w8a16,
            use_int4_w4a16=use_int4_w4a16,
            num_iters=100,
            block_quant_shape=block_quant_shape,
            use_deep_gemm=use_deep_gemm,
@@ -532,6 +579,7 @@ class BenchmarkWorker:
        dtype: torch.dtype,
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
        use_int4_w4a16: bool,
        search_space: list[dict[str, int]],
        block_quant_shape: list[int],
        use_deep_gemm: bool,
@@ -542,7 +590,7 @@ class BenchmarkWorker:
        best_config = None
        best_time = float("inf")
        if current_platform.is_rocm():
-            is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
+            is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16 or use_int4_w4a16)
            search_space = prune_rocm_search_space(
                num_tokens,
                shard_intermediate_size,
@@ -571,6 +619,7 @@ class BenchmarkWorker:
                        dtype,
                        use_fp8_w8a8,
                        use_int8_w8a16,
                        use_int4_w4a16,
                        num_iters=20,
                        block_quant_shape=block_quant_shape,
                        use_deep_gemm=use_deep_gemm,
@@ -618,6 +667,7 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
            else {}
        ),
        **({"kpack": config["kpack"]} if "kpack" in config else {}),
        **({"SPLIT_K": config["SPLIT_K"]} if "SPLIT_K" in config else {}),
    }
@@ -630,11 +680,15 @@ def save_configs(
    dtype: torch.dtype,
    use_fp8_w8a8: bool,
    use_int8_w8a16: bool,
    use_int4_w4a16: bool,
    block_quant_shape: list[int],
    save_dir: str,
 ) -> None:
    dtype_str = _get_config_dtype_str(
-        dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
+        dtype,
        use_int8_w8a16=use_int8_w8a16,
        use_fp8_w8a8=use_fp8_w8a8,
        use_int4_w4a16=use_int4_w4a16,
    )
    # NOTE(woosuk): The current naming convention uses w2.shape[2], which
@@ -686,6 +740,7 @@ def get_model_params(config):
        "DeepseekV2ForCausalLM",
        "DeepseekV3ForCausalLM",
        "DeepseekV32ForCausalLM",
        "GlmMoeDsaForCausalLM",
        "Glm4MoeForCausalLM",
        "Glm4MoeLiteForCausalLM",
        "NemotronHForCausalLM",
@@ -735,6 +790,38 @@ def get_model_params(config):
    return E, topk, intermediate_size, hidden_size
 def get_quantization_group_size(config) -> int | None:
    """Extract the quantization group size from the HF model config.
    This reads directly from the HuggingFace config object (as returned by
    ``get_config()``), not from vLLM's quantization config classes.
    Supports AWQ/GPTQ-style configs (direct 'group_size' key) and
    compressed-tensors configs (nested inside 'config_groups').
    """
    quantization_config = getattr(config, "quantization_config", {})
    if not isinstance(quantization_config, dict):
        return None
    # AWQ / GPTQ style: group_size is a top-level key
    gs = quantization_config.get("group_size")
    if gs is not None:
        return gs
    # compressed-tensors style: group_size is nested in config_groups
    config_groups = quantization_config.get("config_groups", {})
    if not isinstance(config_groups, dict):
        return None
    for group_cfg in config_groups.values():
        if not isinstance(group_cfg, dict):
            continue
        weights = group_cfg.get("weights", {})
        if not isinstance(weights, dict):
            continue
        gs = weights.get("group_size")
        if gs is not None:
            return gs
    return None
 def main(args: argparse.Namespace):
    print(args)
@@ -753,7 +840,20 @@ def main(args: argparse.Namespace):
    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    use_int4_w4a16 = args.dtype == "int4_w4a16"
    block_quant_shape = get_weight_block_size_safety(config)
    if use_int4_w4a16:
        group_size = get_quantization_group_size(config)
        if group_size is None:
            raise ValueError(
                "Could not determine group_size from model config. "
                "The model's quantization_config must contain a 'group_size' "
                "field (AWQ/GPTQ) or 'config_groups.*.weights.group_size' "
                "(compressed-tensors)."
            )
        # For int4_w4a16, block_shape = [0, group_size]
        # block_shape[0]=0 means no block quantization on N dimension
        block_quant_shape = [0, group_size]
    if args.batch_size is None:
        batch_sizes = [
@@ -807,8 +907,20 @@ def main(args: argparse.Namespace):
        return ray.get(outputs)
    if args.tune:
-        is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
+        # int4_w4a16 weights are uint8-packed, not fp16; treat like fp8 for
-        search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
+        # search space generation (no matrix_instr_nonkdim/kpack exploration).
        is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16 or use_int4_w4a16)
        # For int4_w4a16, the group_size constraint on BLOCK_SIZE_K does not
        # apply: the gptq_awq kernel handles arbitrary BLOCK_SIZE_K regardless
        # of group_size. Skip block_quant_shape filtering to keep the full
        # search space (e.g. BLOCK_SIZE_K=64 with group_size=128).
        tune_block_quant_shape = None if use_int4_w4a16 else block_quant_shape
        search_space = get_configs_compute_bound(is_fp16, tune_block_quant_shape)
        if use_int4_w4a16:
            # SPLIT_K is a required kernel constexpr for gptq_awq kernel;
            # only SPLIT_K=1 is used at runtime, so fix it during tuning.
            for cfg in search_space:
                cfg["SPLIT_K"] = 1
        print(f"Start tuning over {len(search_space)} configurations...")
        if use_deep_gemm:
            raise ValueError(
@@ -828,6 +940,7 @@ def main(args: argparse.Namespace):
                    dtype,
                    use_fp8_w8a8,
                    use_int8_w8a16,
                    use_int4_w4a16,
                    search_space,
                    block_quant_shape,
                    use_deep_gemm,
@@ -847,6 +960,7 @@ def main(args: argparse.Namespace):
            dtype,
            use_fp8_w8a8,
            use_int8_w8a16,
            use_int4_w4a16,
            block_quant_shape,
            args.save_dir,
        )
@@ -865,6 +979,7 @@ def main(args: argparse.Namespace):
                    dtype,
                    use_fp8_w8a8,
                    use_int8_w8a16,
                    use_int4_w4a16,
                    block_quant_shape,
                    use_deep_gemm,
                )
@@ -887,7 +1002,10 @@ if __name__ == "__main__":
    )
    parser.add_argument("--enable-expert-parallel", "-enable-ep", action="store_true")
    parser.add_argument(
-        "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
+        "--dtype",
        type=str,
        choices=["auto", "fp8_w8a8", "int8_w8a16", "int4_w4a16"],
        default="auto",
    )
    parser.add_argument("--use-deep-gemm", action="store_true")
    parser.add_argument(
--- a/benchmarks/kernels/benchmark_moe_defaults.py
+++ b/benchmarks/kernels/benchmark_moe_defaults.py
@@ -0,0 +1,278 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Benchmark comparing old vs new default fused MoE configs.
 Runs the triton fused_moe kernel with three configurations for each scenario:
  1. Tuned config (from JSON file, if available) — the target to match
  2. Old default (the hardcoded defaults before this change)
  3. New default (the improved defaults)
 Usage:
    python benchmarks/kernels/benchmark_moe_defaults.py
 Produces a table showing kernel time (us) and speedup of new vs old defaults.
 """
 import torch
 from vllm.model_executor.layers.fused_moe import fused_topk, override_config
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.fused_moe import (
    fused_experts,
    get_default_config,
    get_moe_configs,
 )
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
 from vllm.utils.torch_utils import set_random_seed
 FP8_DTYPE = current_platform.fp8_dtype()
 def old_default_config(M, E, N, K, topk, dtype=None, block_shape=None):
    """The original defaults before https://github.com/vllm-project/vllm/pull/34846,
    for comparison."""
    if dtype == "fp8_w8a8" and block_shape is not None:
        return {
            "BLOCK_SIZE_M": 64,
            "BLOCK_SIZE_N": block_shape[0],
            "BLOCK_SIZE_K": block_shape[1],
            "GROUP_SIZE_M": 32,
            "SPLIT_K": 1,
            "num_warps": 4,
            "num_stages": 3 if not current_platform.is_rocm() else 2,
        }
    elif M <= E:
        return {
            "BLOCK_SIZE_M": 16,
            "BLOCK_SIZE_N": 32,
            "BLOCK_SIZE_K": 64,
            "GROUP_SIZE_M": 1,
            "SPLIT_K": 1,
        }
    else:
        return {
            "BLOCK_SIZE_M": 64,
            "BLOCK_SIZE_N": 64,
            "BLOCK_SIZE_K": 32,
            "GROUP_SIZE_M": 8,
            "SPLIT_K": 1,
        }
 def benchmark_config(
    config,
    M,
    E,
    N,
    K,
    topk,
    dtype,
    use_fp8=False,
    block_shape=None,
    num_iters=100,
 ):
    """Time a single kernel config. Returns kernel time in microseconds."""
    init_dtype = torch.float16 if use_fp8 else dtype
    a = torch.randn(M, K, device="cuda", dtype=init_dtype) / 10
    w1 = torch.randn(E, 2 * N, K, device="cuda", dtype=init_dtype) / 10
    w2 = torch.randn(E, K, N, device="cuda", dtype=init_dtype) / 10
    w1_scale = None
    w2_scale = None
    a1_scale = None
    a2_scale = None
    if use_fp8:
        if block_shape is not None:
            bsn, bsk = block_shape
            n_tiles_w1 = triton.cdiv(2 * N, bsn)
            k_tiles_w1 = triton.cdiv(K, bsk)
            n_tiles_w2 = triton.cdiv(K, bsn)
            k_tiles_w2 = triton.cdiv(N, bsk)
            w1_scale = torch.rand(
                E, n_tiles_w1, k_tiles_w1, device="cuda", dtype=torch.float32
            )
            w2_scale = torch.rand(
                E, n_tiles_w2, k_tiles_w2, device="cuda", dtype=torch.float32
            )
        else:
            w1_scale = torch.rand(E, device="cuda", dtype=torch.float32)
            w2_scale = torch.rand(E, device="cuda", dtype=torch.float32)
        a1_scale = torch.rand(1, device="cuda", dtype=torch.float32)
        a2_scale = torch.rand(1, device="cuda", dtype=torch.float32)
        # Only weights are stored in fp8; activations stay in bf16/fp16
        # and get dynamically quantized inside the kernel.
        w1 = w1.to(FP8_DTYPE)
        w2 = w2.to(FP8_DTYPE)
    quant_config = FusedMoEQuantConfig.make(
        quant_dtype=torch.float8_e4m3fn if use_fp8 else None,
        w1_scale=w1_scale,
        w2_scale=w2_scale,
        a1_scale=a1_scale,
        a2_scale=a2_scale,
        block_shape=block_shape,
    )
    gating = torch.randn(M, E, device="cuda", dtype=torch.float32)
    # Warmup
    for _ in range(20):
        with override_config(config):
            topk_weights, topk_ids, _ = fused_topk(a, gating, topk, renormalize=True)
            fused_experts(
                a,
                w1,
                w2,
                topk_weights,
                topk_ids,
                quant_config=quant_config,
            )
    torch.cuda.synchronize()
    # Benchmark
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()
    for _ in range(num_iters):
        with override_config(config):
            topk_weights, topk_ids, _ = fused_topk(a, gating, topk, renormalize=True)
            fused_experts(
                a,
                w1,
                w2,
                topk_weights,
                topk_ids,
                quant_config=quant_config,
            )
    end.record()
    torch.cuda.synchronize()
    return start.elapsed_time(end) / num_iters * 1000  # ms -> us
 # Model configurations: (name, E, N, K, topk, dtype_str, use_fp8, block_shape)
 # N = moe_intermediate_size // tp_size (the value used in config file lookup)
 MODELS = [
    # --- Few experts ---
    ("Mixtral bf16", 8, 7168, 4096, 2, None, False, None),
    ("Mixtral fp8", 8, 7168, 4096, 2, "fp8_w8a8", True, None),
    # --- Many experts: real model shapes at tp=1 ---
    # Qwen2-MoE-57B: E=60, topk=4, N=1408, K=2048
    ("Qwen2-MoE bf16", 60, 1408, 2048, 4, None, False, None),
    # DeepSeek-V2: E=64, topk=6, N=1407, K=4096
    # (use 1408 to avoid odd alignment; real model is 1407)
    ("DeepSeek-V2 bf16", 64, 1408, 4096, 6, None, False, None),
    # OLMoE-7B: E=64, topk=8, N=2048, K=2048
    ("OLMoE bf16", 64, 2048, 2048, 8, None, False, None),
    # GLM-4-100B-A10B: E=128, topk=8, N=1408, K=4096
    ("GLM-4-MoE bf16", 128, 1408, 4096, 8, None, False, None),
    # Qwen3-30B-A3B: E=128, topk=8, N=768, K=2048
    ("Qwen3-MoE bf16", 128, 768, 2048, 8, None, False, None),
    # DeepSeek-V3 / MiMo-V2-Flash: E=256, topk=8, N=2048, K=7168
    ("DeepSeek-V3 bf16", 256, 2048, 7168, 8, None, False, None),
    # Qwen3.5-70B-A22B (Qwen3-Next): E=512, topk=10, N=512, K=2048
    ("Qwen3-Next bf16", 512, 512, 2048, 10, None, False, None),
    # E=128 N=1856 bf16
    ("E128 N1856 bf16", 128, 1856, 4096, 8, None, False, None),
    # E=256 N=512 bf16 (DS-V3 tp=4)
    ("DS-V3 tp4 bf16", 256, 512, 7168, 8, None, False, None),
    # E=512 N=512 bf16 (Qwen3-Next tp=1)
    ("Qwen3-Next bf16", 512, 512, 2048, 10, None, False, None),
    # E=512 N=256 bf16 (Qwen3-Next tp=2)
    ("Qwen3-Next tp2", 512, 256, 2048, 10, None, False, None),
    # --- FP8 block quant (many experts) ---
    # DS-V3 tp=4: E=256, N=512, fp8 block
    ("DS-V3 tp4 fp8blk", 256, 512, 7168, 8, "fp8_w8a8", True, [128, 128]),
    # DS-V3 tp=8: E=256, N=256, fp8 block
    ("DS-V3 tp8 fp8blk", 256, 256, 7168, 8, "fp8_w8a8", True, [128, 128]),
    # Qwen3-Next tp=2 fp8 block
    ("Qwen3-Next tp2 fp8blk", 512, 256, 2048, 10, "fp8_w8a8", True, [128, 128]),
 ]
 BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
 def main():
    set_random_seed(0)
    torch.set_default_device("cuda")
    dtype = torch.bfloat16
    for name, E, N, K, topk, dtype_str, use_fp8, block_shape in MODELS:
        print(f"\n{'=' * 90}")
        print(f"  {name}  (E={E}, N={N}, K={K}, topk={topk})")
        print(f"{'=' * 90}")
        # Try to load tuned config
        block_n = block_shape[0] if block_shape else None
        block_k = block_shape[1] if block_shape else None
        tuned = get_moe_configs(E, N, dtype_str, block_n, block_k)
        has_tuned = tuned is not None
        print(f"  Tuned config available: {has_tuned}")
        hdr = (
            f"{'Batch':>6} | {'Tuned (us)':>11} | {'Old (us)':>11} | "
            f"{'New (us)':>11} | {'New/Old':>8} | {'New/Tuned':>10}"
        )
        print(f"  {hdr}")
        print(f"  {'-' * len(hdr)}")
        for M in BATCH_SIZES:
            old_cfg = old_default_config(M, E, N, K, topk, dtype_str, block_shape)
            new_cfg = get_default_config(M, E, N, K, topk, dtype_str, block_shape)
            if has_tuned:
                tuned_cfg = tuned[min(tuned.keys(), key=lambda x: abs(x - M))]
                t_tuned = benchmark_config(
                    tuned_cfg,
                    M,
                    E,
                    N,
                    K,
                    topk,
                    dtype,
                    use_fp8=use_fp8,
                    block_shape=block_shape,
                )
            else:
                t_tuned = None
            t_old = benchmark_config(
                old_cfg,
                M,
                E,
                N,
                K,
                topk,
                dtype,
                use_fp8=use_fp8,
                block_shape=block_shape,
            )
            t_new = benchmark_config(
                new_cfg,
                M,
                E,
                N,
                K,
                topk,
                dtype,
                use_fp8=use_fp8,
                block_shape=block_shape,
            )
            ratio_new_old = t_new / t_old
            tuned_str = f"{t_tuned:11.2f}" if t_tuned else f"{'N/A':>11}"
            ratio_tuned = f"{t_new / t_tuned:10.2f}x" if t_tuned else f"{'N/A':>10}"
            # flag regressions where new default is >5% slower than old
            marker = " <--" if ratio_new_old > 1.05 else ""
            print(
                f"  {M:>6} | {tuned_str} | {t_old:11.2f} | {t_new:11.2f} "
                f"| {ratio_new_old:7.2f}x | {ratio_tuned}{marker}"
            )
 if __name__ == "__main__":
    main()
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -44,10 +44,8 @@ def benchmark_permute(
    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
    # output_hidden_states = torch.empty_like(hidden_states)
    if use_fp8_w8a8:
        align_block_size = 128  # deepgemm needs 128 m aligned block
        qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
    else:
        align_block_size = None
        qhidden_states = hidden_states
    gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32)
@@ -67,7 +65,6 @@ def benchmark_permute(
            topk_ids=topk_ids,
            n_expert=num_experts,
            expert_map=None,
            align_block_size=align_block_size,
        )
    # JIT compilation & warmup
@@ -117,10 +114,8 @@ def benchmark_unpermute(
    # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
    if use_fp8_w8a8:
        align_block_size = 128  # deepgemm needs 128 m aligned block
        qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
    else:
        align_block_size = None
        qhidden_states = hidden_states
    input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
@@ -142,7 +137,6 @@ def benchmark_unpermute(
            topk_ids=topk_ids,
            n_expert=num_experts,
            expert_map=None,
            align_block_size=align_block_size,
        )
        # convert to fp16/bf16 as gemm output
        return (
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -36,6 +36,7 @@ from typing import Any
 import numpy as np
 import torch
 from vllm.benchmarks.lib.utils import default_vllm_config
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.transformers_utils.config import get_config
 from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -78,6 +79,7 @@ def calculate_stats(times: list[float]) -> dict[str, float]:
    }
@default_vllm_config()
 def benchmark_mrope(
    model_name: str,
    num_tokens: int,
--- a/benchmarks/kernels/benchmark_mxfp4_qutlass.py
+++ b/benchmarks/kernels/benchmark_mxfp4_qutlass.py
--- a/benchmarks/kernels/benchmark_nvfp4_gemm.py
+++ b/benchmarks/kernels/benchmark_nvfp4_gemm.py
--- a/benchmarks/kernels/benchmark_nvfp4_quant.py
+++ b/benchmarks/kernels/benchmark_nvfp4_quant.py
--- a/benchmarks/kernels/benchmark_nvfp4_qutlass.py
+++ b/benchmarks/kernels/benchmark_nvfp4_qutlass.py
--- a/benchmarks/kernels/benchmark_per_token_quant_fp8.py
+++ b/benchmarks/kernels/benchmark_per_token_quant_fp8.py
@@ -7,6 +7,7 @@ from unittest.mock import patch
 import pandas as pd
 import torch
 from vllm.benchmarks.lib.utils import default_vllm_config
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.triton_utils import triton
@@ -84,6 +85,7 @@ def calculate_diff(
 configs = []
@default_vllm_config()
 def benchmark_quantization(
    batch_size,
    hidden_size,
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -5,6 +5,7 @@ import itertools
 import torch
 from vllm.benchmarks.lib.utils import default_vllm_config
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.triton_utils import triton
 from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -29,6 +30,7 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device):
            args={},
        )
    )
    @default_vllm_config()
    def benchmark(batch_size, seq_len, num_heads, provider):
        dtype = torch.bfloat16
        max_position = 8192
--- a/benchmarks/run_structured_output_benchmark.sh
+++ b/benchmarks/run_structured_output_benchmark.sh
@@ -71,7 +71,7 @@ while [[ $# -gt 0 ]]; do
      usage
      ;;
    *)
-      echo "Unknown argument: $1\n"
+      printf "Unknown argument: %s\n" "$1"
      usage
      ;;
  esac
@@ -84,15 +84,17 @@ mkdir -p "$OUTPUT_DIR"
 QPS_VALUES=(25 20 15 10 5 1)
 # Common parameters
-COMMON_PARAMS="--backend $BACKEND \
+COMMON_PARAMS=(
-               --model $MODEL \
+  --backend "$BACKEND"
-               --dataset $DATASET \
+  --model "$MODEL"
-               --structured-output-ratio $STRUCTURED_OUTPUT_RATIO \
+  --dataset "$DATASET"
-               --save-results \
+  --structured-output-ratio "$STRUCTURED_OUTPUT_RATIO"
-               --result-dir $OUTPUT_DIR \
+  --save-results
-               --output-len $MAX_NEW_TOKENS \
+  --result-dir "$OUTPUT_DIR"
-               --port $PORT \
+  --output-len "$MAX_NEW_TOKENS"
-               --tokenizer-mode $TOKENIZER_MODE"
+  --port "$PORT"
  --tokenizer-mode "$TOKENIZER_MODE"
 )
 echo "Starting structured output benchmark with model: $MODEL"
 echo "Backend: $BACKEND"
@@ -109,17 +111,17 @@ for qps in "${QPS_VALUES[@]}"; do
  GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
  # Construct filename for this run
-  FILENAME="${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json"
+  FILENAME="${BACKEND}_${qps}qps_$(basename "$MODEL")_${DATASET}_${GIT_HASH}_${GIT_BRANCH}.json"
  NUM_PROMPTS=$(echo "$TOTAL_SECONDS * $qps" | bc)
  NUM_PROMPTS=${NUM_PROMPTS%.*}  # Remove fractional part
  echo "Running benchmark with $NUM_PROMPTS prompts"
  # Run the benchmark
-  python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
+  python "$SCRIPT_DIR/benchmark_serving_structured_output.py" "${COMMON_PARAMS[@]}" \
-    --request-rate $qps \
+    --request-rate "$qps" \
    --result-filename "$FILENAME" \
-    --num-prompts $NUM_PROMPTS
+    --num-prompts "$NUM_PROMPTS"
  echo "Completed benchmark with QPS: $qps"
  echo "----------------------------------------"
--- a/Show More
+++ b/Show More