Patch protobuf for CVE-2026-0994 (#34253 )

Signed-off-by: Seiji Eicher <seiji@anyscale.com> Co-authored-by: Kevin H. Luu <khluu000@gmail.com> (cherry picked from commit 5045d5c983)
[XPU][9/N] clean up existing ipex code/doc (#34111 )
2026-02-11 02:33:40 -08:00 · 2026-02-11 02:33:27 -08:00 · 2026-02-11 02:33:15 -08:00 · 2026-02-11 02:33:04 -08:00 · 2026-02-11 02:32:52 -08:00 · 2026-02-11 02:32:41 -08:00
1871 changed files with 44766 additions and 169887 deletions
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -1,7 +1,6 @@
-group: Hardware - AMD Build 
+group: Hardware
 steps:
  - label: "AMD: :docker: build image"
    key: image-build-amd
    depends_on: []
    device: amd_cpu
    no_plugin: true
@@ -10,7 +9,7 @@ steps:
      docker build
      --build-arg max_jobs=16
      --build-arg REMOTE_VLLM=1
-      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
+      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
      --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
      --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
      -f docker/Dockerfile.rocm
--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -21,20 +21,6 @@ steps:
      pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
      pytest -x -v -s tests/kernels/test_onednn.py"
 - label: CPU-Compatibility Tests
  depends_on: []
  soft_fail: true
  device: intel_cpu
  no_plugin: true
  source_file_dependencies:
  - cmake/cpu_extension.cmake
  - setup.py
  - vllm/platforms/cpu.py
  commands:
    - |
      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
      bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
 - label: CPU-Language Generation and Pooling Model Tests
  depends_on: []
  soft_fail: true
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@@ -8,7 +8,7 @@ clean_docker_tag() {
 }
 print_usage_and_exit() {
-    echo "Usage: $0 <registry> <repo> <commit> <branch> <image_tag> [<image_tag_latest>]"
+    echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
    exit 1
 }
@@ -142,16 +142,11 @@ resolve_parent_commit() {
 print_bake_config() {
    echo "--- :page_facing_up: Resolved bake configuration"
-    # Write to a temp directory to avoid polluting the repo root (which is the
+    BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
    # Docker build context). Files left in the repo root get COPY'd into the
    # image and can cause duplicate artifact uploads from downstream steps.
    local bake_tmp
    bake_tmp="$(mktemp -d)"
    BAKE_CONFIG_FILE="${bake_tmp}/bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
    docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
    echo "Saved bake config to ${BAKE_CONFIG_FILE}"
    echo "--- :arrow_down: Uploading bake config to Buildkite"
-    (cd "$(dirname "${BAKE_CONFIG_FILE}")" && buildkite-agent artifact upload "$(basename "${BAKE_CONFIG_FILE}")")
+    buildkite-agent artifact upload "${BAKE_CONFIG_FILE}"
 }
 #################################
@@ -159,7 +154,7 @@ print_bake_config() {
 #################################
 print_instance_info
-if [[ $# -lt 5 ]]; then
+if [[ $# -lt 7 ]]; then
    print_usage_and_exit
 fi
@@ -168,8 +163,10 @@ REGISTRY=$1
 REPO=$2
 BUILDKITE_COMMIT=$3
 BRANCH=$4
-IMAGE_TAG=$5
+VLLM_USE_PRECOMPILED=$5
-IMAGE_TAG_LATEST=${6:-} # only used for main branch, optional
+VLLM_MERGE_BASE_COMMIT=$6
 IMAGE_TAG=$7
 IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional
 # build config
 TARGET="test-ci"
@@ -196,6 +193,8 @@ export CACHE_FROM
 export CACHE_FROM_BASE_BRANCH
 export CACHE_FROM_MAIN
 export CACHE_TO
 export VLLM_USE_PRECOMPILED
 export VLLM_MERGE_BASE_COMMIT
 # print args
 echo "--- :mag: Arguments"
@@ -203,6 +202,8 @@ echo "REGISTRY: ${REGISTRY}"
 echo "REPO: ${REPO}"
 echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
 echo "BRANCH: ${BRANCH}"
 echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
 echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
 echo "IMAGE_TAG: ${IMAGE_TAG}"
 echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -5,7 +5,8 @@ steps:
    depends_on: []
    timeout_in_minutes: 600
    commands:
-    - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
+    - if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
    - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
 # skip build if image already exists
-if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -24,11 +24,13 @@ fi
 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
-  --build-arg VLLM_CPU_X86=true \
+  --build-arg VLLM_CPU_AVX512BF16=true \
-  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
+  --build-arg VLLM_CPU_AVX512VNNI=true \
  --build-arg VLLM_CPU_AMXBF16=true \
  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
  --target vllm-test \
  --progress plain .
 # push
-docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
--- a/.buildkite/image_build/image_build_cpu_arm64.sh
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
 # skip build if image already exists
-if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -24,10 +24,10 @@ fi
 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
-  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
  --target vllm-test \
  --progress plain .
 # push
-docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
--- a/.buildkite/image_build/image_build_hpu.sh
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
 # skip build if image already exists
-if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -25,10 +25,10 @@ fi
 docker build \
  --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
-  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu \
+  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
  --progress plain \
  https://github.com/vllm-project/vllm-gaudi.git
 # push
-docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu
+docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on chartqa for vllm.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.11"
+#   pip install "lm-eval[api]>=0.4.9.2"
 usage() {
    echo``
@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
  --tasks chartqa \
  --batch_size auto \
  --apply_chat_template \
-  --limit "$LIMIT"
+  --limit $LIMIT
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.11"
+#   pip install "lm-eval[api]>=0.4.9.2"
 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.11"
+#   pip install "lm-eval[api]>=0.4.9.2"
 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.11"
+#   pip install "lm-eval[api]>=0.4.9.2"
 usage() {
    echo``
@@ -20,11 +20,14 @@ usage() {
    echo
 }
-while getopts "m:l:f:t:" OPT; do
+while getopts "m:b:l:f:t:" OPT; do
  case ${OPT} in
    m )
        MODEL="$OPTARG"
        ;;
    b )
        BATCH_SIZE="$OPTARG"
        ;;
    l )
        LIMIT="$OPTARG"
        ;;
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -13,10 +13,9 @@ import os
 from contextlib import contextmanager
 import lm_eval
 import numpy as np
 import yaml
 from vllm.platforms import current_platform
 DEFAULT_RTOL = 0.08
@@ -64,9 +63,6 @@ def launch_lm_eval(eval_config, tp_size):
        "allow_deprecated_quantization=True,"
    )
    if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]:
        model_args += "attention_backend=TRITON_ATTN"
    env_vars = eval_config.get("env_vars", None)
    with scoped_env_vars(env_vars):
        results = lm_eval.simple_evaluate(
@@ -106,8 +102,6 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
                f"ground_truth={ground_truth:.3f} | "
                f"measured={measured_value:.3f} | rtol={rtol}"
            )
-
+            success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
            min_acceptable = ground_truth * (1 - rtol)
            success = success and measured_value >= min_acceptable
    assert success
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -83,6 +83,7 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
@@ -7,10 +7,8 @@ import argparse
 import html as _html
 import json
 import os
 from contextlib import nullcontext
 from dataclasses import dataclass
 from importlib import util
 from pathlib import Path
 import pandas as pd
@@ -33,45 +31,6 @@ pd.set_option("display.precision", 2)
 pd.set_option("display.float_format", lambda x: f"{x:.2f}")
 # -----------------------------
 # Concurrency normalization (NEW, small)
 # -----------------------------
 def _find_concurrency_col(df: pd.DataFrame) -> str:
    for c in [
        "# of max concurrency.",
        "# of max concurrency",
        "Max Concurrency",
        "max_concurrency",
        "Concurrency",
    ]:
        if c in df.columns:
            return c
    for c in df.columns:
        if "concurr" in str(c).lower():
            s = df[c]
            if s.dtype.kind in "iu" and s.nunique() > 1 and s.min() >= 1:
                return c
    raise ValueError(
        "Cannot infer concurrency column. "
        "Please rename the column to one of the known names "
        "or add an explicit override (e.g., --concurrency-col)."
    )
 def _normalize_concurrency_in_df(
    df: pd.DataFrame, canonical: str = "# of max concurrency."
 ) -> pd.DataFrame:
    if canonical in df.columns:
        return df
    detected = _find_concurrency_col(df)
    if detected in df.columns and detected != canonical:
        return df.rename(columns={detected: canonical})
    df[canonical] = pd.NA
    return df
 # -----------------------------
 # Core data compare
 # -----------------------------
@@ -91,25 +50,19 @@ def compare_data_columns(
    - Concat along axis=1 (indexes align), then reset_index so callers can
      group by columns.
    - If --debug, add a <file_label>_name column per file.
    Minimal fix to support different max_concurrency lists across files:
      - normalize concurrency column naming to "# of max concurrency."
      - align on UNION of keys (missing points become NaN)
      - BUGFIX: don't drop throughput rows based on P99/Median presence
    """
    print("\ncompare_data_column:", data_column)
    frames = []
    raw_data_cols: list[str] = []
    compare_frames = []
    # Determine key cols after normalizing concurrency
    cols_per_file: list[set] = []
    for f in files:
        try:
            df_tmp = pd.read_json(f, orient="records")
        except Exception as err:
            raise ValueError(f"Failed to read {f}") from err
        df_tmp = _normalize_concurrency_in_df(df_tmp, canonical="# of max concurrency.")
        cols_per_file.append(set(df_tmp.columns))
    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
@@ -120,25 +73,12 @@ def compare_data_columns(
            "No common key columns found from info_cols across the input files."
        )
-    union_index = None
+    meta_added = False
    metas: list[pd.DataFrame] = []
    staged: list[tuple[str, pd.Series, pd.Series | None]] = []
    for file in files:
        df = pd.read_json(file, orient="records")
        df = _normalize_concurrency_in_df(df, canonical="# of max concurrency.")
-        # BUGFIX: only drop rows for latency-like metrics; throughput rows may have
+        if drop_column in df.columns:
        # NaN in P99/Median columns even if the column exists in the JSON.
        metric_lc = str(data_column).lower()
        is_latency_metric = (
            "ttft" in metric_lc
            or "tpot" in metric_lc
            or "p99" in metric_lc
            or "median" in metric_lc
            or metric_lc.strip() in {"p99", "median"}
        )
        if is_latency_metric and drop_column in df.columns:
            df = df.dropna(subset=[drop_column], ignore_index=True)
        for c in (
@@ -163,61 +103,35 @@ def compare_data_columns(
            meta = meta.groupby(level=key_cols, dropna=False).first()
        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
-
+        s = df_idx[data_column]
-        if data_column in df_idx.columns:
+        if not s.index.is_unique:
-            s = df_idx[data_column]
+            s = s.groupby(level=key_cols, dropna=False).mean()
            if not s.index.is_unique:
                s = s.groupby(level=key_cols, dropna=False).mean()
        else:
            # keep NA series to preserve meta keys for union_index
            s = pd.Series(pd.NA, index=meta.index)
        s.name = file_label
-        name_s = None
+        if not meta_added:
            frames.append(meta)
            meta_added = True
        if debug and name_column in df_idx.columns:
            name_s = df_idx[name_column]
            if not name_s.index.is_unique:
                name_s = name_s.groupby(level=key_cols, dropna=False).first()
            name_s.name = f"{file_label}_name"
            frames.append(name_s)
-        if union_index is None:
+        frames.append(s)
            union_index = meta.index
        else:
            union_index = union_index.union(meta.index)
        metas.append(meta)
        staged.append((file_label, s, name_s))
    if union_index is None:
        raise ValueError("No data found after loading inputs.")
    # meta first (union-aligned): build UNION meta across all files
    if metas:
        meta_union = pd.concat(metas, axis=0)
        # Collapse duplicates on the MultiIndex; keep first non-null per column
        meta_union = meta_union.groupby(level=key_cols, dropna=False).first()
        frames.append(meta_union.reindex(union_index))
    # values + ratios (union-aligned)
    metric_series_aligned: list[pd.Series] = []
    for file_label, s, name_s in staged:
        s_aligned = s.reindex(union_index)
        frames.append(s_aligned)
        raw_data_cols.append(file_label)
-        metric_series_aligned.append(s_aligned)
+        compare_frames.append(s)
-        if debug and name_s is not None:
+        if len(compare_frames) >= 2:
-            frames.append(name_s.reindex(union_index))
+            base = compare_frames[0]
-
+            current = compare_frames[-1]
-        if len(metric_series_aligned) >= 2:
+            if "P99" in data_column or "Median" in data_column:
            base = metric_series_aligned[0]
            current = metric_series_aligned[-1]
            if "P99" in str(data_column) or "Median" in str(data_column):
                ratio = base / current
            else:
                ratio = current / base
            ratio = ratio.mask(base == 0)
-            ratio.name = f"Ratio 1 vs {len(metric_series_aligned)}"
+            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
            frames.append(ratio)
    concat_df = pd.concat(frames, axis=1).reset_index(drop=True)
@@ -288,10 +202,24 @@ def split_json_by_tp_pp(
 # -----------------------------
 # Styling helpers
 # -----------------------------
 def _find_concurrency_col(df: pd.DataFrame) -> str:
    for c in [
        "# of max concurrency.",
        "# of max concurrency",
        "Max Concurrency",
        "max_concurrency",
        "Concurrency",
    ]:
        if c in df.columns:
            return c
    for c in df.columns:
        if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
            return c
    return "# of max concurrency."
 def _highlight_threshold(
-    df: pd.DataFrame,
+    df: pd.DataFrame, threshold: float
    threshold: float,
    slack_pct: float = 0.0,
 ) -> pd.io.formats.style.Styler:
    conc_col = _find_concurrency_col(df)
    key_cols = [
@@ -304,24 +232,12 @@ def _highlight_threshold(
    ]
    conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
-    try:
+    return df.style.map(
-        slack_pct = float(slack_pct or 0.0)
+        lambda v: "background-color:#e6ffe6;font-weight:bold;"
-    except Exception:
+        if pd.notna(v) and v <= threshold
-        slack_pct = 0.0
+        else "",
-    slack_limit = threshold * (1.0 + slack_pct / 100.0)
+        subset=conf_cols,
-
+    )
    def _cell(v):
        if pd.isna(v):
            return ""
        if v <= threshold:
            # Strict SLA
            return "background-color:#e6ffe6;font-weight:bold;"
        if v <= slack_limit:
            # Within slack range
            return "background-color:#ffe5cc;font-weight:bold;"
        return ""
    return df.style.map(_cell, subset=conf_cols)
 def highlight_ratio_columns(styler: pd.io.formats.style.Styler):
@@ -359,177 +275,6 @@ def _apply_two_decimals(
    return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="")
 # -----------------------------
 # Export helpers (Excel + CSV)
 # -----------------------------
 def _sanitize_sheet_name(name: str) -> str:
    """
    Excel sheet constraints:
      - max 31 chars
      - cannot contain: : \ / ? * [ ]
      - cannot be empty
    NOTE: Use fast, non-regex operations here to avoid the third-party `regex`
    module's compile overhead/edge-cases on some systems.
    """
    name = "sheet" if name is None else str(name)
    # Replace illegal characters with underscore.
    trans = str.maketrans(
        {
            ":": "_",
            "\\": "_",
            "/": "_",
            "?": "_",
            "*": "_",
            "[": "_",
            "]": "_",
        }
    )
    name = name.translate(trans)
    # Strip quotes/spaces and collapse whitespace.
    name = name.strip().strip("'")
    name = " ".join(name.split())
    if not name:
        name = "sheet"
    return name[:31]
 def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str:
    d = dict(zip(group_cols, gkey_tuple))
    # Always keep input/output lengths (these are important).
    ilen = d.get("Input Len", "")
    olen = d.get("Output Len", "")
    lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else ""
    # Shorten model name aggressively to make room for lens.
    model = d.get("Model", "model")
    leaf = str(model).split("/")[-1]
    max_model_len = max(1, 31 - len(lens))
    model_short = leaf[:max_model_len]
    return _sanitize_sheet_name(f"{model_short}{lens}")
 def _write_tables_to_excel_sheet(
    writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]]
 ):
    """Write all blocks to a sheet with a single to_excel() call.
    Pandas+openpyxl can be extremely slow when called many times per sheet.
    We flatten blocks into one table with a 'Section' column to keep structure
    while making Excel generation fast and deterministic.
    """
    if not blocks:
        pd.DataFrame().to_excel(writer, sheet_name=sheet, index=False)
        return
    combined_parts: list[pd.DataFrame] = []
    for title, df in blocks:
        df2 = df.copy()
        # Put the section label as the first column for readability.
        df2.insert(0, "Section", title)
        combined_parts.append(df2)
    combined = pd.concat(combined_parts, axis=0, ignore_index=True, sort=False)
    combined.to_excel(writer, sheet_name=sheet, index=False)
 def _safe_filename(s: str) -> str:
    # Fast path without the third-party `regex` module.
    s = " ".join(str(s).strip().split())
    allowed = []
    for ch in s:
        if ch.isalnum() or ch in "._-":
            allowed.append(ch)
        else:
            allowed.append("_")
    out = "".join(allowed)
    return out[:180] if len(out) > 180 else out
 # -----------------------------
 # vLLM environment export helper
 # -----------------------------
 def _parse_vllm_env_txt(env_path: Path) -> pd.DataFrame:
    """Parse vllm_env.txt into a flat table (Section, Key, Value).
    Supports:
      - section headers as standalone lines (no ':' or '=')
      - key-value lines like 'OS: Ubuntu ...'
      - env var lines like 'HF_HOME=/data/hf'
    """
    lines = env_path.read_text(encoding="utf-8", errors="replace").splitlines()
    section = "General"
    rows: list[dict] = []
    def set_section(s: str):
        nonlocal section
        s = (s or "").strip()
        if s:
            section = s
    for raw in lines:
        stripped = raw.strip()
        if not stripped:
            continue
        # divider lines like =====
        if set(stripped) <= {"="}:
            continue
        # section header heuristic: short standalone line
        if ":" not in stripped and "=" not in stripped and len(stripped) <= 64:
            if stripped.lower().startswith("collecting environment information"):
                continue
            set_section(stripped)
            continue
        # env var style: KEY=VALUE (and not a URL with :)
        if "=" in stripped and ":" not in stripped:
            k, v = stripped.split("=", 1)
            k = k.strip()
            v = v.strip()
            if k:
                rows.append({"Section": section, "Key": k, "Value": v})
            continue
        # key: value
        if ":" in stripped:
            k, v = stripped.split(":", 1)
            k = k.strip()
            v = v.strip()
            if k:
                rows.append({"Section": section, "Key": k, "Value": v})
            continue
    return pd.DataFrame(rows, columns=["Section", "Key", "Value"])
 def _load_env_df_for_inputs(args, files: list[str]) -> pd.DataFrame | None:
    """Load vllm_env.txt next to the *original* input JSON file.
    Note: when only one -f is provided, the script may split JSON into ./splits/...,
    but vllm_env.txt typically lives next to the original benchmark_results.json.
    """
    base_dir: Path | None = None
    if getattr(args, "file", None):
        base_dir = Path(args.file[0]).resolve().parent
    elif files:
        base_dir = Path(files[0]).resolve().parent
    if base_dir is None:
        return None
    env_path = base_dir / "vllm_env.txt"
    if not env_path.exists():
        return None
    df = _parse_vllm_env_txt(env_path)
    return df
 # -----------------------------
 # Valid max concurrency summary helpers
 # -----------------------------
@@ -556,11 +301,7 @@ def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]:
 def _max_concurrency_ok(
-    df: pd.DataFrame,
+    df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float
    conc_col: str,
    cfg_col: str,
    threshold: float,
    slack_pct: float = 0.0,
 ):
    if df is None or conc_col not in df.columns or cfg_col not in df.columns:
        return pd.NA
@@ -573,14 +314,7 @@ def _max_concurrency_ok(
    if d.empty:
        return pd.NA
-    # Accept values up to (1 + slack_pct%) above the SLA.
+    ok = d[d[cfg_col] <= threshold]
    try:
        slack_pct = float(slack_pct or 0.0)
    except Exception:
        slack_pct = 0.0
    effective_limit = float(threshold) * (1.0 + slack_pct / 100.0)
    ok = d[d[cfg_col] <= effective_limit]
    if ok.empty:
        return pd.NA
@@ -646,25 +380,15 @@ def build_valid_max_concurrency_summary_html(
    if not cfg_cols:
        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
    # Display SLA ranges in the table header (SLA .. SLA*(1+slack))
    ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
    tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
    ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
    tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
    rows = []
    for cfg in cfg_cols:
        ttft_max = (
-            _max_concurrency_ok(
+            _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms)
                ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
            )
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_max = (
-            _max_concurrency_ok(
+            _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms)
                tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
            )
            if tpot_group_df is not None
            else pd.NA
        )
@@ -693,8 +417,8 @@ def build_valid_max_concurrency_summary_html(
        rows.append(
            {
                "Configuration": cfg,
-                f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
+                f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max,
-                f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
+                f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max,
                f"Max {conc_col} (Both)": both,
                "Output Tput @ Both (tok/s)": tput_at_both,
                "TTFT @ Both (ms)": ttft_at_both,
@@ -704,6 +428,7 @@ def build_valid_max_concurrency_summary_html(
    summary_df = pd.DataFrame(rows)
    # --- Coerce numeric columns so Styler doesn't miss them due to object dtype ---
    for c in summary_df.columns:
        if c == "Configuration":
            continue
@@ -711,10 +436,12 @@ def build_valid_max_concurrency_summary_html(
    both_col = f"Max {conc_col} (Both)"
    # --- Strict 2-decimal formatting for ALL non-Configuration columns ---
    formatters = {}
    for c in summary_df.columns:
        if c == "Configuration":
            continue
        # default argument binds per-column formatter correctly
        formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}"
    styler = summary_df.style.format(formatters)
@@ -733,104 +460,6 @@ def build_valid_max_concurrency_summary_html(
    return title + styler.to_html(table_attributes='border="1" class="dataframe"')
 def build_valid_max_concurrency_summary_df(
    tput_group_df: pd.DataFrame | None,
    ttft_group_df: pd.DataFrame | None,
    tpot_group_df: pd.DataFrame | None,
    conc_col: str,
    args,
 ) -> pd.DataFrame | None:
    if ttft_group_df is None and tpot_group_df is None:
        return None
    ttft_cols = (
        _config_value_columns(ttft_group_df, conc_col)
        if ttft_group_df is not None
        else []
    )
    tpot_cols = (
        _config_value_columns(tpot_group_df, conc_col)
        if tpot_group_df is not None
        else []
    )
    tput_cols = (
        _config_value_columns(tput_group_df, conc_col)
        if tput_group_df is not None
        else []
    )
    if ttft_group_df is not None and tpot_group_df is not None:
        cfg_cols = [c for c in ttft_cols if c in tpot_cols]
        if tput_group_df is not None:
            cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols
    else:
        cfg_cols = ttft_cols or tpot_cols
    if not cfg_cols:
        cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str)
    ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0)
    tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0)
    ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)"
    tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)"
    rows = []
    for cfg in cfg_cols:
        ttft_max = (
            _max_concurrency_ok(
                ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct
            )
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_max = (
            _max_concurrency_ok(
                tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct
            )
            if tpot_group_df is not None
            else pd.NA
        )
        both = (
            pd.NA
            if (pd.isna(ttft_max) or pd.isna(tpot_max))
            else min(ttft_max, tpot_max)
        )
        tput_at_both = (
            _value_at_concurrency(tput_group_df, conc_col, cfg, both)
            if tput_group_df is not None
            else pd.NA
        )
        ttft_at_both = (
            _value_at_concurrency(ttft_group_df, conc_col, cfg, both)
            if ttft_group_df is not None
            else pd.NA
        )
        tpot_at_both = (
            _value_at_concurrency(tpot_group_df, conc_col, cfg, both)
            if tpot_group_df is not None
            else pd.NA
        )
        rows.append(
            {
                "Configuration": cfg,
                f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max,
                f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max,
                f"Max {conc_col} (Both)": both,
                "Output Tput @ Both (tok/s)": tput_at_both,
                "TTFT @ Both (ms)": ttft_at_both,
                "TPOT @ Both (ms)": tpot_at_both,
            }
        )
    df = pd.DataFrame(rows)
    for c in df.columns:
        if c != "Configuration":
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df
 # -----------------------------
 # Plot helper
 # -----------------------------
@@ -908,35 +537,6 @@ def build_parser() -> argparse.ArgumentParser:
        default=100.0,
        help="Reference limit for TPOT plots (ms)",
    )
    # ---- SLA tolerance (slack) options ----
    parser.add_argument(
        "--ttft-slack-pct",
        type=float,
        default=5.0,
        help="Allowed percentage above TTFT SLA (default: 5).",
    )
    parser.add_argument(
        "--tpot-slack-pct",
        type=float,
        default=5.0,
        help="Allowed percentage above TPOT SLA (default: 5).",
    )
    # ---- export options ----
    parser.add_argument(
        "--excel-out",
        type=str,
        default="perf_comparison.xlsx",
        help="Write one sheet per (Model, Dataset, Input Len, Output Len).",
    )
    parser.add_argument(
        "--csv-out-dir",
        type=str,
        default="",
        help="If set, write per-group per-metric CSVs into this directory.",
    )
    return parser
@@ -1015,13 +615,9 @@ def render_metric_table_html(
    metric_name = metric_label.lower()
    if "ttft" in metric_name:
-        styler = _highlight_threshold(
+        styler = _highlight_threshold(display_group, args.ttft_max_ms)
            display_group, args.ttft_max_ms, args.ttft_slack_pct
        )
    elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name):
-        styler = _highlight_threshold(
+        styler = _highlight_threshold(display_group, args.tpot_max_ms)
            display_group, args.tpot_max_ms, args.tpot_slack_pct
        )
    else:
        styler = display_group.style
@@ -1061,6 +657,7 @@ def maybe_write_plot(
        markers=True,
    )
    # Ensure plot hover + y tick labels are also 2 decimals.
    fig.update_traces(hovertemplate="%{y:.2f}<extra></extra>")
    fig.update_yaxes(tickformat=".2f")
@@ -1133,186 +730,87 @@ def write_report_group_first(
        for metric_label, (df, _) in metric_cache.items()
    }
-    csv_dir = Path(args.csv_out_dir) if args.csv_out_dir else None
+    with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
-    if csv_dir:
+        main_fh.write('<meta charset="utf-8">\n')
-        csv_dir.mkdir(parents=True, exist_ok=True)
+        for gkey in group_keys:
            gkey_tuple = normalize_group_key(gkey)
            suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
            sub_path = group_filename(gkey_tuple)
            group_header = (
                '<div style="font-size: 1.4em; font-weight: 700; '
                'margin: 18px 0 10px 0;">'
                f"{_html.escape(suffix)}"
                "</div>\n"
            )
-    excel_path = args.excel_out or "perf_comparison.xlsx"
+            main_fh.write(group_header)
-    disable_excel = os.getenv("VLLM_COMPARE_DISABLE_EXCEL", "0") == "1"
+            with open(sub_path, "w", encoding="utf-8") as sub_fh:
                sub_fh.write('<meta charset="utf-8">\n')
                sub_fh.write(group_header)
                tput_group_df = None
                ttft_group_df = None
                tpot_group_df = None
                conc_col = args.xaxis
-    # Prefer xlsxwriter for speed; fallback to openpyxl if unavailable.
+                for metric_label in plan.data_cols:
-    excel_engine = (
+                    gb = metric_groupbys[metric_label]
-        os.getenv("VLLM_COMPARE_EXCEL_ENGINE", "xlsxwriter").strip() or "xlsxwriter"
+                    df_sorted, raw_data_cols = metric_cache[metric_label]
    )
    if excel_engine == "xlsxwriter" and util.find_spec("xlsxwriter") is None:
        excel_engine = "openpyxl"
-    excel_engine_kwargs = {}
+                    try:
-    if excel_engine == "xlsxwriter":
+                        group_df = gb.get_group(gkey)
-        # Reduce memory pressure & usually faster writes.
+                    except KeyError:
-        excel_engine_kwargs = {"options": {"constant_memory": True}}
+                        missing = (
                            '<div style="font-size: 1.1em; font-weight: 600; '
                            'margin: 10px 0;">'
                            f"{_html.escape(metric_label)} — missing for this group"
                            "</div>\n"
                        )
-    xw_ctx = (
+                        main_fh.write(missing)
-        nullcontext(None)
+                        sub_fh.write(missing)
-        if disable_excel
+                        continue
-        else pd.ExcelWriter(
+
-            excel_path, engine=excel_engine, engine_kwargs=excel_engine_kwargs
+                    if conc_col not in group_df.columns:
-        )
+                        conc_col = _find_concurrency_col(group_df)
-    )
+
-    with xw_ctx as xw:
+                    mn = metric_label.lower().strip()
-        used_sheets: set[str] = set()
+                    if "tok/s" in mn:
-        # ---- Environment sheet (first) ----
+                        tput_group_df = group_df
-        env_sheet = _sanitize_sheet_name("Environment")
+                    elif "ttft" in mn:
-        env_df = _load_env_df_for_inputs(args, files)
+                        ttft_group_df = group_df
-        if xw is not None:
+                    elif mn in ("p99", "median") or "tpot" in mn:
-            if env_df is None or env_df.empty:
+                        tpot_group_df = group_df
-                pd.DataFrame(
+
-                    [
+                    display_group = group_df.drop(
-                        {
+                        columns=group_cols_canonical, errors="ignore"
-                            "Section": "Environment",
+                    )
-                            "Key": "vllm_env.txt",
+
-                            "Value": "NOT FOUND (or empty)",
+                    html = render_metric_table_html(
-                        }
+                        display_group, metric_label, suffix, args
-                    ]
+                    )
-                ).to_excel(xw, sheet_name=env_sheet, index=False)
+                    main_fh.write(html)
-            else:
+                    sub_fh.write(html)
-                env_df.to_excel(xw, sheet_name=env_sheet, index=False)
+
-            used_sheets.add(env_sheet)
+                    maybe_write_plot(
-        with open("perf_comparison.html", "w", encoding="utf-8") as main_fh:
+                        main_fh,
-            main_fh.write('<meta charset="utf-8">\n')
+                        sub_fh,
-            for gkey in group_keys:
+                        group_df=group_df,
-                gkey_tuple = normalize_group_key(gkey)
+                        raw_data_cols=raw_data_cols,
-                suffix = build_group_suffix(group_cols_canonical, gkey_tuple)
+                        metric_label=metric_label,
-                sub_path = group_filename(gkey_tuple)
+                        y_axis_col=y_axis_col,
-                group_header = (
+                        args=args,
-                    '<div style="font-size: 1.4em; font-weight: 700; '
+                    )
-                    'margin: 18px 0 10px 0;">'
+
-                    f"{_html.escape(suffix)}"
+                summary_html = build_valid_max_concurrency_summary_html(
-                    "</div>\n"
+                    tput_group_df=tput_group_df,
                    ttft_group_df=ttft_group_df,
                    tpot_group_df=tpot_group_df,
                    conc_col=conc_col,
                    args=args,
                )
-
+                if summary_html:
-                main_fh.write(group_header)
+                    main_fh.write(summary_html)
-
+                    sub_fh.write(summary_html)
                do_excel = xw is not None
                sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple)
                sheet_base = sheet
                if do_excel:
                    dedup_i = 1
                    while sheet in used_sheets:
                        dedup_i += 1
                        suffix = f"_{dedup_i}"
                        # Ensure uniqueness even when sheet names are truncated.
                        base = str(sheet_base)
                        keep = max(1, 31 - len(suffix))
                        sheet = _sanitize_sheet_name(base[:keep] + suffix)
                    used_sheets.add(sheet)
                excel_blocks: list[tuple[str, pd.DataFrame]] = []
                with open(sub_path, "w", encoding="utf-8") as sub_fh:
                    sub_fh.write('<meta charset="utf-8">\n')
                    sub_fh.write(group_header)
                    tput_group_df = None
                    ttft_group_df = None
                    tpot_group_df = None
                    conc_col = args.xaxis
                    for metric_label in plan.data_cols:
                        gb = metric_groupbys[metric_label]
                        df_sorted, raw_data_cols = metric_cache[metric_label]
                        try:
                            group_df = gb.get_group(gkey)
                        except KeyError:
                            missing = (
                                '<div style="font-size: 1.1em; font-weight: 600; '
                                'margin: 10px 0;">'
                                f"{_html.escape(metric_label)} — missing for this group"
                                "</div>\n"
                            )
                            main_fh.write(missing)
                            sub_fh.write(missing)
                            continue
                        if conc_col not in group_df.columns:
                            conc_col = _find_concurrency_col(group_df)
                        mn = metric_label.lower().strip()
                        if "tok/s" in mn:
                            tput_group_df = group_df
                        elif "ttft" in mn:
                            ttft_group_df = group_df
                        elif mn in ("p99", "median") or "tpot" in mn:
                            tpot_group_df = group_df
                        display_group = group_df.drop(
                            columns=group_cols_canonical, errors="ignore"
                        )
                        html = render_metric_table_html(
                            display_group, metric_label, suffix, args
                        )
                        main_fh.write(html)
                        sub_fh.write(html)
                        maybe_write_plot(
                            main_fh,
                            sub_fh,
                            group_df=group_df,
                            raw_data_cols=raw_data_cols,
                            metric_label=metric_label,
                            y_axis_col=y_axis_col,
                            args=args,
                        )
                        excel_blocks.append(
                            (metric_label, group_df.reset_index(drop=True))
                        )
                        if csv_dir:
                            fn = _safe_filename(
                                f"{sheet}__{metric_label}".replace(" ", "_").replace(
                                    "/", "_"
                                )
                            )
                            group_df.to_csv(csv_dir / f"{fn}.csv", index=False)
                    summary_html = build_valid_max_concurrency_summary_html(
                        tput_group_df=tput_group_df,
                        ttft_group_df=ttft_group_df,
                        tpot_group_df=tpot_group_df,
                        conc_col=conc_col,
                        args=args,
                    )
                    if summary_html:
                        main_fh.write(summary_html)
                        sub_fh.write(summary_html)
                    summary_df = build_valid_max_concurrency_summary_df(
                        tput_group_df=tput_group_df,
                        ttft_group_df=ttft_group_df,
                        tpot_group_df=tpot_group_df,
                        conc_col=conc_col,
                        args=args,
                    )
                    if summary_df is not None:
                        excel_blocks.append(
                            ("Valid Max Concurrency Summary", summary_df)
                        )
                        if csv_dir:
                            fn = _safe_filename(
                                f"{sheet}__Valid_Max_Concurrency_Summary"
                            )
                            summary_df.to_csv(csv_dir / f"{fn}.csv", index=False)
                if do_excel:
                    _write_tables_to_excel_sheet(xw, sheet, excel_blocks)
    if disable_excel:
        print("Skipped Excel generation (VLLM_COMPARE_DISABLE_EXCEL=1).")
    else:
        print(f"Wrote Excel: {excel_path}")
    if csv_dir:
        print(f"Wrote CSVs under: {csv_dir}")
 def main():
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
 # This script should be run inside the CI process
 # This script assumes that we are already inside the vllm/ directory
 # Benchmarking results will be available inside vllm/benchmarks/results/
@@ -7,26 +9,14 @@
 set -x
 set -o pipefail
 # Environment-driven debug controls (like ON_CPU=1)
 DRY_RUN="${DRY_RUN:-0}"
 MODEL_FILTER="${MODEL_FILTER:-}"
 DTYPE_FILTER="${DTYPE_FILTER:-}"
 # Adaptive search controls
 ENABLE_ADAPTIVE_CONCURRENCY="${ENABLE_ADAPTIVE_CONCURRENCY:-0}"
 SLA_TTFT_MS="${SLA_TTFT_MS:-3000}"
 SLA_TPOT_MS="${SLA_TPOT_MS:-100}"
 ADAPTIVE_MAX_PROBES="${ADAPTIVE_MAX_PROBES:-8}"
 ADAPTIVE_MAX_CONCURRENCY="${ADAPTIVE_MAX_CONCURRENCY:-1024}"
 check_gpus() {
  if command -v nvidia-smi; then
    # check the number of GPUs and GPU type.
-    declare -g gpu_count=$(nvidia-smi --list-gpus | grep -c . || true)
+    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  elif command -v amd-smi; then
-    declare -g gpu_count=$(amd-smi list | grep -c 'GPU' || true)
+    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
  elif command -v hl-smi; then
-    declare -g gpu_count=$(hl-smi --list | grep -ci "Module ID" || true)
+    declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
  fi
  if [[ $gpu_count -gt 0 ]]; then
@@ -54,7 +44,7 @@ check_cpus() {
  declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
  if [[ $numa_count -gt 0 ]]; then
    echo "NUMA found."
-    echo "$numa_count"
+    echo $numa_count
  else
    echo "Need at least 1 NUMA to run benchmarking."
    exit 1
@@ -122,12 +112,13 @@ json2envs() {
 }
 wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
  local timeout_val="1200"
  timeout "$timeout_val" bash -c '
-    until curl -sf http://localhost:8000/v1/models >/dev/null; do
+    until curl -X POST localhost:8000/v1/completions; do
      sleep 1
-    done
+    done' && return 0 || return 1
  '
 }
 kill_processes_launched_by_current_bash() {
@@ -190,304 +181,6 @@ upload_to_buildkite() {
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }
 # -------------------------------
 # Adaptive concurrency helpers
 # -------------------------------
 result_json_path_for_serving() {
  local test_name=$1
  local qps=$2
  local max_concurrency=$3
  echo "$RESULTS_FOLDER/${test_name}_qps_${qps}_concurrency_${max_concurrency}.json"
 }
 extract_metric_ms() {
  local metric_name=$1
  local json_file=$2
  [[ -f "$json_file" ]] || return 0
  if [[ "$metric_name" == "ttft" ]]; then
    jq -r '
      [
        .ttft_ms.p99?,
        .metrics.ttft_ms.p99?,
        .ttft.p99?,
        .metrics.ttft.p99?,
        .p99_ttft_ms?,
        .ttft_ms.mean?,
        .metrics.ttft_ms.mean?,
        .ttft.mean?,
        .metrics.ttft.mean?,
        .mean_ttft_ms?
      ] | map(select(. != null)) | .[0] // empty
    ' "$json_file"
  else
    jq -r '
      [
        .tpot_ms.p99?,
        .metrics.tpot_ms.p99?,
        .tpot.p99?,
        .metrics.tpot.p99?,
        .p99_tpot_ms?,
        .itl_ms.p99?,
        .metrics.itl_ms.p99?,
        .inter_token_latency_ms.p99?,
        .tpot_ms.mean?,
        .metrics.tpot_ms.mean?,
        .tpot.mean?,
        .metrics.tpot.mean?,
        .itl_ms.mean?,
        .metrics.itl_ms.mean?,
        .mean_tpot_ms?,
        .mean_itl_ms?
      ] | map(select(. != null)) | .[0] // empty
    ' "$json_file"
  fi
 }
 evaluate_sla_from_json() {
  local json_file=$1
  local ttft
  local tpot
  local pass
  [[ -f "$json_file" ]] || return 2
  ttft=$(extract_metric_ms ttft "$json_file")
  tpot=$(extract_metric_ms tpot "$json_file")
  [[ -n "$ttft" && -n "$tpot" ]] || return 2
  pass=$(jq -n \
    --argjson ttft "$ttft" \
    --argjson tpot "$tpot" \
    --argjson sla_ttft "$SLA_TTFT_MS" \
    --argjson sla_tpot "$SLA_TPOT_MS" \
    '($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)')
  [[ "$pass" == "true" ]]
 }
 write_adaptive_summary_json() {
  local summary_file=$1
  local test_name=$2
  local qps=$3
  local static_last_pass=$4
  local static_first_fail=$5
  local final_last_pass=$6
  local final_first_fail=$7
  jq -n \
    --arg test_name "$test_name" \
    --arg qps "$qps" \
    --argjson sla_ttft "$SLA_TTFT_MS" \
    --argjson sla_tpot "$SLA_TPOT_MS" \
    --arg static_last_pass "${static_last_pass:-}" \
    --arg static_first_fail "${static_first_fail:-}" \
    --arg final_last_pass "${final_last_pass:-}" \
    --arg final_first_fail "${final_first_fail:-}" \
    '{
      test_name: $test_name,
      qps: $qps,
      sla_ttft_ms: $sla_ttft,
      sla_tpot_ms: $sla_tpot,
      static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end),
      static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end),
      final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end),
      final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end)
    }' > "$summary_file"
 }
 run_single_serving_probe() {
  local test_name=$1
  local qps=$2
  local max_concurrency=$3
  local tp=$4
  local compilation_config_mode=$5
  local optimization_level=$6
  local client_args_effective=$7
  local client_remote_args=$8
  local server_command=$9
  local new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
  local result_json
  local num_prompts_arg=""
  local client_command
  result_json=$(result_json_path_for_serving "$test_name" "$qps" "$max_concurrency")
  if [[ -f "$result_json" ]]; then
    evaluate_sla_from_json "$result_json"
    return $?
  fi
  if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
    num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
    if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
    if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
    num_prompts_arg="--num-prompts $num_prompts"
  fi
  client_command="vllm bench serve \
    --save-result \
    --result-dir $RESULTS_FOLDER \
    --result-filename ${new_test_name}.json \
    --request-rate $qps \
    --max-concurrency $max_concurrency \
    $num_prompts_arg \
    --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level adaptive_search=1 \
    $client_args_effective $client_remote_args "
  echo "Adaptive probe: $client_command"
  if [[ "${DRY_RUN:-0}" != "1" ]]; then
    bash -c "$client_command"
  fi
  jq_output=$(jq -n \
    --arg server "$server_command" \
    --arg client "$client_command" \
    --arg gpu "$gpu_type" \
    '{
      server_command: $server,
      client_command: $client,
      gpu_type: $gpu,
      adaptive_search: true
    }')
  echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
  evaluate_sla_from_json "$result_json"
 }
 adaptive_refine_from_static_results() {
  local test_name=$1
  local qps=$2
  local max_concurrency_list_raw=$3
  local tp=$4
  local compilation_config_mode=$5
  local optimization_level=$6
  local client_args_effective=$7
  local client_remote_args=$8
  local server_command=$9
  local sorted_points
  local point
  local rc
  local static_last_pass=""
  local static_first_fail=""
  local largest_static=""
  local step_hint=1
  local previous_point=""
  local low
  local high
  local mid
  local probes=0
  local summary_file="$RESULTS_FOLDER/${test_name}_qps_${qps}_sla_summary.json"
  [[ "${ENABLE_ADAPTIVE_CONCURRENCY}" == "1" ]] || return 0
  [[ "${DRY_RUN:-0}" != "1" ]] || return 0
  sorted_points=$(for point in $max_concurrency_list_raw; do printf '%s\n' "$point"; done | tr -d "'" | awk '/^[0-9]+$/' | sort -n | uniq)
  [[ -n "$sorted_points" ]] || return 0
  while read -r point; do
    [[ -z "$point" ]] && continue
    largest_static="$point"
    evaluate_sla_from_json "$(result_json_path_for_serving "$test_name" "$qps" "$point")"
    rc=$?
    if (( rc == 0 )); then
      static_last_pass="$point"
    elif (( rc == 1 )); then
      if [[ -n "$static_last_pass" ]]; then
        static_first_fail="$point"
        break
      fi
    fi
    if [[ -n "$previous_point" ]]; then
      step_hint=$(( point - previous_point ))
      if (( step_hint < 1 )); then step_hint=1; fi
    fi
    previous_point="$point"
  done <<< "$sorted_points"
  if [[ -z "$static_last_pass" ]]; then
    write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "" "$static_first_fail" "" "$static_first_fail"
    return 0
  fi
  if [[ -n "$static_first_fail" ]]; then
    low=$static_last_pass
    high=$static_first_fail
    while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
      mid=$(( (low + high) / 2 ))
      probes=$(( probes + 1 ))
      run_single_serving_probe \
        "$test_name" "$qps" "$mid" "$tp" \
        "$compilation_config_mode" "$optimization_level" \
        "$client_args_effective" "$client_remote_args" "$server_command"
      rc=$?
      if (( rc == 0 )); then
        low=$mid
      elif (( rc == 1 )); then
        high=$mid
      else
        break
      fi
    done
    write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "$static_first_fail" "$low" "$high"
    return 0
  fi
  low=$largest_static
  high=""
  while (( probes < ADAPTIVE_MAX_PROBES )); do
    point=$(( low + step_hint ))
    if (( point > ADAPTIVE_MAX_CONCURRENCY )); then
      point=$ADAPTIVE_MAX_CONCURRENCY
    fi
    (( point > low )) || break
    probes=$(( probes + 1 ))
    run_single_serving_probe \
      "$test_name" "$qps" "$point" "$tp" \
      "$compilation_config_mode" "$optimization_level" \
      "$client_args_effective" "$client_remote_args" "$server_command"
    rc=$?
    if (( rc == 0 )); then
      low=$point
      (( point == ADAPTIVE_MAX_CONCURRENCY )) && break
      step_hint=$(( step_hint * 2 ))
      if (( step_hint < 1 )); then step_hint=1; fi
    elif (( rc == 1 )); then
      high=$point
      break
    else
      break
    fi
  done
  if [[ -n "$high" ]]; then
    while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
      mid=$(( (low + high) / 2 ))
      probes=$(( probes + 1 ))
      run_single_serving_probe \
        "$test_name" "$qps" "$mid" "$tp" \
        "$compilation_config_mode" "$optimization_level" \
        "$client_args_effective" "$client_remote_args" "$server_command"
      rc=$?
      if (( rc == 0 )); then
        low=$mid
      elif (( rc == 1 )); then
        high=$mid
      else
        break
      fi
    done
  fi
  write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "" "$low" "$high"
 }
 run_benchmark_tests() {
  # run benchmark tests using `vllm bench <test_type>` command
  # $1: test type (latency or throughput)
@@ -559,16 +252,37 @@ run_benchmark_tests() {
  done
 }
-run_latency_tests() { run_benchmark_tests "latency" "$1"; }
+run_latency_tests() {
-run_startup_tests() { run_benchmark_tests "startup" "$1"; }
+  run_benchmark_tests "latency" "$1"
-run_throughput_tests() { run_benchmark_tests "throughput" "$1"; }
+}
-merge_serving_tests_stream() {
+run_startup_tests() {
-  # Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode.
+  run_benchmark_tests "startup" "$1"
-  # This helper does NOT modify JSON; it only filters the stream in dry-run mode.
+}
-  local serving_test_file="$1"
+
-  # shellcheck disable=SC2016
+run_throughput_tests() {
-  local merged='
+  run_benchmark_tests "throughput" "$1"
 }
 run_serving_tests() {
  # run serving tests using `vllm bench serve` command
  # $1: a json file specifying serving test cases
  #
  # Supported JSON formats:
  # 1) Plain format: top-level array
  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
  #
  # 2) Default parameters field + plain format tests
  #    {
  #      "defaults": { ... },
  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
  #    }
  local serving_test_file
  serving_test_file=$1
  # Iterate over serving tests
  jq -c '
    if type == "array" then
      # Plain format: test cases array
      .[]
@@ -590,50 +304,7 @@ merge_serving_tests_stream() {
    else
      error("Unsupported serving test file format: must be array or object with .tests")
    end
-  '
+  ' "$serving_test_file" | while read -r params; do
  jq -c "$merged" "$serving_test_file" | \
  if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
    jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" '
      select((($model|length)==0)
             or ((.server_parameters.model // "") == $model)
             or ((.client_parameters.model // "") == $model))
      | select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
    '
  else
    cat
  fi
 }
 run_serving_tests() {
  # run serving tests using `vllm bench serve` command
  # $1: a json file specifying serving test cases
  #
  # Supported JSON formats:
  # 1) Plain format: top-level array
  #    [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
  #
  # 2) Default parameters field + plain format tests
  #    {
  #      "defaults": { ... },
  #      "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
  #    }
  local serving_test_file
  serving_test_file=$1
  # In dry-run mode, if filters are provided but no tests match, fail fast.
  if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then
    local count
    count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ')
    if [[ "$count" -eq 0 ]]; then
      echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2
      return 0
    fi
  fi
  # Iterate over serving tests (merged + optional filtered stream)
  merge_serving_tests_stream "$serving_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^serving_ ]]; then
@@ -652,48 +323,10 @@ run_serving_tests() {
    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
    client_params=$(echo "$params" | jq -r '.client_parameters')
-    # vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly.
+    server_args=$(json2args "$server_params")
    server_model=$(echo "$server_params" | jq -r '.model // empty')
    if [[ -z "$server_model" || "$server_model" == "null" ]]; then
      echo "Error: serving test '$test_name' is missing server_parameters.model" >&2
      exit 1
    fi
    server_params_no_model=$(echo "$server_params" | jq -c 'del(.model)')
    server_args=$(json2args "$server_params_no_model")
    server_envs=$(json2envs "$server_envs")
    client_args=$(json2args "$client_params")
    # ------------------------------------------------------------
    # Option 1: Dynamic num-prompts scaling based on max_concurrency
    #
    # If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with:
    #   num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY
    #
    # If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior
    # unchanged (i.e., whatever is in serving-tests-*.json).
    # ------------------------------------------------------------
    PROMPTS_PER_CONCURRENCY="${PROMPTS_PER_CONCURRENCY-}"  # no default on purpose
    MIN_NUM_PROMPTS="${MIN_NUM_PROMPTS:-1}"
    MAX_NUM_PROMPTS="${MAX_NUM_PROMPTS:-1000000}"
    if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
      # Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
      # Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
      # Handles: --num-prompts 123   and   --num-prompts=123
      client_args_no_np="$(
        printf ' %s ' "$client_args" \
        | sed -E \
          -e 's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g' \
          -e 's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g'
      )"
      # normalize whitespace
      client_args_no_np="$(echo "$client_args_no_np" | tr -s ' ' | sed -E 's/^ //; s/ $//')"
      client_args_no_np="$(echo "$client_args_no_np" | xargs)"
      client_args_effective="$client_args_no_np"
    else
      client_args_effective="$client_args"
    fi
    # qps_list
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
@@ -725,13 +358,14 @@ run_serving_tests() {
    fi
    # check if server model and client model is aligned
    server_model=$(echo "$server_params" | jq -r '.model')
    client_model=$(echo "$client_params" | jq -r '.model')
    if [[ $server_model != "$client_model" ]]; then
      echo "Server model and client model must be the same. Skip testcase $test_name."
      continue
    fi
-    server_command="$server_envs vllm serve $server_model \
+    server_command="$server_envs vllm serve \
      $server_args"
    # run the server
@@ -739,7 +373,7 @@ run_serving_tests() {
    echo "Server command: $server_command"
    # support remote vllm server
    client_remote_args=""
-    if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then
+    if [[ -z "${REMOTE_HOST}" ]]; then
      bash -c "$server_command" &
      server_pid=$!
      # wait until the server is alive
@@ -750,9 +384,6 @@ run_serving_tests() {
        echo ""
        echo "vLLM failed to start within the timeout period."
      fi
    elif [[ "${DRY_RUN:-0}" == "1" ]]; then
        # dry-run: don't start server
        echo "Dry Run."
    else
      server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
      if [[ ${REMOTE_PORT} ]]; then
@@ -771,21 +402,15 @@ run_serving_tests() {
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps="inf"
        echo "now qps is $qps"
      fi
      # iterate over different max_concurrency
      for max_concurrency in $max_concurrency_list; do
-        new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
+        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
        echo " new test name $new_test_name"
        # If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts.
        num_prompts_arg=""
        if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
          num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
          if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
          if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
          num_prompts_arg="--num-prompts $num_prompts"
        fi
        # pass the tensor parallel size, the compilation mode, and the optimization
        # level to the client so that they can be used on the benchmark dashboard
        client_command="vllm bench serve \
@@ -794,16 +419,13 @@ run_serving_tests() {
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --max-concurrency $max_concurrency \
          $num_prompts_arg \
          --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
-          $client_args_effective $client_remote_args "
+          $client_args $client_remote_args "
        echo "Running test case $test_name with qps $qps"
        echo "Client command: $client_command"
-        if [[ "${DRY_RUN:-0}" != "1" ]]; then
+        bash -c "$client_command"
          bash -c "$client_command"
        fi
        # record the benchmarking commands
        jq_output=$(jq -n \
@@ -818,23 +440,15 @@ run_serving_tests() {
        echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
      done
      adaptive_refine_from_static_results \
        "$test_name" "$qps" "$max_concurrency_list" "$tp" \
        "$compilation_config_mode" "$optimization_level" \
        "$client_args_effective" "$client_remote_args" "$server_command"
    done
    # clean up
-    if [[ "${DRY_RUN:-0}" != "1" ]]; then
+    kill -9 $server_pid
-      kill -9 "$server_pid"
+    kill_gpu_processes
      kill_gpu_processes
    fi
  done
 }
 main() {
  local ARCH
  ARCH=''
  if [[ "$ON_CPU" == "1" ]]; then
@@ -844,13 +458,7 @@ main() {
     check_gpus
     ARCH="$arch_suffix"
  fi
-
+  check_hf_token
  # DRY_RUN does not execute vLLM; do not require HF_TOKEN.
  if [[ "${DRY_RUN:-0}" != "1" ]]; then
    check_hf_token
  else
    echo "DRY_RUN=1 -> skip HF_TOKEN validation"
  fi
  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
@@ -871,16 +479,11 @@ main() {
  # dump vllm info via vllm collect-env
  env_output=$(vllm collect-env)
  echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
  # benchmarking
-  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $?
+  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
  if [[ "${DRY_RUN:-0}" == "1" ]]; then
    echo "DRY_RUN=1 -> skip latency/startup/throughput suites"
    exit 0
  fi
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
  run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
@@ -888,7 +491,6 @@ main() {
  # postprocess benchmarking results
  pip install tabulate pandas
  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
  python3 $QUICK_BENCHMARK_ROOT/scripts/compare-json-results.py -f $RESULTS_FOLDER/benchmark_results.json
  upload_to_buildkite
 }
--- a/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
@@ -51,56 +51,5 @@
            "max-model-len": 256,
            "async-scheduling": ""
        }
    },
    {
        "test_name": "latency_deepseek_r1",
        "environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "parameters": {
            "model": "deepseek-ai/DeepSeek-R1",
            "tensor_parallel_size": 8,
            "load_format": "dummy",
            "max-model-len": 2048,
            "dtype": "bfloat16"
        }
    },
    {
        "test_name": "latency_llama4_maverick_17b128e_instruct_fp8",
        "environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "parameters": {
            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
            "tensor_parallel_size": 8,
            "max-model-len": 512,
            "max-num-seqs": 128,
            "async-scheduling": "",
            "gpu-memory-utilization": 0.95,
            "enable_expert_parallel": ""
        }
    },
    {
        "test_name": "latency_qwen3_8b",
        "environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "parameters": {
            "model": "Qwen/Qwen3-8B",
            "tensor_parallel_size": 1,
            "max-model-len": 2048,
            "max-num-seqs": 128,
            "dtype": "bfloat16",
            "async-scheduling": ""
        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
@@ -1,37 +0,0 @@
 {
  "defaults": {
    "qps_list": [
      "inf"
    ],
    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
    "server_environment_variables": {
      "VLLM_RPC_TIMEOUT": 100000,
      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
    },
    "server_parameters": {
      "dtype": "bfloat16",
      "model": "openai/whisper-large-v3-turbo"
    },
    "client_parameters": {
      "model": "openai/whisper-large-v3-turbo",
      "backend": "openai-audio",
      "endpoint": "/v1/audio/transcriptions",
      "dataset_name": "hf",
      "dataset_path": "openslr/librispeech_asr",
      "hf_subset": "clean",
      "hf_split": "test",
      "no_stream": "",
      "no_oversample": "",
      "num_prompts": 200
    }
  },
  "tests": [
    {
      "test_name": "serving_whisper_large_v3_turbo_librispeech_clean_tp1",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {}
    }
  ]
 }
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
@@ -1,41 +0,0 @@
 {
  "defaults": {
    "qps_list": [
      "inf"
    ],
    "max_concurrency_list": [
      32,
      64,
      128
    ],
    "server_environment_variables": {
      "VLLM_RPC_TIMEOUT": 100000,
      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
      "VLLM_CPU_SGL_KERNEL": 1,
      "VLLM_CPU_KVCACHE_SPACE": 40
    },
    "server_parameters": {
      "dtype": "bfloat16",
      "model": "jinaai/jina-embeddings-v3",
      "trust_remote_code": ""
    },
    "client_parameters": {
      "model": "jinaai/jina-embeddings-v3",
      "backend": "openai-embeddings",
      "endpoint": "/v1/embeddings",
      "dataset_name": "sharegpt",
      "dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json",
      "num_prompts": 200
    }
  },
  "tests": [
    {
      "test_name": "serving_jina_embed_v3_tp1_sharegpt",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {}
    }
  ]
 }
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
@@ -1,355 +0,0 @@
 {
  "defaults": {
    "qps_list": [
      "inf"
    ],
    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
    "server_environment_variables": {
      "VLLM_RPC_TIMEOUT": 100000,
      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
      "VLLM_CPU_SGL_KERNEL": 1,
      "VLLM_CPU_KVCACHE_SPACE": 40
    },
    "server_parameters": {
      "model": "meta-llama/Llama-3.1-8B-Instruct",
      "tensor_parallel_size": 1,
      "dtype": "bfloat16",
      "distributed_executor_backend": "mp",
      "block_size": 128,
      "trust_remote_code": "",
      "disable_log_stats": "",
      "max_num_batched_tokens": 2048,
      "max_num_seqs": 256
    },
    "client_parameters": {
      "model": "meta-llama/Llama-3.1-8B-Instruct",
      "backend": "vllm",
      "ignore-eos": "",
      "num_prompts": 200
    }
  },
  "tests": [
    {
      "test_name": "serving_llama8B_tp1_sharegpt",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "sharegpt",
        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
      }
    },
    {
      "test_name": "serving_llama8B_tp2_sharegpt",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "sharegpt",
        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_128_128",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp2_random_128_128",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_128_128",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_128_2048",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp2_random_128_2048",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_128_2048",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_2048_128",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp2_random_2048_128",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_2048_128",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_2048_2048",
      "server_parameters": {
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp2_random_2048_2048",
      "server_parameters": {
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_2048_2048",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_int4_tp1_random_128_128",
      "server_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int4_tp2_random_128_128",
      "server_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int4_tp4_random_128_128",
      "server_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int8_tp1_random_128_128",
      "server_parameters": {
        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int8_tp2_random_128_128",
      "server_parameters": {
        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int8_tp4_random_128_128",
      "server_parameters": {
        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama3B_tp1_random_128_128",
      "server_parameters": {
        "model": "meta-llama/Llama-3.2-3B-Instruct",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "meta-llama/Llama-3.2-3B-Instruct",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_granite2B_tp1_random_128_128",
      "server_parameters": {
        "model": "ibm-granite/granite-3.2-2b-instruct",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "ibm-granite/granite-3.2-2b-instruct",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen1.7B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-1.7B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-1.7B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen4B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-4B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-4B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen8B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-8B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-8B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_glm9B_tp1_random_128_128",
      "server_parameters": {
        "model": "zai-org/glm-4-9b-hf",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "zai-org/glm-4-9b-hf",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_gemma7B_tp1_random_128_128",
      "server_parameters": {
        "model": "google/gemma-7b",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "google/gemma-7b",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    }
  ]
 }
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -72,6 +72,17 @@
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_128_128",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_128_2048",
      "server_parameters": {
@@ -94,6 +105,17 @@
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp4_random_128_2048",
      "server_parameters": {
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 2048
      }
    },
    {
      "test_name": "serving_llama8B_tp1_random_2048_128",
      "server_parameters": {
@@ -117,25 +139,144 @@
      }
    },
    {
-      "test_name": "serving_llama8B_tp1_random_2048_2048",
+      "test_name": "serving_llama8B_tp4_random_2048_128",
      "server_parameters": {
-        "tensor_parallel_size": 1
+        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
-        "random-output-len": 2048
+        "random-output-len": 128
      }
    },
    {
-      "test_name": "serving_llama8B_tp2_random_2048_2048",
+      "test_name": "serving_llama8B_int4_tp1_random_128_128",
      "server_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int4_tp2_random_128_128",
      "server_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "dataset_name": "random",
-        "random-input-len": 2048,
+        "random-input-len": 128,
-        "random-output-len": 2048
+        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama8B_int4_tp4_random_128_128",
      "server_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "tensor_parallel_size": 4
      },
      "client_parameters": {
        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_llama3B_tp1_random_128_128",
      "server_parameters": {
        "model": "meta-llama/Llama-3.2-3B-Instruct",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "meta-llama/Llama-3.2-3B-Instruct",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_granite2B_tp1_random_128_128",
      "server_parameters": {
        "model": "ibm-granite/granite-3.2-2b-instruct",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "ibm-granite/granite-3.2-2b-instruct",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen1.7B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-1.7B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-1.7B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen4B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-4B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-4B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_qwen8B_tp1_random_128_128",
      "server_parameters": {
        "model": "Qwen/Qwen3-8B",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "Qwen/Qwen3-8B",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_glm9B_tp1_random_128_128",
      "server_parameters": {
        "model": "zai-org/glm-4-9b-hf",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "zai-org/glm-4-9b-hf",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    },
    {
      "test_name": "serving_gemma7B_tp1_random_128_128",
      "server_parameters": {
        "model": "google/gemma-7b",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "model": "google/gemma-7b",
        "dataset_name": "random",
        "random-input-len": 128,
        "random-output-len": 128
      }
    }
  ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
@@ -10,6 +10,7 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -36,6 +37,7 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -62,6 +64,7 @@
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -75,83 +78,5 @@
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_deepseek_r1",
        "qps_list": [1, 4, 16, "inf"],
        "server_environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "server_parameters": {
            "model": "deepseek-ai/DeepSeek-R1",
            "tensor_parallel_size": 8,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
            "max-num-seqs": 200,
            "async-scheduling": "",
            "dtype": "bfloat16"
        },
        "client_parameters": {
            "model": "deepseek-ai/DeepSeek-R1",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama4_maverick_17b128e_instruct_fp8",
        "qps_list": [1, 4, 16, "inf"],
        "server_environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "server_parameters": {
            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
            "tensor_parallel_size": 8,
            "disable_log_stats": "",
            "max-model-len": 2048,
            "max-num-seqs": 128,
            "async-scheduling": "",
            "enable_expert_parallel": "",
            "max-num-batched-tokens": 4096
        },
        "client_parameters": {
            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_qwen3_8b",
        "qps_list": [1, 4, 10, "inf"],
        "server_environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "server_parameters": {
            "model": "Qwen/Qwen-3-8B",
            "tensor_parallel_size": 1,
            "dtype": "bfloat16",
            "disable_log_stats": "",
            "async-scheduling": ""
        },
        "client_parameters": {
            "model": "Qwen/Qwen-3-8B",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests.json
@@ -5,6 +5,7 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -22,6 +23,7 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -39,6 +41,7 @@
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -56,6 +59,7 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", 
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "speculative_config": {
                "model": "turboderp/Qwama-0.5B-Instruct",
                "num_speculative_tokens": 4,
--- a/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
@@ -57,67 +57,5 @@
            "max-num-seqs": 512,
            "async-scheduling": ""
        }
    },
    {
        "test_name": "throughput_deepseek_r1",
        "environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "parameters": {
            "model": "deepseek-ai/DeepSeek-R1",
            "tensor_parallel_size": 8,
            "load_format": "dummy",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "dataset_name": "sharegpt",
            "num_prompts": 1000,
            "backend": "vllm",
            "max-model-len": 2048,
            "max-num-seqs": 384,
            "async-scheduling": ""
        }
    },
    {
        "test_name": "throughput_llama4_maverick_17b128e_instruct_fp8",
        "environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "parameters": {
            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
            "tensor_parallel_size": 8,
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "dataset_name": "sharegpt",
            "num_prompts": 1000,
            "backend": "vllm",
            "max-model-len": 2048,
            "max-num-seqs": 512,
            "async-scheduling": "",
            "enable_expert_parallel": ""
        }
    },
    {
        "test_name": "throughput_qwen3_8b",
        "environment_variables": {
            "PT_HPU_LAZY_MODE": 1,
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
            "VLLM_CONTIGUOUS_PA": 1,
            "VLLM_DEFRAG": 1
        },
        "parameters": {
            "model": "Qwen/Qwen-3-8B",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "dataset_name": "sharegpt",
            "num_prompts": 1000,
            "max-num-seqs": 512,
            "backend": "vllm",
            "async-scheduling": ""
        }
    }
 ]
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -83,7 +83,7 @@ steps:
        agents:
          queue: cpu_queue_postmerge
        commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
          - "mkdir artifacts"
          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
@@ -152,7 +152,7 @@ steps:
          queue: cpu_queue_postmerge
        commands:
          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
        env:
--- a/.buildkite/scripts/annotate-rocm-release.sh
+++ b/.buildkite/scripts/annotate-rocm-release.sh
@@ -25,7 +25,7 @@ S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
 S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"
 # Format ROCm version for path (e.g., "7.1" -> "rocm710")
-ROCM_VERSION_PATH="rocm$(echo "${ROCM_VERSION}" | tr -d '.')"
+ROCM_VERSION_PATH="rocm$(echo ${ROCM_VERSION} | tr -d '.')"
 ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
 buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
 ## ROCm Wheel and Docker Image Releases
@@ -68,7 +68,7 @@ aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amd_aiter-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/aiter-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
 \`\`\`
@@ -80,7 +80,7 @@ aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-
 - **torchvision**: TorchVision for ROCm PyTorch
 - **torchaudio**: Torchaudio for ROCm PyTorch
 - **amdsmi**: AMD SMI Python bindings
- **amd_aiter**: Aiter for ROCm
+- **aiter**: Aiter for ROCm
 - **flash-attn**: Flash Attention for ROCm
 ### :warning: Notes
--- a/.buildkite/scripts/cache-rocm-base-wheels.sh
+++ b/.buildkite/scripts/cache-rocm-base-wheels.sh
@@ -83,7 +83,7 @@ case "${1:-}" in
            exit 1
        fi
-        WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
+        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
        if [[ "$WHEEL_COUNT" -eq 0 ]]; then
            echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
            exit 1
@@ -110,9 +110,9 @@ case "${1:-}" in
        echo ""
        echo "Downloaded wheels:"
-        find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \;
+        ls -lh artifacts/rocm-base-wheels/
-        WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
+        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
        echo ""
        echo "Total: $WHEEL_COUNT wheels"
        echo "========================================"
--- a/.buildkite/scripts/check-ray-compatibility.sh
+++ b/.buildkite/scripts/check-ray-compatibility.sh
@@ -1,213 +0,0 @@
 #!/bin/bash
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 #
 # Check if Ray LLM can generate lock files that are compatible with this
 # version of vllm. Downloads Ray's requirement files and runs a full
 # dependency resolution with the installed vllm's constraints to see if
 # a valid lock file can be produced.
 #
 # See: https://github.com/vllm-project/vllm/issues/33599
 set -eo pipefail
 RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python"
 WORK_DIR=$(mktemp -d)
 trap 'rm -rf "$WORK_DIR"' EXIT
 # Fetch all Ray requirement files used in the LLM depset pipeline
 echo ">>> Fetching Ray requirement files"
 RAY_FILES=(
    "requirements.txt"
    "requirements/cloud-requirements.txt"
    "requirements/base-test-requirements.txt"
    "requirements/llm/llm-requirements.txt"
    "requirements/llm/llm-test-requirements.txt"
 )
 for FILE in "${RAY_FILES[@]}"; do
    LOCAL_PATH="${WORK_DIR}/$(basename "$FILE")"
    echo "    ${FILE}"
    curl -fsSL -o "$LOCAL_PATH" "${RAY_BASE_URL}/${FILE}"
 done
 # Extract installed vllm deps
 echo ">>> Extracting installed vllm dependency constraints"
 python3 - "${WORK_DIR}/vllm-constraints.txt" <<'PYEOF'
 """Write out the installed vllm's dependencies as pip constraint lines.
 Ray uses vllm[audio], so audio-extra deps are included with their extra
 markers stripped. The resolver cannot evaluate extra markers for a
 package that is not itself being resolved from an index, so we activate
 them manually here.
 """
 import importlib.metadata
 import re
 import sys
 out_path = sys.argv[1]
 raw_reqs = importlib.metadata.requires("vllm") or []
 # Ray uses vllm[audio] – activate that extra.
 ACTIVE_EXTRAS = {"audio"}
 EXTRA_RE = re.compile(r"""extra\s*==\s*['"]([^'"]+)['"]""")
 lines = []
 for r in raw_reqs:
    if ";" not in r:
        # Unconditional dep — always include.
        lines.append(r.strip())
        continue
    req_part, _, marker_part = r.partition(";")
    marker_part = marker_part.strip()
    extra_matches = EXTRA_RE.findall(marker_part)
    if not extra_matches:
        # Non-extra marker (python_version, etc.) — keep as-is.
        lines.append(r.strip())
        continue
    if not ACTIVE_EXTRAS.intersection(extra_matches):
        continue  # Skip inactive extras (tensorizer, bench, …).
    # Strip the extra== conditions but keep any remaining markers
    # (e.g. python_version).
    cleaned = EXTRA_RE.sub("", marker_part)
    cleaned = re.sub(r"\band\b\s*\band\b", "and", cleaned)
    cleaned = re.sub(r"^\s*and\s+|\s+and\s*$", "", cleaned).strip()
    if cleaned:
        lines.append(f"{req_part.strip()} ; {cleaned}")
    else:
        lines.append(req_part.strip())
 with open(out_path, "w") as f:
    for line in lines:
        f.write(line + "\n")
 print(f"Wrote {len(lines)} constraints to {out_path}")
 PYEOF
 echo ">>> Installed vllm deps (first 20 lines):"
 head -20 "${WORK_DIR}/vllm-constraints.txt"
 # Remove Ray's vllm pin — the installed vllm's transitive deps
 # (written above) replace it in the resolution. vllm itself cannot
 # be resolved from PyPI for in-development versions, so we test
 # whether Ray's requirements can coexist with vllm's dependency
 # constraints instead.
 sed -i '/^vllm/d' "${WORK_DIR}/llm-requirements.txt"
 # Install uv if needed
 if ! command -v uv &>/dev/null; then
    echo ">>> Installing uv"
    pip install uv -q
 fi
 # Resolve: given vllm's constraints, can Ray compile a lock file?
 #
 # vllm's dependency constraints are the fixed side — Ray is flexible and
 # can regenerate its lock files. We pass vllm's constraints via -c so
 # the resolver treats them as non-negotiable bounds, then check whether
 # Ray's own requirements can still be satisfied within those bounds.
 echo ""
 echo "============================================================"
 echo ">>> Resolving: Can Ray generate compatible lock files?"
 echo "============================================================"
 set +e
 uv pip compile \
    "${WORK_DIR}/requirements.txt" \
    "${WORK_DIR}/cloud-requirements.txt" \
    "${WORK_DIR}/base-test-requirements.txt" \
    "${WORK_DIR}/llm-requirements.txt" \
    "${WORK_DIR}/llm-test-requirements.txt" \
    -c "${WORK_DIR}/vllm-constraints.txt" \
    --python-version 3.12 \
    --python-platform x86_64-manylinux_2_31 \
    --extra-index-url https://download.pytorch.org/whl/cu129 \
    --index-strategy unsafe-best-match \
    --unsafe-package setuptools \
    --unsafe-package ray \
    --no-header \
    -o "${WORK_DIR}/resolved.txt" \
    2>&1
 EXIT_CODE=$?
 set -e
 echo ""
 echo "=========================================="
 if [ $EXIT_CODE -eq 0 ]; then
    echo "SUCCESS: Ray can generate lock files compatible with this vllm."
    echo ""
    echo "Key resolved versions:"
    grep -E '^(protobuf|torch|numpy|transformers)==' \
        "${WORK_DIR}/resolved.txt" | sort || true
    echo "=========================================="
    exit 0
 fi
 echo "FAILURE: Ray cannot generate lock files compatible with this vllm."
 echo "This means a fundamental dependency conflict exists that Ray"
 echo "cannot resolve by regenerating its lock files."
 echo "See: https://github.com/vllm-project/vllm/issues/33599"
 echo "=========================================="
 # Buildkite annotation
 if [ -f /usr/bin/buildkite-agent ]; then
    buildkite-agent annotate --style 'warning' --context 'ray-compat' << EOF
 ### :warning: Ray Dependency Compatibility Warning
 This PR introduces dependencies that **cannot** be resolved with Ray's requirements.
 Ray would not be able to regenerate its lock files to accommodate this vllm version.
 Please check the **Ray Dependency Compatibility Check** step logs for details.
 See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for context.
 EOF
 fi
 # Notify Slack if webhook is configured and PR/branch are valid.
 if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
    PR="${BUILDKITE_PULL_REQUEST:-}"
    BRANCH="${BUILDKITE_BRANCH:-}"
    # Skip notification if PR is invalid or branch is empty
    if [[ "$PR" = "false" || -z "$PR" || -z "$BRANCH" ]]; then
        echo ">>> Skipping Slack notification (invalid PR or empty branch: PR=$PR, branch=$BRANCH)"
    else
        echo ">>> Sending Slack notification"
        # Single quotes are intentional: the f-string expressions are Python, not shell.
        # shellcheck disable=SC2016
        PAYLOAD=$(python3 -c '
 import json, os, sys
 pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A")
 branch = os.getenv("BUILDKITE_BRANCH", "unknown")
 url = os.getenv("BUILDKITE_BUILD_URL", "#")
 data = {
    "text": ":warning: Ray Dependency Compatibility Check Failed",
    "blocks": [{
        "type": "section",
        "text": {
            "type": "mrkdwn",
            "text": (
                "*:warning: Ray Dependency Compatibility Check Failed*\n"
                f"PR #{pr} on branch `{branch}` introduces dependencies "
                f"that cannot be resolved with Ray'\''s requirements.\n"
                f"<{url}|View Build>"
            ),
        },
    }],
 }
 print(json.dumps(data))
 ')
        HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
            -H 'Content-type: application/json' \
            -d "$PAYLOAD")
        echo "    Slack webhook response: $HTTP_CODE"
    fi
 else
    echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
 fi
 exit 1
--- a/.buildkite/scripts/cherry-pick-from-milestone.sh
+++ b/.buildkite/scripts/cherry-pick-from-milestone.sh
@@ -134,7 +134,7 @@ log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
 # Store PR data in a temp file
 PR_DATA=$(mktemp)
-trap 'rm -f "$PR_DATA"' EXIT
+trap "rm -f $PR_DATA" EXIT
 if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
    --limit 1000 \
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -1,57 +1,25 @@
 #!/bin/bash
-# This script runs tests inside the corresponding ROCm docker container.
+# This script runs test inside the corresponding ROCm docker container.
 # It handles both single-node and multi-node test configurations.
 #
 # Multi-node detection: Instead of matching on fragile group names, we detect
 # multi-node jobs structurally by looking for the bracket command syntax
 # "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
 #
 ###############################################################################
 # QUOTING / COMMAND PASSING
 #
 # Passing commands as positional arguments ($*) is fragile when the command
 # string itself contains double quotes, e.g.:
 #
 #   bash run-amd-test.sh "export FLAGS="value" && pytest -m "not slow""
 #
 # The outer shell resolves the nested quotes *before* this script runs, so
 # the script receives mangled input it cannot fully recover.
 #
 # Preferred: pass commands via the VLLM_TEST_COMMANDS environment variable:
 #
 #   export VLLM_TEST_COMMANDS='export FLAGS="value" && pytest -m "not slow"'
 #   bash run-amd-test.sh
 #
 # Single-quoted assignment preserves all inner double quotes verbatim.
 # The $* path is kept for backward compatibility but callers should migrate.
 ###############################################################################
 set -o pipefail
 # Export Python path
 export PYTHONPATH=".."
-###############################################################################
+# Print ROCm version
-# Helper Functions
+echo "--- Confirming Clean Initial State"
-###############################################################################
+while true; do
        sleep 3
        if grep -q clean /opt/amdgpu/etc/gpu_state; then
                echo "GPUs state is \"clean\""
                break
        fi
 done
-wait_for_clean_gpus() {
+echo "--- ROCm info"
-  local timeout=${1:-300}
+rocminfo
  local start=$SECONDS
  echo "--- Waiting for clean GPU state (timeout: ${timeout}s)"
  while true; do
    if grep -q clean /opt/amdgpu/etc/gpu_state; then
      echo "GPUs state is \"clean\""
      return
    fi
    if (( SECONDS - start >= timeout )); then
      echo "Error: GPUs did not reach clean state within ${timeout}s" >&2
      exit 1
    fi
    sleep 3
  done
 }
 # cleanup older docker images
 cleanup_docker() {
  # Get Docker's root directory
  docker_root=$(docker info -f '{{.DockerRootDir}}')
@@ -60,12 +28,15 @@ cleanup_docker() {
    exit 1
  fi
  echo "Docker root directory: $docker_root"
-
+  # Check disk usage of the filesystem where Docker's root directory is located
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
  # Define the threshold
  threshold=70
  if [ "$disk_usage" -gt "$threshold" ]; then
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
    # Remove unused volumes / force the system prune for old images as well.
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
    echo "Docker images and volumes cleanup completed."
  else
@@ -74,445 +45,193 @@ cleanup_docker() {
 }
 cleanup_network() {
-  local max_nodes=${NUM_NODES:-2}
+  for node in $(seq 0 $((NUM_NODES-1))); do
-  for node in $(seq 0 $((max_nodes - 1))); do
+    if docker pr -a -q -f name="node${node}" | grep -q .; then
-    if docker ps -a -q -f name="node${node}" | grep -q .; then
+      docker stop "node${node}"
      docker stop "node${node}" || true
    fi
  done
-  if docker network ls | grep -q docker-net; then
+  if docker network ls | grep docker-net; then
-    docker network rm docker-net || true
+    docker network rm docker-net
  fi
 }
-is_multi_node() {
+# Call the cleanup docker function
  local cmds="$1"
  # Primary signal: NUM_NODES environment variable set by the pipeline
  if [[ "${NUM_NODES:-1}" -gt 1 ]]; then
    return 0
  fi
  # Fallback: detect the bracket syntax structurally
  # Pattern: [...] && [...] (per-node command arrays)
  if [[ "$cmds" =~ \[.*\].*\&\&.*\[.*\] ]]; then
    return 0
  fi
  return 1
 }
 handle_pytest_exit() {
  local exit_code=$1
  if [ "$exit_code" -eq 5 ]; then
    echo "Pytest exit code 5 (no tests collected) - treating as success."
    exit 0
  fi
  exit "$exit_code"
 }
 ###############################################################################
 # Pytest marker/keyword re-quoting
 #
 # When commands are passed through Buildkite -> shell -> $* -> bash -c,
 # quotes around multi-word pytest -m/-k expressions get stripped:
 #   pytest -v -s -m 'not cpu_test' v1/core
 # becomes:
 #   pytest -v -s -m not cpu_test v1/core
 #
 # pytest then interprets "cpu_test" as a file path, not part of the marker.
 #
 # This function detects unquoted expressions after -m/-k and re-quotes them
 # by collecting tokens until a recognizable boundary is reached:
 #   - test path (contains '/')
 #   - test file (ends with '.py')
 #   - another pytest flag (--xxx or -x single-char flags)
 #   - command separator (&& || ; |)
 #   - environment variable assignment (FOO=bar)
 #
 # Single-word markers (e.g. -m cpu_test, -m hybrid_model) pass through
 # unquoted since they have no spaces and work fine.
 #
 # Already-quoted expressions (containing literal single quotes) are passed
 # through untouched to avoid double-quoting values injected by
 # apply_rocm_test_overrides.
 #
 # NOTE: This ONLY fixes -m/-k flags. It cannot recover arbitrary inner
 # double-quotes stripped by the calling shell (see header comment).
 # Use VLLM_TEST_COMMANDS to avoid the problem entirely.
 ###############################################################################
 re_quote_pytest_markers() {
  local input="$1"
  local output=""
  local collecting=false
  local marker_buf=""
  # Strip backslash-newline continuations, then flatten remaining newlines
  local flat="${input//$'\\\n'/ }"
  flat="${flat//$'\n'/ }"
  # Disable globbing to prevent *.py etc. from expanding during read -ra
  local restore_glob
  restore_glob="$(shopt -p -o noglob 2>/dev/null || true)"
  set -o noglob
  local -a words
  read -ra words <<< "$flat"
  eval "$restore_glob"
  for word in "${words[@]}"; do
    if $collecting; then
      # If the token we're about to collect already contains a literal
      # single quote, the expression was already quoted upstream.
      # Flush and stop collecting.
      if [[ "$word" == *"'"* ]]; then
        if [[ -n "$marker_buf" ]]; then
          # Should not normally happen (partial buf + quote), flush raw
          output+="${marker_buf} "
          marker_buf=""
        fi
        output+="${word} "
        collecting=false
        continue
      fi
      local is_boundary=false
      case "$word" in
        # Line-continuation artifact
        "\\")
          is_boundary=true ;;
        # Command separators
        "&&"|"||"|";"|"|")
          is_boundary=true ;;
        # Long flags (--ignore, --shard-id, etc.)
        --*)
          is_boundary=true ;;
        # Short flags (-v, -s, -x, etc.) but NOT negative marker tokens
        # like "not" which don't start with "-". Also skip -k/-m which
        # would start a new marker (handled below).
        -[a-zA-Z])
          is_boundary=true ;;
        # Test path (contains /)
        */*)
          is_boundary=true ;;
        # Test file (ends with .py, possibly with ::method)
        *.py|*.py::*)
          is_boundary=true ;;
        # Environment variable assignment preceding a command (FOO=bar)
        *=*)
          # Only treat as boundary if it looks like VAR=value, not
          # pytest filter expressions like num_gpus=2 inside markers
          if [[ "$word" =~ ^[A-Z_][A-Z0-9_]*= ]]; then
            is_boundary=true
          fi
          ;;
      esac
      if $is_boundary; then
        # Strip surrounding double quotes if present (from upstream
        # single-to-double conversion); without this, wrapping below
        # would produce '"expr"' with literal double-quote characters.
        if [[ "$marker_buf" == '"'*'"' ]]; then
          marker_buf="${marker_buf#\"}"
          marker_buf="${marker_buf%\"}"
        fi
        # Flush the collected marker expression
        if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
          output+="'${marker_buf}' "
        else
          output+="${marker_buf} "
        fi
        collecting=false
        marker_buf=""
        # Check if this boundary word itself starts a new -m/-k
        if [[ "$word" == "-m" || "$word" == "-k" ]]; then
          output+="${word} "
          collecting=true
        # Drop stray backslash tokens silently
        elif [[ "$word" == "\\" ]]; then
          :
        else
          output+="${word} "
        fi
      else
        # Accumulate into marker buffer
        if [[ -n "$marker_buf" ]]; then
          marker_buf+=" ${word}"
        else
          marker_buf="${word}"
        fi
      fi
    elif [[ "$word" == "-m" || "$word" == "-k" ]]; then
      output+="${word} "
      collecting=true
      marker_buf=""
    else
      output+="${word} "
    fi
  done
  # Flush any trailing marker expression (marker at end of command)
  if $collecting && [[ -n "$marker_buf" ]]; then
    # Strip surrounding double quotes (see mid-stream flush comment)
    if [[ "$marker_buf" == '"'*'"' ]]; then
      marker_buf="${marker_buf#\"}"
      marker_buf="${marker_buf%\"}"
    fi
    if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
      output+="'${marker_buf}'"
    else
      output+="${marker_buf}"
    fi
  fi
  echo "${output% }"
 }
 ###############################################################################
 # ROCm-specific pytest command rewrites
 #
 # These apply ignore flags and environment overrides for tests that are not
 # yet supported or behave differently on ROCm hardware. Kept as a single
 # function so new exclusions are easy to add in one place.
 ###############################################################################
 apply_rocm_test_overrides() {
  local cmds="$1"
  # --- Model registry filter ---
  if [[ $cmds == *"pytest -v -s models/test_registry.py"* ]]; then
    cmds=${cmds//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
  fi
  # --- LoRA: disable custom paged attention ---
  if [[ $cmds == *"pytest -v -s lora"* ]]; then
    cmds=${cmds//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
  fi
  # --- Kernel ignores ---
  if [[ $cmds == *" kernels/core"* ]]; then
    cmds="${cmds} \
    --ignore=kernels/core/test_fused_quant_layernorm.py \
    --ignore=kernels/core/test_permute_cols.py"
  fi
  if [[ $cmds == *" kernels/attention"* ]]; then
    cmds="${cmds} \
    --ignore=kernels/attention/test_attention_selector.py \
    --ignore=kernels/attention/test_encoder_decoder_attn.py \
    --ignore=kernels/attention/test_flash_attn.py \
    --ignore=kernels/attention/test_flashinfer.py \
    --ignore=kernels/attention/test_prefix_prefill.py \
    --ignore=kernels/attention/test_cascade_flash_attn.py \
    --ignore=kernels/attention/test_mha_attn.py \
    --ignore=kernels/attention/test_lightning_attn.py \
    --ignore=kernels/attention/test_attention.py"
  fi
  if [[ $cmds == *" kernels/quantization"* ]]; then
    cmds="${cmds} \
    --ignore=kernels/quantization/test_int8_quant.py \
    --ignore=kernels/quantization/test_machete_mm.py \
    --ignore=kernels/quantization/test_block_fp8.py \
    --ignore=kernels/quantization/test_block_int8.py \
    --ignore=kernels/quantization/test_marlin_gemm.py \
    --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
    --ignore=kernels/quantization/test_int8_kernel.py"
  fi
  if [[ $cmds == *" kernels/mamba"* ]]; then
    cmds="${cmds} \
    --ignore=kernels/mamba/test_mamba_mixer2.py \
    --ignore=kernels/mamba/test_causal_conv1d.py \
    --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
  fi
  if [[ $cmds == *" kernels/moe"* ]]; then
    cmds="${cmds} \
    --ignore=kernels/moe/test_moe.py \
    --ignore=kernels/moe/test_cutlass_moe.py \
    --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
  fi
  # --- Entrypoint ignores ---
  if [[ $cmds == *" entrypoints/openai "* ]]; then
    cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
    --ignore=entrypoints/openai/chat_completion/test_audio.py \
    --ignore=entrypoints/openai/completion/test_shutdown.py \
    --ignore=entrypoints/openai/test_completion.py \
    --ignore=entrypoints/openai/test_models.py \
    --ignore=entrypoints/openai/test_lora_adapters.py \
    --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
    --ignore=entrypoints/openai/chat_completion/test_root_path.py \
    --ignore=entrypoints/openai/test_tokenization.py \
    --ignore=entrypoints/openai/completion/test_prompt_validation.py "}
  fi
  if [[ $cmds == *" entrypoints/llm "* ]]; then
    cmds=${cmds//" entrypoints/llm "/" entrypoints/llm \
    --ignore=entrypoints/llm/test_chat.py \
    --ignore=entrypoints/llm/test_accuracy.py \
    --ignore=entrypoints/llm/test_init.py \
    --ignore=entrypoints/llm/test_prompt_validation.py "}
  fi
  # Clean up escaped newlines from --ignore appends
  cmds=$(echo "$cmds" | sed 's/ \\ / /g')
  echo "$cmds"
 }
 ###############################################################################
 # Main
 ###############################################################################
 # --- GPU initialization ---
 echo "--- Confirming Clean Initial State"
 wait_for_clean_gpus
 echo "--- ROCm info"
 rocminfo
 # --- Docker housekeeping ---
 cleanup_docker
 echo "--- Resetting GPUs"
 echo "reset" > /opt/amdgpu/etc/gpu_state
 wait_for_clean_gpus
-# --- Pull test image ---
+echo "reset" > /opt/amdgpu/etc/gpu_state
 while true; do
        sleep 3
        if grep -q clean /opt/amdgpu/etc/gpu_state; then
                echo "GPUs state is \"clean\""
                break
        fi
 done
 echo "--- Pulling container"
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"
 remove_docker_container() {
-  docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
+   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
 }
 trap remove_docker_container EXIT
 # --- Prepare commands ---
 echo "--- Running container"
 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
-# ---- Command source selection ----
+commands=$@
 # Prefer VLLM_TEST_COMMANDS (preserves all inner quoting intact).
 # Fall back to $* for backward compatibility, but warn that inner
 # double-quotes will have been stripped by the calling shell.
 if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then
  commands="${VLLM_TEST_COMMANDS}"
  echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)"
 else
  commands="$*"
  if [[ -z "$commands" ]]; then
    echo "Error: No test commands provided." >&2
    echo "Usage:" >&2
    echo "  Preferred:  VLLM_TEST_COMMANDS='...' bash $0" >&2
    echo "  Legacy:     bash $0 \"commands here\"" >&2
    exit 1
  fi
  echo "Commands sourced from positional args (legacy mode)"
  echo "WARNING: Inner double-quotes in the command string may have been"
  echo "  stripped by the calling shell. If you see syntax errors, switch to:"
  echo "  export VLLM_TEST_COMMANDS='your commands here'"
  echo "  bash $0"
 fi
 echo "Raw commands: $commands"
-# Fix quoting before ROCm overrides (so overrides see correct structure)
+commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
 commands=$(re_quote_pytest_markers "$commands")
 echo "After re-quoting: $commands"
-commands=$(apply_rocm_test_overrides "$commands")
+if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi
 commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
 if [[ $commands == *"pytest -v -s lora"* ]]; then
  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
 fi
 #ignore certain kernels tests
 if [[ $commands == *" kernels/core"* ]]; then
  commands="${commands} \
  --ignore=kernels/core/test_fused_quant_layernorm.py \
  --ignore=kernels/core/test_permute_cols.py"
 fi
 if [[ $commands == *" kernels/attention"* ]]; then
  commands="${commands} \
  --ignore=kernels/attention/test_attention_selector.py \
  --ignore=kernels/attention/test_encoder_decoder_attn.py \
  --ignore=kernels/attention/test_flash_attn.py \
  --ignore=kernels/attention/test_flashinfer.py \
  --ignore=kernels/attention/test_prefix_prefill.py \
  --ignore=kernels/attention/test_cascade_flash_attn.py \
  --ignore=kernels/attention/test_mha_attn.py \
  --ignore=kernels/attention/test_lightning_attn.py \
  --ignore=kernels/attention/test_attention.py"
 fi
 if [[ $commands == *" kernels/quantization"* ]]; then
  commands="${commands} \
  --ignore=kernels/quantization/test_int8_quant.py \
  --ignore=kernels/quantization/test_machete_mm.py \
  --ignore=kernels/quantization/test_block_fp8.py \
  --ignore=kernels/quantization/test_block_int8.py \
  --ignore=kernels/quantization/test_marlin_gemm.py \
  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
  --ignore=kernels/quantization/test_int8_kernel.py"
 fi
 if [[ $commands == *" kernels/mamba"* ]]; then
  commands="${commands} \
  --ignore=kernels/mamba/test_mamba_mixer2.py \
  --ignore=kernels/mamba/test_causal_conv1d.py \
  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
 fi
 if [[ $commands == *" kernels/moe"* ]]; then
  commands="${commands} \
  --ignore=kernels/moe/test_moe.py \
  --ignore=kernels/moe/test_cutlass_moe.py \
  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
 fi
 #ignore certain Entrypoints/openai tests
 if [[ $commands == *" entrypoints/openai "* ]]; then
  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
  --ignore=entrypoints/openai/test_audio.py \
  --ignore=entrypoints/openai/test_shutdown.py \
  --ignore=entrypoints/openai/test_completion.py \
  --ignore=entrypoints/openai/test_models.py \
  --ignore=entrypoints/openai/test_lora_adapters.py \
  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
  --ignore=entrypoints/openai/test_root_path.py \
  --ignore=entrypoints/openai/test_tokenization.py \
  --ignore=entrypoints/openai/test_prompt_validation.py "}
 fi
 #ignore certain Entrypoints/llm tests
 if [[ $commands == *" entrypoints/llm "* ]]; then
  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
  --ignore=entrypoints/llm/test_chat.py \
  --ignore=entrypoints/llm/test_accuracy.py \
  --ignore=entrypoints/llm/test_init.py \
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi
 commands=$(echo "$commands" | sed 's/ \\ / /g')
 echo "Final commands: $commands"
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
 # --ignore=entrypoints/openai/test_accuracy.py \
 # --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
 MYPYTHONPATH=".."
-# Verify GPU access
+# Test that we're launching on the machine that has
 # proper access to GPUs
 render_gid=$(getent group render | cut -d: -f3)
 if [[ -z "$render_gid" ]]; then
  echo "Error: 'render' group not found. This is required for GPU access." >&2
  exit 1
 fi
-# --- RDMA device passthrough (conditional) ---
+if [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
 # If the host has RDMA devices, pass them through so tests like
 # test_moriio_connector can access ibverbs. On hosts without RDMA
 # hardware the tests will gracefully skip via _rdma_available().
 RDMA_FLAGS=""
 if [ -d /dev/infiniband ]; then
  echo "RDMA devices detected on host, enabling passthrough"
  RDMA_FLAGS="--device /dev/infiniband --cap-add=IPC_LOCK"
 else
  echo "No RDMA devices found on host, RDMA tests will be skipped"
 fi
 # --- Route: multi-node vs single-node ---
 if is_multi_node "$commands"; then
  echo "--- Multi-node job detected"
  export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
-  # Parse the bracket syntax:  prefix ; [node0_cmds] && [node1_cmds]
+  if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then
-  #   BASH_REMATCH[1] = prefix (everything before first bracket)
+      prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g')
-  #   BASH_REMATCH[2] = comma-separated node0 commands
+      echo "PREFIX: ${prefix}"
-  #   BASH_REMATCH[3] = comma-separated node1 commands
+      export composite_command="(command rocm-smi || true)"
-  if [[ "$commands" =~ ^(.*)\[(.*)"] && ["(.*)\]$ ]]; then
+      myIFS=$IFS
-    prefix=$(echo "${BASH_REMATCH[1]}" | sed 's/;//g')
+      IFS=','
-    echo "PREFIX: ${prefix}"
+      read -ra node0 <<< ${BASH_REMATCH[2]}
      read -ra node1 <<< ${BASH_REMATCH[3]}
      IFS=$myIFS
      for i in "${!node0[@]}";do 
        command_node_0=$(echo ${node0[i]} | sed 's/\"//g')
        command_node_1=$(echo ${node1[i]} | sed 's/\"//g')
-    export composite_command="(command rocm-smi || true)"
+        export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
-    saved_IFS=$IFS
+        echo "COMMANDS: ${commands}"
-    IFS=','
+        composite_command=$(echo "${composite_command} && ${commands}")
-    read -ra node0 <<< "${BASH_REMATCH[2]}"
+      done
-    read -ra node1 <<< "${BASH_REMATCH[3]}"
+      /bin/bash -c "${composite_command}"
-    IFS=$saved_IFS
+      cleanup_network
    if [[ ${#node0[@]} -ne ${#node1[@]} ]]; then
      echo "Warning: node0 has ${#node0[@]} commands, node1 has ${#node1[@]}. They will be paired by index."
    fi
    for i in "${!node0[@]}"; do
      command_node_0=$(echo "${node0[i]}" | sed 's/\"//g')
      command_node_1=$(echo "${node1[i]}" | sed 's/\"//g')
      step_cmd="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
      echo "COMMANDS: ${step_cmd}"
      composite_command="${composite_command} && ${step_cmd}"
    done
    /bin/bash -c "${composite_command}"
    exit_code=$?
    cleanup_network
    handle_pytest_exit "$exit_code"
  else
-    echo "Multi-node job detected but failed to parse bracket command syntax."
+      echo "Failed to parse node commands! Exiting."
-    echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]"
+      cleanup_network
-    echo "Got: $commands"
+      exit 111
    cleanup_network
    exit 111
  fi
 else
  echo "--- Single-node job"
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \
-    --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
-    $RDMA_FLAGS \
+          --network=host \
-    --network=host \
+          --shm-size=16gb \
-    --shm-size=16gb \
+          --group-add "$render_gid" \
-    --group-add "$render_gid" \
+          --rm \
-    --rm \
+          -e HF_TOKEN \
-    -e HF_TOKEN \
+          -e AWS_ACCESS_KEY_ID \
-    -e AWS_ACCESS_KEY_ID \
+          -e AWS_SECRET_ACCESS_KEY \
-    -e AWS_SECRET_ACCESS_KEY \
+          -v "${HF_CACHE}:${HF_MOUNT}" \
-    -e BUILDKITE_PARALLEL_JOB \
+          -e "HF_HOME=${HF_MOUNT}" \
-    -e BUILDKITE_PARALLEL_JOB_COUNT \
+          -e "PYTHONPATH=${MYPYTHONPATH}" \
-    -v "${HF_CACHE}:${HF_MOUNT}" \
+          --name "${container_name}" \
-    -e "HF_HOME=${HF_MOUNT}" \
+          "${image_name}" \
-    -e "PYTHONPATH=${MYPYTHONPATH}" \
+          /bin/bash -c "${commands}"
    --name "${container_name}" \
    "${image_name}" \
    /bin/bash -c "${commands}"
  exit_code=$?
  handle_pytest_exit "$exit_code"
 fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
@@ -1,65 +0,0 @@
 #!/bin/bash
 set -euox pipefail
 export VLLM_CPU_KVCACHE_SPACE=1 
 export VLLM_CPU_CI_ENV=1
 # Reduce sub-processes for acceleration
 export TORCH_COMPILE_DISABLE=1 
 export VLLM_ENABLE_V1_MULTIPROCESSING=0
 SDE_ARCHIVE="sde-external-10.7.0-2026-02-18-lin.tar.xz"
 SDE_CHECKSUM="CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217"
 wget "https://downloadmirror.intel.com/913594/${SDE_ARCHIVE}"
 echo "${SDE_CHECKSUM}  ${SDE_ARCHIVE}" | sha256sum --check
 mkdir -p sde
 tar -xvf "./${SDE_ARCHIVE}" --strip-components=1 -C ./sde/
 wait_for_pid_and_check_log() {
    local pid="$1"
    local log_file="$2"
    local exit_status
    if [ -z "$pid" ] || [ -z "$log_file" ]; then
        echo "Usage: wait_for_pid_and_check_log <PID> <LOG_FILE>"
        return 1
    fi
    echo "Waiting for process $pid to finish..."
    # Use the 'wait' command to pause the script until the specific PID exits.
    # The 'wait' command's own exit status will be that of the waited-for process.
    if wait "$pid"; then
        exit_status=$?
        echo "Process $pid finished with exit status $exit_status (Success)."
    else
        exit_status=$?
        echo "Process $pid finished with exit status $exit_status (Failure)."
    fi
    if [ "$exit_status" -ne 0 ]; then
        echo "Process exited with a non-zero status."
        echo "--- Last few lines of log file: $log_file ---"
        tail -n 50 "$log_file"
        echo "---------------------------------------------"
        return 1 # Indicate failure based on exit status
    fi
    echo "No errors detected in log file and process exited successfully."
    return 0
 }
 # Test Sky Lake (AVX512F)
 ./sde/sde64 -skl -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_0.log 2>&1 &
 PID_TEST_0=$!
 # Test Cascade Lake (AVX512F + VNNI)
 ./sde/sde64 -clx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_1.log 2>&1 &
 PID_TEST_1=$!
 # Test Cooper Lake (AVX512F + VNNI + BF16)
 ./sde/sde64 -cpx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_2.log 2>&1 &
 PID_TEST_2=$!
 wait_for_pid_and_check_log $PID_TEST_0 test_0.log
 wait_for_pid_and_check_log $PID_TEST_1 test_1.log
 wait_for_pid_and_check_log $PID_TEST_2 test_2.log
--- a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
@@ -1,43 +1,26 @@
 #!/bin/bash
 set -euox pipefail
 export VLLM_CPU_CI_ENV=0
 echo "--- PP+TP"
 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
 server_pid=$!
-timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
+timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
 vllm bench serve \
    --backend vllm \
    --dataset-name random \
    --model meta-llama/Llama-3.2-3B-Instruct \
    --num-prompts 20 \
    --result-dir ./test_results \
    --result-filename tp_pp.json \
    --save-result \
    --endpoint /v1/completions
-kill -s SIGTERM $server_pid; wait $server_pid || true
+kill -s SIGTERM $server_pid &
 failed_req=$(jq '.failed' ./test_results/tp_pp.json)
 if [ "$failed_req" -ne 0 ]; then
  echo "Some requests were failed!"
  exit 1
 fi
 echo "--- DP+TP"
 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
 server_pid=$!
-timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
+timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
 vllm bench serve \
    --backend vllm \
    --dataset-name random \
    --model meta-llama/Llama-3.2-3B-Instruct \
    --num-prompts 20 \
    --result-dir ./test_results \
    --result-filename dp_pp.json \
    --save-result \
    --endpoint /v1/completions
-kill -s SIGTERM $server_pid; wait $server_pid || true
+kill -s SIGTERM $server_pid &
 failed_req=$(jq '.failed' ./test_results/dp_pp.json)
 if [ "$failed_req" -ne 0 ]; then
  echo "Some requests were failed!"
  exit 1
 fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -34,7 +34,7 @@ function cpu_tests() {
  # offline inference
  docker exec cpu-test bash -c "
    set -e
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
  # Run model tests
  docker exec cpu-test bash -c "
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -27,7 +27,7 @@ function cpu_tests() {
  podman exec -it "$container_id" bash -c "
    export TORCH_COMPILE_DISABLE=1
    set -xve
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
  # Run basic model test
  podman exec -it "$container_id" bash -c "
@@ -43,7 +43,7 @@ function cpu_tests() {
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
    # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
-    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> "$HOME"/test_rest.log
+    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
 }
 # All of CPU tests are expected to be finished less than 40 mins.
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -16,5 +16,5 @@ echo "--- :docker: Building Docker image"
 docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
+docker run --rm --cpuset-cpus=$CORE_RANGE --cpuset-mems=$NUMA_NODE -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g $IMAGE_NAME \
-        timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
+        timeout $TIMEOUT_VAL bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@@ -25,5 +25,5 @@ remove_docker_container
 # Run the image and test offline inference
 docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
+    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -1,42 +1,17 @@
 #!/bin/bash
-# This script builds the HPU docker image and runs the offline inference inside the container.
+# This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 #
 # vllm-gaudi compatibility pinning:
 #   The vllm-gaudi plugin is installed on top of the vllm upstream checkout used by this CI job.
 #   When upstream vllm changes its API, the plugin may break before it has been updated.
 #   To handle this, the vllm-gaudi repository maintains a file:
 #     vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT
 #   The first line of that file controls what version of vllm is used inside the Docker image:
 #     - "latest"        : no checkout override; the current Buildkite CI commit is used as-is.
 #     - "<commit SHA>"  : vllm is checked out to that specific commit before building, pinning
 #                         the test to a known-compatible baseline.
 #   To unpin (resume testing against the live vllm tip), set the file content back to "latest".
 set -exuo pipefail
 # Fetch the vllm community commit reference from vllm-gaudi (first line only).
 VLLM_COMMUNITY_COMMIT=$(curl -s \
  https://raw.githubusercontent.com/vllm-project/vllm-gaudi/vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT \
  | head -1 | tr -d '\n')
 echo "Using vllm community commit: ${VLLM_COMMUNITY_COMMIT}"
 # Try building the docker image
 image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
 container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
-cat <<EOF | docker build -t "${image_name}" -f - .
+cat <<EOF | docker build -t ${image_name} -f - .
 FROM gaudi-base-image:latest
 COPY ./ /workspace/vllm
 # If VLLM_COMMUNITY_COMMIT is a specific commit (not "latest"), check it out to pin vllm
 # to the version known to be compatible with vllm-gaudi. When the value is "latest",
 # the current checkout (the Buildkite CI commit) is used unchanged.
 RUN if [ "${VLLM_COMMUNITY_COMMIT}" != "latest" ]; then \
      cd /workspace/vllm && git fetch --unshallow 2>/dev/null || true && git checkout ${VLLM_COMMUNITY_COMMIT}; \
    fi
 WORKDIR /workspace/vllm
 ENV no_proxy=localhost,127.0.0.1
@@ -64,19 +39,19 @@ EOF
 # functions, while other platforms only need one remove_docker_container
 # function.
 EXITCODE=1
-remove_docker_containers() { docker rm -f "${container_name}" || true; }
+remove_docker_containers() { docker rm -f ${container_name} || true; }
 trap 'remove_docker_containers; exit $EXITCODE;' EXIT
 remove_docker_containers
 echo "Running HPU plugin v1 test"
-docker run --rm --runtime=habana --name="${container_name}" --network=host \
+docker run --rm --runtime=habana --name=${container_name} --network=host \
  -e HABANA_VISIBLE_DEVICES=all \
  -e VLLM_SKIP_WARMUP=true \
  -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
  -e PT_HPU_LAZY_MODE=1 \
  "${image_name}" \
  /bin/bash -c '
-  cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
+  cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
 '
 EXITCODE=$?
--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -41,7 +41,6 @@ get_config() {
        echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
        exit 1
    fi
    # shellcheck source=/dev/null
    source "${TEST_RUN_CONFIG_FILE}"
    echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
    return 0
@@ -49,8 +48,9 @@ get_config() {
 # get test running configuration.
 fetch_vllm_test_cfg
 get_config
 # Check if the function call was successful. If not, exit the script.
-if ! get_config; then
+if [ $? -ne 0 ]; then
  exit 1
 fi
@@ -62,14 +62,14 @@ agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
 echo "agent_idx: ${agent_idx}"
 builder_name="cachebuilder${agent_idx}"
 builder_cache_dir="/mnt/docker-cache${agent_idx}"
-mkdir -p "${builder_cache_dir}"
+mkdir -p ${builder_cache_dir}
 # Try building the docker image
 cat <<EOF | DOCKER_BUILDKIT=1 docker build \
-    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:"${PYPI_CACHE_HOST}" \
+    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
-    --builder "${builder_name}" --cache-from type=local,src="${builder_cache_dir}" \
+    --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
-                           --cache-to type=local,dest="${builder_cache_dir}",mode=max \
+                           --cache-to type=local,dest=${builder_cache_dir},mode=max \
-    --progress=plain --load -t "${image_name}" -f - .
+    --progress=plain --load -t ${image_name} -f - .
 FROM ${BASE_IMAGE_NAME}
 # Define environments
@@ -116,7 +116,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
-    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/devlib && \
+    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
    python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -139,7 +139,7 @@ trap remove_docker_container EXIT
 # Generate corresponding --device args based on BUILDKITE_AGENT_NAME
 # Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
 #   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
-#   returns one argument per line: --device, /dev/davinciX, ...
+#   returns --device /dev/davinci0 --device /dev/davinci1
 parse_and_gen_devices() {
    local input="$1"
    local index cards_num
@@ -151,24 +151,29 @@ parse_and_gen_devices() {
        return 1
    fi
    local devices=""
    local i=0
    while (( i < cards_num )); do
        local dev_idx=$(((index - 1)*cards_num + i ))
-        printf '%s\n' "--device"
+        devices="$devices --device /dev/davinci${dev_idx}"
        printf '%s\n' "/dev/davinci${dev_idx}"
        ((i++))
    done
    # trim leading space
    devices="${devices#"${devices%%[![:space:]]*}"}"
    # Output devices: assigned to the caller variable
    printf '%s' "$devices"
 }
-mapfile -t device_args < <(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
+devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
 # Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
 # This test checks whether the OOT platform interface is functioning properly in conjunction with
 # the hardware plugin vllm-ascend.
 model_cache_dir=/mnt/modelscope${agent_idx}
-mkdir -p "${model_cache_dir}"
+mkdir -p ${model_cache_dir}
 docker run \
-    "${device_args[@]}" \
+    ${devices} \
    --device /dev/davinci_manager \
    --device /dev/devmm_svm \
    --device /dev/hisi_hdc \
@@ -177,7 +182,7 @@ docker run \
    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
    -v /etc/ascend_install.info:/etc/ascend_install.info \
-    -v "${model_cache_dir}":/root/.cache/modelscope \
+    -v ${model_cache_dir}:/root/.cache/modelscope \
    --entrypoint="" \
    --name "${container_name}" \
    "${image_name}" \
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 # Try building the docker image
-docker build -t "${image_name}" -f docker/Dockerfile.xpu .
+docker build -t ${image_name} -f docker/Dockerfile.xpu .
 # Setup cleanup
 remove_docker_container() {
@@ -34,17 +34,17 @@ docker run \
    set -e
    echo $ZE_AFFINITY_MASK
    pip install tblib==3.1.0
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
-    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
-    python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
+    python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
-    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
+    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
-    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
+    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
    cd tests
-    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
+    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
--- a/.buildkite/scripts/push-nightly-builds.sh
+++ b/.buildkite/scripts/push-nightly-builds.sh
@@ -21,16 +21,16 @@ echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag nam
 # pull original arch-dependent images from AWS ECR Public
 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX"
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX"
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
 # tag arch-dependent images
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-x86_64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-aarch64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
 # push arch-dependent images to DockerHub
-docker push vllm/vllm-openai:"$TAG_NAME"-x86_64
+docker push vllm/vllm-openai:$TAG_NAME-x86_64
-docker push vllm/vllm-openai:"$TAG_NAME"-aarch64
+docker push vllm/vllm-openai:$TAG_NAME-aarch64
 # push arch-independent manifest to DockerHub
-docker manifest create vllm/vllm-openai:"$TAG_NAME" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
+docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
-docker manifest create vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend
+docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
-docker manifest push vllm/vllm-openai:"$TAG_NAME"
+docker manifest push vllm/vllm-openai:$TAG_NAME
-docker manifest push vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT"
+docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT
--- a/.buildkite/scripts/run-prime-rl-test.sh
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@@ -0,0 +1,64 @@
 #!/bin/bash
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Setup script for Prime-RL integration tests
 # This script prepares the environment for running Prime-RL tests with nightly vLLM
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
 PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
 PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
 if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
    echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
    exit 0
 fi
 echo "Setting up Prime-RL integration test environment..."
 # Clean up any existing Prime-RL directory
 if [ -d "${PRIME_RL_DIR}" ]; then
    echo "Removing existing Prime-RL directory..."
    rm -rf "${PRIME_RL_DIR}"
 fi
 # Install UV if not available
 if ! command -v uv &> /dev/null; then
    echo "Installing UV package manager..."
    curl -LsSf https://astral.sh/uv/install.sh | sh
    source $HOME/.local/bin/env
 fi
 # Clone Prime-RL repository at specific branch for reproducible tests
 PRIME_RL_BRANCH="integ-vllm-main"
 echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
 git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
 cd "${PRIME_RL_DIR}"
 echo "Setting up UV project environment..."
 export UV_PROJECT_ENVIRONMENT=/usr/local
 ln -s /usr/bin/python3 /usr/local/bin/python
 # Remove vllm pin from pyproject.toml
 echo "Removing vllm pin from pyproject.toml..."
 sed -i '/vllm==/d' pyproject.toml
 # Sync Prime-RL dependencies
 echo "Installing Prime-RL dependencies..."
 uv sync --inexact && uv sync --inexact --all-extras
 # Verify installation
 echo "Verifying installations..."
 uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
 uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
 echo "Prime-RL integration test environment setup complete!"
 echo "Running Prime-RL integration tests..."
 export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
 uv run pytest -vs tests/integration/test_rl.py -m gpu
 echo "Prime-RL integration tests completed!"
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -51,14 +51,14 @@ for BACK in "${BACKENDS[@]}"; do
    --enable-eplb \
    --trust-remote-code \
    --max-model-len 2048 \
-    --all2all-backend "$BACK" \
+    --all2all-backend $BACK \
-    --port "$PORT" &
+    --port $PORT &
  SERVER_PID=$!
-  wait_for_server "$PORT"
+  wait_for_server $PORT
  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
@@ -1,57 +0,0 @@
 #!/usr/bin/env bash
 set -euxo pipefail
 # Nightly e2e test for prefetch offloading with a MoE model.
 # Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights
 # and validates GSM8K accuracy matches baseline (no offloading).
 #
 # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
 THRESHOLD=${1:-0.25}
 NUM_Q=${2:-1319}
 PORT=${3:-8030}
 OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
 mkdir -p "${OUT_DIR}"
 wait_for_server() {
  local port=$1
  timeout 600 bash -c '
    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
      sleep 1
    done'
 }
 MODEL="deepseek-ai/DeepSeek-V2-Lite"
 cleanup() {
  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
    kill "${SERVER_PID}" 2>/dev/null || true
    for _ in {1..20}; do
      kill -0 "${SERVER_PID}" 2>/dev/null || break
      sleep 0.5
    done
    kill -9 "${SERVER_PID}" 2>/dev/null || true
  fi
 }
 trap cleanup EXIT
 vllm serve "$MODEL" \
  --max-model-len 2048 \
  --offload-group-size 8 \
  --offload-num-in-group 2 \
  --offload-prefetch-step 1 \
  --offload-params w13_weight w2_weight \
  --port "$PORT" &
 SERVER_PID=$!
 wait_for_server "$PORT"
 TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
 OUT="${OUT_DIR}/${TAG}_prefetch_offload.json"
 python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
 python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} prefetch_offload: accuracy {acc:.3f}")
 assert acc >= ${THRESHOLD}, f"${MODEL} prefetch_offload accuracy {acc}"
 PY
 cleanup
 SERVER_PID=
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@@ -47,20 +47,20 @@ for BACK in "${BACKENDS[@]}"; do
  vllm serve "$MODEL" \
    --enforce-eager \
    --enable-eplb \
-    --all2all-backend "$BACK" \
+    --all2all-backend $BACK \
    --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
-    --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \
+    --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
-    --data-parallel-size "${DATA_PARALLEL_SIZE}" \
+    --data-parallel-size ${DATA_PARALLEL_SIZE} \
    --enable-expert-parallel \
    --trust-remote-code \
    --max-model-len 2048 \
-    --port "$PORT" &
+    --port $PORT &
  SERVER_PID=$!
-  wait_for_server "$PORT"
+  wait_for_server $PORT
  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -24,7 +24,7 @@ if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:
  BACKENDS=("allgather_reducescatter")
  # Disable MOE padding for ROCm since it is causing eplb to fail
  export VLLM_ROCM_MOE_PADDING=0
-  PLATFORM_ARGS=("--no-async-scheduling" "--attention-backend=TRITON_ATTN")
+  PLATFORM_ARGS=("--no-async-scheduling")
  echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
 else
  # Non-ROCm platform (CUDA/other)
@@ -51,20 +51,20 @@ for BACK in "${BACKENDS[@]}"; do
    --tensor-parallel-size 4 \
    --enable-expert-parallel \
    --enable-eplb \
-    --all2all-backend "$BACK" \
+    --all2all-backend $BACK \
    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
    --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
    --trust-remote-code \
    --max-model-len 2048 \
    --gpu-memory-utilization 0.9 \
    "${PLATFORM_ARGS[@]}" \
-    --port "$PORT" &
+    --port $PORT &
  SERVER_PID=$!
-  wait_for_server "$PORT"
+  wait_for_server $PORT
  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
-  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}"
+  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
--- a/.buildkite/scripts/tool_call/run-bfcl-eval.sh
+++ b/.buildkite/scripts/tool_call/run-bfcl-eval.sh
@@ -1,248 +0,0 @@
 #!/bin/bash
 # Run BFCL (Berkeley Function Call Leaderboard) tool-calling correctness
 # evaluation against a local vLLM server.
 #
 # Usage:
 #   # Run with defaults (gpt-oss-20b, multi_turn)
 #   bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
 #
 #   # Run with gpt-oss-120b and multiple test categories
 #   BFCL_MODEL="openai/gpt-oss-120b" BFCL_TP_SIZE=4 \
 #     BFCL_TEST_CATEGORY="live_simple, multiple, parallel_multiple" \
 #     bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
 #
 #   # Chain both API types (use BFCL_OUTPUT_DIR to avoid overwriting results)
 #   BFCL_OUTPUT_DIR=./bfcl-chat-completions BFCL_API_TYPE=chat_completions \
 #     bash .buildkite/scripts/tool_call/run-bfcl-eval.sh && \
 #   BFCL_OUTPUT_DIR=./bfcl-responses BFCL_API_TYPE=responses \
 #     bash .buildkite/scripts/tool_call/run-bfcl-eval.sh
 #
 # Environment variables (all optional, with defaults):
 #   BFCL_MODEL          - HF model name (default: openai/gpt-oss-20b)
 #   BFCL_API_TYPE       - API type: "chat_completions" or "responses" (default: chat_completions)
 #   BFCL_OUTPUT_DIR     - Directory for BFCL results (default: current working directory)
 #   BFCL_TEST_CATEGORY  - BFCL test categories (default: multi_turn)
 #   BFCL_TOOL_CALL_PARSER - Tool call parser name (default: openai)
 #   BFCL_NUM_THREADS    - Threads for BFCL generate (default: 8)
 #   BFCL_TP_SIZE        - Tensor parallel size (default: 1)
 #   BFCL_MAX_MODEL_LEN  - Max model length (default: 4096)
 #   BFCL_PORT           - Server port (default: 8000)
 #   BFCL_REASONING_PARSER - Reasoning parser name (default: disabled)
 #   BFCL_EXTRA_ARGS     - Additional vLLM server args
 set -euo pipefail
 # ---- Configuration ----
 MODEL="${BFCL_MODEL:-openai/gpt-oss-20b}"
 API_TYPE="${BFCL_API_TYPE:-chat_completions}"
 OUTPUT_DIR="${BFCL_OUTPUT_DIR:-}"
 TEST_CATEGORY="${BFCL_TEST_CATEGORY:-multi_turn}"
 TOOL_CALL_PARSER="${BFCL_TOOL_CALL_PARSER:-openai}"
 NUM_THREADS="${BFCL_NUM_THREADS:-8}"
 TP_SIZE="${BFCL_TP_SIZE:-1}"
 MAX_MODEL_LEN="${BFCL_MAX_MODEL_LEN:-4096}"
 PORT="${BFCL_PORT:-8000}"
 REASONING_PARSER="${BFCL_REASONING_PARSER:-}"
 EXTRA_ARGS="${BFCL_EXTRA_ARGS:-}"
 # Set up output directory
 if [ -n "$OUTPUT_DIR" ]; then
    mkdir -p "$OUTPUT_DIR"
    OUTPUT_DIR="$(cd "$OUTPUT_DIR" && pwd)"
 fi
 echo "============================================"
 echo "BFCL Tool Call Correctness Evaluation"
 echo "============================================"
 echo "Model:          $MODEL"
 echo "Tool parser:    $TOOL_CALL_PARSER"
 echo "API type:       $API_TYPE"
 echo "Output dir:     ${OUTPUT_DIR:-<cwd>}"
 echo "Test category:  $TEST_CATEGORY"
 echo "TP size:        $TP_SIZE"
 echo "Max model len:  $MAX_MODEL_LEN"
 echo "Port:           $PORT"
 echo "Num threads:    $NUM_THREADS"
 echo "============================================"
 # ---- Install bfcl-eval if missing ----
 if ! python3 -c "import bfcl_eval" 2>/dev/null; then
    echo "Installing bfcl-eval..."
    pip install "bfcl-eval>=2025.10.20.1,<2026"
 fi
 # ---- Cleanup handler ----
 SERVER_PID=""
 cleanup() {
    if [ -n "$SERVER_PID" ]; then
        echo "Stopping vLLM server (pid=$SERVER_PID)..."
        kill "$SERVER_PID" 2>/dev/null || true
        wait "$SERVER_PID" 2>/dev/null || true
    fi
    # Remove BFCL lock files (created by filelock for thread-safe writes)
    rm -rf .file_locks/
    if [ -n "${OUTPUT_DIR:-}" ]; then
        rm -rf "$OUTPUT_DIR/.file_locks/"
    fi
 }
 trap cleanup EXIT
 # ---- Start vLLM server ----
 echo "Starting vLLM server..."
 SERVE_ARGS=(
    "$MODEL"
    --port "$PORT"
    --enable-auto-tool-choice
    --tool-call-parser "$TOOL_CALL_PARSER"
    --tensor-parallel-size "$TP_SIZE"
    --max-model-len "$MAX_MODEL_LEN"
    --enforce-eager
    --no-enable-prefix-caching
 )
 # Append reasoning parser if specified
 if [ -n "$REASONING_PARSER" ]; then
    SERVE_ARGS+=(--reasoning-parser "$REASONING_PARSER")
 fi
 # Append any extra args
 if [ -n "$EXTRA_ARGS" ]; then
    read -ra EXTRA_ARGS_ARRAY <<< "$EXTRA_ARGS"
    SERVE_ARGS+=("${EXTRA_ARGS_ARRAY[@]}")
 fi
 echo "Command: vllm serve ${SERVE_ARGS[*]}"
 vllm serve "${SERVE_ARGS[@]}" &
 SERVER_PID=$!
 # ---- Wait for server to be ready ----
 echo "Waiting for vLLM server to start (timeout: 600s)..."
 SECONDS_WAITED=0
 until curl -sf "http://localhost:${PORT}/health" > /dev/null 2>&1; do
    if [ $SECONDS_WAITED -ge 600 ]; then
        echo ""
        echo "ERROR: vLLM server failed to start within 600s"
        exit 1
    fi
    if (( SECONDS_WAITED % 30 == 0 && SECONDS_WAITED > 0 )); then
        echo "  Still waiting... (${SECONDS_WAITED}s elapsed)"
    fi
    sleep 2
    SECONDS_WAITED=$((SECONDS_WAITED + 2))
 done
 echo "vLLM server is ready. (started in ${SECONDS_WAITED}s)"
 # ---- Run BFCL evaluation ----
 # bfcl-eval has no CLI entry point; generate() and evaluate() are Typer
 # functions that must be called from Python. The MODEL_CONFIG_MAPPING must
 # be patched in-process so BFCL knows to use the OpenAI-compatible handler
 # against our local vLLM server.
 bfcl_exit_code=0
 python3 - "$MODEL" "$TEST_CATEGORY" "$NUM_THREADS" "$PORT" "$API_TYPE" "$OUTPUT_DIR" << 'PYEOF' || bfcl_exit_code=$?
 import os
 import sys
 model = sys.argv[1]
 test_category = sys.argv[2]
 num_threads = int(sys.argv[3])
 port = sys.argv[4]
 api_type = sys.argv[5]
 output_dir = sys.argv[6] if len(sys.argv) > 6 and sys.argv[6] else os.getcwd()
 os.environ["OPENAI_BASE_URL"] = f"http://localhost:{port}/v1"
 os.environ["OPENAI_API_KEY"] = "dummy"
 os.environ["BFCL_PROJECT_ROOT"] = output_dir
 import bfcl_eval.constants.model_config as bfcl_model_config
 from bfcl_eval.constants.model_config import ModelConfig
 from bfcl_eval.model_handler.api_inference.openai_completion import (
    OpenAICompletionsHandler,
 )
 from bfcl_eval.model_handler.api_inference.openai_response import (
    OpenAIResponsesHandler,
 )
 if api_type == "responses":
    handler = OpenAIResponsesHandler
 else:
    handler = OpenAICompletionsHandler
 bfcl_model_config.MODEL_CONFIG_MAPPING[model] = ModelConfig(
    model_name=model,
    display_name=f"{model} (FC) (vLLM)",
    url=f"https://huggingface.co/{model}",
    org="",
    license="apache-2.0",
    model_handler=handler,
    input_price=None,
    output_price=None,
    is_fc_model=True,
    underscore_to_dot=True,
 )
 from bfcl_eval.__main__ import evaluate, generate
 import inspect
 import typer
 def _get_default_kwargs(function):
    kwargs = {}
    for k, v in inspect.signature(function).parameters.items():
        if v.default is not inspect.Parameter.empty:
            default = v.default
            if isinstance(default, typer.models.OptionInfo):
                default = default.default
            kwargs[k] = default
    return kwargs
 # ---- generate ----
 print(f"=== BFCL generate: model={model} test_category={test_category} ===")
 gen_kwargs = _get_default_kwargs(generate)
 gen_kwargs["model"] = [model]
 gen_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
 gen_kwargs["skip_server_setup"] = True
 gen_kwargs["num_threads"] = num_threads
 generate(**gen_kwargs)
 # ---- evaluate ----
 print(f"=== BFCL evaluate: model={model} test_category={test_category} ===")
 eval_kwargs = _get_default_kwargs(evaluate)
 eval_kwargs["model"] = [model]
 eval_kwargs["test_category"] = [c.strip() for c in test_category.split(",")]
 evaluate(**eval_kwargs)
 print("=== BFCL evaluation completed successfully ===")
 PYEOF
 # ---- Upload results to buildkite ----
 if command -v buildkite-agent &>/dev/null; then
    if [ $bfcl_exit_code -eq 0 ]; then
        STYLE="success"
        STATUS="PASSED"
    else
        STYLE="error"
        STATUS="FAILED"
    fi
    buildkite-agent annotate --style "$STYLE" --context "bfcl-results" <<EOF
 ### BFCL Tool Call Correctness - ${STATUS}
 - **Model:** \`${MODEL}\`
 - **Parser:** \`${TOOL_CALL_PARSER}\`
 - **API type:** \`${API_TYPE}\`
 - **Test category:** \`${TEST_CATEGORY}\`
 EOF
    # BFCL writes results to $BFCL_PROJECT_ROOT/result/ and scores to
    # $BFCL_PROJECT_ROOT/score/
    RESULTS_ROOT="${OUTPUT_DIR:-.}"
    if [ -d "$RESULTS_ROOT/result" ]; then
        buildkite-agent artifact upload "$RESULTS_ROOT/result/**/*"
    fi
    if [ -d "$RESULTS_ROOT/score" ]; then
        buildkite-agent artifact upload "$RESULTS_ROOT/score/**/*"
    fi
 fi
 exit $bfcl_exit_code
--- a/.buildkite/scripts/tpu/docker_run_bm.sh
+++ b/.buildkite/scripts/tpu/docker_run_bm.sh
@@ -9,11 +9,10 @@ ENV_FILE=$1
 # For testing on local vm, use `set -a` to export all variables
 source /etc/environment
-# shellcheck source=/dev/null
+source $ENV_FILE
 source "$ENV_FILE"
 remove_docker_container() { 
-    docker rm -f "$CONTAINER_NAME" || true;
+    docker rm -f $CONTAINER_NAME || true;
 }
 trap remove_docker_container EXIT
@@ -42,13 +41,13 @@ echo
 echo "starting docker...$CONTAINER_NAME"
 echo    
 docker run \
- -v "$DOWNLOAD_DIR":"$DOWNLOAD_DIR" \
+ -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
- --env-file "$ENV_FILE" \
+ --env-file $ENV_FILE \
 -e HF_TOKEN="$HF_TOKEN" \
- -e TARGET_COMMIT="$BUILDKITE_COMMIT" \
+ -e TARGET_COMMIT=$BUILDKITE_COMMIT \
- -e MODEL="$MODEL" \
+ -e MODEL=$MODEL \
 -e WORKSPACE=/workspace \
- --name "$CONTAINER_NAME" \
+ --name $CONTAINER_NAME \
 -d \
 --privileged \
 --network host \
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@@ -42,21 +42,21 @@ echo "lanching vllm..."
 echo "logging to $VLLM_LOG"
 echo
-vllm serve "$MODEL" \
+vllm serve $MODEL \
 --seed 42 \
- --max-num-seqs "$MAX_NUM_SEQS" \
+ --max-num-seqs $MAX_NUM_SEQS \
- --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
+ --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
- --tensor-parallel-size "$TENSOR_PARALLEL_SIZE" \
+ --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
 --no-enable-prefix-caching \
- --download_dir "$DOWNLOAD_DIR" \
+ --download_dir $DOWNLOAD_DIR \
- --max-model-len "$MAX_MODEL_LEN" > "$VLLM_LOG" 2>&1 &
+ --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
 echo "wait for 20 minutes.."
 echo
 # sleep 1200
 # wait for 10 minutes...
-for _ in {1..120}; do
+for i in {1..120}; do
    # TODO: detect other type of errors.
    if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
        echo "Detected RuntimeError, exiting."
@@ -78,11 +78,11 @@ echo "logging to $BM_LOG"
 echo
 vllm bench serve \
    --backend vllm \
-    --model "$MODEL"  \
+    --model $MODEL  \
    --dataset-name sonnet \
    --dataset-path benchmarks/sonnet_4x.txt \
-    --sonnet-input-len "$INPUT_LEN" \
+    --sonnet-input-len $INPUT_LEN \
-    --sonnet-output-len "$OUTPUT_LEN" \
+    --sonnet-output-len $OUTPUT_LEN \
    --ignore-eos > "$BM_LOG"
 echo "completed..."
--- a/.buildkite/scripts/upload-nightly-wheels.sh
+++ b/.buildkite/scripts/upload-nightly-wheels.sh
@@ -72,19 +72,20 @@ obj_json="objects.json"
 aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
 mkdir -p "$INDICES_OUTPUT_DIR"
-# call script to generate indices for all existing wheels
+# call script to generate indicies for all existing wheels
 # this indices have relative paths that could work as long as it is next to the wheel directory in s3
 # i.e., the wheels are always in s3://vllm-wheels/<commit>/
 # and indices can be placed in /<commit>/, or /nightly/, or /<version>/
-alias_args=()
+if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
-if [[ -n "$DEFAULT_VARIANT_ALIAS" ]]; then
+    alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
-    alias_args=(--alias-to-default "$DEFAULT_VARIANT_ALIAS")
+else
    alias_arg=""
 fi
 # HACK: we do not need regex module here, but it is required by pre-commit hook
 # To avoid any external dependency, we simply replace it back to the stdlib re module
 sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
-$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" "${alias_args[@]}"
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
 # copy indices to /<commit>/ unconditionally
 echo "Uploading indices to $S3_COMMIT_PREFIX"
@@ -99,9 +100,9 @@ fi
 # re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
 if [[ "$version" != *"dev"* ]]; then
    echo "Re-generating indices for /$pure_version/"
-    rm -rf "${INDICES_OUTPUT_DIR:?}/*"
+    rm -rf "$INDICES_OUTPUT_DIR/*"
    mkdir -p "$INDICES_OUTPUT_DIR"
    # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
-    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" "${alias_args[@]}"
+    $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
 fi
--- a/.buildkite/scripts/upload-release-wheels-pypi.sh
+++ b/.buildkite/scripts/upload-release-wheels-pypi.sh
@@ -7,7 +7,7 @@ SUBPATH=$BUILDKITE_COMMIT
 S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
 RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
-GIT_VERSION=$(git describe --exact-match --tags "$BUILDKITE_COMMIT" 2>/dev/null)
+GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
 echo "Release version from Buildkite: $RELEASE_VERSION"
@@ -54,13 +54,10 @@ mkdir -p $DIST_DIR
 # include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
 aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
 echo "Wheels copied to local directory"
-# generate source distribution using setup.py
+# generate source tarball
-python setup.py sdist --dist-dir=$DIST_DIR
+git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
 ls -la $DIST_DIR
 SDIST_FILE=$(find $DIST_DIR -name "vllm*.tar.gz")
 echo "Found sdist: $SDIST_FILE"
 # upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
 PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
 if [[ -z "$PYPI_WHEEL_FILES" ]]; then
@@ -68,6 +65,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
  exit 1
 fi
-python3 -m twine check "$PYPI_WHEEL_FILES" "$SDIST_FILE"
+python3 -m twine check $PYPI_WHEEL_FILES
-python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES" "$SDIST_FILE"
+python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES
-echo "Wheels and source distribution uploaded to PyPI"
+echo "Wheels uploaded to PyPI"
--- a/.buildkite/scripts/upload-rocm-wheels.sh
+++ b/.buildkite/scripts/upload-rocm-wheels.sh
@@ -55,7 +55,7 @@ mkdir -p all-rocm-wheels
 cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
 cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
-WHEEL_COUNT=$(find all-rocm-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
+WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
 echo "Total wheels to upload: $WHEEL_COUNT"
 if [ "$WHEEL_COUNT" -eq 0 ]; then
@@ -115,7 +115,7 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] |
 fi
 # Extract version from vLLM wheel and update version-specific index
-VLLM_WHEEL=$(find all-rocm-wheels -maxdepth 1 -name 'vllm*.whl' 2>/dev/null | head -1)
+VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
 if [ -n "$VLLM_WHEEL" ]; then
    VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
    echo "Version in wheel: $VERSION"
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
--- a/.buildkite/test_areas/benchmarks.yaml
+++ b/.buildkite/test_areas/benchmarks.yaml
@@ -17,15 +17,3 @@ steps:
  - tests/benchmarks/
  commands:
  - pytest -v -s benchmarks/
 - label: Attention Benchmarks Smoke Test (B200)
  device: b200
  num_gpus: 2
  optional: true
  working_dir: "/vllm-workspace/"
  timeout_in_minutes: 10
  source_file_dependencies:
  - benchmarks/attention_benchmarks/
  - vllm/v1/attention/
  commands:
  - python3 benchmarks/attention_benchmarks/benchmark.py --backends flash flashinfer --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -36,16 +36,6 @@ steps:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
 - label: AsyncTP Correctness Tests (B200)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/"
  device: b200
  optional: true
  num_devices: 2
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
 - label: Distributed Compile Unit Tests (2xH100)
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
@@ -101,8 +91,8 @@ steps:
    - nvidia-smi
    # Run all models and attn backends but only Inductor partition and native custom ops
    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
-    # Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and (qwen3 or deepseek)"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
 - label: Fusion E2E Config Sweep (H100)
  timeout_in_minutes: 30
@@ -131,10 +121,13 @@ steps:
  optional: true
  commands:
    - nvidia-smi
-    # Run all models but only FLASHINFER, Inductor partition and native custom ops
+    # Run all models and attn backends but only Inductor partition and native custom ops
-    # Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # -k "inductor_partition and not +rms_norm and not +quant_fp8"
-    # Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
+    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek)) or llama-3)"
+    # -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
    # Run just llama3 (fp8 & fp4) for all config combinations
    # -k "llama-3"
    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
 - label: Fusion E2E TP2 Quick (H100)
  timeout_in_minutes: 20
@@ -150,8 +143,8 @@ steps:
  commands:
    - nvidia-smi
    # Run all models and attn backends but only Inductor partition and native custom ops
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
 - label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
  timeout_in_minutes: 40
@@ -169,7 +162,7 @@ steps:
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
-    # Run just llama3 (fp8 & bf16) for all config combinations
+    # Run just llama3 (fp4 & fp8 & bf16) for all config combinations
    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
 - label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
@@ -204,8 +197,7 @@ steps:
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
-    # Run all models but only FLASHINFER, Inductor partition and native custom ops
+    # Run all models and attn backends but only Inductor partition and native custom ops
    # include qwen/deepseek with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    # for ar-rms-quant-fp4, also sweep llama3
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))) or Llama-3.1-8B-Instruct-FP4"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -50,18 +50,23 @@ steps:
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
- label: Distributed Torchrun + Examples (4 GPUs)
+- label: Distributed Tests (4 GPUs)
-  timeout_in_minutes: 30
+  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
-  - tests/distributed/test_torchrun_example.py
+  - tests/distributed/test_utils
-  - tests/distributed/test_torchrun_example_moe.py
+  - tests/distributed/test_pynccl
  - tests/distributed/test_events
  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - examples/offline_inference/new_weight_syncing/
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_symm_mem_allreduce.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
@@ -79,27 +84,6 @@ steps:
  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
  # OLD rlhf examples
  - cd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  # NEW rlhf examples
  - cd new_weight_syncing
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
 - label: Distributed DP Tests (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_utils
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
@@ -107,27 +91,20 @@ steps:
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
 - label: Distributed Compile + Comm (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
  - tests/compile/fullgraph/test_basic_correctness.py
  - tests/distributed/test_symm_mem_allreduce.py
  - tests/distributed/test_multiproc_executor.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
  - pytest -v -s compile/fullgraph/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
  - pytest -v -s distributed/test_symm_mem_allreduce.py
-  # test multi-node TP with multiproc executor (simulated on single node)
+  # TODO: create a dedicated test section for multi-GPU example tests
-  - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
+  # when we have multiple distributed example tests
  # OLD rlhf examples
  - cd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  # NEW rlhf examples
  - cd new_weight_syncing
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
 - label: Distributed Tests (8 GPUs)(H100)
  timeout_in_minutes: 10
@@ -169,7 +146,6 @@ steps:
  num_devices: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py
@@ -189,7 +165,6 @@ steps:
  num_devices: 2
  num_nodes: 2
  no_plugin: true
  optional: true # TODO: revert once infra issue solved
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
@@ -222,31 +197,7 @@ steps:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
+- label: Pipeline + Context Parallelism (4 GPUs))
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 - label: NixlConnector PD + Spec Decode acceptance (2 GPUs)
  timeout_in_minutes: 30
  device: a100
  working_dir: "/vllm-workspace/tests"
  num_devices: 2
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - vllm/v1/worker/kv_connector_model_runner_mixin.py
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
 - label: Pipeline + Context Parallelism (4 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -29,11 +29,15 @@ steps:
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
- label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100)
+- label: Prime-RL Integration (2 GPUs)
-  timeout_in_minutes: 60
+  timeout_in_minutes: 30
  device: h100
  optional: true
-  num_devices: 1
+  soft_fail: true
  num_devices: 2
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh 0.25 200 8030
+    - nvidia-smi
    - bash .buildkite/scripts/run-prime-rl-test.sh
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -14,59 +14,17 @@ steps:
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
- label: Engine (1 GPU)
+- label: V1 e2e + engine
-  timeout_in_minutes: 30
+  timeout_in_minutes: 45
  source_file_dependencies:
-    - vllm/v1/engine/
+    - vllm/
-    - tests/v1/engine/
+    - tests/v1
  commands:
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - pytest -v -s v1/e2e
    # Run this test standalone for now;
    # need to untangle use (implicit) use of spawn/fork across the tests.
    - pytest -v -s v1/engine/test_preprocess_error_handling.py
    # Run the rest of v1/engine tests
    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
 - label: e2e Scheduling (1 GPU)
  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/v1/
    - tests/v1/e2e/general/
  commands:
    - pytest -v -s v1/e2e/general/test_async_scheduling.py
 - label: e2e Core (1 GPU)
  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/v1/
    - tests/v1/e2e/general/
  commands:
    - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
 - label: V1 e2e (2 GPUs)
  timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
  optional: true
  num_devices: 2
  source_file_dependencies:
    - vllm/
    - tests/v1/e2e
  commands:
    # Only run tests that need exactly 2 GPUs
    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
  mirror:
    amd:
      device: mi325_2
      depends_on:
      - image-build-amd
 - label: V1 e2e (4 GPUs)
  timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
  optional: true
  num_devices: 4
  source_file_dependencies:
    - vllm/
    - tests/v1/e2e
  commands:
    # Only run tests that need 4 GPUs
    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
  mirror:
    amd:
      device: mi325_4
      depends_on:
      - image-build-amd
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -34,26 +34,23 @@ steps:
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
  - pytest -v -s entrypoints/test_chat_utils.py
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Entrypoints Integration (API Server 2)
  timeout_in_minutes: 130
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/rpc
  - tests/entrypoints/instrumentator
  - tests/tool_use
  - tests/entrypoints/sleep
  - tests/entrypoints/instrumentator
  - tests/entrypoints/rpc
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/instrumentator
  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
  - pytest -v -s entrypoints/instrumentator
  - pytest -v -s entrypoints/sleep
  - pytest -v -s tool_use
 - label: Entrypoints Integration (Pooling)
@@ -82,11 +79,6 @@ steps:
    - tests/v1
  commands:
    - pytest -v -s v1/entrypoints
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: OpenAI API Correctness
  timeout_in_minutes: 30
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -21,18 +21,3 @@ steps:
  commands:
  - pytest -v -s distributed/test_eplb_execute.py
  - pytest -v -s distributed/test_eplb_spec_decode.py
 - label: Elastic EP Scaling Test
  timeout_in_minutes: 20
  device: b200
  optional: true
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/compilation/
  - tests/distributed/
  commands:
  - pytest -v -s distributed/test_elastic_ep.py
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -8,9 +8,8 @@ steps:
  - csrc/
  - tests/kernels/core
  - tests/kernels/test_top_k_per_row.py
  - tests/kernels/test_concat_mla_q.py
  commands:
-    - pytest -v -s kernels/core kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
 - label: Kernels Attention Test %N
  timeout_in_minutes: 35
@@ -45,8 +44,7 @@ steps:
  - vllm/envs.py
  - vllm/config
  commands:
-    - pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
    - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
 - label: Kernels Mamba Test
@@ -72,7 +70,7 @@ steps:
  - tests/kernels/moe/test_batched_deepgemm.py
  - tests/kernels/attention/test_deepgemm_attention.py
  commands:
-    - pytest -v -s kernels/quantization/test_block_fp8.py
+    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
    - pytest -v -s kernels/moe/test_deepgemm.py
    - pytest -v -s kernels/moe/test_batched_deepgemm.py
    - pytest -v -s kernels/attention/test_deepgemm_attention.py
@@ -97,7 +95,7 @@ steps:
  - vllm/platforms/cuda.py
  commands:
    - nvidia-smi
-    - python3 examples/basic/offline_inference/chat.py
+    - python3 examples/offline_inference/basic/chat.py
    # Attention
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
    - pytest -v -s tests/kernels/attention/test_attention_selector.py
@@ -117,7 +115,6 @@ steps:
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
    - pytest -v -s tests/kernels/moe/test_flashinfer_moe.py
    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
    # e2e
    - pytest -v -s tests/models/quantization/test_nvfp4.py
@@ -157,6 +154,8 @@ steps:
  commands:
    - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
    - pytest -v -s kernels/moe/test_deepep_moe.py
    - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
    # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
 - label: Kernels Fp4 MoE Test (B200)
  timeout_in_minutes: 60
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -11,17 +11,17 @@ steps:
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
-# - label: LM Eval Large Models (4 GPUs)(A100)
+- label: LM Eval Large Models (4 GPUs)(A100)
-#   device: a100
+  device: a100
-#   optional: true
+  optional: true
-#   num_devices: 4
+  num_devices: 4
-#   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-#   source_file_dependencies:
+  source_file_dependencies:
-#   - csrc/
+  - csrc/
-#   - vllm/model_executor/layers/quantization
+  - vllm/model_executor/layers/quantization
-#   commands:
+  commands:
-#   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-#   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 - label: LM Eval Large Models (4 GPUs)(H100)
  device: h100
@@ -73,29 +73,3 @@ steps:
  num_devices: 2
  commands:
    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
 - label: GPQA Eval (GPT-OSS) (H100)
  timeout_in_minutes: 120
  device: h100
  optional: true
  num_devices: 2
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/evals/gpt_oss/
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-h100.txt
 - label: GPQA Eval (GPT-OSS) (B200)
  timeout_in_minutes: 120
  device: b200
  optional: true
  num_devices: 2
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/evals/gpt_oss/
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-b200.txt
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -9,7 +9,6 @@ steps:
    - tests/v1
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    # split the test to avoid interference
    - pytest -v -s -m 'not cpu_test' v1/core
    - pytest -v -s v1/executor
@@ -17,7 +16,6 @@ steps:
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
    # TODO: create another `optional` test group for slow tests
    - pytest -v -s -m 'not slow_test' v1/spec_decode
    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
    - pytest -v -s -m 'not cpu_test' v1/metrics
@@ -27,11 +25,6 @@ steps:
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: V1 Others (CPU)
  depends_on:
@@ -67,13 +60,12 @@ steps:
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
-     # for basic
+    - python3 offline_inference/basic/chat.py # for basic
-    - python3 basic/offline_inference/chat.py
+    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
-    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
+    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 offline_inference/basic/classify.py
-    - python3 basic/offline_inference/classify.py
+    - python3 offline_inference/basic/embed.py
-    - python3 basic/offline_inference/embed.py
+    - python3 offline_inference/basic/score.py
    - python3 basic/offline_inference/score.py
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
@@ -116,11 +108,9 @@ steps:
  timeout_in_minutes: 50
  source_file_dependencies:
  - vllm/
  - tests/detokenizer
  - tests/multimodal
  - tests/utils_
  commands:
  - pytest -v -s detokenizer
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_
@@ -133,7 +123,6 @@ steps:
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/test_pooling_params.py
  - tests/test_ray_env.py
  - tests/multimodal
  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
@@ -147,7 +136,6 @@ steps:
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s test_pooling_params.py
  - pytest -v -s test_ray_env.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s renderers
  - pytest -v -s tokenizers_
@@ -155,6 +143,20 @@ steps:
  - pytest -v -s transformers_utils
  - pytest -v -s config
 - label: GPT-OSS Eval (B200)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  device: b200
  optional: true
  source_file_dependencies:
  - tests/evals/gpt_oss
  - vllm/model_executor/models/gpt_oss.py
  - vllm/model_executor/layers/quantization/mxfp4.py
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
 - label: Batch Invariance (H100)
  timeout_in_minutes: 25
  device: h100
--- a/.buildkite/test_areas/model_executor.yaml
+++ b/.buildkite/test_areas/model_executor.yaml
@@ -9,9 +9,9 @@ steps:
  - vllm/config/model.py
  - vllm/model_executor
  - tests/model_executor
-  - tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
+  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
  commands:
    - apt-get update && apt-get install -y curl libsodium23
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s model_executor
-    - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py
+    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
--- a/.buildkite/test_areas/model_runner_v2.yaml
+++ b/.buildkite/test_areas/model_runner_v2.yaml
@@ -1,110 +0,0 @@
 group: Model Runner V2
 depends_on:
  - image-build
 steps:
 - label: Model Runner V2 Core Tests
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/v1/worker/gpu/
  - vllm/v1/worker/gpu_worker.py
  - vllm/v1/core/sched/
  - vllm/v1/attention/
  - tests/v1/engine/test_llm_engine.py
  - tests/v1/e2e/
  - tests/v1/entrypoints/llm/test_struct_output_generate.py
  commands:
  - set -x
  - export VLLM_USE_V2_MODEL_RUNNER=1
  - pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics"
  # This requires eager until we sort out CG correctness issues.
  # TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged.
  - ENFORCE_EAGER=1 pytest -v -s v1/e2e/general/test_async_scheduling.py -k "not ngram"
  - pytest -v -s v1/e2e/general/test_context_length.py
  - pytest -v -s v1/e2e/general/test_min_tokens.py
  # Temporary hack filter to exclude ngram spec decoding based tests.
  - pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
 - label: Model Runner V2 Examples
  timeout_in_minutes: 45
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
    - vllm/v1/worker/gpu/
    - vllm/v1/core/sched/
    - vllm/v1/worker/gpu_worker.py
    - examples/offline_inference/
    - examples/basic/offline_inference/
    - examples/pooling/embed/vision_embedding_offline.py
    - examples/others/tensorize_vllm_model.py
  commands:
    - set -x
    - export VLLM_USE_V2_MODEL_RUNNER=1
    - pip install tensorizer # for tensorizer test
    - python3 basic/offline_inference/chat.py # for basic
    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
    #- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10  # TODO
    #- python3 basic/offline_inference/embed.py   # TODO
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    # for pooling models
    - python3 pooling/embed/vision_embedding_offline.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 - label: Model Runner V2 Distributed (2 GPUs)
  timeout_in_minutes: 45
  working_dir: "/vllm-workspace/tests"
  num_devices: 2
  source_file_dependencies:
    - vllm/v1/worker/gpu/
    - vllm/v1/worker/gpu_worker.py
    - tests/basic_correctness/test_basic_correctness.py
    - tests/v1/distributed/test_async_llm_dp.py
    - tests/v1/distributed/test_eagle_dp.py
  commands:
    - set -x
    - export VLLM_USE_V2_MODEL_RUNNER=1
    # The "and not True" here is a hacky way to exclude the prompt_embeds cases which aren't yet supported.
    - TARGET_TEST_SUITE=L4 pytest -v -s basic_correctness/test_basic_correctness.py -m 'distributed(num_gpus=2)' -k "not ray and not True"
    # https://github.com/NVIDIA/nccl/issues/1838
    - export NCCL_CUMEM_HOST_ENABLE=0
    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py -k "not ray"
    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
 # These require fix https://github.com/vllm-project/vllm/pull/36280
 - label: Model Runner V2 Pipeline Parallelism (4 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
    - vllm/v1/worker/gpu/
    - vllm/v1/worker/gpu_worker.py
    - tests/distributed/test_pipeline_parallel.py
    #- tests/distributed/test_pp_cudagraph.py
  commands:
    - set -x
    - export VLLM_USE_V2_MODEL_RUNNER=1
    - pytest -v -s distributed/test_pipeline_parallel.py -k "not ray and not Jamba"
    # TODO: Uncomment once https://github.com/vllm-project/vllm/pull/35162 is merged.
    #- pytest -v -s distributed/test_pp_cudagraph.py -k "not ray"
 - label: Model Runner V2 Spec Decode
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/worker/gpu/
  - vllm/v1/worker/gpu_worker.py
  - tests/v1/spec_decode/test_max_len.py
  - tests/v1/e2e/spec_decode/test_spec_decode.py
  commands:
  - set -x
  - export VLLM_USE_V2_MODEL_RUNNER=1
  - pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp"
  - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle or mtp"
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -4,6 +4,7 @@ depends_on:
 steps:
 - label: Basic Models Tests (Initialization)
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -15,6 +16,7 @@ steps:
 - label: Basic Models Tests (Extra Initialization) %N
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
@@ -36,12 +38,6 @@ steps:
  - tests/models/test_registry.py
  commands:
    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Basic Models Test (Other CPU) # 5min
  depends_on: 
@@ -65,7 +61,7 @@ steps:
    - pytest -v -s tests/models/test_transformers.py
    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
-    - python3 examples/basic/offline_inference/chat.py
+    - python3 examples/offline_inference/basic/chat.py
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
--- a/.buildkite/test_areas/models_language.yaml
+++ b/.buildkite/test_areas/models_language.yaml
@@ -4,6 +4,7 @@ depends_on:
 steps:
 - label: Language Models Tests (Standard)
  timeout_in_minutes: 25
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -15,6 +16,7 @@ steps:
 - label: Language Models Tests (Extra Standard) %N
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
@@ -30,6 +32,7 @@ steps:
 - label: Language Models Tests (Hybrid) %N
  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
@@ -37,7 +40,7 @@ steps:
  commands:
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
-    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    # Shard hybrid language model tests
    - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
@@ -45,6 +48,7 @@ steps:
 - label: Language Models Test (Extended Generation) # 80min
  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
@@ -52,21 +56,13 @@ steps:
  commands:
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
-    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
      commands:
      - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
      - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
      - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 - label: Language Models Test (PPL)
  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
@@ -76,20 +72,17 @@ steps:
 - label: Language Models Test (Extended Pooling)  # 36min
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/pooling
  commands:
    - pytest -v -s models/language/pooling -m 'not core_model'
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Language Models Test (MTEB)
  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -2,65 +2,16 @@ group: Models - Multimodal
 depends_on: 
  - image-build
 steps:
- label: "Multi-Modal Models (Standard) 1: qwen2"
+- label: Multi-Modal Models (Standard) # 60min
-  timeout_in_minutes: 45
+  timeout_in_minutes: 80
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2"
+    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model
+    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: "Multi-Modal Models (Standard) 2: qwen3 + gemma"
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma"
    - pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl"
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma"
    - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: "Multi-Modal Models (Standard) 4: other + whisper"
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Multi-Modal Processor Test (CPU)
  depends_on: 
@@ -69,7 +20,6 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  - tests/models/registry.py
  device: cpu
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
@@ -80,7 +30,6 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  - tests/models/registry.py
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
@@ -103,11 +52,6 @@ steps:
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Multi-Modal Models (Extended) 2
  optional: true
@@ -126,3 +70,12 @@ steps:
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models
  optional: true
  commands:
    - echo 'Testing custom models...'
    # PR authors can temporarily add commands below to test individual models
    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -15,17 +15,10 @@ steps:
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
  # end platform plugin tests
-  # begin io_processor plugins test
+  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
  # test generic io_processor plugins functions
  - pytest -v -s ./plugins_tests/test_io_processor_plugins.py
  # test Terratorch io_processor plugins
  - pip install -e ./plugins/prithvi_io_processor_plugin
-  - pytest -v -s plugins_tests/test_terratorch_io_processor_plugins.py
+  - pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # test bge_m3_sparse io_processor plugin
  - pip install -e ./plugins/bge_m3_sparse_plugin
  - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
  - pip uninstall bge_m3_sparse_plugin -y
  # end io_processor plugins test
  # begin stat_logger plugins test
  - pip install -e ./plugins/vllm_add_dummy_stat_logger
@@ -36,6 +29,6 @@ steps:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py
-  - pytest -v -s entrypoints/openai/chat_completion/test_oot_registration.py # it needs a clean process
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
--- a/.buildkite/test_areas/ray_compat.yaml
+++ b/.buildkite/test_areas/ray_compat.yaml
@@ -1,16 +0,0 @@
 group: Ray Compatibility
 depends_on:
  - image-build
 steps:
 - label: Ray Dependency Compatibility Check
  # Informational only — does not block the pipeline.
  # If this fails, it means the PR introduces a dependency that
  # conflicts with Ray's dependency constraints.
  # See https://github.com/vllm-project/vllm/issues/33599
  soft_fail: true
  timeout_in_minutes: 10
  source_file_dependencies:
  - requirements/
  - setup.py
  commands:
  - bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh
--- a/.buildkite/test_areas/samplers.yaml
+++ b/.buildkite/test_areas/samplers.yaml
@@ -12,10 +12,3 @@ steps:
  commands:
    - pytest -v -s samplers
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
      commands:
      - pytest -v -s samplers
--- a/.buildkite/test_areas/spec_decode.yaml
+++ b/.buildkite/test_areas/spec_decode.yaml
@@ -1,40 +0,0 @@
 group: Spec Decode
 depends_on:
  - image-build
 steps:
 - label: Spec Decode Eagle
  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/v1/spec_decode/
    - vllm/v1/worker/gpu/spec_decode/
    - tests/v1/e2e/spec_decode/
  commands:
    - pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"
 - label: Spec Decode Speculators + MTP
  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/v1/spec_decode/
    - vllm/v1/worker/gpu/spec_decode/
    - vllm/transformers_utils/configs/speculators/
    - tests/v1/e2e/spec_decode/
  commands:
    - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
 - label: Spec Decode Ngram + Suffix
  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/v1/spec_decode/
    - vllm/v1/worker/gpu/spec_decode/
    - tests/v1/e2e/spec_decode/
  commands:
    - pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"
 - label: Spec Decode Draft Model
  timeout_in_minutes: 30
  source_file_dependencies:
    - vllm/v1/spec_decode/
    - vllm/v1/worker/gpu/spec_decode/
    - tests/v1/e2e/spec_decode/
  commands:
    - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"
--- a/.buildkite/test_areas/weight_loading.yaml
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -13,13 +13,13 @@ steps:
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
-# - label: Weight Loading Multiple GPU - Large Models # optional
+- label: Weight Loading Multiple GPU - Large Models # optional
-#   working_dir: "/vllm-workspace/tests"
+  working_dir: "/vllm-workspace/tests"
-#   num_devices: 2
+  num_devices: 2
-#   device: a100
+  device: a100
-#   optional: true
+  optional: true
-#   source_file_dependencies:
+  source_file_dependencies:
-#   - vllm/
+  - vllm/
-#   - tests/weight_loading
+  - tests/weight_loading
-#   commands:
+  commands:
-#     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
--- a/.github/.bc-linter.yml
+++ b/.github/.bc-linter.yml
@@ -0,0 +1,24 @@
 # doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
 version: 1
 paths:
 # We temporarily disable globally, and will only enable with `annotations.include`
 # include:
 #   - "vllm/v1/attetion/*.py"
 #   - "vllm/v1/core/*.py"
 exclude:
  - "**/*.py"
 scan:
  functions: true        # check free functions and methods
  classes: true          # check classes/dataclasses
  public_only: true      # ignore names starting with "_" at any level
 annotations:
  include:               # decorators that force‑include a symbol
    - name: "bc_linter_include"  # matched by simple name or dotted suffix
      propagate_to_members: false # for classes, include methods/inner classes
  exclude:               # decorators that force‑exclude a symbol
    - name: "bc_linter_skip"     # matched by simple name or dotted suffix
      propagate_to_members: true  # for classes, exclude methods/inner classes
 excluded_violations: []  # e.g. ["ParameterRenamed", "FieldTypeChanged"]
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,66 +2,45 @@
 # for more info about CODEOWNERS file
 # This lists cover the "core" components of vLLM that require careful review
-/vllm/compilation @zou3519 @youkaichao @ProExpertProg @BoyuanFeng
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
-/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
+/vllm/model_executor/layers/attention @LucasWilkinson
 /vllm/lora @jeejeelee
 /vllm/model_executor/layers/attention @LucasWilkinson @MatthewBonanni
 /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
 /vllm/model_executor/layers/batch_invariant.py @yewentao256 
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
-/vllm/vllm_flash_attn @LucasWilkinson @MatthewBonanni
+/vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm @chaunceyjiang
 /vllm/entrypoints @aarnphm @chaunceyjiang
 /vllm/tool_parsers @aarnphm @chaunceyjiang
 /vllm/compilation @zou3519 @youkaichao @ProExpertProg
 /vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
 /vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
-/vllm/config/cache.py @heheda12345
+/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
 # Entrypoints
 /vllm/entrypoints/anthropic @mgoin @DarkLight1337
 /vllm/entrypoints/cli @hmellor @mgoin @DarkLight1337 @russellb
 /vllm/entrypoints/mcp @heheda12345
 /vllm/entrypoints/openai @aarnphm @chaunceyjiang @DarkLight1337 @russellb
 /vllm/entrypoints/openai/realtime @njhill
 /vllm/entrypoints/openai/speech_to_text @NickLucche
 /vllm/entrypoints/pooling @noooop
 /vllm/entrypoints/sagemaker @DarkLight1337
 /vllm/entrypoints/serve @njhill
 /vllm/entrypoints/*.py @njhill
 /vllm/entrypoints/chat_utils.py @DarkLight1337
 /vllm/entrypoints/llm.py @DarkLight1337
 # Input/Output Processing
 /vllm/sampling_params.py @njhill @NickLucche
 /vllm/pooling_params.py @noooop @DarkLight1337
 /vllm/tokenizers @DarkLight1337 @njhill
 /vllm/renderers @DarkLight1337 @njhill
 /vllm/reasoning @aarnphm @chaunceyjiang
 /vllm/tool_parsers @aarnphm @chaunceyjiang
 # vLLM V1
-/vllm/v1/attention @LucasWilkinson @MatthewBonanni
+/vllm/v1/attention @LucasWilkinson
 /vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
 /vllm/v1/attention/backends/mla @pavanimajety
 /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
 /vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
 /vllm/v1/sample @22quinn @houseroad @njhill
-/vllm/v1/spec_decode @benchislett @luccafong @MatthewBonanni
+/vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/kv_cache_interface.py @heheda12345
 /vllm/v1/kv_offload @ApostaC @orozery
-/vllm/v1/engine @njhill
+/vllm/v1/worker/gpu/kv_connector.py @orozery
-/vllm/v1/executor @njhill
+/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery
 /vllm/v1/worker @njhill
 /vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery @NickLucche
 # Model runner V2
-/vllm/v1/worker/gpu @WoosukKwon @njhill
+/vllm/v1/worker/gpu @WoosukKwon
 /vllm/v1/worker/gpu/kv_connector.py @orozery
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin 
@@ -136,8 +115,8 @@ mkdocs.yaml @hmellor
 /vllm/model_executor/models/mixtral*.py @patrickvonplaten
 /vllm/model_executor/models/voxtral*.py @patrickvonplaten
 /vllm/model_executor/models/pixtral*.py @patrickvonplaten
 /vllm/tokenizers/mistral.py @patrickvonplaten
 /vllm/transformers_utils/configs/mistral.py @patrickvonplaten
 /vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
 # Kernels
 /vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep
@@ -173,7 +152,9 @@ mkdocs.yaml @hmellor
 /examples/pooling @noooop
 /tests/models/*/pooling* @noooop
 /tests/entrypoints/pooling @noooop
 /vllm/entrypoints/pooling @noooop
 /vllm/config/pooler.py @noooop
 /vllm/pooling_params.py @noooop
 /vllm/model_executor/layers/pooler @noooop
 # Security guide and policies
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -3,7 +3,6 @@ pull_request_rules:
  description: Automatically apply documentation label
  conditions:
    - label != stale
    - -closed
    - or:
      - files~=^[^/]+\.md$
      - files~=^docs/
@@ -27,7 +26,7 @@ pull_request_rules:
        Hi @{{author}}, the pre-commit checks have failed. Please run:
        ```bash 
-        uv pip install pre-commit>=4.5.1
+        uv pip install pre-commit
        pre-commit install
        pre-commit run --all-files
        ```
@@ -38,13 +37,15 @@ pull_request_rules:
        > [!TIP]
        > <details>
-        > <summary>Is <code>mypy</code> failing?</summary>
+        > <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
        > <br/>
-        > <code>mypy</code> is run differently in CI. If the failure is related to this check, please use the following command to run it locally:
+        > <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
        >
        > ```bash
        > # For mypy (substitute "3.10" with the failing version if needed)
        > pre-commit run --hook-stage manual mypy-3.10
        > # For markdownlint
        > pre-commit run --hook-stage manual markdownlint
        > ```
        > </details>
@@ -258,7 +259,8 @@ pull_request_rules:
      - files=benchmarks/run_structured_output_benchmark.sh
      - files=docs/features/structured_outputs.md
      - files=examples/offline_inference/structured_outputs.py
-      - files=examples/online_serving/structured_outputs/structured_outputs.py
+      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
      - files~=^tests/v1/structured_output/
      - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
      - files~=^vllm/v1/structured_output/
@@ -334,7 +336,7 @@ pull_request_rules:
    - or:
      - files~=^tests/tool_use/
      - files~=^tests/entrypoints/openai/tool_parsers/
-      - files=tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
+      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
      - files~=^vllm/entrypoints/openai/tool_parsers/
      - files=docs/features/tool_calling.md
      - files~=^examples/tool_chat_*
@@ -381,7 +383,7 @@ pull_request_rules:
    - or:
      - files~=^vllm/model_executor/model_loader/tensorizer.py
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
-      - files~=^tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
+      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
      - files~=^tests/model_executor/model_loader/tensorizer_loader/
  actions:
    assign:
--- a/.github/workflows/bc-lint.yml
+++ b/.github/workflows/bc-lint.yml
@@ -0,0 +1,29 @@
 name: BC Lint
 on:
  pull_request:
    types:
      - opened
      - synchronize
      - reopened
      - labeled
      - unlabeled
 jobs:
  bc_lint:
    if: github.repository_owner == 'vllm-project'
    runs-on: ubuntu-latest
    steps:
      - name: Run BC Lint Action
        uses: pytorch/test-infra/.github/actions/bc-lint@main
        with:
          repo: ${{ github.event.pull_request.head.repo.full_name }}
          base_sha: ${{ github.event.pull_request.base.sha }}
          head_sha: ${{ github.event.pull_request.head.sha }}
          suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
          docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
          config_dir: .github
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
  cancel-in-progress: true
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -19,7 +19,6 @@ jobs:
        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
        with:
          python-version: '3.12'
          cache: 'pip'
      - name: Install Python dependencies
        run: |
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -6,9 +6,6 @@ on:
      - main
  workflow_dispatch:  # Manual trigger
 permissions:
  contents: read
 jobs:
  macos-m1-smoke-test:
    runs-on: macos-latest
--- a/.gitignore
+++ b/.gitignore
@@ -3,8 +3,6 @@
 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/*
 !vllm/vllm_flash_attn/__init__.py
 !vllm/vllm_flash_attn/flash_attn_interface.py
 # OpenAI triton kernels copied from source
 vllm/third_party/triton_kernels/*
@@ -189,9 +187,11 @@ cython_debug/
 .vscode/
 # Claude
 CLAUDE.md
 .claude/
 # Codex
 AGENTS.md
 .codex/
 # Cursor
@@ -238,6 +238,3 @@ ep_kernels_workspace/
 vllm/grpc/vllm_engine_pb2.py
 vllm/grpc/vllm_engine_pb2_grpc.py
 vllm/grpc/vllm_engine_pb2.pyi
 # Ignore generated cpu headers 
 csrc/cpu/cpu_attn_dispatch_generated.h
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@ repos:
    args: [--output-format, github, --fix]
  - id: ruff-format
 - repo: https://github.com/crate-ci/typos
-  rev: v1.43.5
+  rev: v1.38.1
  hooks:
  - id: typos
    args: [--force-exclude]
@@ -24,13 +24,12 @@ repos:
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
    types_or: [c++, cuda]
    args: [--style=file, --verbose]
- repo: https://github.com/DavidAnson/markdownlint-cli2
+- repo: https://github.com/igorshubovych/markdownlint-cli
-  rev: v0.21.0
+  rev: v0.45.0
  hooks:
-  - id: markdownlint-cli2
+  - id: markdownlint
-    language_version: lts
+    exclude: '.*\.inc\.md'
-    args: [--fix]
+    stages: [manual] # Only run in CI
    exclude: ^CLAUDE\.md$
 - repo: https://github.com/rhysd/actionlint
  rev: v1.7.7
  hooks:
@@ -56,7 +55,7 @@ repos:
      language: python
      types_or: [python, pyi]
      require_serial: true
-      additional_dependencies: ["mypy[faster-cache]==1.19.1", regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
+      additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.10
    entry: python tools/pre_commit/mypy.py 1 "3.10"
@@ -128,13 +127,6 @@ repos:
    language: python
    types: [python]
    additional_dependencies: [regex]
  # prevent use torch.cuda APIs
  - id: check-torch-cuda-call
    name: "Prevent new 'torch.cuda' APIs call"
    entry: python tools/pre_commit/check_torch_cuda.py
    language: python
    types: [python]
    additional_dependencies: [regex]
  - id: validate-config
    name: Validate configuration has default values and that each field has a docstring
    entry: python tools/pre_commit/validate_config.py
@@ -151,11 +143,6 @@ repos:
    name: Check attention backend documentation is up to date
    entry: python tools/pre_commit/generate_attention_backend_docs.py --check
    language: python
  - id: check-boolean-context-manager
    name: Check for boolean ops in with-statements
    entry: python tools/pre_commit/check_boolean_context_manager.py
    language: python
    types: [python]
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -9,15 +9,13 @@ build:
    python: "3.12"
  jobs:
    post_checkout:
-      # - bash docs/maybe_skip_pr_build.sh
+      - git fetch --unshallow || true
      - git fetch origin main --unshallow --no-tags --filter=blob:none || true
    pre_create_environment:
      - pip install uv
    create_environment:
      - uv venv $READTHEDOCS_VIRTUALENV_PATH
    install:
      - uv pip install --python $READTHEDOCS_VIRTUALENV_PATH/bin/python --no-cache-dir -r requirements/docs.txt 
 mkdocs:
  configuration: mkdocs.yaml
  fail_on_warning: true
 # Optionally declare the Python requirements required to build your docs
 python:
  install:
    - requirements: requirements/docs.txt
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1,113 +0,0 @@
 # Agent Instructions for vLLM
 > These instructions apply to **all** AI-assisted contributions to `vllm-project/vllm`.
 > Breaching these guidelines can result in automatic banning.
 ## 1. Contribution Policy (Mandatory)
 ### Duplicate-work checks
 Before proposing a PR, run these checks:
 ```bash
 gh issue view <issue_number> --repo vllm-project/vllm --comments
 gh pr list --repo vllm-project/vllm --state open --search "<issue_number> in:body"
 gh pr list --repo vllm-project/vllm --state open --search "<short area keywords>"
 ```
 - If an open PR already addresses the same fix, do not open another.
 - If your approach is materially different, explain the difference in the issue.
 ### No low-value busywork PRs
 Do not open one-off PRs for tiny edits (single typo, isolated style change, one mutable default, etc.). Mechanical cleanups are acceptable only when bundled with substantive work.
 ### Accountability
 - Pure code-agent PRs are **not allowed**. A human submitter must understand and defend the change end-to-end.
 - The submitting human must review every changed line and run relevant tests.
 - PR descriptions for AI-assisted work **must** include:
    - Why this is not duplicating an existing PR.
    - Test commands run and results.
    - Clear statement that AI assistance was used.
 ### Fail-closed behavior
 If work is duplicate/trivial busywork, **do not proceed**. Return a short explanation of what is missing.
 ---
 ## 2. Development Workflow
 ### Environment setup
 ```bash
 # Install `uv` if you don't have it already:
 curl -LsSf https://astral.sh/uv/install.sh | sh
 # Always use `uv` for Python environment management:
 uv venv --python 3.12
 source .venv/bin/activate
 # Always make sure `pre-commit` and its hooks are installed:
 uv pip install -r requirements/lint.txt
 pre-commit install
 ```
 ### Installing dependencies
 ```bash
 # If you are only making Python changes:
 VLLM_USE_PRECOMPILED=1 uv pip install -e .
 # If you are also making C/C++ changes:
 uv pip install -e .
 ```
 ### Running tests
 Tests require extra dependencies.
 All versions for test dependencies should be read from `requirements/test.txt`
 ```bash
 # Install bare minimum test dependencies:
 uv pip install pytest pytest-asyncio tblib
 # Install additional test dependencies as needed, or install them all as follows:
 uv pip install -r requirements/test.txt
 # Run specific test from specific test file
 pytest tests/path/to/test.py -v -s -k test_name
 # Run all tests in directory
 pytest tests/path/to/dir -v -s
 ```
 ### Running linters
 ```bash
 # Run all pre-commit hooks on staged files:
 pre-commit run
 # Run on all files:
 pre-commit run --all-files
 # Run a specific hook:
 pre-commit run ruff-check --all-files
 # Run mypy as it is in CI:
 pre-commit run mypy-3.10 --all-files --hook-stage manual
 ```
 ### Commit messages
 Add attribution using commit trailers such as `Co-authored-by:` (other projects use `Assisted-by:` or `Generated-by:`). For example:
 ```text
 Your commit message here
 Co-authored-by: GitHub Copilot
 Co-authored-by: Claude
 Co-authored-by: gemini-code-assist
 Signed-off-by: Your Name <your.email@example.com>
 ```
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -1 +0,0 @@
@AGENTS.md
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,7 +37,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1150;gfx1151;gfx1152;gfx1153;gfx1200;gfx1201")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
 # ROCm installation prefix. Default to /opt/rocm but allow override via
 # -DROCM_PATH=/your/rocm/path when invoking cmake.
@@ -293,7 +293,6 @@ set(VLLM_EXT_SRC
  "csrc/fused_qknorm_rope_kernel.cu"
  "csrc/layernorm_quant_kernels.cu"
  "csrc/sampler.cu"
  "csrc/topk.cu"
  "csrc/cuda_view.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/w8a8/int8/scaled_quant.cu"
@@ -725,7 +724,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # CUTLASS MoE kernels
  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
-  # on Hopper). get_cutlass_(batched_)moe_mm_data should only be compiled
+  # on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
  # if it's possible to compile MoE kernels that use its output.
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
@@ -771,51 +770,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()
  # Expert-specialization MXFP8 blockscaled grouped kernels (SM100+).
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
    cuda_archs_loose_intersection(ES_MXFP8_GROUPED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
  else()
    cuda_archs_loose_intersection(ES_MXFP8_GROUPED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND ES_MXFP8_GROUPED_MM_ARCHS)
    set(SRCS
      "csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm.cu"
      "csrc/moe/mxfp8_moe/mxfp8_experts_quant.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${ES_MXFP8_GROUPED_MM_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_ES_MXFP8_GROUPED_MM_SM100=1")
    message(STATUS "Building ES MXFP8 grouped kernels for archs: ${ES_MXFP8_GROUPED_MM_ARCHS}")
  else()
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8
        AND ES_MXFP8_GROUPED_MM_ARCHS)
      message(STATUS "Not building ES MXFP8 grouped kernels as CUDA Compiler version is "
                     "not >= 12.8.")
    else()
      message(STATUS "Not building ES MXFP8 grouped kernels as no compatible archs found "
                     "in CUDA target architectures.")
    endif()
  endif()
  # DeepSeek V3 fused A GEMM kernel (requires SM 9.0+, Hopper and later)
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
    cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
  else()
    cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_FUSED_A_GEMM_ARCHS)
    set(DSV3_FUSED_A_GEMM_SRC "csrc/dsv3_fused_a_gemm.cu")
    set_gencode_flags_for_srcs(
      SRCS "${DSV3_FUSED_A_GEMM_SRC}"
      CUDA_ARCHS "${DSV3_FUSED_A_GEMM_ARCHS}")
    list(APPEND VLLM_EXT_SRC ${DSV3_FUSED_A_GEMM_SRC})
    message(STATUS "Building dsv3_fused_a_gemm for archs: ${DSV3_FUSED_A_GEMM_ARCHS}")
  else()
    message(STATUS "Not building dsv3_fused_a_gemm as no compatible archs found "
                   "in CUDA target architectures.")
  endif()
  # moe_data.cu is used by all CUTLASS MoE kernels.
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
    cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
@@ -998,8 +952,7 @@ set(VLLM_MOE_EXT_SRC
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_MOE_EXT_SRC
    "csrc/moe/moe_wna16.cu"
-    "csrc/moe/grouped_topk_kernels.cu"
+    "csrc/moe/grouped_topk_kernels.cu")
    "csrc/moe/router_gemm.cu")
 endif()
 if(VLLM_GPU_LANG STREQUAL "CUDA")
@@ -1128,27 +1081,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
                   " in CUDA target architectures")
  endif()
  # DeepSeek V3 router GEMM kernel - requires SM90+
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
    cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
  else()
    cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_ROUTER_GEMM_ARCHS)
    set(DSV3_ROUTER_GEMM_SRC
      "csrc/moe/dsv3_router_gemm_entry.cu"
      "csrc/moe/dsv3_router_gemm_float_out.cu"
      "csrc/moe/dsv3_router_gemm_bf16_out.cu")
    set_gencode_flags_for_srcs(
      SRCS "${DSV3_ROUTER_GEMM_SRC}"
      CUDA_ARCHS "${DSV3_ROUTER_GEMM_ARCHS}")
    list(APPEND VLLM_MOE_EXT_SRC "${DSV3_ROUTER_GEMM_SRC}")
    message(STATUS "Building DSV3 router GEMM kernel for archs: ${DSV3_ROUTER_GEMM_ARCHS}")
  else()
    message(STATUS "Not building DSV3 router GEMM kernel as no compatible archs found"
                   " (requires SM90+ and CUDA >= 12.0)")
  endif()
 endif()
 message(STATUS "Enabling moe extension.")
--- a/benchmarks/attention_benchmarks/README.md
+++ b/benchmarks/attention_benchmarks/README.md
@@ -187,7 +187,7 @@ python benchmark.py \
 ## Hardware Requirements
 | Backend | Hardware |
-| ------- | -------- |
+|---------|----------|
 | Flash/Triton/FlashInfer | Any CUDA GPU |
 | CUTLASS MLA | Blackwell (SM100+) |
 | FlashAttn MLA | Hopper (SM90+) |
--- a/benchmarks/attention_benchmarks/init.py
+++ b/benchmarks/attention_benchmarks/init.py
@@ -15,6 +15,7 @@ from .common import (
    BenchmarkConfig,
    BenchmarkResult,
    MockLayer,
    MockModelConfig,
    ResultsFormatter,
    get_attention_scale,
    is_mla_backend,
@@ -35,6 +36,7 @@ __all__ = [
    "ResultsFormatter",
    # Mock objects
    "MockLayer",
    "MockModelConfig",
    # Utilities
    "setup_mla_dims",
    "get_attention_scale",
--- a/benchmarks/attention_benchmarks/batch_spec.py
+++ b/benchmarks/attention_benchmarks/batch_spec.py
@@ -229,40 +229,3 @@ def get_batch_stats(requests: list[BatchRequest]) -> dict:
            sum(r.kv_len for r in requests) / len(requests) if requests else 0
        ),
    }
 def get_batch_type(batch_spec: str, spec_decode_threshold: int = 8) -> str:
    """
    Classify a batch spec into a type string.
    Args:
        batch_spec: Batch specification string (e.g., "q2k", "8q1s1k", "2q2k_8q1s1k")
        spec_decode_threshold: Max q_len to be considered spec-decode vs extend
    Returns:
        Type string: "prefill", "decode", "spec-decode", "extend", or "mixed (types...)"
    """
    requests = parse_batch_spec(batch_spec)
    # Classify each request
    types_present = set()
    for req in requests:
        if req.is_decode:
            types_present.add("decode")
        elif req.is_prefill:
            types_present.add("prefill")
        elif req.is_extend:
            # Distinguish spec-decode (small q_len) from extend (chunked prefill)
            if req.q_len <= spec_decode_threshold:
                types_present.add("spec-decode")
            else:
                types_present.add("extend")
    if len(types_present) == 1:
        return types_present.pop()
    elif len(types_present) > 1:
        # Sort for consistent output
        sorted_types = sorted(types_present)
        return f"mixed ({'+'.join(sorted_types)})"
    else:
        return "unknown"
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -43,12 +43,9 @@ from common import (
    ModelParameterSweep,
    ParameterSweep,
    ResultsFormatter,
    batch_spec_sort_key,
    is_mla_backend,
 )
 from vllm.v1.worker.workspace import init_workspace_manager
 def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
    """Run standard attention benchmark (Flash/Triton/FlashInfer)."""
@@ -61,9 +58,7 @@ def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
    """Run MLA benchmark with appropriate backend."""
    from mla_runner import run_mla_benchmark as run_mla
-    return run_mla(
+    return run_mla(config.backend, config, **kwargs)
        config.backend, config, prefill_backend=config.prefill_backend, **kwargs
    )
 def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
@@ -223,13 +218,10 @@ def run_model_parameter_sweep(
                        by_param_and_spec[key].append(r)
                        break
-    # Sort by param value then spec (batch_size, q_len, kv_len)
+    # Sort by param value then spec
    sorted_keys = sorted(
        by_param_and_spec.keys(),
-        key=lambda x: (
+        key=lambda x: (int(x[0]) if x[0].isdigit() else x[0], x[1]),
            int(x[0]) if x[0].isdigit() else x[0],
            batch_spec_sort_key(x[1]),
        ),
    )
    current_param_value = None
@@ -338,7 +330,7 @@ def run_parameter_sweep(
                by_spec[spec] = []
            by_spec[spec].append(r)
-    for spec in sorted(by_spec.keys(), key=batch_spec_sort_key):
+    for spec in sorted(by_spec.keys()):
        results = by_spec[spec]
        best = min(results, key=lambda r: r.mean_time)
        console.print(
@@ -444,27 +436,20 @@ def main():
    # Backend selection
    parser.add_argument(
        "--backends",
        "--decode-backends",
        nargs="+",
-        help="Decode backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
+        help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
        "flashinfer_mla, flashattn_mla, flashmla)",
    )
    parser.add_argument(
        "--backend",
        help="Single backend (alternative to --backends)",
    )
    parser.add_argument(
        "--prefill-backends",
        nargs="+",
        help="Prefill backends to compare (fa2, fa3, fa4). "
        "Uses the first decode backend for impl construction.",
    )
    # Batch specifications
    parser.add_argument(
        "--batch-specs",
        nargs="+",
-        default=None,
+        default=["q2k", "8q1s1k"],
        help="Batch specifications using extended grammar",
    )
@@ -480,21 +465,6 @@ def main():
    parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
    parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
    parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
    parser.add_argument(
        "--kv-cache-dtype",
        default="auto",
        choices=["auto", "fp8"],
        help="KV cache dtype: auto or fp8",
    )
    parser.add_argument(
        "--cuda-graphs",
        action=argparse.BooleanOptionalAction,
        default=True,
        help=(
            "Launch kernels with CUDA graphs to eliminate CPU overhead"
            "in measurements (default: True)"
        ),
    )
    # Parameter sweep (use YAML config for advanced sweeps)
    parser.add_argument(
@@ -526,24 +496,15 @@ def main():
        if "description" in yaml_config:
            console.print(f"[dim]{yaml_config['description']}[/]")
-        # Override args with YAML values, but CLI args take precedence
+        # Override args with YAML values
-        # Check if CLI provided backends (they would be non-None and not default)
+        # (YAML takes precedence unless CLI arg was explicitly set)
-        cli_backends_provided = args.backend is not None or args.backends is not None
+        # Backend(s)
-
+        if "backend" in yaml_config:
-        # Backend(s) - only use YAML if CLI didn't specify
+            args.backend = yaml_config["backend"]
-        if not cli_backends_provided:
+            args.backends = None
-            if "backend" in yaml_config:
+        elif "backends" in yaml_config:
-                args.backend = yaml_config["backend"]
+            args.backends = yaml_config["backends"]
-                args.backends = None
+            args.backend = None
            elif "backends" in yaml_config:
                args.backends = yaml_config["backends"]
                args.backend = None
            elif "decode_backends" in yaml_config:
                args.backends = yaml_config["decode_backends"]
                args.backend = None
        # Prefill backends (e.g., ["fa3", "fa4"])
        args.prefill_backends = yaml_config.get("prefill_backends", None)
        # Check for special modes
        if "mode" in yaml_config:
@@ -553,24 +514,21 @@ def main():
        # Batch specs and sizes
        # Support both explicit batch_specs and generated batch_spec_ranges
-        # CLI --batch-specs takes precedence over YAML when provided.
+        if "batch_spec_ranges" in yaml_config:
-        cli_batch_specs_provided = args.batch_specs is not None
+            # Generate batch specs from ranges
-        if not cli_batch_specs_provided:
+            generated_specs = generate_batch_specs_from_ranges(
-            if "batch_spec_ranges" in yaml_config:
+                yaml_config["batch_spec_ranges"]
-                # Generate batch specs from ranges
+            )
-                generated_specs = generate_batch_specs_from_ranges(
+            # Combine with any explicit batch_specs
-                    yaml_config["batch_spec_ranges"]
+            if "batch_specs" in yaml_config:
-                )
+                args.batch_specs = yaml_config["batch_specs"] + generated_specs
-                # Combine with any explicit batch_specs
+            else:
-                if "batch_specs" in yaml_config:
+                args.batch_specs = generated_specs
-                    args.batch_specs = yaml_config["batch_specs"] + generated_specs
+            console.print(
-                else:
+                f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
-                    args.batch_specs = generated_specs
+            )
-                console.print(
+        elif "batch_specs" in yaml_config:
-                    f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
+            args.batch_specs = yaml_config["batch_specs"]
                )
            elif "batch_specs" in yaml_config:
                args.batch_specs = yaml_config["batch_specs"]
        if "batch_sizes" in yaml_config:
            args.batch_sizes = yaml_config["batch_sizes"]
@@ -586,19 +544,13 @@ def main():
            args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads)
            args.block_size = model.get("block_size", args.block_size)
-        # Benchmark settings (top-level keys)
+        # Benchmark settings
-        if "device" in yaml_config:
+        if "benchmark" in yaml_config:
-            args.device = yaml_config["device"]
+            bench = yaml_config["benchmark"]
-        if "repeats" in yaml_config:
+            args.device = bench.get("device", args.device)
-            args.repeats = yaml_config["repeats"]
+            args.repeats = bench.get("repeats", args.repeats)
-        if "warmup_iters" in yaml_config:
+            args.warmup_iters = bench.get("warmup_iters", args.warmup_iters)
-            args.warmup_iters = yaml_config["warmup_iters"]
+            args.profile_memory = bench.get("profile_memory", args.profile_memory)
        if "profile_memory" in yaml_config:
            args.profile_memory = yaml_config["profile_memory"]
        if "kv_cache_dtype" in yaml_config:
            args.kv_cache_dtype = yaml_config["kv_cache_dtype"]
        if "cuda_graphs" in yaml_config:
            args.cuda_graphs = yaml_config["cuda_graphs"]
        # Parameter sweep configuration
        if "parameter_sweep" in yaml_config:
@@ -652,19 +604,10 @@ def main():
    # Determine backends
    backends = args.backends or ([args.backend] if args.backend else ["flash"])
    prefill_backends = getattr(args, "prefill_backends", None)
    if not args.batch_specs:
        args.batch_specs = ["q2k", "8q1s1k"]
    console.print(f"Backends: {', '.join(backends)}")
    if prefill_backends:
        console.print(f"Prefill backends: {', '.join(prefill_backends)}")
    console.print(f"Batch specs: {', '.join(args.batch_specs)}")
    console.print(f"KV cache dtype: {args.kv_cache_dtype}")
    console.print(f"CUDA graphs: {args.cuda_graphs}")
    console.print()
    init_workspace_manager(args.device)
    # Run benchmarks
    all_results = []
@@ -717,8 +660,6 @@ def main():
                        repeats=args.repeats,
                        warmup_iters=args.warmup_iters,
                        profile_memory=args.profile_memory,
                        kv_cache_dtype=args.kv_cache_dtype,
                        use_cuda_graphs=args.cuda_graphs,
                    )
                    # Add decode pipeline config
@@ -871,8 +812,6 @@ def main():
            "repeats": args.repeats,
            "warmup_iters": args.warmup_iters,
            "profile_memory": args.profile_memory,
            "kv_cache_dtype": args.kv_cache_dtype,
            "use_cuda_graphs": args.cuda_graphs,
        }
        all_results = run_model_parameter_sweep(
            backends,
@@ -895,8 +834,6 @@ def main():
            "repeats": args.repeats,
            "warmup_iters": args.warmup_iters,
            "profile_memory": args.profile_memory,
            "kv_cache_dtype": args.kv_cache_dtype,
            "use_cuda_graphs": args.cuda_graphs,
        }
        all_results = run_parameter_sweep(
            backends, args.batch_specs, base_config_args, args.parameter_sweep, console
@@ -904,95 +841,37 @@ def main():
    else:
        # Normal mode: compare backends
-        decode_results = []
+        total = len(backends) * len(args.batch_specs)
        prefill_results = []
-        # Run decode backend comparison
+        with tqdm(total=total, desc="Benchmarking") as pbar:
-        if not prefill_backends:
+            for spec in args.batch_specs:
-            # No prefill backends specified: compare decode backends as before
+                for backend in backends:
-            total = len(backends) * len(args.batch_specs)
+                    config = BenchmarkConfig(
                        backend=backend,
                        batch_spec=spec,
                        num_layers=args.num_layers,
                        head_dim=args.head_dim,
                        num_q_heads=args.num_q_heads,
                        num_kv_heads=args.num_kv_heads,
                        block_size=args.block_size,
                        device=args.device,
                        repeats=args.repeats,
                        warmup_iters=args.warmup_iters,
                        profile_memory=args.profile_memory,
                    )
-            with tqdm(total=total, desc="Benchmarking") as pbar:
+                    result = run_benchmark(config)
-                for spec in args.batch_specs:
+                    all_results.append(result)
                    for backend in backends:
                        config = BenchmarkConfig(
                            backend=backend,
                            batch_spec=spec,
                            num_layers=args.num_layers,
                            head_dim=args.head_dim,
                            num_q_heads=args.num_q_heads,
                            num_kv_heads=args.num_kv_heads,
                            block_size=args.block_size,
                            device=args.device,
                            repeats=args.repeats,
                            warmup_iters=args.warmup_iters,
                            profile_memory=args.profile_memory,
                            kv_cache_dtype=args.kv_cache_dtype,
                            use_cuda_graphs=args.cuda_graphs,
                        )
-                        result = run_benchmark(config)
+                    if not result.success:
-                        decode_results.append(result)
+                        console.print(f"[red]Error {backend} {spec}: {result.error}[/]")
-                        if not result.success:
+                    pbar.update(1)
                            console.print(
                                f"[red]Error {backend} {spec}: {result.error}[/]"
                            )
-                        pbar.update(1)
+        # Display results
-
+        console.print("\n[bold green]Results:[/]")
-            console.print("\n[bold green]Results:[/]")
+        formatter = ResultsFormatter(console)
-            formatter = ResultsFormatter(console)
+        formatter.print_table(all_results, backends)
            formatter.print_table(decode_results, backends)
        # Run prefill backend comparison
        if prefill_backends:
            # Use first decode backend for impl construction
            decode_backend = backends[0]
            total = len(prefill_backends) * len(args.batch_specs)
            console.print(
                f"[yellow]Prefill comparison mode: "
                f"using {decode_backend} for decode impl[/]"
            )
            with tqdm(total=total, desc="Prefill benchmarking") as pbar:
                for spec in args.batch_specs:
                    for pb in prefill_backends:
                        config = BenchmarkConfig(
                            backend=decode_backend,
                            batch_spec=spec,
                            num_layers=args.num_layers,
                            head_dim=args.head_dim,
                            num_q_heads=args.num_q_heads,
                            num_kv_heads=args.num_kv_heads,
                            block_size=args.block_size,
                            device=args.device,
                            repeats=args.repeats,
                            warmup_iters=args.warmup_iters,
                            profile_memory=args.profile_memory,
                            prefill_backend=pb,
                        )
                        result = run_benchmark(config)
                        # Label result with prefill backend name for display
                        labeled_config = replace(result.config, backend=pb)
                        result = replace(result, config=labeled_config)
                        prefill_results.append(result)
                        if not result.success:
                            console.print(f"[red]Error {pb} {spec}: {result.error}[/]")
                        pbar.update(1)
            console.print("\n[bold green]Prefill Backend Results:[/]")
            formatter = ResultsFormatter(console)
            formatter.print_table(
                prefill_results, prefill_backends, compare_to_fastest=True
            )
        all_results = decode_results + prefill_results
    # Save results
    if all_results:
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -10,37 +10,18 @@ from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Any
 import numpy as np
 import torch
 from batch_spec import get_batch_type, parse_batch_spec
 from rich.console import Console
 from rich.table import Table
 def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
    """
    Extract sorting key from batch spec: (batch_size, max_q_len, max_kv_len).
    This ensures results are sorted by batch size first, then query length,
    then sequence length, rather than alphabetically.
    """
    try:
        requests = parse_batch_spec(spec)
        batch_size = len(requests)
        max_q_len = max(r.q_len for r in requests) if requests else 0
        max_kv_len = max(r.kv_len for r in requests) if requests else 0
        return (batch_size, max_q_len, max_kv_len)
    except Exception:
        # Fallback for unparsable specs
        return (0, 0, 0)
 # Mock classes for vLLM attention infrastructure
 class MockHfConfig:
    """Mock HuggingFace config that satisfies vLLM's requirements."""
-    def __init__(self, mla_dims: dict, index_topk: int | None = None):
+    def __init__(self, mla_dims: dict):
        self.num_attention_heads = mla_dims["num_q_heads"]
        self.num_key_value_heads = mla_dims["num_kv_heads"]
        self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"]
@@ -51,8 +32,6 @@ class MockHfConfig:
        self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"]
        self.v_head_dim = mla_dims["v_head_dim"]
        self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
        if index_topk is not None:
            self.index_topk = index_topk
    def get_text_config(self):
        return self
@@ -61,7 +40,10 @@ class MockHfConfig:
 # Import AttentionLayerBase at module level to avoid circular dependencies
 try:
    from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
    _HAS_ATTENTION_LAYER_BASE = True
 except ImportError:
    _HAS_ATTENTION_LAYER_BASE = False
    AttentionLayerBase = object  # Fallback
@@ -77,7 +59,6 @@ class MockKVBProj:
        self.qk_nope_head_dim = qk_nope_head_dim
        self.v_head_dim = v_head_dim
        self.out_dim = qk_nope_head_dim + v_head_dim
        self.weight = torch.empty(0, dtype=torch.bfloat16)
    def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
        """
@@ -101,38 +82,6 @@ class MockKVBProj:
        return (result,)  # Return as tuple to match ColumnParallelLinear API
 class MockIndexer:
    """Mock Indexer for sparse MLA backends.
    Provides topk_indices_buffer that sparse MLA backends use to determine
    which KV cache slots to attend to for each token.
    """
    def __init__(
        self,
        max_num_tokens: int,
        topk_tokens: int,
        device: torch.device,
    ):
        self.topk_tokens = topk_tokens
        self.topk_indices_buffer = torch.zeros(
            (max_num_tokens, topk_tokens),
            dtype=torch.int32,
            device=device,
        )
    def fill_random_indices(self, num_tokens: int, max_kv_len: int):
        """Fill topk_indices_buffer with random valid indices for benchmarking."""
        indices = torch.randint(
            0,
            max_kv_len,
            (num_tokens, self.topk_tokens),
            dtype=torch.int32,
            device=self.topk_indices_buffer.device,
        )
        self.topk_indices_buffer[:num_tokens] = indices
 class MockLayer(AttentionLayerBase):
    """Mock attention layer with scale parameters and impl.
@@ -164,6 +113,95 @@ class MockLayer(AttentionLayerBase):
        return self._kv_cache_spec
 class MockModelConfig:
    """Mock model configuration."""
    def __init__(
        self,
        num_q_heads: int,
        num_kv_heads: int,
        head_dim: int,
        dtype: torch.dtype = torch.float16,
        max_model_len: int = 32768,
    ):
        self._n_q = num_q_heads
        self._n_kv = num_kv_heads
        self._d = head_dim
        self.dtype = dtype
        self.max_model_len = max_model_len
    def get_num_attention_heads(self, _=None) -> int:
        return self._n_q
    def get_num_kv_heads(self, _=None) -> int:
        return self._n_kv
    def get_head_size(self) -> int:
        return self._d
    def get_num_layers(self) -> int:
        """Mock method for layer count queries."""
        return 1
    def get_sliding_window_for_layer(self, _layer_idx: int):
        """Mock method for sliding window queries."""
        return None
    def get_logits_soft_cap_for_layer(self, _layer_idx: int):
        """Mock method for logits soft cap queries."""
        return None
    def get_sm_scale_for_layer(self, _layer_idx: int) -> float:
        """Mock method for SM scale queries."""
        return 1.0 / (self.get_head_size() ** 0.5)
 class MockParallelConfig:
    """Mock parallel configuration."""
    pass
 class MockCompilationConfig:
    """Mock compilation configuration."""
    def __init__(self):
        self.full_cuda_graph = False
        self.static_forward_context = {}
 class MockVLLMConfig:
    """Mock VLLM configuration."""
    def __init__(self):
        self.compilation_config = MockCompilationConfig()
 class MockRunner:
    """Mock GPU runner for metadata builders."""
    def __init__(
        self,
        seq_lens: np.ndarray,
        query_start_locs: np.ndarray,
        device: torch.device,
        num_q_heads: int,
        num_kv_heads: int,
        head_dim: int,
        dtype: torch.dtype,
    ):
        self.model_config = MockModelConfig(num_q_heads, num_kv_heads, head_dim, dtype)
        self.parallel_config = MockParallelConfig()
        self.vllm_config = MockVLLMConfig()
        self.seq_lens_np = seq_lens
        self.query_start_loc_np = query_start_locs
        self.device = device
        self.attention_chunk_size = None
        self.num_query_heads = num_q_heads
        self.num_kv_heads = num_kv_heads
        self.dtype = dtype
@dataclass
 class ParameterSweep:
    """Configuration for sweeping a backend parameter."""
@@ -213,11 +251,7 @@ class BenchmarkConfig:
    profile_memory: bool = False
    use_cuda_graphs: bool = False
    # "auto" or "fp8"
    kv_cache_dtype: str = "auto"
    # MLA-specific
    prefill_backend: str | None = None
    kv_lora_rank: int | None = None
    qk_nope_head_dim: int | None = None
    qk_rope_head_dim: int | None = None
@@ -282,19 +316,14 @@ class ResultsFormatter:
            backends: List of backend names being compared
            compare_to_fastest: Show percentage comparison to fastest
        """
-        # Group by batch spec, preserving first-occurrence order
+        # Group by batch spec
        by_spec = {}
        specs_order = []
        for r in results:
            spec = r.config.batch_spec
            if spec not in by_spec:
                by_spec[spec] = {}
                specs_order.append(spec)
            by_spec[spec][r.config.backend] = r
        # Sort specs by (batch_size, q_len, kv_len) instead of alphabetically
        specs_order = sorted(by_spec.keys(), key=batch_spec_sort_key)
        # Create shortened backend names for display
        def shorten_backend_name(name: str) -> str:
            """Shorten long backend names for table display."""
@@ -308,8 +337,6 @@ class ResultsFormatter:
        table = Table(title="Attention Benchmark Results")
        table.add_column("Batch\nSpec", no_wrap=True)
        table.add_column("Type", no_wrap=True)
        table.add_column("Batch\nSize", justify="right", no_wrap=True)
        multi = len(backends) > 1
        for backend in backends:
@@ -323,14 +350,12 @@ class ResultsFormatter:
                table.add_column(col_rel, justify="right", no_wrap=False)
        # Add rows
-        for spec in specs_order:
+        for spec in sorted(by_spec.keys()):
            spec_results = by_spec[spec]
            times = {b: r.mean_time for b, r in spec_results.items() if r.success}
            best_time = min(times.values()) if times else 0.0
-            batch_type = get_batch_type(spec)
+            row = [spec]
            batch_size = len(parse_batch_spec(spec))
            row = [spec, batch_type, str(batch_size)]
            for backend in backends:
                if backend in spec_results:
                    r = spec_results[backend]
@@ -372,7 +397,6 @@ class ResultsFormatter:
                    "backend",
                    "batch_spec",
                    "num_layers",
                    "kv_cache_dtype",
                    "mean_time",
                    "std_time",
                    "throughput",
@@ -386,7 +410,6 @@ class ResultsFormatter:
                        "backend": r.config.backend,
                        "batch_spec": r.config.batch_spec,
                        "num_layers": r.config.num_layers,
                        "kv_cache_dtype": r.config.kv_cache_dtype,
                        "mean_time": r.mean_time,
                        "std_time": r.std_time,
                        "throughput": r.throughput_tokens_per_sec or 0,
@@ -463,11 +486,10 @@ def get_attention_scale(head_dim: int) -> float:
 def is_mla_backend(backend: str) -> bool:
    """
-    Check if backend is an MLA backend using the AttentionBackendEnum.
+    Check if backend is an MLA backend using the backend's is_mla() property.
    Args:
-        backend: Backend name matching AttentionBackendEnum exactly
+        backend: Backend name (e.g., "CUTLASS_MLA", "FLASHINFER_MLA")
        (e.g., "FLASHMLA_SPARSE")
    Returns:
        True if the backend is an MLA backend, False otherwise
@@ -475,8 +497,7 @@ def is_mla_backend(backend: str) -> bool:
    from vllm.v1.attention.backends.registry import AttentionBackendEnum
    try:
-        backend_enum = AttentionBackendEnum[backend]
+        backend_class = AttentionBackendEnum[backend.upper()].get_class()
        backend_class = backend_enum.get_class()
        return backend_class.is_mla()
-    except (KeyError, ValueError, ImportError, AttributeError):
+    except (KeyError, ValueError, ImportError):
        return False
--- a/benchmarks/attention_benchmarks/configs/mla_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
@@ -3,7 +3,7 @@
 model:
  name: "deepseek-v3"
  num_layers: 60
-  num_q_heads: 128  # Base value, can be swept for TP simulation
+  num_q_heads: 128
  num_kv_heads: 1  # MLA uses single latent KV
  head_dim: 576
  kv_lora_rank: 512
@@ -12,13 +12,6 @@ model:
  v_head_dim: 128
  block_size: 128  # CUTLASS MLA and FlashAttn MLA use 128
 # Model parameter sweep: simulate tensor parallelism by varying num_q_heads
 # TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
 model_parameter_sweep:
  param_name: "num_q_heads"
  values: [128, 64, 32, 16]
  label_format: "{backend}_{value}h"
 batch_specs:
  # Small batches, varying sequence lengths
  - "16q1s512"     # 16 requests, 512 KV cache
@@ -41,30 +34,28 @@ batch_specs:
  # Very large batches
  - "128q1s1k"     # 128 requests, 1k KV cache
  - "128q1s2k"     # 128 requests, 2k KV cache
  - "128q1s4k"     # 128 requests, 4k KV cache
  - "128q1s8k"     # 128 requests, 8k KV cache
  # Long context
  - "32q1s16k"     # 32 requests, 16k KV cache
  - "32q1s32k"     # 32 requests, 32k KV cache
 backends:
-  - CUTLASS_MLA
+  - cutlass_mla
-  - FLASHINFER_MLA
+  - flashinfer_mla
-  - FLASH_ATTN_MLA  # Hopper only
+  - flashattn_mla  # Hopper only
-  - FLASHMLA        # Hopper only
+  - flashmla        # Hopper only
 device: "cuda:0"
-repeats: 100
+repeats: 5
-warmup_iters: 10
+warmup_iters: 3
 profile_memory: true
 # Backend-specific tuning
-CUTLASS_MLA:
+cutlass_mla:
  num_kv_splits: auto  # or specific value like 4, 8, 16
-FLASH_ATTN_MLA:
+flashattn_mla:
  reorder_batch_threshold: 512
-FLASHMLA:
+flashmla:
  reorder_batch_threshold: 1
--- a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
@@ -30,9 +30,9 @@ batch_specs:
  - "2q16k_32q1s4k"         # 2 very large prefill + 32 decode
  # Context extension + decode
-  - "2q1ks2k_16q1s1k"       # 2 extend + 16 decode
+  - "2q1kkv2k_16q1s1k"       # 2 extend + 16 decode
-  - "4q2ks4k_32q1s2k"       # 4 extend + 32 decode
+  - "4q2kkv4k_32q1s2k"       # 4 extend + 32 decode
-  - "2q1ks8k_32q1s2k"       # 2 large extend + 32 decode
+  - "2q1kkv8k_32q1s2k"       # 2 large extend + 32 decode
  # Explicitly chunked prefill
  - "q8k"           # 8k prefill with chunking hint
@@ -45,10 +45,10 @@ batch_specs:
  - "4q4k_60q1s4k"          # 4 prefill + 60 decode
 backends:
-  - CUTLASS_MLA
+  - cutlass_mla
-  - FLASHINFER_MLA
+  - flashinfer_mla
-  - FLASH_ATTN_MLA   # Hopper only
+  - flashattn_mla   # Hopper only
-  - FLASHMLA         # Hopper only
+  - flashmla        # Hopper only
 device: "cuda:0"
 repeats: 5
--- a/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
@@ -1,126 +0,0 @@
 # MLA prefill backend comparison
 #
 # Compares all available MLA prefill backends:
 #   FA backends:  fa2, fa3, fa4 (FlashAttention versions)
 #   Non-FA:       flashinfer, cudnn, trtllm (Blackwell-only, require flashinfer)
 #
 # Uses cutlass_mla as the decode backend for impl construction
 # (only the prefill path is exercised).
 #
 # Backends that aren't available on the current platform will report errors
 # in the results table (e.g., fa3 on Blackwell, cudnn without artifactory).
 #
 # Usage:
 #   python benchmark.py --config configs/mla_prefill.yaml
 description: "MLA prefill backend comparison"
 model:
  name: "deepseek-v3"
  num_layers: 60
  num_q_heads: 128
  num_kv_heads: 1
  head_dim: 576
  kv_lora_rank: 512
  qk_nope_head_dim: 128
  qk_rope_head_dim: 64
  v_head_dim: 128
  block_size: 128
 # model:
 #   name: "deepseek-v2-lite"
 #   num_layers: 27
 #   num_q_heads: 16
 #   num_kv_heads: 1
 #   head_dim: 576
 #   kv_lora_rank: 512
 #   qk_nope_head_dim: 128
 #   qk_rope_head_dim: 64
 #   v_head_dim: 128
 #   block_size: 128
 batch_specs:
  # Pure prefill
  - "q512"
  - "q1k"
  - "q2k"
  - "q4k"
  - "q8k"
  # Batched pure prefill
  - "2q512"
  - "2q1k"
  - "2q2k"
  - "2q4k"
  - "2q8k"
  - "4q512"
  - "4q1k"
  - "4q2k"
  - "4q4k"
  - "4q8k"
  - "8q512"
  - "8q1k"
  - "8q2k"
  - "8q4k"
  - "8q8k"
  # Chunked prefill / extend
  # Short context
  - "q128s1k"
  - "q256s2k"
  - "q512s4k"
  - "q1ks4k"
  - "q2ks8k"
  - "2q128s1k"
  - "2q256s2k"
  - "2q512s4k"
  - "2q1ks4k"
  - "2q2ks8k"
  - "4q128s1k"
  - "4q256s2k"
  - "4q512s4k"
  - "4q1ks4k"
  - "4q2ks8k"
  - "8q128s1k"
  - "8q256s2k"
  - "8q512s4k"
  - "8q1ks4k"
  # Medium context
  - "q128s16k"
  - "q512s16k"
  - "q1ks16k"
  - "q2ks16k"
  - "2q128s16k"
  - "2q512s16k"
  - "2q1ks16k"
  - "2q2ks16k"
  - "4q128s16k"
  - "4q512s16k"
  - "4q1ks16k"
  - "4q2ks16k"
  # Long context
  - "q128s64k"
  - "q512s64k"
  - "q1ks64k"
  - "q2ks64k"
  - "2q128s64k"
  - "2q512s64k"
  - "2q1ks64k"
  - "2q2ks64k"
 decode_backends:
  - CUTLASS_MLA
 prefill_backends:
  - fa2
  - fa3
  - fa4
  - flashinfer
  - cudnn
  - trtllm
 device: "cuda:0"
 repeats: 20
 warmup_iters: 5
--- a/benchmarks/attention_benchmarks/configs/mla_sparse_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_sparse_decode.yaml
@@ -1,58 +0,0 @@
 # MLA decode-only benchmark configuration
 model:
  name: "deepseek-v3"
  num_layers: 60
  num_q_heads: 128  # Base value, can be swept for TP simulation
  num_kv_heads: 1  # MLA uses single latent KV
  head_dim: 576
  kv_lora_rank: 512
  qk_nope_head_dim: 128
  qk_rope_head_dim: 64
  v_head_dim: 128
  block_size: 128  # CUTLASS MLA and FlashAttn MLA use 128
 # Model parameter sweep: simulate tensor parallelism by varying num_q_heads
 # TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
 model_parameter_sweep:
  param_name: "num_q_heads"
  values: [128, 64, 32, 16]
  label_format: "{backend}_{value}h"
 batch_specs:
  # Small batches, varying sequence lengths
  - "16q1s512"     # 16 requests, 512 KV cache
  - "16q1s1k"      # 16 requests, 1k KV cache
  - "16q1s2k"      # 16 requests, 2k KV cache
  - "16q1s4k"      # 16 requests, 4k KV cache
  # Medium batches
  - "32q1s1k"      # 32 requests, 1k KV cache
  - "32q1s2k"      # 32 requests, 2k KV cache
  - "32q1s4k"      # 32 requests, 4k KV cache
  - "32q1s8k"      # 32 requests, 8k KV cache
  # Large batches
  - "64q1s1k"      # 64 requests, 1k KV cache
  - "64q1s2k"      # 64 requests, 2k KV cache
  - "64q1s4k"      # 64 requests, 4k KV cache
  - "64q1s8k"      # 64 requests, 8k KV cache
  # Very large batches
  - "128q1s1k"     # 128 requests, 1k KV cache
  - "128q1s2k"     # 128 requests, 2k KV cache
  - "128q1s4k"     # 128 requests, 4k KV cache
  - "128q1s8k"     # 128 requests, 8k KV cache
  # Long context
  - "32q1s16k"     # 32 requests, 16k KV cache
  - "32q1s32k"     # 32 requests, 32k KV cache
 backends:
  - FLASHMLA_SPARSE
  - FLASHINFER_MLA_SPARSE
 device: "cuda:0"
 repeats: 100
 warmup_iters: 10
 profile_memory: true
--- a/benchmarks/attention_benchmarks/configs/mla_sparse_prefill.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_sparse_prefill.yaml
@@ -1,62 +0,0 @@
 # MLA prefill-only benchmark configuration for sparse backends
 model:
  name: "deepseek-v3"
  num_layers: 60
  num_q_heads: 128
  num_kv_heads: 1
  head_dim: 576
  kv_lora_rank: 512
  qk_nope_head_dim: 128
  qk_rope_head_dim: 64
  v_head_dim: 128
  block_size: 128
 # Model parameter sweep: simulate tensor parallelism by varying num_q_heads
 # TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
 model_parameter_sweep:
  param_name: "num_q_heads"
  values: [128, 64, 32, 16]
  label_format: "{backend}_{value}h"
 batch_specs:
  # Pure prefill
  - "1q512"
  - "1q1k"
  - "1q2k"
  - "1q4k"
  - "1q8k"
  # Batched pure prefill
  - "2q512"
  - "2q1k"
  - "2q2k"
  - "2q4k"
  - "2q8k"
  - "4q512"
  - "4q1k"
  - "4q2k"
  - "4q4k"
  - "4q8k"
  - "8q512"
  - "8q1k"
  - "8q2k"
  - "8q4k"
  - "8q8k"
  # Extend
  - "1q512s4k"
  - "1q512s8k"
  - "1q1ks8k"
  - "1q2ks8k"
  - "1q2ks16k"
  - "1q4ks16k"
 backends:
  - FLASHMLA_SPARSE
  - FLASHINFER_MLA_SPARSE
 device: "cuda:0"
 repeats: 10
 warmup_iters: 3
 profile_memory: true
--- a/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
+++ b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
@@ -6,7 +6,7 @@
 description: "Decode vs Prefill pipeline crossover analysis"
 # Test FlashAttn MLA
-backend: FLASH_ATTN_MLA
+backend: flashattn_mla
 # Mode: decode_vs_prefill comparison (special sweep mode)
 # For each batch spec, we'll test both decode and prefill pipelines
@@ -62,10 +62,11 @@ model:
  block_size: 128
 # Benchmark settings
-device: "cuda:0"
+benchmark:
-repeats: 15          # More repeats for spec decode variance
+  device: "cuda:0"
-warmup_iters: 5
+  repeats: 15          # More repeats for spec decode variance
-profile_memory: false
+  warmup_iters: 5
  profile_memory: false
 # Output
 output:
--- a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
@@ -41,17 +41,18 @@ batch_specs:
 # Backends that support query length > 1
 backends:
-  - FLASH_ATTN_MLA    # reorder_batch_threshold = 512
+  - flashattn_mla    # reorder_batch_threshold = 512
-  - FLASHMLA          # reorder_batch_threshold = 1 (tunable)
+  - flashmla          # reorder_batch_threshold = 1 (tunable)
 # FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
-# - FLASHINFER_MLA
+# - flashinfer_mla
 # Benchmark settings
-device: "cuda:0"
+benchmark:
-repeats: 10  # More repeats for statistical significance
+  device: "cuda:0"
-warmup_iters: 5
+  repeats: 10  # More repeats for statistical significance
-profile_memory: false
+  warmup_iters: 5
  profile_memory: false
 # Test these threshold values for optimization
 parameter_sweep:
--- a/benchmarks/attention_benchmarks/configs/standard_attention.yaml
+++ b/benchmarks/attention_benchmarks/configs/standard_attention.yaml
@@ -25,22 +25,14 @@ batch_specs:
  - "4q1k_16q1s2k"     # 4 prefill + 16 decode
  - "2q4k_32q1s1k"     # 2 large prefill + 32 decode
-  # Speculative decode (q <= 8)
+  # Context extension
-  - "16q2s1k"         # 16 requests, 2 spec tokens, 1k KV cache
+  - "q1ks2k"          # 1k query, 2k sequence (chunked prefill)
  - "16q4s1k"         # 16 requests, 4 spec tokens, 1k KV cache
  - "16q8s1k"         # 16 requests, 8 spec tokens, 1k KV cache
  - "32q4s2k"         # 32 requests, 4 spec tokens, 2k KV cache
  - "8q8s4k"          # 8 requests, 8 spec tokens, 4k KV cache
  # Context extension (chunked prefill)
  - "q1ks2k"          # 1k query, 2k sequence
  - "2q1ks4k"         # 2 requests: 1k query, 4k sequence
 # Available backends: FLASH_ATTN, TRITON_ATTN, FLASHINFER
 backends:
-  - FLASH_ATTN
+  - flash
-  - TRITON_ATTN
+  - triton
-  - FLASHINFER
+  - flashinfer
 device: "cuda:0"
 repeats: 5
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -8,13 +8,14 @@ This module provides helpers for running MLA backends without
 needing full VllmConfig integration.
 """
 import importlib
 import numpy as np
 import torch
 from batch_spec import parse_batch_spec
 from common import (
    BenchmarkResult,
    MockHfConfig,
    MockIndexer,
    MockKVBProj,
    MockLayer,
    setup_mla_dims,
@@ -60,11 +61,7 @@ def create_minimal_vllm_config(
    model_name: str = "deepseek-v3",
    block_size: int = 128,
    max_num_seqs: int = 256,
    max_num_batched_tokens: int = 8192,
    mla_dims: dict | None = None,
    index_topk: int | None = None,
    prefill_backend: str | None = None,
    kv_cache_dtype: str = "auto",
 ) -> VllmConfig:
    """
    Create minimal VllmConfig for MLA benchmarks.
@@ -76,11 +73,6 @@ def create_minimal_vllm_config(
        max_num_seqs: Maximum number of sequences
        mla_dims: Optional custom MLA dimensions dict. If not provided, uses
                  setup_mla_dims(model_name)
        index_topk: Optional topk value for sparse MLA backends. If provided,
                    the config will include index_topk for sparse attention.
        prefill_backend: Prefill backend name (e.g., "fa3", "fa4", "flashinfer",
                        "cudnn", "trtllm"). Configures the attention config to
                        force the specified prefill backend.
    Returns:
        VllmConfig for benchmarking
@@ -90,7 +82,7 @@ def create_minimal_vllm_config(
        mla_dims = setup_mla_dims(model_name)
    # Create mock HF config first (avoids downloading from HuggingFace)
-    mock_hf_config = MockHfConfig(mla_dims, index_topk=index_topk)
+    mock_hf_config = MockHfConfig(mla_dims)
    # Create a temporary minimal config.json to avoid HF downloads
    # This ensures consistent ModelConfig construction without network access
@@ -128,12 +120,16 @@ def create_minimal_vllm_config(
            seed=0,
            max_model_len=32768,
            quantization=None,
            quantization_param_path=None,
            enforce_eager=False,
            max_context_len_to_capture=None,
            max_seq_len_to_capture=8192,
            max_logprobs=20,
            disable_sliding_window=False,
            skip_tokenizer_init=True,
            served_model_name=None,
            limit_mm_per_prompt=None,
            use_async_output_proc=True,
            config_format="auto",
        )
    finally:
@@ -151,13 +147,14 @@ def create_minimal_vllm_config(
    cache_config = CacheConfig(
        block_size=block_size,
        gpu_memory_utilization=0.9,
-        cache_dtype=kv_cache_dtype,
+        swap_space=0,
        cache_dtype="auto",
        enable_prefix_caching=False,
    )
    scheduler_config = SchedulerConfig(
        max_num_seqs=max_num_seqs,
-        max_num_batched_tokens=max(max_num_batched_tokens, max_num_seqs),
+        max_num_batched_tokens=8192,
        max_model_len=32768,
        is_encoder_decoder=False,
        enable_chunked_prefill=True,
@@ -169,7 +166,7 @@ def create_minimal_vllm_config(
    compilation_config = CompilationConfig()
-    vllm_config = VllmConfig(
+    return VllmConfig(
        model_config=model_config,
        cache_config=cache_config,
        parallel_config=parallel_config,
@@ -177,147 +174,62 @@ def create_minimal_vllm_config(
        compilation_config=compilation_config,
    )
    if prefill_backend is not None:
        prefill_cfg = get_prefill_backend_config(prefill_backend)
        if prefill_cfg["flash_attn_version"] is not None:
            vllm_config.attention_config.flash_attn_version = prefill_cfg[
                "flash_attn_version"
            ]
        vllm_config.attention_config.disable_flashinfer_prefill = prefill_cfg[
            "disable_flashinfer_prefill"
        ]
        vllm_config.attention_config.use_cudnn_prefill = prefill_cfg[
            "use_cudnn_prefill"
        ]
        vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill = prefill_cfg[
            "use_trtllm_ragged_deepseek_prefill"
        ]
    return vllm_config
 # ============================================================================
-# Prefill Backend Configuration
+# Backend Configuration
 # ============================================================================
-# Maps prefill backend names to attention config overrides.
+
-# FA backends set flash_attn_version and disable non-FA paths.
+# Backend name to class name prefix mapping
-# Non-FA backends enable their specific path and disable others.
+_BACKEND_NAME_MAP = {
-_PREFILL_BACKEND_CONFIG: dict[str, dict] = {
+    "flashattn_mla": "FlashAttnMLA",
-    "fa2": {
+    "flashmla": "FlashMLA",
-        "flash_attn_version": 2,
+    "flashinfer_mla": "FlashInferMLA",
-        "disable_flashinfer_prefill": True,
+    "cutlass_mla": "CutlassMLA",
        "use_cudnn_prefill": False,
        "use_trtllm_ragged_deepseek_prefill": False,
    },
    "fa3": {
        "flash_attn_version": 3,
        "disable_flashinfer_prefill": True,
        "use_cudnn_prefill": False,
        "use_trtllm_ragged_deepseek_prefill": False,
    },
    "fa4": {
        "flash_attn_version": 4,
        "disable_flashinfer_prefill": True,
        "use_cudnn_prefill": False,
        "use_trtllm_ragged_deepseek_prefill": False,
    },
    "flashinfer": {
        "flash_attn_version": None,
        "disable_flashinfer_prefill": False,
        "use_cudnn_prefill": False,
        "use_trtllm_ragged_deepseek_prefill": False,
    },
    "cudnn": {
        "flash_attn_version": None,
        "disable_flashinfer_prefill": True,
        "use_cudnn_prefill": True,
        "use_trtllm_ragged_deepseek_prefill": False,
    },
    "trtllm": {
        "flash_attn_version": None,
        "disable_flashinfer_prefill": True,
        "use_cudnn_prefill": False,
        "use_trtllm_ragged_deepseek_prefill": True,
    },
 }
-
+# Special properties that differ from defaults
 def get_prefill_backend_config(prefill_backend: str) -> dict:
    """Get attention config overrides for a prefill backend."""
    if prefill_backend not in _PREFILL_BACKEND_CONFIG:
        raise ValueError(
            f"Unknown prefill backend: {prefill_backend!r}. "
            f"Available: {list(_PREFILL_BACKEND_CONFIG.keys())}"
        )
    return _PREFILL_BACKEND_CONFIG[prefill_backend]
 # ============================================================================
 # Decode Backend Configuration
 # ============================================================================
 # Backend-specific properties that can't be inferred from the backend class
 # Keys are AttentionBackendEnum names (uppercase)
 _BACKEND_PROPERTIES = {
-    "FLASHMLA": {
+    "flashmla": {
        "query_format": "concat",  # Single concatenated tensor (vs tuple)
        "block_size": 64,  # FlashMLA uses fixed block size
    },
-    "FLASHMLA_SPARSE": {
+    "flashinfer_mla": {
-        "query_format": "concat",  # Single concatenated tensor (vs tuple)
+        "block_size": 64,  # FlashInfer MLA only supports 32 or 64
    },
 }
 def _get_backend_config(backend: str) -> dict:
    """
-    Get backend configuration from AttentionBackendEnum.
+    Get backend configuration using naming conventions.
-    Uses the registry to get the backend class and extract configuration
+    All MLA backends follow the pattern:
-    from its methods (get_impl_cls, get_builder_cls, is_sparse, etc.).
+    - Module: vllm.v1.attention.backends.mla.{backend}
-
+    - Impl: {Name}Impl
-    Args:
+    - Metadata: {Name}Metadata (or MLACommonMetadata)
-        backend: Backend name matching AttentionBackendEnum exactly
+    - DecodeMetadata: {Name}DecodeMetadata (or MLACommonDecodeMetadata)
-        (e.g., "FLASHMLA_SPARSE")
+    - MetadataBuilder: {Name}MetadataBuilder
    Returns:
        Dict with backend configuration
    """
-    from vllm.v1.attention.backend import MultipleOf
+    if backend not in _BACKEND_NAME_MAP:
-    from vllm.v1.attention.backends.registry import AttentionBackendEnum
+        raise ValueError(f"Unknown backend: {backend}")
-    try:
+    name = _BACKEND_NAME_MAP[backend]
        backend_enum = AttentionBackendEnum[backend]
        backend_class = backend_enum.get_class()
    except (KeyError, ValueError) as e:
        valid_backends = [e.name for e in AttentionBackendEnum if e.name != "CUSTOM"]
        raise ValueError(
            f"Unknown backend: {backend}. "
            f"Valid MLA backends: {[b for b in valid_backends if 'MLA' in b]}"
        ) from e
    # Get block size from backend class
    block_sizes = backend_class.get_supported_kernel_block_sizes()
    # Use first supported block size (backends typically support one for MLA)
    block_size = block_sizes[0] if block_sizes else None
    if isinstance(block_size, MultipleOf):
        # No fixed block size; fall back to config value
        block_size = None
    # Check if sparse via class method if available
    is_sparse = getattr(backend_class, "is_sparse", lambda: False)()
    # Get properties that can't be inferred
    props = _BACKEND_PROPERTIES.get(backend, {})
    # Check if backend uses common metadata (FlashInfer, CUTLASS)
    uses_common = backend in ("flashinfer_mla", "cutlass_mla")
    return {
-        "backend_class": backend_class,
+        "module": f"vllm.v1.attention.backends.mla.{backend}",
-        "impl_class": backend_class.get_impl_cls(),
+        "impl_class": f"{name}Impl",
-        "builder_class": backend_class.get_builder_cls(),
+        "metadata_class": "MLACommonMetadata" if uses_common else f"{name}Metadata",
        "decode_metadata_class": "MLACommonDecodeMetadata"
        if uses_common
        else f"{name}DecodeMetadata",
        "builder_class": f"{name}MetadataBuilder",
        "query_format": props.get("query_format", "tuple"),
-        "block_size": block_size,
+        "block_size": props.get("block_size", None),
        "is_sparse": is_sparse,
    }
@@ -535,27 +447,22 @@ def _create_backend_impl(
    mla_dims: dict,
    vllm_config: VllmConfig,
    device: torch.device,
    max_num_tokens: int = 8192,
    index_topk: int | None = None,
    kv_cache_dtype: str = "auto",
 ):
    """
    Create backend implementation instance.
    Args:
-        backend_cfg: Backend configuration dict from _get_backend_config()
+        backend_cfg: Backend configuration dict
        mla_dims: MLA dimension configuration
        vllm_config: VllmConfig instance
        device: Target device
        max_num_tokens: Maximum number of tokens for sparse indexer buffer
        index_topk: Topk value for sparse MLA backends
    Returns:
-        Tuple of (impl, layer, builder_instance, indexer)
+        Tuple of (impl, layer, builder_instance)
    """
-    # Get classes from backend config (already resolved by _get_backend_config)
+    # Import backend classes
-    impl_class = backend_cfg["impl_class"]
+    backend_module = importlib.import_module(backend_cfg["module"])
-    builder_class = backend_cfg["builder_class"]
+    impl_class = getattr(backend_module, backend_cfg["impl_class"])
    # Calculate scale
    scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"])
@@ -567,44 +474,26 @@ def _create_backend_impl(
        v_head_dim=mla_dims["v_head_dim"],
    )
    # Create indexer for sparse backends
    indexer = None
    if backend_cfg.get("is_sparse", False):
        if index_topk is None:
            index_topk = 2048  # Default topk for sparse MLA
        indexer = MockIndexer(
            max_num_tokens=max_num_tokens,
            topk_tokens=index_topk,
            device=device,
        )
    # Build impl kwargs
    impl_kwargs = {
        "num_heads": mla_dims["num_q_heads"],
        "head_size": mla_dims["head_dim"],
        "scale": scale,
        "num_kv_heads": mla_dims["num_kv_heads"],
        "alibi_slopes": None,
        "sliding_window": None,
        "kv_cache_dtype": kv_cache_dtype,
        "logits_soft_cap": None,
        "attn_type": "decoder",
        "kv_sharing_target_layer_name": None,
        "q_lora_rank": None,
        "kv_lora_rank": mla_dims["kv_lora_rank"],
        "qk_nope_head_dim": mla_dims["qk_nope_head_dim"],
        "qk_rope_head_dim": mla_dims["qk_rope_head_dim"],
        "qk_head_dim": mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
        "v_head_dim": mla_dims["v_head_dim"],
        "kv_b_proj": mock_kv_b_proj,
    }
    # Add indexer for sparse backends
    if indexer is not None:
        impl_kwargs["indexer"] = indexer
    # Create impl
-    impl = impl_class(**impl_kwargs)
+    impl = impl_class(
        num_heads=mla_dims["num_q_heads"],
        head_size=mla_dims["head_dim"],
        scale=scale,
        num_kv_heads=mla_dims["num_kv_heads"],
        alibi_slopes=None,
        sliding_window=None,
        kv_cache_dtype="auto",
        logits_soft_cap=None,
        attn_type="decoder",
        kv_sharing_target_layer_name=None,
        q_lora_rank=None,
        kv_lora_rank=mla_dims["kv_lora_rank"],
        qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
        qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
        qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
        v_head_dim=mla_dims["v_head_dim"],
        kv_b_proj=mock_kv_b_proj,
    )
    # Initialize DCP attributes
    if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1):
@@ -626,7 +515,9 @@ def _create_backend_impl(
    # Create builder instance if needed
    builder_instance = None
-    if builder_class:
+    if backend_cfg["builder_class"]:
        builder_class = getattr(backend_module, backend_cfg["builder_class"])
        # Populate static_forward_context so builder can find the layer
        # MockLayer inherits from AttentionLayerBase, so isinstance checks pass
        vllm_config.compilation_config.static_forward_context = {"placeholder": layer}
@@ -638,7 +529,7 @@ def _create_backend_impl(
            device=device,
        )
-    return impl, layer, builder_instance, indexer
+    return impl, layer, builder_instance
 # ============================================================================
@@ -703,8 +594,6 @@ def _run_single_benchmark(
    backend_cfg: dict,
    mla_dims: dict,
    device: torch.device,
    indexer=None,
    kv_cache_dtype: str | None = None,
 ) -> BenchmarkResult:
    """
    Run a single benchmark iteration.
@@ -717,7 +606,6 @@ def _run_single_benchmark(
        backend_cfg: Backend configuration dict
        mla_dims: MLA dimension configuration
        device: Target device
        indexer: Optional MockIndexer for sparse backends
    Returns:
        BenchmarkResult with timing statistics
@@ -725,9 +613,7 @@ def _run_single_benchmark(
    # Parse batch spec
    requests = parse_batch_spec(config.batch_spec)
    q_lens = [r.q_len for r in requests]
    kv_lens = [r.kv_len for r in requests]
    total_q = sum(q_lens)
    max_kv_len = max(kv_lens)
    # Determine block size
    block_size = backend_cfg["block_size"] or config.block_size
@@ -738,123 +624,45 @@ def _run_single_benchmark(
    )
    # Create KV cache
-    if kv_cache_dtype is None:
+    kv_cache = torch.zeros(
-        kv_cache_dtype = getattr(config, "kv_cache_dtype", "auto")
+        num_blocks,
-    head_size = mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"]
+        block_size,
-    if kv_cache_dtype == "fp8_ds_mla":
+        mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
-        # FlashMLA sparse custom format: 656 bytes per token, stored as uint8.
+        device=device,
-        # Layout: kv_lora_rank fp8 bytes + 4 float32 tile scales
+        dtype=torch.bfloat16,
-        #         + 2*rope_dim bf16 bytes
+    )
        # = 512 + 16 + 128 = 656 bytes for DeepSeek dims.
        kv_cache = torch.zeros(
            num_blocks,
            block_size,
            656,
            device=device,
            dtype=torch.uint8,
        )
    elif kv_cache_dtype == "fp8":
        from vllm.platforms import current_platform
-        kv_cache = torch.zeros(
+    # Create input tensors for both decode and prefill modes
-            num_blocks,
+    decode_inputs, prefill_inputs = _create_input_tensors(
-            block_size,
+        total_q,
-            head_size,
+        mla_dims,
-            device=device,
+        backend_cfg["query_format"],
-            dtype=torch.uint8,
+        device,
-        ).view(current_platform.fp8_dtype())
+        torch.bfloat16,
    )
    # Determine which forward method to use based on metadata
    if metadata.decode is not None:
        forward_fn = lambda: impl._forward_decode(
            decode_inputs, kv_cache, metadata, layer
        )
    elif metadata.prefill is not None:
        forward_fn = lambda: impl._forward_prefill(
            prefill_inputs["q"],
            prefill_inputs["k_c_normed"],
            prefill_inputs["k_pe"],
            kv_cache,
            metadata,
            prefill_inputs["k_scale"],
            prefill_inputs["output"],
        )
    else:
        kv_cache = torch.zeros(
            num_blocks,
            block_size,
            head_size,
            device=device,
            dtype=torch.bfloat16,
        )
    # Fill indexer with random indices for sparse backends
    is_sparse = backend_cfg.get("is_sparse", False)
    if is_sparse and indexer is not None:
        indexer.fill_random_indices(total_q, max_kv_len)
    # Determine which forward methods to use based on metadata.
    # Sparse MLA backends always use forward_mqa
    has_decode = is_sparse or getattr(metadata, "decode", None) is not None
    has_prefill = not is_sparse and getattr(metadata, "prefill", None) is not None
    if not has_decode and not has_prefill:
        raise RuntimeError("Metadata has neither decode nor prefill metadata")
    num_decode = (
        metadata.num_decode_tokens
        if (has_decode and has_prefill)
        else total_q
        if has_decode
        else 0
    )
    num_prefill = total_q - num_decode
    # Some backends requires fp8 queries when using fp8 KV cache.
    is_fp8_kvcache = kv_cache_dtype.startswith("fp8")
    quantize_query = is_fp8_kvcache and getattr(
        impl, "supports_quant_query_input", False
    )
    # quantize_query forces concat format
    query_fmt = "concat" if quantize_query else backend_cfg["query_format"]
    # Create decode query tensors
    if has_decode:
        decode_inputs, _ = _create_input_tensors(
            num_decode, mla_dims, query_fmt, device, torch.bfloat16
        )
        # Cast decode query to fp8 if the backend supports it
        if quantize_query:
            from vllm.platforms import current_platform
            if isinstance(decode_inputs, tuple):
                decode_inputs = torch.cat(list(decode_inputs), dim=-1)
            decode_inputs = decode_inputs.to(current_platform.fp8_dtype())
    # Create prefill input tensors
    if has_prefill:
        _, prefill_inputs = _create_input_tensors(
            num_prefill, mla_dims, query_fmt, device, torch.bfloat16
        )
    # Build forward function
    def forward_fn():
        results = []
        if has_decode:
            results.append(impl.forward_mqa(decode_inputs, kv_cache, metadata, layer))
        if has_prefill:
            results.append(
                impl.forward_mha(
                    prefill_inputs["q"],
                    prefill_inputs["k_c_normed"],
                    prefill_inputs["k_pe"],
                    kv_cache,
                    metadata,
                    prefill_inputs["k_scale"],
                    prefill_inputs["output"],
                )
            )
        return results[0] if len(results) == 1 else tuple(results)
    # Warmup
    for _ in range(config.warmup_iters):
        forward_fn()
-    torch.accelerator.synchronize()
+    torch.cuda.synchronize()
    # Optionally capture a CUDA graph after warmup.
    # Graph replay eliminates CPU launch overhead so timings reflect pure
    # kernel time.
    if config.use_cuda_graphs:
        graph = torch.cuda.CUDAGraph()
        with torch.cuda.graph(graph):
            forward_fn()
        benchmark_fn = graph.replay
    else:
        benchmark_fn = forward_fn
    # Benchmark
    times = []
@@ -864,10 +672,10 @@ def _run_single_benchmark(
        start.record()
        for _ in range(config.num_layers):
-            benchmark_fn()
+            forward_fn()
        end.record()
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        elapsed_ms = start.elapsed_time(end)
        times.append(elapsed_ms / 1000.0 / config.num_layers)
@@ -885,26 +693,20 @@ def _run_single_benchmark(
 def _run_mla_benchmark_batched(
    backend: str,
    configs_with_params: list[tuple],  # [(config, threshold, num_splits), ...]
    index_topk: int = 2048,
    prefill_backend: str | None = None,
 ) -> list[BenchmarkResult]:
    """
    Unified batched MLA benchmark runner for all backends.
-    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
+    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
               flashinfer_mla_sparse, flashmla_sparse
    This function reuses backend initialization across multiple benchmarks
    to avoid setup/teardown overhead.
    Args:
-        backend: Backend name (decode backend used for impl construction)
+        backend: Backend name
        configs_with_params: List of (config, threshold, num_splits) tuples
            - threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
            - num_splits: num_kv_splits (CUTLASS only)
        index_topk: Topk value for sparse MLA backends (default 2048)
        prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
            When set, forces the specified FlashAttention version for prefill.
    Returns:
        List of BenchmarkResult objects
@@ -914,7 +716,7 @@ def _run_mla_benchmark_batched(
    backend_cfg = _get_backend_config(backend)
    device = torch.device(configs_with_params[0][0].device)
-    torch.accelerator.set_device_index(device)
+    torch.cuda.set_device(device)
    # Determine block size
    config_block_size = configs_with_params[0][0].block_size
@@ -928,94 +730,21 @@ def _run_mla_benchmark_batched(
    if mla_dims is None:
        mla_dims = setup_mla_dims("deepseek-v3")
    # Determine if this is a sparse backend
    is_sparse = backend_cfg.get("is_sparse", False)
    # Extract kv_cache_dtype from the first config
    kv_cache_dtype = getattr(first_config, "kv_cache_dtype", "auto")
    # FlashMLA sparse only supports "fp8_ds_mla" internally (not generic "fp8").
    # Remap here so the user can pass --kv-cache-dtype fp8 regardless of backend.
    if backend.upper() == "FLASHMLA_SPARSE" and kv_cache_dtype == "fp8":
        kv_cache_dtype = "fp8_ds_mla"
    # Compute max total_q across all configs so the metadata builder buffer
    # and scheduler config are large enough for all batch specs.
    max_total_q = max(
        sum(r.q_len for r in parse_batch_spec(cfg.batch_spec))
        for cfg, *_ in configs_with_params
    )
    # Create and set vLLM config for MLA (reused across all benchmarks)
    vllm_config = create_minimal_vllm_config(
        model_name="deepseek-v3",  # Used only for model path
        block_size=block_size,
        max_num_batched_tokens=max_total_q,
        mla_dims=mla_dims,  # Use custom dims from config or default
        index_topk=index_topk if is_sparse else None,
        prefill_backend=prefill_backend,
        kv_cache_dtype=kv_cache_dtype,
    )
    results = []
    with set_current_vllm_config(vllm_config):
-        # Clear cached prefill backend detection functions so they re-evaluate
+        # Create backend impl, layer, and builder (reused across benchmarks)
-        # with the current VllmConfig. These are @functools.cache decorated and
+        impl, layer, builder_instance = _create_backend_impl(
-        # would otherwise return stale results from a previous backend's config.
+            backend_cfg, mla_dims, vllm_config, device
        from vllm.model_executor.layers.attention.mla_attention import (
            use_cudnn_prefill,
            use_flashinfer_prefill,
            use_trtllm_ragged_deepseek_prefill,
        )
        use_flashinfer_prefill.cache_clear()
        use_cudnn_prefill.cache_clear()
        use_trtllm_ragged_deepseek_prefill.cache_clear()
        # Create backend impl, layer, builder, and indexer (reused across benchmarks)
        impl, layer, builder_instance, indexer = _create_backend_impl(
            backend_cfg,
            mla_dims,
            vllm_config,
            device,
            max_num_tokens=max_total_q,
            index_topk=index_topk if is_sparse else None,
            kv_cache_dtype=kv_cache_dtype,
        )
        # Verify the actual prefill backend matches what was requested
        if prefill_backend is not None:
            prefill_cfg = get_prefill_backend_config(prefill_backend)
            fa_version = prefill_cfg["flash_attn_version"]
            if fa_version is not None:
                # FA backend: verify the impl's FA version
                actual_fa_version = getattr(impl, "vllm_flash_attn_version", None)
                if actual_fa_version != fa_version:
                    raise RuntimeError(
                        f"Prefill backend '{prefill_backend}' requested FA "
                        f"version {fa_version}, but the impl is using FA "
                        f"version {actual_fa_version}. Check "
                        f"vllm/v1/attention/backends/fa_utils.py."
                    )
            else:
                # Non-FA backend: verify the builder picked the right path
                expected_flags = {
                    "flashinfer": "_use_fi_prefill",
                    "cudnn": "_use_cudnn_prefill",
                    "trtllm": "_use_trtllm_ragged_prefill",
                }
                flag_name = expected_flags.get(prefill_backend)
                if flag_name and not getattr(builder_instance, flag_name, False):
                    raise RuntimeError(
                        f"Prefill backend '{prefill_backend}' was requested "
                        f"but the metadata builder did not enable it. This "
                        f"usually means a dependency is missing (e.g., "
                        f"flashinfer not installed) or the platform doesn't "
                        f"support it."
                    )
        # Run each benchmark with the shared impl
        for config, threshold, num_splits in configs_with_params:
            # Set threshold for this benchmark (FlashAttn/FlashMLA only)
@@ -1039,8 +768,6 @@ def _run_mla_benchmark_batched(
                    backend_cfg,
                    mla_dims,
                    device,
                    indexer=indexer,
                    kv_cache_dtype=kv_cache_dtype,
                )
                results.append(result)
@@ -1066,27 +793,20 @@ def run_mla_benchmark(
    config,
    reorder_batch_threshold: int | None = None,
    num_kv_splits: int | None = None,
    index_topk: int = 2048,
    prefill_backend: str | None = None,
 ) -> BenchmarkResult | list[BenchmarkResult]:
    """
    Unified MLA benchmark runner for all backends.
-    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
+    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
               flashinfer_mla_sparse, flashmla_sparse
    Always uses batched execution internally for optimal performance.
    Args:
-        backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
+        backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla)
                 flashinfer_mla_sparse, flashmla_sparse)
        config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples
        reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA
                                 (single config mode only)
        num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
        index_topk: Topk value for sparse MLA backends (default 2048)
        prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
            When set, forces the specified FlashAttention version for prefill.
    Returns:
        BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
@@ -1096,9 +816,9 @@ def run_mla_benchmark(
        # Already in batched format
        if len(config) > 0 and isinstance(config[0], tuple):
            # Format: [(cfg, param), ...] where param is threshold or num_splits
-            if backend in ("flashattn_mla", "flashmla", "flashmla_sparse"):
+            if backend in ("flashattn_mla", "flashmla"):
                configs_with_params = [(cfg, param, None) for cfg, param in config]
-            else:  # cutlass_mla, flashinfer_mla, or sparse backends
+            else:  # cutlass_mla or flashinfer_mla
                configs_with_params = [(cfg, None, param) for cfg, param in config]
        else:
            # Format: [cfg, ...] - just configs
@@ -1110,9 +830,7 @@ def run_mla_benchmark(
        return_single = True
    # Use unified batched execution
-    results = _run_mla_benchmark_batched(
+    results = _run_mla_benchmark_batched(backend, configs_with_params)
        backend, configs_with_params, index_topk, prefill_backend=prefill_backend
    )
    # Return single result or list based on input
    return results[0] if return_single else results
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Seiji Eicher	c44d0c6d66	Patch protobuf for CVE-2026-0994 (#34253 ) Signed-off-by: Seiji Eicher <seiji@anyscale.com> Co-authored-by: Kevin H. Luu <khluu000@gmail.com> (cherry picked from commit `5045d5c983`)	2026-02-11 02:33:40 -08:00
Kunshang Ji	83db96d8cd	[XPU][9/N] clean up existing ipex code/doc (#34111 ) Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> (cherry picked from commit `cb9574eb85`)	2026-02-11 02:33:27 -08:00
zofia	dbfb79fe45	[XPU][7/N] enable xpu fp8 moe (#34202 ) Signed-off-by: Zhu, Zufang <zufang.zhu@intel.com> (cherry picked from commit `b482f71e9f`)	2026-02-11 02:33:15 -08:00
Roger Wang	b2e1fc3589	[Bugfix][Core] Fix CPU memory leak from Request reference cycle in prefix caching (#34183 ) Signed-off-by: Roger Wang <hey@rogerw.io> (cherry picked from commit `8a5e0e2b2b`)	2026-02-11 02:33:04 -08:00
Gregory Shtrasberg	55a1baebc5	[Bugfix][ROCm][GPT-OSS] Use old triton_kernels implementation on ROCm if the new API is not available (#34153 ) Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> (cherry picked from commit `c60f8e3b49`)	2026-02-11 02:32:52 -08:00
Charlie Fu	e1e9841631	[torch.compile][Fusion] Fix attention fusion pass removing kv_udpate op. (#33945 ) Signed-off-by: charlifu <charlifu@amd.com> (cherry picked from commit `bb9f97308d`)	2026-02-11 02:32:41 -08:00
zofia	5bd63387c3	[XPU][6/N] add xpu scaled_mm kernel (#34117 ) Signed-off-by: Zhu, Zufang <zufang.zhu@intel.com> (cherry picked from commit `9bdb06b436`)	2026-02-11 02:32:27 -08:00