[Bugfix] Fix MTP accuracy for GLM-5 (#34385 )

Signed-off-by: mgoin <mgoin64@gmail.com> (cherry picked from commit ec12d39d44)
[Bugfix] send None sentinel on final commit so server properly sends transcription.done (#33963 )
2026-02-11 20:54:27 -08:00 · 2026-02-11 20:54:14 -08:00 · 2026-02-11 20:54:00 -08:00 · 2026-02-11 20:53:40 -08:00 · 2026-02-11 20:53:11 -08:00 · 2026-02-11 02:33:40 -08:00
1249 changed files with 69057 additions and 28721 deletions
--- a/tests/compile/distributed/init.py
+++ b/tests/compile/distributed/init.py
--- a/.buildkite/ci_config.yaml
+++ b/.buildkite/ci_config.yaml
@@ -1,7 +1,8 @@
 name: vllm_ci
 job_dirs:
-  - ".buildkite/test_areas"
  - ".buildkite/image_build"
+  - ".buildkite/test_areas"
+  - ".buildkite/hardware_tests"
 run_all_patterns:
  - "docker/Dockerfile"
  - "CMakeLists.txt"
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -0,0 +1,29 @@
+group: Hardware
+steps:
+  - label: "AMD: :docker: build image"
+    depends_on: []
+    device: amd_cpu
+    no_plugin: true
+    commands:
+    - >
+      docker build
+      --build-arg max_jobs=16
+      --build-arg REMOTE_VLLM=1
+      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
+      --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
+      --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
+      -f docker/Dockerfile.rocm
+      --target test
+      --no-cache
+      --progress plain .
+    - docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 1
+        - exit_status: -10  # Agent was lost
+          limit: 1
+        - exit_status: 1  # Machine occasionally fail
+          limit: 1
--- a/.buildkite/hardware_tests/ascend_npu.yaml
+++ b/.buildkite/hardware_tests/ascend_npu.yaml
@@ -0,0 +1,10 @@
+group: Hardware
+depends_on: ~
+steps:
+  - label: "Ascend NPU Test"
+    soft_fail: true
+    timeout_in_minutes: 20
+    no_plugin: true
+    device: ascend_npu
+    commands: 
+    - bash .buildkite/scripts/hardware_ci/run-npu-test.sh
--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -0,0 +1,100 @@
+group: CPU
+depends_on: []
+steps:
+- label: CPU-Kernel Tests
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  - csrc/cpu/
+  - cmake/cpu_extension.cmake
+  - CMakeLists.txt
+  - vllm/_custom_ops.py
+  - tests/kernels/attention/test_cpu_attn.py
+  - tests/kernels/moe/test_cpu_fused_moe.py
+  - tests/kernels/test_onednn.py
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+      pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+      pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
+      pytest -x -v -s tests/kernels/test_onednn.py"
+
+- label: CPU-Language Generation and Pooling Model Tests
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  - csrc/cpu/
+  - vllm/
+  - tests/models/language/generation/
+  - tests/models/language/pooling/
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
+      pytest -x -v -s tests/models/language/generation -m cpu_model
+      pytest -x -v -s tests/models/language/pooling -m cpu_model"
+
+- label: CPU-Quantization Model Tests
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  - csrc/cpu/
+  - vllm/model_executor/layers/quantization/cpu_wna16.py
+  - vllm/model_executor/layers/quantization/gptq_marlin.py
+  - vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+  - vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
+  - vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
+  - tests/quantization/test_compressed_tensors.py
+  - tests/quantization/test_cpu_wna16.py
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+      pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
+      pytest -x -v -s tests/quantization/test_cpu_wna16.py"
+      
+- label: CPU-Distributed Tests
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  - csrc/cpu/shm.cpp
+  - vllm/v1/worker/cpu_worker.py
+  - vllm/v1/worker/gpu_worker.py
+  - vllm/v1/worker/cpu_model_runner.py
+  - vllm/v1/worker/gpu_model_runner.py
+  - vllm/platforms/cpu.py
+  - vllm/distributed/parallel_state.py
+  - vllm/distributed/device_communicators/cpu_communicator.py
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
+      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
+
+- label: CPU-Multi-Modal Model Tests %N
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  # - vllm/
+  - vllm/model_executor/layers/rotary_embedding
+  - tests/models/multimodal/generation/
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m "
+      pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB"
+  parallelism: 2
+
+- label: "Arm CPU Test"
+  depends_on: []
+  soft_fail: true
+  device: arm_cpu
+  no_plugin: true
+  commands: 
+  - bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
--- a/.buildkite/hardware_tests/gh200.yaml
+++ b/.buildkite/hardware_tests/gh200.yaml
@@ -0,0 +1,10 @@
+group: Hardware
+steps:
+  - label: "GH200 Test"
+    soft_fail: true
+    device: gh200
+    no_plugin: true
+    optional: true
+    commands: 
+    - nvidia-smi 
+    - bash .buildkite/scripts/hardware_ci/run-gh200-test.sh
--- a/.buildkite/hardware_tests/intel.yaml
+++ b/.buildkite/hardware_tests/intel.yaml
@@ -0,0 +1,17 @@
+group: Hardware
+depends_on: ~
+steps:
+  - label: "Intel HPU Test"
+    soft_fail: true
+    device: intel_hpu
+    no_plugin: true
+    commands: 
+    - bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
+
+  - label: "Intel GPU Test"
+    depends_on: []
+    soft_fail: true
+    device: intel_gpu
+    no_plugin: true
+    commands: 
+    - bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@@ -1,56 +1,256 @@
 #!/bin/bash
-set -e
+set -euo pipefail

-if [[ $# -lt 8 ]]; then
-  echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
-  exit 1
+# replace invalid characters in Docker image tags and truncate to 128 chars
+clean_docker_tag() {
+    local input="$1"
+    echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128
+}
+
+print_usage_and_exit() {
+    echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
+    exit 1
+}
+
+print_instance_info() {
+    echo ""
+    echo "=== Debug: Instance Information ==="
+    # Get IMDSv2 token
+    if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
+            -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then
+        AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
+            http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown")
+        INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
+            http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown")
+        INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
+            http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")
+        AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
+            http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown")
+        echo "AMI ID:        ${AMI_ID}"
+        echo "Instance Type: ${INSTANCE_TYPE}"
+        echo "Instance ID:   ${INSTANCE_ID}"
+        echo "AZ:            ${AZ}"
+    else
+        echo "Not running on EC2 or IMDS not available"
+    fi
+    # Check for warm cache AMI (marker file baked into custom AMI)
+    if [[ -f /etc/vllm-ami-info ]]; then
+        echo "Cache:         warm (custom vLLM AMI)"
+        cat /etc/vllm-ami-info
+    else
+        echo "Cache:         cold (standard AMI)"
+    fi
+    echo "==================================="
+    echo ""
+}
+
+setup_buildx_builder() {
+    echo "--- :buildkite: Setting up buildx builder"
+    if [[ -S "${BUILDKIT_SOCKET}" ]]; then
+        # Custom AMI with standalone buildkitd - use remote driver for warm cache
+        echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
+        echo "Using remote driver to connect to buildkitd (warm cache available)"
+        if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
+            echo "Using existing baked-vllm-builder"
+            docker buildx use baked-vllm-builder
+        else
+            echo "Creating baked-vllm-builder with remote driver"
+            docker buildx create \
+                --name baked-vllm-builder \
+                --driver remote \
+                --use \
+                "unix://${BUILDKIT_SOCKET}"
+        fi
+        docker buildx inspect --bootstrap
+    elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
+        # Existing builder available
+        echo "Using existing builder: ${BUILDER_NAME}"
+        docker buildx use "${BUILDER_NAME}"
+        docker buildx inspect --bootstrap
+    else
+        # No local buildkitd, no existing builder - create new docker-container builder
+        echo "No local buildkitd found, using docker-container driver"
+        docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
+        docker buildx inspect --bootstrap
+    fi
+
+    # builder info
+    echo "Active builder:"
+    docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
+}
+
+check_and_skip_if_image_exists() {
+    if [[ -n "${IMAGE_TAG:-}" ]]; then
+        echo "--- :mag: Checking if image exists"
+        if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
+            echo "Image already exists: ${IMAGE_TAG}"
+            echo "Skipping build"
+            exit 0
+        fi
+        echo "Image not found, proceeding with build"
+    fi
+}
+
+ecr_login() {
+    aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+    aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+}
+
+prepare_cache_tags() {
+    # resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
+    TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
+    MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
+
+    if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
+        if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
+            cache="${MAIN_CACHE_ECR}:latest"
+        else
+            clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH")
+            cache="${TEST_CACHE_ECR}:${clean_branch}"
+        fi
+        CACHE_TO="$cache"
+        CACHE_FROM="$cache"
+        CACHE_FROM_BASE_BRANCH="$cache"
+    else
+        CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
+        CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
+        if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then
+            CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest"
+        else
+            clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH")
+            CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}"
+        fi
+    fi
+
+    CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest"
+    export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
+}
+
+resolve_parent_commit() {
+    if [[ -z "${PARENT_COMMIT:-}" ]]; then
+        PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
+        if [[ -n "${PARENT_COMMIT}" ]]; then
+            echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
+            export PARENT_COMMIT
+        else
+            echo "Could not determine parent commit (may be first commit in repo)"
+        fi
+    else
+        echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
+    fi
+}
+
+print_bake_config() {
+    echo "--- :page_facing_up: Resolved bake configuration"
+    BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
+    docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
+    echo "Saved bake config to ${BAKE_CONFIG_FILE}"
+    echo "--- :arrow_down: Uploading bake config to Buildkite"
+    buildkite-agent artifact upload "${BAKE_CONFIG_FILE}"
+}
+
+#################################
+#         Main Script           #
+#################################
+print_instance_info
+
+if [[ $# -lt 7 ]]; then
+    print_usage_and_exit
 fi

+# input args
 REGISTRY=$1
 REPO=$2
 BUILDKITE_COMMIT=$3
 BRANCH=$4
 VLLM_USE_PRECOMPILED=$5
 VLLM_MERGE_BASE_COMMIT=$6
-CACHE_FROM=$7
-CACHE_TO=$8
+IMAGE_TAG=$7
+IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional

-# authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
-aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+# build config
+TARGET="test-ci"
+VLLM_BAKE_FILE_PATH="${VLLM_BAKE_FILE_PATH:-docker/docker-bake.hcl}"
+BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
+CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
+CI_HCL_PATH="/tmp/ci.hcl"
+BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"

-# docker buildx 
-docker buildx create --name vllm-builder --driver docker-container --use
-docker buildx inspect --bootstrap
-docker buildx ls
+prepare_cache_tags
+ecr_login

-# skip build if image already exists
-if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
-  echo "Image not found, proceeding with build..."
-else
-  echo "Image found"
-  exit 0
+# Environment info (for docs and human readers)
+#   VLLM_CI_BRANCH      - ci-infra branch to use (default: main)
+#   VLLM_BAKE_FILE_PATH      - Path to vLLM's bake file (default: docker/docker-bake.hcl)
+#   BUILDER_NAME        - Name for buildx builder (default: vllm-builder)
+#
+# Build configuration (exported as environment variables for bake):
+export BUILDKITE_COMMIT
+export PARENT_COMMIT
+export IMAGE_TAG
+export IMAGE_TAG_LATEST
+export CACHE_FROM
+export CACHE_FROM_BASE_BRANCH
+export CACHE_FROM_MAIN
+export CACHE_TO
+export VLLM_USE_PRECOMPILED
+export VLLM_MERGE_BASE_COMMIT
+
+# print args
+echo "--- :mag: Arguments"
+echo "REGISTRY: ${REGISTRY}"
+echo "REPO: ${REPO}"
+echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
+echo "BRANCH: ${BRANCH}"
+echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
+echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
+echo "IMAGE_TAG: ${IMAGE_TAG}"
+echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
+
+# print build configuration
+echo "--- :mag: Build configuration"
+echo "TARGET: ${TARGET}"
+echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
+echo "BUILDER_NAME: ${BUILDER_NAME}"
+echo "CI_HCL_URL: ${CI_HCL_URL}"
+echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}"
+
+echo "--- :mag: Cache tags"
+echo "CACHE_TO: ${CACHE_TO}"
+echo "CACHE_FROM: ${CACHE_FROM}"
+echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}"
+echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
+
+check_and_skip_if_image_exists
+
+echo "--- :docker: Setting up Docker buildx bake"
+echo "Target: ${TARGET}"
+echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
+echo "CI HCL path: ${CI_HCL_PATH}"
+
+if [[ ! -f "${VLLM_BAKE_FILE_PATH}" ]]; then
+    echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE_PATH}"
+    echo "Make sure you're running from the vLLM repository root"
+    exit 1
 fi

-if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
-  merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
-else
-  merge_base_commit_build_args=""
+echo "--- :arrow_down: Downloading ci.hcl"
+curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
+echo "Downloaded to ${CI_HCL_PATH}"
+
+if [[ ! -f "${CI_HCL_PATH}" ]]; then
+    echo "Error: ci.hcl not found at ${CI_HCL_PATH}"
+    exit 1
 fi

-# build
-docker buildx build --file docker/Dockerfile \
-  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
-  --build-arg USE_SCCACHE=1 \
-  --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
-  --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
-  --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
-  ${merge_base_commit_build_args} \
-  --cache-from type=registry,ref=${CACHE_FROM},mode=max \
-  --cache-to type=registry,ref=${CACHE_TO},mode=max \
-  --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
-  $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
-  --push \
-  --target test \
-  --progress plain .
+setup_buildx_builder
+
+resolve_parent_commit
+export PARENT_COMMIT
+
+print_bake_config
+
+echo "--- :docker: Building ${TARGET}"
+docker --debug buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"
+
+echo "--- :white_check_mark: Build complete"
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -3,8 +3,10 @@ steps:
  - label: ":docker: Build image"
    key: image-build
    depends_on: []
+    timeout_in_minutes: 600
    commands:
-    - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
+    - if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
+    - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
@@ -40,7 +42,7 @@ steps:
          limit: 2
        - exit_status: -10  # Agent was lost
          limit: 2
-  
+
  - label: ":docker: Build CPU arm64 image"
    key: cpu-arm64-image-build
    depends_on: []
--- a/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
+++ b/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
@@ -0,0 +1,15 @@
+model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.695
+  - name: "exact_match,flexible-extract"
+    value: 0.447
+limit: 1319
+num_fewshot: 5
+max_model_len: 262144
+enforce_eager: false
+apply_chat_template: true
+fewshot_as_multiturn: true
+trust_remote_code: true
--- a/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
@@ -0,0 +1,19 @@
+model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.7142
+  - name: "exact_match,flexible-extract"
+    value: 0.4579
+env_vars:
+  VLLM_USE_FLASHINFER_MOE_FP8: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
+limit: 1319
+num_fewshot: 5
+max_model_len: 262144
+kv_cache_dtype: fp8
+enforce_eager: false
+apply_chat_template: true
+fewshot_as_multiturn: true
+trust_remote_code: true
--- a/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
@@ -1 +1,2 @@
 Qwen3-235B-A22B-Instruct-2507-FP8.yaml
+NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
+NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
--- a/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -393,7 +393,7 @@ if __name__ == "__main__":
    with open(results_folder / md_file, "w") as f:
        results = read_markdown(
            "../.buildkite/performance-benchmarks/"
-            + "performance-benchmarks-descriptions.md"
+            "performance-benchmarks-descriptions.md"
        )
        results = results.format(
            latency_tests_markdown_table=latency_md_table,
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -25,9 +25,9 @@ check_gpus() {
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
-  
+
  declare -g arch_suffix=''
-  
+
  if command -v nvidia-smi; then
    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
  elif command -v amd-smi; then
@@ -181,19 +181,20 @@ upload_to_buildkite() {
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }

-run_latency_tests() {
-  # run latency tests using `vllm bench latency` command
-  # $1: a json file specifying latency test cases
+run_benchmark_tests() {
+  # run benchmark tests using `vllm bench <test_type>` command
+  # $1: test type (latency or throughput)
+  # $2: a json file specifying test cases

-  local latency_test_file
-  latency_test_file=$1
+  local test_type=$1
+  local test_file=$2

-  # Iterate over latency tests
-  jq -c '.[]' "$latency_test_file" | while read -r params; do
+  # Iterate over tests
+  jq -c '.[]' "$test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
-    if [[ ! "$test_name" =~ ^latency_ ]]; then
-      echo "In latency-test.json, test_name must start with \"latency_\"."
+    if [[ ! "$test_name" =~ ^${test_type}_ ]]; then
+      echo "In ${test_type}-test.json, test_name must start with \"${test_type}_\"."
      exit 1
    fi

@@ -204,15 +205,15 @@ run_latency_tests() {
    fi

    # get arguments
-    latency_params=$(echo "$params" | jq -r '.parameters')
-    latency_args=$(json2args "$latency_params")
-    latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
-    latency_envs=$(json2envs "$latency_environment_variables")
+    bench_params=$(echo "$params" | jq -r '.parameters')
+    bench_args=$(json2args "$bench_params")
+    bench_environment_variables=$(echo "$params" | jq -r '.environment_variables')
+    bench_envs=$(json2envs "$bench_environment_variables")

    # check if there is enough GPU to run the test
-    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
+    tp=$(echo "$bench_params" | jq -r '.tensor_parallel_size')
    if [[ "$ON_CPU" == "1" ]]; then
-      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
+      pp=$(echo "$bench_params" | jq -r '.pipeline_parallel_size // 1')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -225,97 +226,42 @@ run_latency_tests() {
      fi
    fi

-    latency_command=" $latency_envs vllm bench latency \
+    bench_command=" $bench_envs vllm bench $test_type \
      --output-json $RESULTS_FOLDER/${test_name}.json \
-      $latency_args"
+      $bench_args"

    echo "Running test case $test_name"
-    echo "Latency command: $latency_command"
+    echo "${test_type^} command: $bench_command"

-    # recoding benchmarking command ang GPU command
+    # recording benchmarking command and GPU command
    jq_output=$(jq -n \
-      --arg latency "$latency_command" \
+      --arg command "$bench_command" \
      --arg gpu "$gpu_type" \
+      --arg test_type "$test_type" \
      '{
-        latency_command: $latency,
+        ($test_type + "_command"): $command,
        gpu_type: $gpu
      }')
    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"

    # run the benchmark
-    eval "$latency_command"
+    eval "$bench_command"

    kill_gpu_processes

  done
 }

+run_latency_tests() {
+  run_benchmark_tests "latency" "$1"
+}
+
+run_startup_tests() {
+  run_benchmark_tests "startup" "$1"
+}
+
 run_throughput_tests() {
-  # run throughput tests using `vllm bench throughput`
-  # $1: a json file specifying throughput test cases
-
-  local throughput_test_file
-  throughput_test_file=$1
-
-  # Iterate over throughput tests
-  jq -c '.[]' "$throughput_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-    if [[ ! "$test_name" =~ ^throughput_ ]]; then
-      echo "In throughput-test.json, test_name must start with \"throughput_\"."
-      exit 1
-    fi
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # get arguments
-    throughput_params=$(echo "$params" | jq -r '.parameters')
-    throughput_args=$(json2args "$throughput_params")
-    throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
-    throughput_envs=$(json2envs "$throughput_environment_variables")
-
-    # check if there is enough GPU to run the test
-    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
-    if [[ "$ON_CPU" == "1" ]]; then
-      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
-      world_size=$(($tp*$pp))
-      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
-        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
-        continue
-      fi
-    else
-      if [[ $gpu_count -lt $tp ]]; then
-        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
-        continue
-      fi
-    fi
-
-    throughput_command=" $throughput_envs vllm bench throughput \
-      --output-json $RESULTS_FOLDER/${test_name}.json \
-      $throughput_args"
-
-    echo "Running test case $test_name"
-    echo "Throughput command: $throughput_command"
-    # recoding benchmarking command ang GPU command
-    jq_output=$(jq -n \
-      --arg command "$throughput_command" \
-      --arg gpu "$gpu_type" \
-      '{
-        throughput_command: $command,
-        gpu_type: $gpu
-      }')
-    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
-
-    # run the benchmark
-    eval "$throughput_command"
-
-    kill_gpu_processes
-
-  done
+  run_benchmark_tests "throughput" "$1"
 }

 run_serving_tests() {
@@ -447,6 +393,11 @@ run_serving_tests() {
      fi
    fi

+    # save the compilation mode and optimization level on the serving results
+    # whenever they are set
+    compilation_config_mode=$(echo "$server_params" | jq -r '."compilation_config.mode" // empty')
+    optimization_level=$(echo "$server_params" | jq -r '.optimization_level // empty')
+
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
@@ -460,15 +411,15 @@ run_serving_tests() {
      for max_concurrency in $max_concurrency_list; do
        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
        echo " new test name $new_test_name"
-        # pass the tensor parallel size to the client so that it can be displayed
-        # on the benchmark dashboard
+        # pass the tensor parallel size, the compilation mode, and the optimization
+        # level to the client so that they can be used on the benchmark dashboard
        client_command="vllm bench serve \
          --save-result \
          --result-dir $RESULTS_FOLDER \
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --max-concurrency $max_concurrency \
-          --metadata "tensor_parallel_size=$tp" \
+          --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
          $client_args $client_remote_args "

        echo "Running test case $test_name with qps $qps"
@@ -534,6 +485,7 @@ main() {
  # benchmarking
  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
+  run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"

  # postprocess benchmarking results
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -176,23 +176,6 @@ steps:
        env:
          DOCKER_BUILDKIT: "1"

-      - block: "Build release image for x86_64 ROCm"
-        key: block-rocm-release-image-build
-        depends_on: ~
-
-      - label: "Build release image - x86_64 - ROCm"
-        depends_on: block-rocm-release-image-build
-        id: build-release-image-rocm
-        agents:
-          queue: cpu_queue_postmerge
-        commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          # Build base image first
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
-          # Build vLLM ROCm image using the base
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
-
  - group: "Publish release images"
    key: "publish-release-images"
    steps:
@@ -274,14 +257,14 @@ steps:
          - input-release-version
          - build-wheels

-      - label: "Upload release wheels to PyPI and GitHub"
+      - label: "Upload release wheels to PyPI"
        depends_on:
          - block-upload-release-wheels
        id: upload-release-wheels
        agents:
          queue: small_cpu_queue_postmerge
        commands:
-          - "bash .buildkite/scripts/upload-release-wheels.sh"
+          - "bash .buildkite/scripts/upload-release-wheels-pypi.sh"

  # =============================================================================
  # ROCm Release Pipeline (x86_64 only)
@@ -476,7 +459,7 @@ steps:
      S3_BUCKET: "vllm-wheels"

  # ROCm Job 2: Build vLLM ROCm Wheel
-  - label: ":python: Build vLLM ROCm Wheel"
+  - label: ":python: Build vLLM ROCm Wheel - x86_64"
    id: build-rocm-vllm-wheel
    depends_on:
      - step: build-rocm-base-wheels
@@ -638,9 +621,93 @@ steps:
    depends_on:
      - step: upload-rocm-wheels
        allow_failure: true
+      - step: input-release-version
+        allow_failure: true
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "bash .buildkite/scripts/annotate-rocm-release.sh"
    env:
      S3_BUCKET: "vllm-wheels"
+
+  # ROCm Job 5: Generate Root Index for ROCm Wheels (for release only)
+  # This is the job to create https://wheels.vllm.ai/rocm/ index allowing
+  # users to install with `uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/`
+  - block: "Generate Root Index for ROCm Wheels for Release"
+    key: block-generate-root-index-rocm-wheels
+    depends_on: upload-rocm-wheels
+
+  - label: ":package: Generate Root Index for ROCm Wheels for Release"
+    depends_on: block-generate-root-index-rocm-wheels
+    id: generate-root-index-rocm-wheels
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
+    env:
+      S3_BUCKET: "vllm-wheels"
+      VARIANT: "rocm700"
+
+  # ROCm Job 5: Build ROCm Release Docker Image
+  - label: ":docker: Build release image - x86_64 - ROCm"
+    id: build-rocm-release-image
+    depends_on:
+      - step: build-rocm-base-wheels
+        allow_failure: false
+    agents:
+      queue: cpu_queue_postmerge
+    timeout_in_minutes: 60
+    commands:
+      - |
+        set -euo pipefail
+
+        # Login to ECR
+        aws ecr-public get-login-password --region us-east-1 | \
+          docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
+
+        # Download Docker image from S3 (set by build-rocm-base-wheels)
+        DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
+        if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
+          echo "ERROR: rocm-docker-image-s3-path metadata not found"
+          exit 1
+        fi
+
+        echo "Downloading base image from $${DOCKER_IMAGE_S3_PATH}"
+        mkdir -p artifacts/rocm-docker-image
+        aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
+
+        # Load base Docker image
+        echo "Loading base Docker image..."
+        LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
+        BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
+        echo "Loaded base image: $${BASE_IMAGE_TAG}"
+
+        # Tag and push the base image to ECR
+        docker tag "$${BASE_IMAGE_TAG}" public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
+        docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
+        echo "Pushed base image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base"
+
+        # Get GPU architectures from meta-data
+        PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
+        PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
+
+        # Build vLLM ROCm release image using cached base
+        DOCKER_BUILDKIT=1 docker build \
+          --build-arg max_jobs=16 \
+          --build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
+          --build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
+          --build-arg USE_SCCACHE=1 \
+          --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
+          --build-arg SCCACHE_REGION_NAME=us-west-2 \
+          --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
+          --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm \
+          --target vllm-openai \
+          --progress plain \
+          -f docker/Dockerfile.rocm .
+
+        # Push to ECR
+        docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
+        echo "Pushed: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
+    env:
+      DOCKER_BUILDKIT: "1"
+      S3_BUCKET: "vllm-wheels"
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -11,28 +11,36 @@ fi
 buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
 To download the wheel (by commit):
 \`\`\`
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_aarch64.whl .

-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+(Optional) For CUDA 13.0:
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .
+
+(Optional) For CPU:
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
 \`\`\`

-To download the wheel (by version):
-\`\`\`
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
-
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
-\`\`\`

 To download and upload the image:

 \`\`\`
+# Download images:
+
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
+docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
+docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
+
+# Tag and push images:
+
+## CUDA

 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
@@ -40,22 +48,70 @@ docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
 docker push vllm/vllm-openai:latest-x86_64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64

+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
+docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
+docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
+docker push vllm/vllm-openai:latest-x86_64-cu130
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
+
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker push vllm/vllm-openai:latest-aarch64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64

-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai:rocm
-docker tag vllm/vllm-openai:rocm vllm/vllm-openai:latest-rocm
-docker tag vllm/vllm-openai:rocm vllm/vllm-openai:v${RELEASE_VERSION}-rocm
-docker push vllm/vllm-openai:latest-rocm
-docker push vllm/vllm-openai:v${RELEASE_VERSION}-rocm
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
+docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
+docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+docker push vllm/vllm-openai:latest-aarch64-cu130
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+
+## ROCm
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+docker push vllm/vllm-openai-rocm:latest
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+docker push vllm/vllm-openai-rocm:latest-base
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+
+## CPU
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
+docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
+docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
+docker push vllm/vllm-openai-cpu:latest-x86_64
+docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
+docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
+docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+docker push vllm/vllm-openai-cpu:latest-arm64
+docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+
+# Create multi-arch manifest:

 docker manifest rm vllm/vllm-openai:latest
 docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker manifest push vllm/vllm-openai:latest
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
+
+docker manifest rm vllm/vllm-openai:latest-cu130
+docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+docker manifest push vllm/vllm-openai:latest-cu130
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
+
+docker manifest rm vllm/vllm-openai-cpu:latest || true
+docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
+docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+docker manifest push vllm/vllm-openai-cpu:latest
+docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
 \`\`\`
-EOF 
+EOF
--- a/.buildkite/scripts/annotate-rocm-release.sh
+++ b/.buildkite/scripts/annotate-rocm-release.sh
@@ -3,25 +3,32 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 #
 # Generate Buildkite annotation for ROCm wheel release
-
 set -ex

 # Get build configuration from meta-data
 # Extract ROCm version dynamically from Dockerfile.rocm_base
-# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.1-complete -> extracts "7.1"
+# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0"
 ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
 PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
 PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")

+# TODO: Enable the nightly build for ROCm
+# Get release version, default to 1.0.0.dev for nightly/per-commit builds
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "")
+if [ -z "${RELEASE_VERSION}" ]; then
+  RELEASE_VERSION="1.0.0.dev"
+fi
+
 # S3 URLs
 S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
 S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
-S3_URL="https://${S3_BUCKET}.s3.${S3_REGION}.amazonaws.com"
-ROCM_PATH="rocm/${BUILDKITE_COMMIT}"
+S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"

+# Format ROCm version for path (e.g., "7.1" -> "rocm710")
+ROCM_VERSION_PATH="rocm$(echo ${ROCM_VERSION} | tr -d '.')"
+ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
 buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
-## :rocm: ROCm Wheel Release
-
+## ROCm Wheel and Docker Image Releases
 ### Build Configuration
 | Setting | Value |
 |---------|-------|
@@ -34,41 +41,72 @@ buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' <<
 ### :package: Installation

 **Install from this build (by commit):**
-\`\`\`bash
-uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/{rocm_variant}/

-# Example:
-uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/rocm700/
+\`\`\`bash
+pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
+
+# Example for ROCm ${ROCM_VERSION}:
+pip install vllm --extra-index-url ${S3_URL}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
 \`\`\`

 **Install from nightly (if published):**
+
 \`\`\`bash
-uv pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/
+pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
 \`\`\`

 ### :floppy_disk: Download Wheels Directly

 \`\`\`bash
 # List all ROCm wheels
-aws s3 ls s3://${S3_BUCKET}/${ROCM_PATH}/
-
+aws s3 ls s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/
 # Download specific wheels
-aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/vllm-*.whl .
-aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torch-*.whl .
-aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/triton_rocm-*.whl .
-aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torchvision-*.whl .
-aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/amdsmi-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/vllm-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torch-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-kernels-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/aiter-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
 \`\`\`

 ### :gear: Included Packages
 - **vllm**: vLLM with ROCm support
 - **torch**: PyTorch built for ROCm ${ROCM_VERSION}
- **triton_rocm**: Triton built for ROCm
+- **triton**: Triton
+- **triton-kernels**: Triton kernels
 - **torchvision**: TorchVision for ROCm PyTorch
+- **torchaudio**: Torchaudio for ROCm PyTorch
 - **amdsmi**: AMD SMI Python bindings
+- **aiter**: Aiter for ROCm
+- **flash-attn**: Flash Attention for ROCm

 ### :warning: Notes
 - These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
 - Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
 - Platform: Linux x86_64 only
+
+### :package: Docker Image Release
+
+To download and upload the image:
+
+\`\`\`
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+docker push vllm/vllm-openai-rocm:latest-base
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+docker push vllm/vllm-openai-rocm:latest
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+\`\`\`
+
 EOF
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -112,7 +112,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:

 def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
    """
-    Generate project list HTML content linking to each project & variant sub-directory.
+    Generate project list HTML content linking to each project & variant subdirectory.
    """
    href_tags = []
    for name in sorted(subdir_names):
@@ -168,23 +168,23 @@ def generate_index_and_metadata(
        comment (str | None): Optional comment to include in the generated HTML files.

    First, parse all wheel files to extract metadata.
-    We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
+    We need to collect all wheel files for each variant, and generate an index for it (in a subdirectory).
    The index for the default variant (if any) is generated in the root index directory.

    If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
    is purely a copy of the corresponding variant index, with only the links adjusted.
    Otherwise, all wheels without variant suffixes are treated as the default variant.

-    If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
+    If `alias_to_default` is provided, an additional alias subdirectory is created, it has the same content
    as the default variant index, but the links are adjusted accordingly.

    Index directory structure:
        index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
-            index.html  # project list, linking to "vllm/" and other packages, and all variant sub-directories
+            index.html  # project list, linking to "vllm/" and other packages, and all variant subdirectories
            vllm/
                index.html # package index, pointing to actual files in wheel_base_dir (relative path)
                metadata.json # machine-readable metadata for all wheels in this package
-            cpu/ # cpu variant sub-directory
+            cpu/ # cpu variant subdirectory
                index.html
                vllm/
                    index.html
@@ -194,7 +194,7 @@ def generate_index_and_metadata(
                vllm/
                    index.html
                    metadata.json
-            cu130/ # cu130 variant sub-directory
+            cu130/ # cu130 variant subdirectory
                index.html
                vllm/
                    index.html
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -44,6 +44,17 @@ cleanup_docker() {
  fi
 }

+cleanup_network() {
+  for node in $(seq 0 $((NUM_NODES-1))); do
+    if docker pr -a -q -f name="node${node}" | grep -q .; then
+      docker stop "node${node}"
+    fi
+  done
+  if docker network ls | grep docker-net; then
+    docker network rm docker-net
+  fi
+}
+
 # Call the cleanup docker function
 cleanup_docker

@@ -76,7 +87,7 @@ mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"

 commands=$@
-echo "Commands:$commands"
+echo "Raw commands: $commands"

 commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}

@@ -158,6 +169,9 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi

+commands=$(echo "$commands" | sed 's/ \\ / /g')
+echo "Final commands: $commands"
+
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
@@ -165,7 +179,6 @@ fi
 # --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13


-PARALLEL_JOB_COUNT=8
 MYPYTHONPATH=".."

 # Test that we're launching on the machine that has
@@ -176,53 +189,33 @@ if [[ -z "$render_gid" ]]; then
  exit 1
 fi

-# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
-if [[ $commands == *"--shard-id="* ]]; then
-  # assign job count as the number of shards used
-  commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
-  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
-    # assign shard-id for each shard
-    commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
-    echo "Shard ${GPU} commands:$commands_gpu"
-    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
-    docker run \
-        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
-        --network=host \
-        --shm-size=16gb \
-        --group-add "$render_gid" \
-        --rm \
-        -e HIP_VISIBLE_DEVICES="${GPU}" \
-        -e HF_TOKEN \
-        -e AWS_ACCESS_KEY_ID \
-        -e AWS_SECRET_ACCESS_KEY \
-        -v "${HF_CACHE}:${HF_MOUNT}" \
-        -e "HF_HOME=${HF_MOUNT}" \
-        -e "PYTHONPATH=${MYPYTHONPATH}" \
-        --name "${container_name}_${GPU}" \
-        "${image_name}" \
-        /bin/bash -c "${commands_gpu}" \
-        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
-    PIDS+=($!)
-  done
-  #wait for all processes to finish and collect exit codes
-  for pid in "${PIDS[@]}"; do
-    wait "${pid}"
-    STATUS+=($?)
-  done
-  at_least_one_shard_with_tests=0
-  for st in "${STATUS[@]}"; do
-    if [[ ${st} -ne 0 ]] && [[ ${st} -ne 5 ]]; then
-      echo "One of the processes failed with $st"
-      exit "${st}"
-    elif [[ ${st} -eq 5 ]]; then
-      echo "Shard exited with status 5 (no tests collected) - treating as success"
-    else # This means st is 0
-      at_least_one_shard_with_tests=1
-    fi
-  done
-  if [[ ${#STATUS[@]} -gt 0 && ${at_least_one_shard_with_tests} -eq 0 ]]; then
-    echo "All shards reported no tests collected. Failing the build."
-    exit 1
+if [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
+
+  export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
+
+  if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then
+      prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g')
+      echo "PREFIX: ${prefix}"
+      export composite_command="(command rocm-smi || true)"
+      myIFS=$IFS
+      IFS=','
+      read -ra node0 <<< ${BASH_REMATCH[2]}
+      read -ra node1 <<< ${BASH_REMATCH[3]}
+      IFS=$myIFS
+      for i in "${!node0[@]}";do 
+        command_node_0=$(echo ${node0[i]} | sed 's/\"//g')
+        command_node_1=$(echo ${node1[i]} | sed 's/\"//g')
+        
+        export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
+        echo "COMMANDS: ${commands}"
+        composite_command=$(echo "${composite_command} && ${commands}")
+      done
+      /bin/bash -c "${composite_command}"
+      cleanup_network
+  else
+      echo "Failed to parse node commands! Exiting."
+      cleanup_network
+      exit 111
  fi
 else
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
--- a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+set -euox pipefail
+
+echo "--- PP+TP"
+vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
+server_pid=$!
+timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+vllm bench serve \
+    --backend vllm \
+    --dataset-name random \
+    --model meta-llama/Llama-3.2-3B-Instruct \
+    --num-prompts 20 \
+    --endpoint /v1/completions
+kill -s SIGTERM $server_pid &
+
+echo "--- DP+TP"
+vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
+server_pid=$!
+timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+vllm bench serve \
+    --backend vllm \
+    --dataset-name random \
+    --model meta-llama/Llama-3.2-3B-Instruct \
+    --num-prompts 20 \
+    --endpoint /v1/completions
+kill -s SIGTERM $server_pid &
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -2,119 +2,19 @@

 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
-set -ex
+set -euox pipefail

 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-48-95}
-# used for TP/PP E2E test
-OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
+IMAGE_NAME="cpu-test-$NUMA_NODE"
+TIMEOUT_VAL=$1
+TEST_COMMAND=$2

-export CMAKE_BUILD_PARALLEL_LEVEL=32
-
-# Setup cleanup
-remove_docker_container() {
-    set -e;
-    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
-}
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+# building the docker image
+echo "--- :docker: Building Docker image"
+docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
-
-function cpu_tests() {
-  set -e
-  export NUMA_NODE=$2
-
-  # list packages
-  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
-    set -e
-    pip list"
-
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pip list"
-
-  # offline inference
-  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
-    set -e
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
-
-  # Run kernel tests
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
-    pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
-    pytest -x -v -s tests/kernels/test_onednn.py"
-
-  # Run basic model test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    # Note: disable until supports V1
-    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
-    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
-
-    pytest -x -v -s tests/models/language/generation -m cpu_model
-    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
-
-    pytest -x -v -s tests/models/language/pooling -m cpu_model
-    pytest -x -v -s tests/models/multimodal/generation \
-                --ignore=tests/models/multimodal/generation/test_pixtral.py \
-                -m cpu_model"
-
-  # Run compressed-tensor test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -x -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
-
-  # Run AWQ/GPTQ test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -x -s -v \
-    tests/quantization/test_cpu_wna16.py"
-
-  # Run multi-lora tests
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -x -s -v \
-    tests/lora/test_qwenvl.py"
-
-  # online serving: tp+pp
-  docker exec cpu-test-"$NUMA_NODE" bash -c '
-    set -e
-    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
-    server_pid=$!
-    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
-    vllm bench serve \
-      --backend vllm \
-      --dataset-name random \
-      --model meta-llama/Llama-3.2-3B-Instruct \
-      --num-prompts 20 \
-      --endpoint /v1/completions
-    kill -s SIGTERM $server_pid &'
-
-  # online serving: tp+dp
-  docker exec cpu-test-"$NUMA_NODE" bash -c '
-    set -e
-    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
-    server_pid=$!
-    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
-    vllm bench serve \
-      --backend vllm \
-      --dataset-name random \
-      --model meta-llama/Llama-3.2-3B-Instruct \
-      --num-prompts 20 \
-      --endpoint /v1/completions
-    kill -s SIGTERM $server_pid &'
-}
-
-# All of CPU tests are expected to be finished less than 40 mins.
-export -f cpu_tests
-timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+docker run --rm --cpuset-cpus=$CORE_RANGE --cpuset-mems=$NUMA_NODE -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g $IMAGE_NAME \
+        timeout $TIMEOUT_VAL bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -5,7 +5,9 @@
 set -exuo pipefail

 # Try building the docker image
-cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
+image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
+container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
+cat <<EOF | docker build -t ${image_name} -f - .
 FROM gaudi-base-image:latest

 COPY ./ /workspace/vllm
@@ -15,7 +17,8 @@ WORKDIR /workspace/vllm
 ENV no_proxy=localhost,127.0.0.1
 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true

-RUN VLLM_TARGET_DEVICE=empty pip install .
+RUN bash -c 'pip install -r <(sed "/^torch/d" requirements/build.txt)'
+RUN VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e .
 RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git

 # install development dependencies (for testing)
@@ -36,15 +39,20 @@ EOF
 # functions, while other platforms only need one remove_docker_container
 # function.
 EXITCODE=1
-remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
+remove_docker_containers() { docker rm -f ${container_name} || true; }
 trap 'remove_docker_containers; exit $EXITCODE;' EXIT
 remove_docker_containers

 echo "Running HPU plugin v1 test"
-docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
+docker run --rm --runtime=habana --name=${container_name} --network=host \
  -e HABANA_VISIBLE_DEVICES=all \
-  hpu-plugin-v1-test-env \
-  /bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
+  -e VLLM_SKIP_WARMUP=true \
+  -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
+  -e PT_HPU_LAZY_MODE=1 \
+  "${image_name}" \
+  /bin/bash -c '
+  cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
+'

 EXITCODE=$?
 if [ $EXITCODE -eq 0 ]; then
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -38,15 +38,18 @@ docker run \
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
+    python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
+    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
+    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
    cd tests
-    pytest -v -s v1/core
+    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
-    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
+    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
    pytest -v -s v1/test_serial_utils.py
 '
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -43,7 +43,6 @@ trap cleanup EXIT

 for BACK in "${BACKENDS[@]}"; do
  VLLM_DEEP_GEMM_WARMUP=skip \
-  VLLM_ALL2ALL_BACKEND=$BACK \
  vllm serve "$MODEL" \
    --enforce-eager \
    --tensor-parallel-size 2 \
@@ -52,6 +51,7 @@ for BACK in "${BACKENDS[@]}"; do
    --enable-eplb \
    --trust-remote-code \
    --max-model-len 2048 \
+    --all2all-backend $BACK \
    --port $PORT &
  SERVER_PID=$!
  wait_for_server $PORT
--- a/.buildkite/scripts/upload-release-wheels-pypi.sh
+++ b/.buildkite/scripts/upload-release-wheels-pypi.sh
@@ -7,17 +7,19 @@ SUBPATH=$BUILDKITE_COMMIT
 S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"

 RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
-echo "Release version from Buildkite: $RELEASE_VERSION"
 GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
-if [ -z "$GIT_VERSION" ]; then
+
+echo "Release version from Buildkite: $RELEASE_VERSION"
+
+if [[ -z "$GIT_VERSION" ]]; then
    echo "[FATAL] Not on a git tag, cannot create release."
    exit 1
 else
    echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
 fi
 # sanity check for version mismatch
-if [ "$RELEASE_VERSION" != "$GIT_VERSION" ]; then
-  if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
+if [[ "$RELEASE_VERSION" != "$GIT_VERSION" ]]; then
+  if [[ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]]; then
    echo "[WARNING] Force release and ignore version mismatch"
  else
    echo "[FATAL] Release version from Buildkite does not match Git version."
@@ -27,7 +29,7 @@ fi
 PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'

 # check pypi token
-if [ -z "$PYPI_TOKEN" ]; then
+if [[ -z "$PYPI_TOKEN" ]]; then
  echo "[FATAL] PYPI_TOKEN is not set."
  exit 1
 else
@@ -35,41 +37,8 @@ else
  export TWINE_PASSWORD="$PYPI_TOKEN"
 fi

-# check github token
-if [ -z "$GITHUB_TOKEN" ]; then
-  echo "[FATAL] GITHUB_TOKEN is not set."
-  exit 1
-else
-  export GH_TOKEN="$GITHUB_TOKEN"
-fi
-
 set -x # avoid printing secrets above

-# download gh CLI from github
-# Get latest gh CLI version from GitHub API
-GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
-if [ -z "$GH_VERSION" ]; then
-  echo "[FATAL] Failed to get latest gh CLI version from GitHub"
-  exit 1
-fi
-echo "Downloading gh CLI version: $GH_VERSION"
-GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
-GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
-GH_INSTALL_DIR="/tmp/gh-install"
-mkdir -p "$GH_INSTALL_DIR"
-pushd "$GH_INSTALL_DIR"
-curl -L -o "$GH_TARBALL" "$GH_URL"
-tar -xzf "$GH_TARBALL"
-GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
-if [ -z "$GH_BIN" ]; then
-  echo "[FATAL] Failed to find gh CLI executable"
-  exit 1
-fi
-echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
-echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
-command "$GH_BIN" release list --limit 5
-popd
-
 # install twine from pypi
 python3 -m venv /tmp/vllm-release-env
 source /tmp/vllm-release-env/bin/activate
@@ -89,16 +58,13 @@ echo "Wheels copied to local directory"
 git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
 ls -la $DIST_DIR

-
 # upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
 PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
-if [ -z "$PYPI_WHEEL_FILES" ]; then
+if [[ -z "$PYPI_WHEEL_FILES" ]]; then
  echo "No default variant wheels found, quitting..."
  exit 1
 fi
-python3 -m twine check $PYPI_WHEEL_FILES
-python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
-echo "Wheels uploaded to PyPI"

-# create release on GitHub with the release version and all wheels
-command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl
+python3 -m twine check $PYPI_WHEEL_FILES
+python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES
+echo "Wheels uploaded to PyPI"
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -70,6 +70,7 @@ steps:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
+  - tests/test_pooling_params.py
  - tests/multimodal
  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
@@ -82,6 +83,7 @@ steps:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
+  - pytest -v -s test_pooling_params.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s renderers
  - pytest -v -s tokenizers_
@@ -231,6 +233,7 @@ steps:
  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
+  - examples/offline_inference/new_weight_syncing/
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
@@ -266,10 +269,16 @@ steps:
  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
+  # OLD rlhf examples
  - pushd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd
+  # NEW rlhf examples
+  - pushd ../examples/offline_inference/new_weight_syncing
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
+  - popd

 - label: Distributed Tests (8 GPUs) # 4min
  timeout_in_minutes: 10
@@ -505,7 +514,7 @@ steps:
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    # for pooling models
-    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    - python3 pooling/embed/vision_embedding_offline.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
@@ -525,6 +534,7 @@ steps:
  - tests/cuda
  commands:
    - pytest -v -s cuda/test_cuda_context.py
+    - pytest -v -s cuda/test_platform_no_cuda_init.py

 - label: Samplers Test # 56min
  timeout_in_minutes: 75
@@ -604,9 +614,11 @@ steps:
  - tests/compile
  commands:
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
-    # Limit to no custom ops to reduce running time
-    # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+    # # Limit to no custom ops to reduce running time
+    # # Wrap with quotes to escape yaml and avoid starting -k string with a -
+    # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.

 - label: Cudagraph test
  timeout_in_minutes: 20
@@ -640,8 +652,9 @@ steps:
  # grade: Blocking
  source_file_dependencies:
  - csrc/attention/
-  - vllm/attention
  - vllm/v1/attention
+    # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
+  - vllm/model_executor/layers/attention
  - tests/kernels/attention
  commands:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -741,7 +754,7 @@ steps:
 - label: Benchmarks # 11min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  working_dir: "/vllm-workspace/.buildkite"
  source_file_dependencies:
@@ -752,7 +765,7 @@ steps:
 - label: Benchmarks CLI Test # 7min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - vllm/
@@ -851,10 +864,11 @@ steps:
  torch_nightly: true
  source_file_dependencies:
  - vllm/
+  - tests/models/test_terratorch.py
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
  commands:
-    - pytest -v -s models/test_transformers.py models/test_registry.py
+    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py

 - label: Basic Models Test (Other CPU) # 5min
  mirror_hardwares: [amdexperimental, amdproduction]
@@ -1131,7 +1145,7 @@ steps:
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -1177,44 +1191,26 @@ steps:
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/test_fusion_attn.py
-  - tests/compile/test_silu_mul_quant_fusion.py
-  - tests/compile/distributed/test_fusion_all_reduce.py
-  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/passes/test_fusion_attn.py
+  - tests/compile/passes/test_silu_mul_quant_fusion.py
+  - tests/compile/passes/distributed/test_fusion_all_reduce.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
-    - pytest -v -s tests/compile/test_fusion_attn.py
-    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    - pytest -v -s tests/compile/passes/test_fusion_attn.py
+    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
+
+    # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # # Wrap with quotes to escape yaml
+    # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
+
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: Blackwell Fusion E2E Tests # 30 min
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  gpu: b200
-  optional: true
-  num_gpus: 2
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
-  commands:
-    - nvidia-smi
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-
 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
@@ -1277,7 +1273,7 @@ steps:

 - label: 2 Node Tests (4 GPUs in total) # 16min
  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdmultinode]
  agent_pool: mi325_4
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
@@ -1291,15 +1287,15 @@ steps:
  - tests/distributed/
  - tests/examples/offline_inference/data_parallel.py
  commands:
-  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
+  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)  | grep 'Same node test passed'   | grep 'Node count test passed'
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py 
+    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py 
    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py 
+    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py 
    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code

 - label: Distributed Tests (2 GPUs) # 68min
@@ -1508,6 +1504,9 @@ steps:
  source_file_dependencies:
  - vllm/
  commands:
+  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
+  # TODO: Remove when the bug is fixed in a future ROCm release
+  - export TORCH_NCCL_BLOCKING_WAIT=1
  # NOTE: don't test llama model here, it seems hf implementation is buggy
  # see https://github.com/vllm-project/vllm/pull/5689 for details
  - pytest -v -s distributed/test_custom_all_reduce.py
@@ -1558,12 +1557,15 @@ steps:
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
-    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
    #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
-    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
+    # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
+
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
    - pytest -v -s tests/v1/distributed/test_dbo.py
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -63,6 +63,7 @@ steps:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
+  - tests/test_pooling_params.py
  - tests/multimodal
  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
@@ -75,6 +76,7 @@ steps:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
+  - pytest -v -s test_pooling_params.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s renderers
  - pytest -v -s tokenizers_
@@ -204,6 +206,7 @@ steps:
  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
+  - examples/offline_inference/new_weight_syncing/
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
@@ -238,10 +241,16 @@ steps:
  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
+  # OLD rlhf examples
  - pushd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd
+  # NEW rlhf examples
+  - pushd ../examples/offline_inference/new_weight_syncing
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
+  - popd

 - label: Distributed Tests (8 GPUs) # 4min
  timeout_in_minutes: 10
@@ -362,7 +371,7 @@ steps:
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
-    - pytest -v -s v1/spec_decode
+    - pytest -v -s -m 'not slow_test' v1/spec_decode
    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
    - pytest -v -s -m 'not cpu_test' v1/metrics
    - pytest -v -s v1/test_oracle.py
@@ -444,7 +453,7 @@ steps:
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    # for pooling models
-    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    - python3 pooling/embed/vision_embedding_offline.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
@@ -510,6 +519,7 @@ steps:
  # However, find does not normally propagate error codes, so we combine it with xargs
  # (using -0 for proper path handling)
  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
+  - pytest -s -v compile/passes --ignore compile/passes/distributed

 - label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
@@ -537,9 +547,11 @@ steps:
  commands:
    # fp8 kv scales not supported on sm89, tested on Blackwell instead
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
-    # Limit to no custom ops to reduce running time
-    # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+    # # Limit to no custom ops to reduce running time
+    # # Wrap with quotes to escape yaml and avoid starting -k string with a -
+    # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.

 - label: Cudagraph test
  timeout_in_minutes: 20
@@ -568,8 +580,9 @@ steps:
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/attention/
-  - vllm/attention
  - vllm/v1/attention
+    # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
+  - vllm/model_executor/layers/attention
  - tests/kernels/attention
  commands:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -792,10 +805,11 @@ steps:
  torch_nightly: true
  source_file_dependencies:
  - vllm/
+  - tests/models/test_terratorch.py
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
  commands:
-    - pytest -v -s models/test_transformers.py models/test_registry.py
+    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py

 - label: Basic Models Test (Other CPU) # 5min
  timeout_in_minutes: 10
@@ -1017,7 +1031,7 @@ steps:
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -1067,84 +1081,23 @@ steps:
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
-  - tests/compile/distributed/test_fusion_all_reduce.py
-  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/passes/distributed/test_fusion_all_reduce.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
    - pytest -v -s tests/compile/test_fusion_attn.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
+    #  # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    #  # Wrap with quotes to escape yaml
+    #  - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
+
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: Blackwell Fusion E2E Tests # 30 min
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  gpu: b200
-  optional: true
-  num_gpus: 2
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
-  commands:
-    - nvidia-smi
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-
- label: Hopper Fusion E2E Tests (H100) # 10min
-  timeout_in_minutes: 70
-  working_dir: "/vllm-workspace/"
-  gpu: h100
-  optional: true
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/test_fusion_attn.py
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # skip Llama-4 since it does not fit on this device
-    - pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
-
- label: Hopper Fusion Distributed E2E Tests (2xH100)  # 70min
-  timeout_in_minutes: 70
-  working_dir: "/vllm-workspace/"
-  gpu: h100
-  optional: true
-  num_gpus: 2
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-
 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
@@ -1202,6 +1155,8 @@ steps:
  - pytest -v -s distributed/test_shm_broadcast.py
  - pytest -v -s distributed/test_shm_buffer.py
  - pytest -v -s distributed/test_shm_storage.py
+  - pytest -v -s distributed/test_packed_tensor.py
+  - pytest -v -s distributed/test_weight_transfer.py

 - label: 2 Node Tests (4 GPUs in total) # 16min
  timeout_in_minutes: 30
@@ -1316,7 +1271,7 @@ steps:
  - pytest -v -s distributed/test_distributed_oot.py
  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
-  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
+  - pytest -v -s plugins/lora_resolvers # unit tests for lora resolver plugins

 - label: Pipeline + Context Parallelism Test # 45min
  timeout_in_minutes: 60
@@ -1419,6 +1374,20 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py

+- label: Acceptance Length Test (Large Models) # optional
+  timeout_in_minutes: 120
+  gpu: h100
+  optional: true
+  num_gpus: 1
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/v1/spec_decode/
+  - vllm/model_executor/models/mlp_speculator.py
+  - tests/v1/spec_decode/test_acceptance_length.py
+  commands:
+    - export VLLM_ALLOW_INSECURE_SERIALIZATION=1
+    - pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
+
 - label: LM Eval Large Models # optional
  gpu: a100
  optional: true
@@ -1453,8 +1422,8 @@ steps:
  commands:
    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
    # Run sequence parallel tests
-    - pytest -v -s tests/distributed/test_sequence_parallel.py
-    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
+    - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py

 - label: Distributed Tests (H100) # optional
  gpu: h100
@@ -1462,7 +1431,7 @@ steps:
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py
--- a/.buildkite/test_areas/attention.yaml
+++ b/.buildkite/test_areas/attention.yaml
@@ -4,7 +4,7 @@ depends_on:
 steps:
 - label: V1 attention (H100)
  timeout_in_minutes: 30
-  gpu: h100
+  device: h100
  source_file_dependencies:
    - vllm/config/attention.py
    - vllm/model_executor/layers/attention
@@ -15,7 +15,7 @@ steps:

 - label: V1 attention (B200)
  timeout_in_minutes: 30
-  gpu: b200
+  device: b200
  source_file_dependencies:
    - vllm/config/attention.py
    - vllm/model_executor/layers/attention
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -2,56 +2,202 @@ group: Compile
 depends_on: 
  - image-build
 steps:
- label: Fusion and Compile Tests (B200)
-  timeout_in_minutes: 40
+- label: Sequence Parallel Correctness Tests (2 GPUs)
+  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  num_devices: 2
  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/model_executor/layers/
+  - vllm/compilation/
  - vllm/v1/worker/
  - vllm/v1/cudagraph_dispatcher.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/test_fusion_attn.py
-  - tests/compile/test_silu_mul_quant_fusion.py
-  - tests/compile/distributed/test_fusion_all_reduce.py
-  - tests/compile/distributed/test_fusions_e2e.py
-  - tests/compile/fullgraph/test_full_graph.py
+  - tests/compile/correctness_e2e/test_sequence_parallel.py
  commands:
-    - nvidia-smi
-    - pytest -v -s tests/compile/test_fusion_attn.py
-    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
-    # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
-    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py

- label: Fusion E2E (2 GPUs)(B200)
-  timeout_in_minutes: 40
+- label: Sequence Parallel Correctness Tests (2xH100)
+  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  device: h100
  optional: true
-  num_gpus: 2
+  num_devices: 2
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
+
+- label: AsyncTP Correctness Tests (2xH100)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/"
+  device: h100
+  optional: true
+  num_devices: 2
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
+
+- label: Distributed Compile Unit Tests (2xH100)
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 2
+  source_file_dependencies:
+  - vllm/compilation/
+  - vllm/model_executor/layers
+  - tests/compile/passes/distributed/
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -s -v tests/compile/passes/distributed
+
+- label: Fusion and Compile Unit Tests (B200)
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/"
+  device: b200
  source_file_dependencies:
  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
+  - vllm/model_executor/layers/quantization/
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
+  - vllm/model_executor/layers/attention/attention.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
+  - tests/compile/passes/test_fusion_attn.py
+  - tests/compile/passes/test_silu_mul_quant_fusion.py
+  - tests/compile/passes/distributed/test_fusion_all_reduce.py
+  - tests/compile/fullgraph/test_full_graph.py
+  commands:
+    # b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
+    - nvidia-smi
+    - pytest -v -s tests/compile/passes/test_fusion_attn.py -k FLASHINFER
+    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_devices=2 is not set
+    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    # TODO(luka) move to H100 once pass tests run on H100
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+
+- label: Fusion E2E Quick (H100)
+  timeout_in_minutes: 15
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 1
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"

+- label: Fusion E2E Config Sweep (H100)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 1
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/compilation/
+    # can affect pattern matching
+    - vllm/model_executor/layers/layernorm.py
+    - vllm/model_executor/layers/activation.py
+    - vllm/model_executor/layers/attention/attention.py
+    - vllm/model_executor/layers/quantization/input_quant_fp8.py
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run just llama3 (fp8) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
+
+- label: Fusion E2E Config Sweep (B200)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/"
+  device: b200
+  num_devices: 1
+  optional: true
+  commands:
+    - nvidia-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    # -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
+    # Run just llama3 (fp8 & fp4) for all config combinations
+    # -k "llama-3"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
+
+- label: Fusion E2E TP2 Quick (H100)
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+
+- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/compilation/
+    # can affect pattern matching
+    - vllm/model_executor/layers/layernorm.py
+    - vllm/model_executor/layers/activation.py
+    - vllm/model_executor/layers/attention/attention.py
+    - vllm/model_executor/layers/quantization/input_quant_fp8.py
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run just llama3 (fp4 & fp8 & bf16) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
+
+- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/compilation/
+    # can affect pattern matching
+    - vllm/model_executor/layers/layernorm.py
+    - vllm/model_executor/layers/activation.py
+    - vllm/model_executor/layers/attention/attention.py
+    - vllm/model_executor/layers/quantization/input_quant_fp8.py
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run just llama3 (fp8 & bf16) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
+
+- label: Fusion E2E TP2 (B200)
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/"
+  device: b200
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    # for ar-rms-quant-fp4, also sweep llama3
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
--- a/.buildkite/test_areas/cuda.yaml
+++ b/.buildkite/test_areas/cuda.yaml
@@ -9,6 +9,7 @@ steps:
  - tests/cuda
  commands:
    - pytest -v -s cuda/test_cuda_context.py
+    - pytest -v -s cuda/test_platform_no_cuda_init.py

 - label: Cudagraph
  timeout_in_minutes: 20
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Distributed Comm Ops
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/distributed
  - tests/distributed
@@ -16,9 +16,9 @@ steps:
  - pytest -v -s distributed/test_shm_storage.py

 - label: Distributed (2 GPUs)
-  timeout_in_minutes: 90
+  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/compilation/
  - vllm/distributed/
@@ -47,14 +47,13 @@ steps:
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
-  - pytest -v -s distributed/test_sequence_parallel.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py

 - label: Distributed Tests (4 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
  - tests/distributed/test_utils
@@ -63,6 +62,7 @@ steps:
  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
+  - examples/offline_inference/new_weight_syncing/
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
@@ -97,14 +97,19 @@ steps:
  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
+  # OLD rlhf examples
  - cd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+  # NEW rlhf examples
+  - cd new_weight_syncing
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py

 - label: Distributed Tests (8 GPUs)(H100)
  timeout_in_minutes: 10
-  gpu: h100
-  num_gpus: 8
+  device: h100
+  num_devices: 8
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - examples/offline_inference/torchrun_dp_example.py
@@ -120,9 +125,9 @@ steps:
  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep

 - label: Distributed Tests (4 GPUs)(A100)
-  gpu: a100
+  device: a100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/
  commands:
@@ -133,26 +138,22 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py

- label: Distributed Tests (2 GPUs)(H200)
-  gpu: h200
+- label: Distributed Tests (2 GPUs)(H100)
+  timeout_in_minutes: 15
+  device: h100
  optional: true
  working_dir: "/vllm-workspace/"
-  num_gpus: 2
+  num_devices: 2
  commands:
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
-    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py

 - label: Distributed Tests (2 GPUs)(B200)
-  gpu: b200
+  device: b200
  optional: true
  working_dir: "/vllm-workspace/"
-  num_gpus: 2
+  num_devices: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
@@ -161,8 +162,9 @@ steps:
 - label: 2 Node Test (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  num_nodes: 2
+  no_plugin: true
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
@@ -171,12 +173,12 @@ steps:
  - tests/distributed/
  - tests/examples/offline_inference/data_parallel.py
  commands:
-    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
+    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"

 - label: Distributed NixlConnector PD accuracy (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
@@ -184,10 +186,21 @@ steps:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

+- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+
 - label: Pipeline + Context Parallelism (4 GPUs))
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
@@ -196,4 +209,4 @@ steps:
  - tests/distributed/
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
-  - pytest -v -s distributed/test_pipeline_parallel.py
+  - pytest -v -s distributed/test_pipeline_parallel.py
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -4,27 +4,27 @@ depends_on:
 steps:
 - label: DeepSeek V2-Lite Accuracy
  timeout_in_minutes: 60
-  gpu: h100
+  device: h100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010

 - label: Qwen3-30B-A3B-FP8-block Accuracy
  timeout_in_minutes: 60
-  gpu: h100
+  device: h100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020

 - label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
  timeout_in_minutes: 60
-  gpu: b200
+  device: b200
  optional: true
-  num_gpus: 2
+  num_devices: 2
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
@@ -33,10 +33,11 @@ steps:
  timeout_in_minutes: 30
  optional: true
  soft_fail: true
-  num_gpus: 2
+  num_devices: 2
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
+    - nvidia-smi
    - bash .buildkite/scripts/run-prime-rl-test.sh
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -23,4 +23,8 @@ steps:
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - pytest -v -s v1/e2e
-    - pytest -v -s v1/engine
+    # Run this test standalone for now;
+    # need to untangle use (implicit) use of spawn/fork across the tests.
+    - pytest -v -s v1/engine/test_preprocess_error_handling.py
+    # Run the rest of v1/engine tests
+    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -14,7 +14,7 @@ steps:
 - label: EPLB Execution
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/eplb
  - tests/distributed/test_eplb_execute.py
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -15,8 +15,9 @@ steps:
  timeout_in_minutes: 35
  source_file_dependencies:
  - csrc/attention/
-  - vllm/attention
  - vllm/v1/attention
+    # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
+  - vllm/model_executor/layers/attention
  - tests/kernels/attention
  commands:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -57,8 +58,8 @@ steps:

 - label: Kernels DeepGEMM Test (H100)
  timeout_in_minutes: 45
-  gpu: h100
-  num_gpus: 1
+  device: h100
+  num_devices: 1
  source_file_dependencies:
  - tools/install_deepgemm.sh
  - vllm/utils/deep_gemm.py
@@ -77,7 +78,7 @@ steps:
 - label: Kernels (B200)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  device: b200
  # optional: true
  source_file_dependencies:
  - csrc/quantization/fp4/
@@ -85,7 +86,7 @@ steps:
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -114,4 +115,55 @@ steps:
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
+    # e2e
+    - pytest -v -s tests/models/quantization/test_nvfp4.py
+
+- label: Kernels Helion Test
+  timeout_in_minutes: 30
+  device: h100
+  source_file_dependencies:
+  - vllm/utils/import_utils.py
+  - tests/kernels/helion/
+  commands:
+    - pip install helion
+    - pytest -v -s kernels/helion/
+
+ 
+- label: Kernels FP8 MoE Test (1 H100)
+  timeout_in_minutes: 90
+  device: h100
+  num_devices: 1
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_cutlass_moe.py
+    - pytest -v -s kernels/moe/test_flashinfer.py
+    - pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
+    - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
+    - pytest -v -s kernels/moe/test_moe.py
+    # - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
+    - pytest -v -s kernels/moe/test_block_int8.py
+    - pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
+    - pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
+
+- label: Kernels FP8 MoE Test (2 H100s)
+  timeout_in_minutes: 90
+  device: h100
+  num_devices: 2
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
+    - pytest -v -s kernels/moe/test_deepep_moe.py
+    - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
+    # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
+  
+- label: Kernels Fp4 MoE Test (B200)
+  timeout_in_minutes: 60
+  device: b200
+  num_devices: 1
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_cutedsl_moe.py
+    - pytest -v -s kernels/moe/test_flashinfer_moe.py
+    - pytest -v -s kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s kernels/moe/test_ocp_mx_moe.py
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -12,9 +12,9 @@ steps:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt

 - label: LM Eval Large Models (4 GPUs)(A100)
-  gpu: a100
+  device: a100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
@@ -24,9 +24,9 @@ steps:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

 - label: LM Eval Large Models (4 GPUs)(H100)
-  gpu: h100
+  device: h100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
@@ -37,10 +37,39 @@ steps:

 - label: LM Eval Small Models (B200)
  timeout_in_minutes: 120
-  gpu: b200
+  device: b200
  optional: true
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
+
+- label: LM Eval Large Models (H200)
+  timeout_in_minutes: 60
+  device: h200
+  optional: true
+  num_devices: 8
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
+
+- label: MoE Refactor Integration Test (H100 - TEMPORARY)
+  device: h100
+  optional: true
+  num_devices: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
+  
+- label: MoE Refactor Integration Test (B200 - TEMPORARY)
+  device: b200
+  optional: true
+  num_devices: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
+
+- label: MoE Refactor Integration Test (B200 DP - TEMPORARY)
+  device: b200
+  optional: true
+  num_devices: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@@ -14,7 +14,7 @@ steps:

 - label: LoRA TP (Distributed)
  timeout_in_minutes: 30
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/lora
  - tests/lora
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -16,7 +16,7 @@ steps:
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
-    - pytest -v -s v1/spec_decode
+    - pytest -v -s -m 'not slow_test' v1/spec_decode
    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
    - pytest -v -s -m 'not cpu_test' v1/metrics
    - pytest -v -s v1/test_oracle.py
@@ -27,11 +27,12 @@ steps:
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

 - label: V1 Others (CPU)
-  depends_on: ~
+  depends_on:
+    - image-build-cpu
  source_file_dependencies:
    - vllm/
    - tests/v1
-  no_gpu: true
+  device: cpu
  commands:
    # split the test to avoid interference
    - pytest -v -s -m 'cpu_test' v1/core
@@ -71,7 +72,7 @@ steps:
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
     # for pooling models
-    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    - python3 pooling/embed/vision_embedding_offline.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
@@ -82,7 +83,7 @@ steps:

 - label: Metrics, Tracing (2 GPUs)
  timeout_in_minutes: 20
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/
  - tests/v1/tracing
@@ -114,12 +115,14 @@ steps:
  - pytest -v -s utils_

 - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
-  depends_on: ~
+  depends_on: 
+  - image-build-cpu
  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
+  - tests/test_pooling_params.py
  - tests/multimodal
  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
@@ -127,11 +130,12 @@ steps:
  - tests/tool_parsers
  - tests/transformers_utils
  - tests/config
-  no_gpu: true
+  device: cpu
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
+  - pytest -v -s test_pooling_params.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s renderers
  - pytest -v -s tokenizers_
@@ -142,7 +146,7 @@ steps:
 - label: GPT-OSS Eval (B200)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  device: b200
  optional: true
  source_file_dependencies:
  - tests/evals/gpt_oss
@@ -155,7 +159,7 @@ steps:

 - label: Batch Invariance (H100)
  timeout_in_minutes: 25
-  gpu: h100
+  device: h100
  source_file_dependencies:
    - vllm/v1/attention
    - vllm/model_executor/layers
@@ -164,4 +168,18 @@ steps:
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pip install pytest-timeout pytest-forked
    - pytest -v -s v1/determinism/test_batch_invariance.py
-    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+  
+- label: Acceptance Length Test (Large Models) # optional
+  timeout_in_minutes: 25
+  gpu: h100
+  optional: true
+  num_gpus: 1
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/v1/spec_decode/
+  - vllm/model_executor/models/mlp_speculator.py
+  - tests/v1/spec_decode/test_acceptance_length.py
+  commands:
+    - export VLLM_ALLOW_INSECURE_SERIALIZATION=1
+    - pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -33,18 +33,21 @@ steps:
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
+  - tests/models/test_terratorch.py
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
  commands:
-    - pytest -v -s models/test_transformers.py models/test_registry.py
+    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py

 - label: Basic Models Test (Other CPU) # 5min
+  depends_on: 
+  - image-build-cpu
  timeout_in_minutes: 10
  source_file_dependencies:
  - vllm/
  - tests/models/test_utils.py
  - tests/models/test_vision.py
-  no_gpu: true
+  device: cpu
  commands:
    - pytest -v -s models/test_utils.py models/test_vision.py

--- a/.buildkite/test_areas/models_distributed.yaml
+++ b/.buildkite/test_areas/models_distributed.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Distributed Model Tests (2 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/model_executor/model_loader/sharded_state_loader.py
  - vllm/model_executor/models/
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -14,11 +14,13 @@ steps:
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work

 - label: Multi-Modal Processor Test (CPU)
+  depends_on: 
+  - image-build-cpu
  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
-  no_gpu: true
+  device: cpu
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Plugin Tests (2 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -3,7 +3,7 @@ depends_on:
  - image-build
 steps:
 - label: PyTorch Compilation Unit Tests
-  timeout_in_minutes: 30
+  timeout_in_minutes: 10
  source_file_dependencies:
    - vllm/
    - tests/compile
@@ -17,8 +17,16 @@ steps:
  # (using -0 for proper path handling)
  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"

+- label: PyTorch Compilation Passes Unit Tests
+  timeout_in_minutes: 20
+  source_file_dependencies:
+    - vllm/
+    - tests/compile/passes
+  commands:
+  - pytest -s -v compile/passes --ignore compile/passes/distributed
+
 - label: PyTorch Fullgraph Smoke Test
-  timeout_in_minutes: 30
+  timeout_in_minutes: 35
  source_file_dependencies:
  - vllm/
  - tests/compile
@@ -30,16 +38,13 @@ steps:
  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"

 - label: PyTorch Fullgraph
-  timeout_in_minutes: 40
+  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
    # fp8 kv scales not supported on sm89, tested on Blackwell instead
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
-    # Limit to no custom ops to reduce running time
-    # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"

 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
--- a/.buildkite/test_areas/quantization.yaml
+++ b/.buildkite/test_areas/quantization.yaml
@@ -16,14 +16,14 @@ steps:
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
-  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

 - label: Quantized MoE Test (B200)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  device: b200
  source_file_dependencies:
  - tests/quantization/test_blackwell_moe.py
  - vllm/model_executor/models/deepseek_v2.py
--- a/.buildkite/test_areas/weight_loading.yaml
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Weight Loading Multiple GPU  # 33min
  timeout_in_minutes: 45
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  optional: true
  source_file_dependencies:
  - vllm/
@@ -15,8 +15,8 @@ steps:

 - label: Weight Loading Multiple GPU - Large Models # optional
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  gpu: a100
+  num_devices: 2
+  device: a100
  optional: true
  source_file_dependencies:
  - vllm/
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,8 +2,8 @@
 # for more info about CODEOWNERS file

 # This lists cover the "core" components of vLLM that require careful review
-/vllm/attention @LucasWilkinson
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
+/vllm/model_executor/layers/attention @LucasWilkinson
 /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
@@ -16,7 +16,7 @@
 /vllm/entrypoints @aarnphm @chaunceyjiang
 /vllm/tool_parsers @aarnphm @chaunceyjiang
 /vllm/compilation @zou3519 @youkaichao @ProExpertProg
-/vllm/distributed/kv_transfer @NickLucche @ApostaC
+/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson

 # Any change to the VllmConfig changes can have a large user-facing impact,
@@ -30,12 +30,14 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/v1/attention/backends/mla @pavanimajety
 /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
-/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
+/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
 /vllm/v1/sample @22quinn @houseroad @njhill
 /vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/kv_cache_interface.py @heheda12345
-/vllm/v1/offloading @ApostaC
+/vllm/v1/kv_offload @ApostaC @orozery
+/vllm/v1/worker/gpu/kv_connector.py @orozery
+/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery

 # Model runner V2
 /vllm/v1/worker/gpu @WoosukKwon
@@ -54,13 +56,13 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
-/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
+/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
 /tests/v1/kv_connector/nixl_integration @NickLucche
-/tests/v1/kv_connector @ApostaC
-/tests/v1/offloading @ApostaC
+/tests/v1/kv_connector @ApostaC @orozery
+/tests/v1/kv_offload @ApostaC @orozery
 /tests/v1/determinism @yewentao256 

 # Transformers modeling backend
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -121,24 +121,9 @@ repos:
    name: Update Dockerfile dependency graph
    entry: tools/pre_commit/update-dockerfile-graph.sh
    language: script
-  - id: enforce-import-regex-instead-of-re
-    name: Enforce import regex as re
-    entry: python tools/pre_commit/enforce_regex_import.py
-    language: python
-    types: [python]
-    pass_filenames: false
-    additional_dependencies: [regex]
-  # forbid directly import triton
-  - id: forbid-direct-triton-import
-    name: "Forbid direct 'import triton'"
-    entry: python tools/pre_commit/check_triton_import.py
-    language: python
-    types: [python]
-    pass_filenames: false
-    additional_dependencies: [regex]
-  - id: check-pickle-imports
-    name: Prevent new pickle/cloudpickle imports
-    entry: python tools/pre_commit/check_pickle_imports.py
+  - id: check-forbidden-imports
+    name: Check for forbidden imports
+    entry: python tools/pre_commit/check_forbidden_imports.py
    language: python
    types: [python]
    additional_dependencies: [regex]
@@ -154,6 +139,10 @@ repos:
    files: ^docker/(Dockerfile|versions\.json)$
    pass_filenames: false
    additional_dependencies: [dockerfile-parse]
+  - id: attention-backend-docs
+    name: Check attention backend documentation is up to date
+    entry: python tools/pre_commit/generate_attention_backend_docs.py --check
+    language: python
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,8 +56,8 @@ endif()
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.10.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.10.0")

 #
 # Try to find python package with an executable that exactly matches
@@ -433,7 +433,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
    endif()

-    if (MARLIN_SM75_ARCHS) 
+    if (MARLIN_SM75_ARCHS)
      file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/marlin/sm75_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
@@ -445,7 +445,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
    endif()

-    if (MARLIN_FP8_ARCHS) 
+    if (MARLIN_FP8_ARCHS)
      file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/marlin/sm89_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
@@ -458,7 +458,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()

    set(MARLIN_SRCS
-       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
       "csrc/quantization/marlin/marlin.cu"
       "csrc/quantization/marlin/marlin_int4_fp8_preprocess.cu"
       "csrc/quantization/marlin/gptq_marlin_repack.cu"
@@ -1043,7 +1042,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
    endif()

-    if (MARLIN_MOE_SM75_ARCHS) 
+    if (MARLIN_MOE_SM75_ARCHS)
      file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_MOE_SM75_SRC}"
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -11,7 +11,7 @@ This directory used to contain vLLM's benchmark scripts and utilities for perfor

 ## Usage

-For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).
+For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/benchmarking/cli/#benchmark-cli).

 For full CLI reference see:

--- a/benchmarks/attention_benchmarks/README.md
+++ b/benchmarks/attention_benchmarks/README.md
@@ -0,0 +1,266 @@
+# vLLM Attention Benchmarking Suite
+
+Fast, flexible benchmarking for vLLM attention and MLA backends with an extended batch specification grammar.
+
+## Quick Start
+
+```bash
+cd benchmarks/attention_benchmarks
+
+# Run a pre-configured benchmark
+python benchmark.py --config configs/mla_decode.yaml
+python benchmark.py --config configs/mla_mixed_batch.yaml
+python benchmark.py --config configs/speculative_decode.yaml
+python benchmark.py --config configs/standard_attention.yaml
+python benchmark.py --config configs/reorder_threshold.yaml
+
+# Or run custom benchmarks
+python benchmark.py \
+    --backends flash flashinfer \
+    --batch-specs "q2k" "8q1s1k" "2q2k_32q1s1k" \
+    --output-csv results.csv
+```
+
+## Simplified Batch Specification Grammar
+
+Express workloads concisely using query length and sequence length:
+
+```python
+"q2k"              # 2048-token prefill (q_len=2048, seq_len=2048)
+"q1s1k"            # Decode: 1 token with 1K sequence
+"8q1s1k"           # 8 decode requests
+"q4s1k"            # 4-token extend (e.g., spec decode)
+"2q2k_32q1s1k"     # Mixed: 2 prefills + 32 decodes
+"16q4s1k"          # 16 spec decode (4 tokens each)
+```
+
+### Grammar Rule
+
+```text
+Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
+
+- count:   Number of identical requests (optional, default=1)
+- q_len:   Query length (number of new tokens)
+- seq_len: Total sequence length (optional, defaults to q_len for prefill)
+- 'k':     Multiplies value by 1024
+
+Mixed batches: Use _ to combine (e.g., "2q2k_32q1s1k")
+```
+
+**Note**: Decode, prefill, and spec decode are just different query lengths - no special syntax needed!
+
+## Pre-configured Benchmarks
+
+The suite includes several pre-configured YAML benchmark configurations:
+
+### MLA Decode Benchmark
+
+Tests pure decode performance across MLA backends with varying batch sizes and sequence lengths.
+
+```bash
+python benchmark.py --config configs/mla_decode.yaml
+```
+
+### MLA Mixed Batch Benchmark
+
+Tests chunked prefill performance with mixed prefill + decode batches.
+
+```bash
+python benchmark.py --config configs/mla_mixed_batch.yaml
+```
+
+### Speculative Decoding Benchmark
+
+Tests speculative decode scenarios (K-token verification) and reorder_batch_threshold optimization.
+
+```bash
+python benchmark.py --config configs/speculative_decode.yaml
+```
+
+### Standard Attention Benchmark
+
+Tests standard attention backends (Flash/Triton/FlashInfer) with pure prefill, decode, and mixed batches.
+
+```bash
+python benchmark.py --config configs/standard_attention.yaml
+```
+
+### Reorder Threshold Study
+
+**Question:** At what query length does the prefill pipeline become faster than the decode pipeline?
+
+Tests query lengths from 1-1024 across 9 batch sizes to find the crossover point. Uses `decode_vs_prefill` mode to compare both pipelines for each query length.
+
+```bash
+python benchmark.py --config configs/reorder_threshold.yaml
+```
+
+---
+
+## Universal Benchmark
+
+The `benchmark.py` script handles **all** backends - both standard attention and MLA.
+
+### Standard Attention (Flash/Triton/FlashInfer)
+
+```bash
+python benchmark.py \
+    --backends flash triton flashinfer \
+    --batch-specs "q2k" "8q1s1k" "2q2k_32q1s1k" \
+    --num-layers 10 \
+    --repeats 5 \
+    --output-csv results.csv
+```
+
+### MLA Backends
+
+```bash
+# Compare all MLA backends
+python benchmark.py \
+    --backends cutlass_mla flashinfer_mla flashattn_mla flashmla \
+    --batch-specs "64q1s1k" "64q1s4k" \
+    --output-csv mla_results.csv
+```
+
+### Parameter Sweeps
+
+Use `--sweep-param` and `--sweep-values` to run parameter sweeps from the CLI:
+
+#### CUTLASS MLA num-splits Optimization
+
+**Question:** What is the optimal `num_kv_splits` for CUTLASS MLA?
+
+```bash
+python benchmark.py \
+    --backend cutlass_mla \
+    --batch-specs "64q1s1k" "64q1s4k" "64q1s16k" \
+    --sweep-param num_kv_splits \
+    --sweep-values 1 2 4 8 16 \
+    --output-json optimal_splits.json
+```
+
+#### Reorder Batch Threshold Optimization
+
+**Question:** What's the optimal `reorder_batch_threshold` for speculative decoding?
+
+```bash
+python benchmark.py \
+    --backend flashmla \
+    --batch-specs "q4s1k" "q8s2k" \
+    --sweep-param reorder_batch_threshold \
+    --sweep-values 1 4 16 64 256 512 \
+    --output-csv threshold_sweep.csv
+```
+
+### All Command-Line Options
+
+```text
+--config CONFIG                     # Path to YAML config file (overrides other args)
+--backends BACKEND [BACKEND ...]    # flash, triton, flashinfer, cutlass_mla,
+                                    # flashinfer_mla, flashattn_mla, flashmla
+--backend BACKEND                   # Single backend (alternative to --backends)
+--batch-specs SPEC [SPEC ...]       # Batch specifications using extended grammar
+
+# Model configuration
+--num-layers N                      # Number of layers
+--head-dim N                        # Head dimension
+--num-q-heads N                     # Query heads
+--num-kv-heads N                    # KV heads
+--block-size N                      # Block size
+
+# Benchmark settings
+--device DEVICE                     # Device (default: cuda:0)
+--repeats N                         # Repetitions
+--warmup-iters N                    # Warmup iterations
+--profile-memory                    # Profile memory usage
+
+# Parameter sweeps
+--sweep-param PARAM                 # Parameter name to sweep (e.g., num_kv_splits,
+                                    # reorder_batch_threshold)
+--sweep-values N [N ...]            # Values to sweep for the parameter
+
+# Output
+--output-csv FILE                   # Save to CSV
+--output-json FILE                  # Save to JSON
+```
+
+## Hardware Requirements
+
+| Backend | Hardware |
+|---------|----------|
+| Flash/Triton/FlashInfer | Any CUDA GPU |
+| CUTLASS MLA | Blackwell (SM100+) |
+| FlashAttn MLA | Hopper (SM90+) |
+| FlashMLA | Hopper (SM90+) |
+| FlashInfer-MLA | Any CUDA GPU |
+
+## Using MLA Runner Directly
+
+All MLA backends are available through `mla_runner.run_mla_benchmark()`:
+
+```python
+from mla_runner import run_mla_benchmark
+from common import BenchmarkConfig
+
+config = BenchmarkConfig(
+    backend="cutlass_mla",
+    batch_spec="64q1s4k",
+    num_layers=10,
+    head_dim=576,
+    num_q_heads=128,
+    num_kv_heads=1,
+    block_size=128,
+    device="cuda:0",
+    repeats=5,
+    warmup_iters=3,
+)
+
+# CUTLASS MLA with specific num_kv_splits
+result = run_mla_benchmark("cutlass_mla", config, num_kv_splits=4)
+print(f"Time: {result.mean_time:.6f}s")
+
+# FlashInfer-MLA
+result = run_mla_benchmark("flashinfer_mla", config)
+
+# FlashAttn MLA (Hopper SM90+)
+result = run_mla_benchmark("flashattn_mla", config, reorder_batch_threshold=64)
+
+# FlashMLA (Hopper SM90+)
+result = run_mla_benchmark("flashmla", config, reorder_batch_threshold=64)
+```
+
+## Python API
+
+```python
+from batch_spec import parse_batch_spec, format_batch_spec, get_batch_stats
+from common import BenchmarkConfig, BenchmarkResult, ResultsFormatter
+
+# Parse batch specs
+requests = parse_batch_spec("2q2k_q4s1k_32q1s1k")
+print(format_batch_spec(requests))
+# "2 prefill (2x2k), 1 extend (1xq4kv1k), 32 decode (32x1k)"
+
+# Get batch statistics
+stats = get_batch_stats(requests)
+print(f"Total tokens: {stats['total_tokens']}")
+print(f"Num decode: {stats['num_decode']}, Num prefill: {stats['num_prefill']}")
+
+# Format results
+formatter = ResultsFormatter()
+formatter.save_csv(results, "output.csv")
+formatter.save_json(results, "output.json")
+```
+
+## Tips
+
+**1. Warmup matters** - Use `--warmup-iters 10` for stable results
+
+**2. Multiple repeats** - Use `--repeats 20` for low variance
+
+**3. Save results** - Always use `--output-csv` or `--output-json`
+
+**4. Test incrementally** - Start with `--num-layers 1 --repeats 1`
+
+**5. Extended grammar** - Leverage spec decode, chunked prefill patterns
+
+**6. Parameter sweeps** - Use `--sweep-param` and `--sweep-values` to find optimal values
--- a/benchmarks/attention_benchmarks/init.py
+++ b/benchmarks/attention_benchmarks/init.py
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""vLLM Attention Benchmarking Suite."""
+
+from .batch_spec import (
+    BatchRequest,
+    format_batch_spec,
+    get_batch_stats,
+    parse_batch_spec,
+    reorder_for_flashinfer,
+    split_by_type,
+)
+from .common import (
+    BenchmarkConfig,
+    BenchmarkResult,
+    MockLayer,
+    MockModelConfig,
+    ResultsFormatter,
+    get_attention_scale,
+    is_mla_backend,
+    setup_mla_dims,
+)
+
+__all__ = [
+    # Batch specification
+    "BatchRequest",
+    "parse_batch_spec",
+    "format_batch_spec",
+    "reorder_for_flashinfer",
+    "split_by_type",
+    "get_batch_stats",
+    # Benchmarking infrastructure
+    "BenchmarkConfig",
+    "BenchmarkResult",
+    "ResultsFormatter",
+    # Mock objects
+    "MockLayer",
+    "MockModelConfig",
+    # Utilities
+    "setup_mla_dims",
+    "get_attention_scale",
+    "is_mla_backend",
+]
--- a/benchmarks/attention_benchmarks/batch_spec.py
+++ b/benchmarks/attention_benchmarks/batch_spec.py
@@ -0,0 +1,231 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Simplified batch specification grammar for attention benchmarks.
+
+Grammar (underscore-separated segments):
+  Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
+
+  - count: Number of identical requests (optional, default=1)
+  - q_len: Query length (number of new tokens)
+  - seq_len: Total sequence length (optional, defaults to q_len for prefill)
+  - 'k' suffix: Multiplies value by 1024
+
+Common patterns:
+  - Prefill:  q_len == seq_len  (e.g., "q2k" → 2048 new tokens, 2048 seq)
+  - Decode:   q_len == 1        (e.g., "q1s1k" → 1 token, 1024 seq length)
+  - Extend:   q_len < seq_len   (e.g., "q4s1k" → 4 tokens, 1024 seq length)
+
+Examples:
+  q2k              -> [(2048, 2048)]           # Prefill: 2048 tokens
+  q1s1k            -> [(1, 1024)]              # Decode: 1 token, 1K sequence
+  8q1s1k           -> [(1, 1024)] * 8          # 8 decode requests
+  q4s1k            -> [(4, 1024)]              # 4-token extend (spec decode)
+  2q1k_32q1s1k     -> [(1024, 1024)] * 2 + [(1, 1024)] * 32  # Mixed batch
+  16q4s1k          -> [(4, 1024)] * 16         # 16 spec decode requests
+"""
+
+from collections import Counter
+from dataclasses import dataclass
+
+import regex as re
+
+
+@dataclass
+class BatchRequest:
+    """Represents a single request in a batch."""
+
+    q_len: int  # Query length (number of new tokens)
+    kv_len: int  # Total KV cache length
+
+    @property
+    def is_decode(self) -> bool:
+        """True if this is a decode request (q_len == 1)."""
+        return self.q_len == 1
+
+    @property
+    def is_prefill(self) -> bool:
+        """True if this is a pure prefill (q_len == kv_len)."""
+        return self.q_len == self.kv_len
+
+    @property
+    def is_extend(self) -> bool:
+        """True if this is context extension (q_len > 1, kv_len > q_len)."""
+        return self.q_len > 1 and self.kv_len > self.q_len
+
+    @property
+    def context_len(self) -> int:
+        """Context length (KV cache - query)."""
+        return self.kv_len - self.q_len
+
+    def as_tuple(self) -> tuple[int, int]:
+        """Return as (q_len, kv_len) tuple for compatibility."""
+        return (self.q_len, self.kv_len)
+
+
+def _parse_size(size_str: str, k_suffix: str) -> int:
+    """Parse size string with optional 'k' suffix."""
+    size = int(size_str)
+    return size * 1024 if k_suffix == "k" else size
+
+
+def parse_batch_spec(spec: str) -> list[BatchRequest]:
+    """
+    Parse batch specification string into list of BatchRequest objects.
+
+    Grammar: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
+
+    Args:
+        spec: Batch specification string (see module docstring for grammar)
+
+    Returns:
+        List of BatchRequest objects
+
+    Raises:
+        ValueError: If spec format is invalid
+    """
+    requests = []
+
+    for seg in spec.split("_"):
+        # Unified pattern: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
+        m = re.match(r"^(?:(\d+))?q(\d+)(k?)(?:s(\d+)(k?))?$", seg)
+        if m:
+            cnt = int(m.group(1)) if m.group(1) else 1
+            q_len = _parse_size(m.group(2), m.group(3))
+            kv_len = _parse_size(m.group(4), m.group(5)) if m.group(4) else q_len
+            requests.extend([BatchRequest(q_len=q_len, kv_len=kv_len)] * cnt)
+            continue
+
+        raise ValueError(f"Invalid batch spec segment: '{seg}'")
+
+    return requests
+
+
+def format_batch_spec(requests: list[BatchRequest]) -> str:
+    """
+    Format list of BatchRequest into human-readable string.
+
+    Groups requests by type and provides counts and sizes.
+
+    Args:
+        requests: List of BatchRequest objects
+
+    Returns:
+        Formatted string describing the batch
+    """
+    kinds = {
+        "prefill": [],
+        "extend": [],
+        "decode": [],
+    }
+
+    for req in requests:
+        tup = (req.q_len, req.kv_len)
+        if req.is_prefill:
+            kinds["prefill"].append(tup)
+        elif req.is_extend:
+            kinds["extend"].append(tup)
+        elif req.is_decode:
+            kinds["decode"].append(tup)
+
+    parts = []
+    for kind in ["prefill", "extend", "decode"]:
+        lst = kinds[kind]
+        if not lst:
+            continue
+
+        cnt_total = len(lst)
+        ctr = Counter(lst)
+        inner = []
+
+        for (q, kv), cnt in ctr.items():
+            if kind == "prefill":
+                size = f"{q // 1024}k" if q % 1024 == 0 else str(q)
+                inner.append(f"{cnt}x{size}")
+            elif kind == "decode":
+                size = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
+                inner.append(f"{cnt}x{size}")
+            else:  # extend
+                qstr = f"{q // 1024}k" if q % 1024 == 0 else str(q)
+                kstr = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
+                inner.append(f"{cnt}xq{qstr}kv{kstr}")
+
+        parts.append(f"{cnt_total} {kind} ({', '.join(inner)})")
+
+    return ", ".join(parts)
+
+
+def reorder_for_flashinfer(requests: list[BatchRequest]) -> list[BatchRequest]:
+    """
+    Reorder requests for FlashInfer: decode first, then prefill.
+
+    FlashInfer expects decode requests before prefill requests for
+    optimal performance.
+
+    Args:
+        requests: Original list of BatchRequest
+
+    Returns:
+        Reordered list with decode requests first
+    """
+    decodes = [r for r in requests if r.is_decode]
+    non_decodes = [r for r in requests if not r.is_decode]
+    return decodes + non_decodes
+
+
+def split_by_type(
+    requests: list[BatchRequest],
+) -> dict[str, list[BatchRequest]]:
+    """
+    Split requests by type for analysis.
+
+    Args:
+        requests: List of BatchRequest
+
+    Returns:
+        Dict with keys: 'decode', 'prefill', 'extend'
+    """
+    result = {
+        "decode": [],
+        "prefill": [],
+        "extend": [],
+    }
+
+    for req in requests:
+        if req.is_decode:
+            result["decode"].append(req)
+        elif req.is_prefill:
+            result["prefill"].append(req)
+        elif req.is_extend:
+            result["extend"].append(req)
+
+    return result
+
+
+def get_batch_stats(requests: list[BatchRequest]) -> dict:
+    """
+    Compute statistics about a batch.
+
+    Args:
+        requests: List of BatchRequest
+
+    Returns:
+        Dict with batch statistics
+    """
+    by_type = split_by_type(requests)
+
+    return {
+        "total_requests": len(requests),
+        "num_decode": len(by_type["decode"]),
+        "num_prefill": len(by_type["prefill"]),
+        "num_extend": len(by_type["extend"]),
+        "total_tokens": sum(r.q_len for r in requests),
+        "total_kv_cache": sum(r.kv_len for r in requests),
+        "max_q_len": max((r.q_len for r in requests), default=0),
+        "max_kv_len": max((r.kv_len for r in requests), default=0),
+        "avg_q_len": sum(r.q_len for r in requests) / len(requests) if requests else 0,
+        "avg_kv_len": (
+            sum(r.kv_len for r in requests) / len(requests) if requests else 0
+        ),
+    }
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -0,0 +1,886 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Universal vLLM Attention Benchmark
+
+Benchmark any attention backend with the extended grammar.
+Supports standard attention (Flash/Triton/FlashInfer) and MLA backends.
+
+Examples:
+    # Standard attention
+    python benchmark.py --backends flash flashinfer --batch-specs "q2k" "8q1s1k"
+
+    # MLA backends
+    python benchmark.py --backends cutlass_mla flashinfer_mla --batch-specs "64q1s1k"
+
+    # Parameter sweep (CLI)
+    python benchmark.py --backend cutlass_mla \
+                        --batch-specs "64q1s1k" \
+                        --sweep-param num_kv_splits \
+                        --sweep-values 1 4 8 16
+
+    # Parameter sweep (YAML config - recommended)
+    python benchmark.py --config configs/cutlass_numsplits.yaml
+"""
+
+import argparse
+import sys
+from dataclasses import replace
+from pathlib import Path
+
+import yaml
+from rich.console import Console
+from tqdm import tqdm
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from batch_spec import parse_batch_spec
+from common import (
+    BenchmarkConfig,
+    BenchmarkResult,
+    ModelParameterSweep,
+    ParameterSweep,
+    ResultsFormatter,
+    is_mla_backend,
+)
+
+
+def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
+    """Run standard attention benchmark (Flash/Triton/FlashInfer)."""
+    from runner import run_attention_benchmark
+
+    return run_attention_benchmark(config)
+
+
+def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
+    """Run MLA benchmark with appropriate backend."""
+    from mla_runner import run_mla_benchmark as run_mla
+
+    return run_mla(config.backend, config, **kwargs)
+
+
+def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
+    """
+    Run a single benchmark with proper backend selection.
+
+    Args:
+        config: BenchmarkConfig with backend, batch_spec, and model params
+        **kwargs: Additional arguments passed to MLA benchmarks
+
+    Returns:
+        BenchmarkResult (may have error field set on failure)
+    """
+    try:
+        if is_mla_backend(config.backend):
+            return run_mla_benchmark(config, **kwargs)
+        else:
+            return run_standard_attention_benchmark(config)
+    except Exception as e:
+        return BenchmarkResult(
+            config=config,
+            mean_time=float("inf"),
+            std_time=0,
+            min_time=float("inf"),
+            max_time=float("inf"),
+            error=str(e),
+        )
+
+
+def run_model_parameter_sweep(
+    backends: list[str],
+    batch_specs: list[str],
+    base_config_args: dict,
+    sweep: ModelParameterSweep,
+    console: Console,
+) -> list[BenchmarkResult]:
+    """
+    Run model parameter sweep for given backends and batch specs.
+
+    Args:
+        backends: List of backend names
+        batch_specs: List of batch specifications
+        base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
+        sweep: ModelParameterSweep configuration
+        console: Rich console for output
+
+    Returns:
+        List of BenchmarkResult objects
+    """
+    all_results = []
+
+    console.print(
+        f"[yellow]Model sweep mode: testing {sweep.param_name} = {sweep.values}[/]"
+    )
+
+    total = len(backends) * len(batch_specs) * len(sweep.values)
+
+    with tqdm(total=total, desc="Benchmarking") as pbar:
+        for backend in backends:
+            for spec in batch_specs:
+                for value in sweep.values:
+                    # Create config with modified model parameter
+                    config_args = base_config_args.copy()
+                    config_args[sweep.param_name] = value
+
+                    # Create config with original backend for running
+                    clean_config = BenchmarkConfig(
+                        backend=backend, batch_spec=spec, **config_args
+                    )
+
+                    # Run benchmark
+                    result = run_benchmark(clean_config)
+
+                    # Replace backend with labeled version for display
+                    backend_label = sweep.get_label(backend, value)
+                    labeled_config = replace(result.config, backend=backend_label)
+                    result = replace(result, config=labeled_config)
+                    all_results.append(result)
+
+                    if not result.success:
+                        console.print(
+                            f"[red]Error {backend} {spec} {sweep.param_name}="
+                            f"{value}: {result.error}[/]"
+                        )
+
+                    pbar.update(1)
+
+    # Display sweep results - create separate table for each parameter value
+    console.print("\n[bold green]Model Parameter Sweep Results:[/]")
+    formatter = ResultsFormatter(console)
+
+    # Group results by parameter value and extract backend mapping
+    by_param_value = {}
+    backend_mapping = {}  # Maps labeled backend -> original backend
+
+    for r in all_results:
+        # Extract original backend and param value from labeled backend
+        # The label format is: {backend}_{param_name}_{value}
+        # We need to reverse engineer this
+        labeled_backend = r.config.backend
+
+        # Try each backend to find which one this result belongs to
+        for backend in backends:
+            for value in sweep.values:
+                expected_label = sweep.get_label(backend, value)
+                if labeled_backend == expected_label:
+                    backend_mapping[labeled_backend] = backend
+                    param_value = str(value)
+
+                    if param_value not in by_param_value:
+                        by_param_value[param_value] = []
+                    by_param_value[param_value].append(r)
+                    break
+
+    # Create a table for each parameter value
+    sorted_param_values = sorted(
+        by_param_value.keys(), key=lambda x: int(x) if x.isdigit() else x
+    )
+
+    for param_value in sorted_param_values:
+        console.print(f"\n[bold cyan]{sweep.param_name} = {param_value}[/]")
+        param_results = by_param_value[param_value]
+
+        # Create modified results with original backend names
+        modified_results = []
+        for r in param_results:
+            # Get the original backend name from our mapping
+            original_backend = backend_mapping[r.config.backend]
+            modified_config = replace(r.config, backend=original_backend)
+            modified_result = replace(r, config=modified_config)
+            modified_results.append(modified_result)
+
+        # Print table with original backend names
+        formatter.print_table(modified_results, backends, compare_to_fastest=True)
+
+    # Show optimal backend for each (param_value, batch_spec) combination
+    console.print(
+        f"\n[bold cyan]Optimal backend for each ({sweep.param_name}, batch_spec):[/]"
+    )
+
+    # Group by (param_value, batch_spec)
+    by_param_and_spec = {}
+    for r in all_results:
+        if r.success:
+            # Find which (backend, value) this result corresponds to
+            labeled_backend = r.config.backend
+            for backend in backends:
+                for value in sweep.values:
+                    expected_label = sweep.get_label(backend, value)
+                    if labeled_backend == expected_label:
+                        param_value = str(value)
+                        spec = r.config.batch_spec
+                        key = (param_value, spec)
+
+                        if key not in by_param_and_spec:
+                            by_param_and_spec[key] = []
+                        by_param_and_spec[key].append(r)
+                        break
+
+    # Sort by param value then spec
+    sorted_keys = sorted(
+        by_param_and_spec.keys(),
+        key=lambda x: (int(x[0]) if x[0].isdigit() else x[0], x[1]),
+    )
+
+    current_param_value = None
+    for param_value, spec in sorted_keys:
+        # Print header when param value changes
+        if param_value != current_param_value:
+            console.print(f"\n  [bold]{sweep.param_name}={param_value}:[/]")
+            current_param_value = param_value
+
+        results = by_param_and_spec[(param_value, spec)]
+        best = min(results, key=lambda r: r.mean_time)
+
+        # Extract original backend name using the mapping
+        backend_name = backend_mapping[best.config.backend]
+
+        # Show all backends' times for comparison
+        times_str = " | ".join(
+            [
+                f"{backend_mapping[r.config.backend]}: {r.mean_time:.6f}s"
+                for r in sorted(results, key=lambda r: r.mean_time)
+            ]
+        )
+
+        console.print(
+            f"    {spec:12s} -> [bold green]{backend_name:15s}[/] ({times_str})"
+        )
+
+    return all_results
+
+
+def run_parameter_sweep(
+    backends: list[str],
+    batch_specs: list[str],
+    base_config_args: dict,
+    sweep: ParameterSweep,
+    console: Console,
+) -> list[BenchmarkResult]:
+    """
+    Run parameter sweep for given backends and batch specs.
+
+    Args:
+        backends: List of backend names
+        batch_specs: List of batch specifications
+        base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
+        sweep: ParameterSweep configuration
+        console: Rich console for output
+
+    Returns:
+        List of BenchmarkResult objects
+    """
+    all_results = []
+
+    # Build list of values to sweep (including auto if requested)
+    sweep_values = list(sweep.values)
+    if sweep.include_auto:
+        sweep_values.append("auto")
+
+    console.print(f"[yellow]Sweep mode: testing {sweep.param_name} = {sweep_values}[/]")
+
+    total = len(backends) * len(batch_specs) * len(sweep_values)
+
+    with tqdm(total=total, desc="Benchmarking") as pbar:
+        for backend in backends:
+            for spec in batch_specs:
+                for value in sweep_values:
+                    # Create config with original backend for running
+                    config = BenchmarkConfig(
+                        backend=backend, batch_spec=spec, **base_config_args
+                    )
+
+                    # Prepare kwargs for benchmark runner
+                    kwargs = {}
+                    if value != "auto":
+                        kwargs[sweep.param_name] = value
+
+                    # Run benchmark
+                    result = run_benchmark(config, **kwargs)
+
+                    # Replace backend with labeled version for display
+                    backend_label = sweep.get_label(backend, value)
+                    labeled_config = replace(result.config, backend=backend_label)
+                    result = replace(result, config=labeled_config)
+                    all_results.append(result)
+
+                    if not result.success:
+                        console.print(
+                            f"[red]Error {backend} {spec} {sweep.param_name}="
+                            f"{value}: {result.error}[/]"
+                        )
+
+                    pbar.update(1)
+
+    # Display sweep results
+    console.print("\n[bold green]Sweep Results:[/]")
+    backend_labels = [sweep.get_label(b, v) for b in backends for v in sweep_values]
+    formatter = ResultsFormatter(console)
+    formatter.print_table(all_results, backend_labels)
+
+    # Show optimal values
+    console.print(f"\n[bold cyan]Optimal {sweep.param_name} per batch spec:[/]")
+    by_spec = {}
+    for r in all_results:
+        if r.success:
+            spec = r.config.batch_spec
+            if spec not in by_spec:
+                by_spec[spec] = []
+            by_spec[spec].append(r)
+
+    for spec in sorted(by_spec.keys()):
+        results = by_spec[spec]
+        best = min(results, key=lambda r: r.mean_time)
+        console.print(
+            f"  {spec}: [bold green]{best.config.backend}[/] ({best.mean_time:.6f}s)"
+        )
+
+    return all_results
+
+
+def load_config_from_yaml(config_path: str) -> dict:
+    """Load configuration from YAML file."""
+    with open(config_path) as f:
+        return yaml.safe_load(f)
+
+
+def generate_batch_specs_from_ranges(ranges: list[dict]) -> list[str]:
+    """
+    Generate batch specs from range specifications.
+
+    Args:
+        ranges: List of range specifications, each containing:
+            - template: Batch spec template (e.g., "q{q_len}kv1k")
+            - q_len: Dict with start, stop, step, end_inclusive (optional)
+            - Other parameters can also be ranges
+
+    Returns:
+        List of generated batch spec strings
+
+    Example:
+        ranges = [
+            {
+                "template": "q{q_len}kv1k",
+                "q_len": {
+                    "start": 1,
+                    "stop": 16,
+                    "step": 1,
+                    "end_inclusive": true  # Optional, defaults to true
+                }
+            }
+        ]
+        Returns: ["q1kv1k", "q2kv1k", ..., "q16kv1k"]
+    """
+    all_specs = []
+
+    for range_spec in ranges:
+        template = range_spec.get("template")
+        if not template:
+            raise ValueError("Range specification must include 'template'")
+
+        # Extract all range parameters from the spec
+        range_params = {}
+        for key, value in range_spec.items():
+            if key == "template":
+                continue
+            if isinstance(value, dict) and "start" in value:
+                # This is a range specification
+                start = value["start"]
+                stop = value["stop"]
+                step = value.get("step", 1)
+                # Check if end should be inclusive (default: True)
+                end_inclusive = value.get("end_inclusive", True)
+
+                # Adjust stop based on end_inclusive
+                if end_inclusive:
+                    range_params[key] = list(range(start, stop + 1, step))
+                else:
+                    range_params[key] = list(range(start, stop, step))
+            else:
+                # This is a fixed value
+                range_params[key] = [value]
+
+        # Generate all combinations (Cartesian product)
+        if range_params:
+            import itertools
+
+            param_names = list(range_params.keys())
+            param_values = [range_params[name] for name in param_names]
+
+            for values in itertools.product(*param_values):
+                params = dict(zip(param_names, values))
+                spec = template.format(**params)
+                all_specs.append(spec)
+        else:
+            # No parameters, just use template as-is
+            all_specs.append(template)
+
+    return all_specs
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Universal vLLM attention benchmark",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+
+    # Config file
+    parser.add_argument(
+        "--config",
+        help="Path to YAML config file (overrides other args)",
+    )
+
+    # Backend selection
+    parser.add_argument(
+        "--backends",
+        nargs="+",
+        help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
+        "flashinfer_mla, flashattn_mla, flashmla)",
+    )
+    parser.add_argument(
+        "--backend",
+        help="Single backend (alternative to --backends)",
+    )
+
+    # Batch specifications
+    parser.add_argument(
+        "--batch-specs",
+        nargs="+",
+        default=["q2k", "8q1s1k"],
+        help="Batch specifications using extended grammar",
+    )
+
+    # Model config
+    parser.add_argument("--num-layers", type=int, default=10, help="Number of layers")
+    parser.add_argument("--head-dim", type=int, default=128, help="Head dimension")
+    parser.add_argument("--num-q-heads", type=int, default=32, help="Query heads")
+    parser.add_argument("--num-kv-heads", type=int, default=8, help="KV heads")
+    parser.add_argument("--block-size", type=int, default=16, help="Block size")
+
+    # Benchmark settings
+    parser.add_argument("--device", default="cuda:0", help="Device")
+    parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
+    parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
+    parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
+
+    # Parameter sweep (use YAML config for advanced sweeps)
+    parser.add_argument(
+        "--sweep-param",
+        help="Parameter name to sweep (e.g., num_kv_splits, reorder_batch_threshold)",
+    )
+    parser.add_argument(
+        "--sweep-values",
+        type=int,
+        nargs="+",
+        help="Values to sweep for the parameter",
+    )
+
+    # Output
+    parser.add_argument("--output-csv", help="Save to CSV")
+    parser.add_argument("--output-json", help="Save to JSON")
+
+    args = parser.parse_args()
+
+    console = Console()
+    console.print("[bold cyan]vLLM Attention Benchmark[/]")
+
+    # Load config from YAML if provided
+    if args.config:
+        console.print(f"[yellow]Loading config from: {args.config}[/]")
+        yaml_config = load_config_from_yaml(args.config)
+
+        # Show description if available
+        if "description" in yaml_config:
+            console.print(f"[dim]{yaml_config['description']}[/]")
+
+        # Override args with YAML values
+        # (YAML takes precedence unless CLI arg was explicitly set)
+        # Backend(s)
+        if "backend" in yaml_config:
+            args.backend = yaml_config["backend"]
+            args.backends = None
+        elif "backends" in yaml_config:
+            args.backends = yaml_config["backends"]
+            args.backend = None
+
+        # Check for special modes
+        if "mode" in yaml_config:
+            args.mode = yaml_config["mode"]
+        else:
+            args.mode = None
+
+        # Batch specs and sizes
+        # Support both explicit batch_specs and generated batch_spec_ranges
+        if "batch_spec_ranges" in yaml_config:
+            # Generate batch specs from ranges
+            generated_specs = generate_batch_specs_from_ranges(
+                yaml_config["batch_spec_ranges"]
+            )
+            # Combine with any explicit batch_specs
+            if "batch_specs" in yaml_config:
+                args.batch_specs = yaml_config["batch_specs"] + generated_specs
+            else:
+                args.batch_specs = generated_specs
+            console.print(
+                f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
+            )
+        elif "batch_specs" in yaml_config:
+            args.batch_specs = yaml_config["batch_specs"]
+
+        if "batch_sizes" in yaml_config:
+            args.batch_sizes = yaml_config["batch_sizes"]
+        else:
+            args.batch_sizes = None
+
+        # Model config
+        if "model" in yaml_config:
+            model = yaml_config["model"]
+            args.num_layers = model.get("num_layers", args.num_layers)
+            args.head_dim = model.get("head_dim", args.head_dim)
+            args.num_q_heads = model.get("num_q_heads", args.num_q_heads)
+            args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads)
+            args.block_size = model.get("block_size", args.block_size)
+
+        # Benchmark settings
+        if "benchmark" in yaml_config:
+            bench = yaml_config["benchmark"]
+            args.device = bench.get("device", args.device)
+            args.repeats = bench.get("repeats", args.repeats)
+            args.warmup_iters = bench.get("warmup_iters", args.warmup_iters)
+            args.profile_memory = bench.get("profile_memory", args.profile_memory)
+
+        # Parameter sweep configuration
+        if "parameter_sweep" in yaml_config:
+            sweep_config = yaml_config["parameter_sweep"]
+            args.parameter_sweep = ParameterSweep(
+                param_name=sweep_config["param_name"],
+                values=sweep_config["values"],
+                include_auto=sweep_config.get("include_auto", False),
+                label_format=sweep_config.get(
+                    "label_format", "{backend}_{param_name}_{value}"
+                ),
+            )
+        else:
+            args.parameter_sweep = None
+
+        # Model parameter sweep configuration
+        if "model_parameter_sweep" in yaml_config:
+            sweep_config = yaml_config["model_parameter_sweep"]
+            args.model_parameter_sweep = ModelParameterSweep(
+                param_name=sweep_config["param_name"],
+                values=sweep_config["values"],
+                label_format=sweep_config.get(
+                    "label_format", "{backend}_{param_name}_{value}"
+                ),
+            )
+        else:
+            args.model_parameter_sweep = None
+
+        # Output
+        if "output" in yaml_config:
+            output = yaml_config["output"]
+            if "csv" in output and not args.output_csv:
+                args.output_csv = output["csv"]
+            if "json" in output and not args.output_json:
+                args.output_json = output["json"]
+
+        console.print()
+
+    # Handle CLI-based parameter sweep (if not from YAML)
+    if (
+        (not hasattr(args, "parameter_sweep") or args.parameter_sweep is None)
+        and args.sweep_param
+        and args.sweep_values
+    ):
+        args.parameter_sweep = ParameterSweep(
+            param_name=args.sweep_param,
+            values=args.sweep_values,
+            include_auto=False,
+            label_format="{backend}_{param_name}_{value}",
+        )
+
+    # Determine backends
+    backends = args.backends or ([args.backend] if args.backend else ["flash"])
+    console.print(f"Backends: {', '.join(backends)}")
+    console.print(f"Batch specs: {', '.join(args.batch_specs)}")
+    console.print()
+
+    # Run benchmarks
+    all_results = []
+
+    # Handle special mode: decode_vs_prefill comparison
+    if hasattr(args, "mode") and args.mode == "decode_vs_prefill":
+        console.print("[yellow]Mode: Decode vs Prefill pipeline comparison[/]")
+        console.print(
+            "[dim]For each query length, testing both decode and prefill pipelines[/]"
+        )
+        console.print("[dim]Using batched execution for optimal performance[/]")
+
+        # Extract batch sizes from config
+        batch_sizes = getattr(args, "batch_sizes", [1])
+        backend = backends[0]  # Use first backend (should only be one)
+
+        # Calculate total benchmarks
+        total = len(batch_sizes)
+
+        with tqdm(total=total, desc="Benchmarking") as pbar:
+            for batch_size in batch_sizes:
+                # Prepare all configs for this batch size
+                configs_with_thresholds = []
+
+                for spec in args.batch_specs:
+                    # Parse the batch spec to get query length
+                    requests = parse_batch_spec(spec)
+                    if not requests:
+                        console.print(
+                            f"[red]Error: Could not parse batch spec '{spec}'[/]"
+                        )
+                        continue
+
+                    # Get query length from first request
+                    query_length = requests[0].q_len
+
+                    # Create batch spec for this batch size
+                    # For batch_size > 1, we need to prepend the count
+                    batch_spec = f"{batch_size}{spec}" if batch_size > 1 else spec
+
+                    # Create base config (without backend name)
+                    base_config = BenchmarkConfig(
+                        backend=backend,  # Will be overridden later
+                        batch_spec=batch_spec,
+                        num_layers=args.num_layers,
+                        head_dim=args.head_dim,
+                        num_q_heads=args.num_q_heads,
+                        num_kv_heads=args.num_kv_heads,
+                        block_size=args.block_size,
+                        device=args.device,
+                        repeats=args.repeats,
+                        warmup_iters=args.warmup_iters,
+                        profile_memory=args.profile_memory,
+                    )
+
+                    # Add decode pipeline config
+                    decode_threshold = query_length
+                    config_decode = replace(
+                        base_config,
+                        backend=f"{backend}_decode_qlen{query_length}_bs{batch_size}",
+                    )
+                    configs_with_thresholds.append((config_decode, decode_threshold))
+
+                    # Add prefill pipeline config if query_length > 1
+                    if query_length > 1:
+                        prefill_threshold = query_length - 1
+                        config_prefill = replace(
+                            base_config,
+                            backend=f"{backend}_prefill_qlen{query_length}"
+                            f"_bs{batch_size}",
+                        )
+                        configs_with_thresholds.append(
+                            (config_prefill, prefill_threshold)
+                        )
+
+                # Run all benchmarks for this batch size in one go (batched mode)
+                try:
+                    from mla_runner import run_mla_benchmark as run_mla
+
+                    # Use batched API: pass list of (config, threshold) tuples
+                    timing_results = run_mla(backend, configs_with_thresholds)
+
+                    # Create BenchmarkResult objects from timing results
+                    for (config, _), timing in zip(
+                        configs_with_thresholds, timing_results
+                    ):
+                        result = BenchmarkResult(
+                            config=config,
+                            mean_time=timing["mean"],
+                            std_time=timing["std"],
+                            min_time=timing["min"],
+                            max_time=timing["max"],
+                            throughput_tokens_per_sec=timing.get("throughput", None),
+                        )
+                        all_results.append(result)
+
+                except Exception as e:
+                    import traceback
+
+                    console.print(
+                        f"[red]Error running batched benchmarks for "
+                        f"batch_size={batch_size}: {e}[/]"
+                    )
+                    console.print("[red]Traceback:[/]")
+                    traceback.print_exc()
+                    # Add error results for all configs
+                    for config, _ in configs_with_thresholds:
+                        result = BenchmarkResult(
+                            config=config,
+                            mean_time=float("inf"),
+                            std_time=0,
+                            min_time=float("inf"),
+                            max_time=float("inf"),
+                            error=str(e),
+                        )
+                        all_results.append(result)
+
+                pbar.update(1)
+
+        # Display decode vs prefill results
+        console.print("\n[bold green]Decode vs Prefill Results:[/]")
+
+        # Group by batch size
+        by_batch_size = {}
+        for r in all_results:
+            if r.success:
+                # Extract batch size from backend name
+                parts = r.config.backend.split("_")
+                bs_part = [p for p in parts if p.startswith("bs")]
+                if bs_part:
+                    bs = int(bs_part[0][2:])
+                    if bs not in by_batch_size:
+                        by_batch_size[bs] = []
+                    by_batch_size[bs].append(r)
+
+        # For each batch size, analyze crossover point
+        for bs in sorted(by_batch_size.keys()):
+            console.print(f"\n[bold cyan]Batch size: {bs}[/]")
+            results = by_batch_size[bs]
+
+            # Group by query length
+            by_qlen = {}
+            for r in results:
+                parts = r.config.backend.split("_")
+                qlen_part = [p for p in parts if p.startswith("qlen")]
+                if qlen_part:
+                    qlen = int(qlen_part[0][4:])
+                    if qlen not in by_qlen:
+                        by_qlen[qlen] = {}
+
+                    pipeline = "decode" if "decode" in r.config.backend else "prefill"
+                    by_qlen[qlen][pipeline] = r
+
+            # Find crossover point
+            last_decode_faster = None
+            for qlen in sorted(by_qlen.keys()):
+                pipelines = by_qlen[qlen]
+                if "decode" in pipelines and "prefill" in pipelines:
+                    decode_time = pipelines["decode"].mean_time
+                    prefill_time = pipelines["prefill"].mean_time
+                    faster = "decode" if decode_time < prefill_time else "prefill"
+
+                    speedup = (
+                        prefill_time / decode_time
+                        if decode_time < prefill_time
+                        else decode_time / prefill_time
+                    )
+
+                    console.print(
+                        f"  qlen={qlen:3d}: decode={decode_time:.6f}s, "
+                        f"prefill={prefill_time:.6f}s -> "
+                        f"[bold]{faster}[/] ({speedup:.2f}x)"
+                    )
+
+                    if faster == "decode":
+                        last_decode_faster = qlen
+
+            if last_decode_faster is not None:
+                optimal_threshold = last_decode_faster
+                console.print(
+                    f"\n  [bold green]Optimal threshold for batch_size={bs}: "
+                    f"{optimal_threshold}[/]"
+                )
+                console.print(
+                    f"  [dim](Use decode pipeline for query_length <= "
+                    f"{optimal_threshold})[/]"
+                )
+            else:
+                console.print(
+                    f"\n  [yellow]Prefill always faster for batch_size={bs}[/]"
+                )
+
+    # Handle model parameter sweep mode
+    elif hasattr(args, "model_parameter_sweep") and args.model_parameter_sweep:
+        # Model parameter sweep
+        base_config_args = {
+            "num_layers": args.num_layers,
+            "head_dim": args.head_dim,
+            "num_q_heads": args.num_q_heads,
+            "num_kv_heads": args.num_kv_heads,
+            "block_size": args.block_size,
+            "device": args.device,
+            "repeats": args.repeats,
+            "warmup_iters": args.warmup_iters,
+            "profile_memory": args.profile_memory,
+        }
+        all_results = run_model_parameter_sweep(
+            backends,
+            args.batch_specs,
+            base_config_args,
+            args.model_parameter_sweep,
+            console,
+        )
+
+    # Handle parameter sweep mode (unified)
+    elif hasattr(args, "parameter_sweep") and args.parameter_sweep:
+        # Unified parameter sweep
+        base_config_args = {
+            "num_layers": args.num_layers,
+            "head_dim": args.head_dim,
+            "num_q_heads": args.num_q_heads,
+            "num_kv_heads": args.num_kv_heads,
+            "block_size": args.block_size,
+            "device": args.device,
+            "repeats": args.repeats,
+            "warmup_iters": args.warmup_iters,
+            "profile_memory": args.profile_memory,
+        }
+        all_results = run_parameter_sweep(
+            backends, args.batch_specs, base_config_args, args.parameter_sweep, console
+        )
+
+    else:
+        # Normal mode: compare backends
+        total = len(backends) * len(args.batch_specs)
+
+        with tqdm(total=total, desc="Benchmarking") as pbar:
+            for spec in args.batch_specs:
+                for backend in backends:
+                    config = BenchmarkConfig(
+                        backend=backend,
+                        batch_spec=spec,
+                        num_layers=args.num_layers,
+                        head_dim=args.head_dim,
+                        num_q_heads=args.num_q_heads,
+                        num_kv_heads=args.num_kv_heads,
+                        block_size=args.block_size,
+                        device=args.device,
+                        repeats=args.repeats,
+                        warmup_iters=args.warmup_iters,
+                        profile_memory=args.profile_memory,
+                    )
+
+                    result = run_benchmark(config)
+                    all_results.append(result)
+
+                    if not result.success:
+                        console.print(f"[red]Error {backend} {spec}: {result.error}[/]")
+
+                    pbar.update(1)
+
+        # Display results
+        console.print("\n[bold green]Results:[/]")
+        formatter = ResultsFormatter(console)
+        formatter.print_table(all_results, backends)
+
+    # Save results
+    if all_results:
+        formatter = ResultsFormatter(console)
+        if args.output_csv:
+            formatter.save_csv(all_results, args.output_csv)
+        if args.output_json:
+            formatter.save_json(all_results, args.output_json)
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -0,0 +1,503 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Common utilities for attention benchmarking."""
+
+import csv
+import json
+import math
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+from rich.console import Console
+from rich.table import Table
+
+# Mock classes for vLLM attention infrastructure
+
+
+class MockHfConfig:
+    """Mock HuggingFace config that satisfies vLLM's requirements."""
+
+    def __init__(self, mla_dims: dict):
+        self.num_attention_heads = mla_dims["num_q_heads"]
+        self.num_key_value_heads = mla_dims["num_kv_heads"]
+        self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"]
+        self.model_type = "deepseek_v2"
+        self.is_encoder_decoder = False
+        self.kv_lora_rank = mla_dims["kv_lora_rank"]
+        self.qk_nope_head_dim = mla_dims["qk_nope_head_dim"]
+        self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"]
+        self.v_head_dim = mla_dims["v_head_dim"]
+        self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
+
+    def get_text_config(self):
+        return self
+
+
+# Import AttentionLayerBase at module level to avoid circular dependencies
+try:
+    from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+
+    _HAS_ATTENTION_LAYER_BASE = True
+except ImportError:
+    _HAS_ATTENTION_LAYER_BASE = False
+    AttentionLayerBase = object  # Fallback
+
+
+class MockKVBProj:
+    """Mock KV projection layer for MLA prefill mode.
+
+    Mimics ColumnParallelLinear behavior for kv_b_proj in MLA backends.
+    Projects kv_c_normed to [qk_nope_head_dim + v_head_dim] per head.
+    """
+
+    def __init__(self, num_heads: int, qk_nope_head_dim: int, v_head_dim: int):
+        self.num_heads = num_heads
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.v_head_dim = v_head_dim
+        self.out_dim = qk_nope_head_dim + v_head_dim
+
+    def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
+        """
+        Project kv_c_normed to output space.
+
+        Args:
+            x: Input tensor [num_tokens, kv_lora_rank]
+
+        Returns:
+            Tuple containing output tensor
+                [num_tokens, num_heads, qk_nope_head_dim + v_head_dim]
+        """
+        num_tokens = x.shape[0]
+        result = torch.randn(
+            num_tokens,
+            self.num_heads,
+            self.out_dim,
+            device=x.device,
+            dtype=x.dtype,
+        )
+        return (result,)  # Return as tuple to match ColumnParallelLinear API
+
+
+class MockLayer(AttentionLayerBase):
+    """Mock attention layer with scale parameters and impl.
+
+    Inherits from AttentionLayerBase so it passes isinstance checks
+    in get_layers_from_vllm_config when FlashInfer prefill is enabled.
+    """
+
+    def __init__(self, device: torch.device, impl=None, kv_cache_spec=None):
+        # Don't call super().__init__() as AttentionLayerBase doesn't have __init__
+        self._k_scale = torch.tensor(1.0, device=device)
+        self._v_scale = torch.tensor(1.0, device=device)
+        self._q_scale = torch.tensor(1.0, device=device)
+        # Scalar floats for kernels that need them
+        self._k_scale_float = float(self._k_scale.item())
+        self._v_scale_float = float(self._v_scale.item())
+        self._q_scale_float = float(self._q_scale.item())
+        # AttentionImpl for metadata builders to query
+        self.impl = impl
+        # KV cache spec for get_kv_cache_spec
+        self._kv_cache_spec = kv_cache_spec
+
+    def get_attn_backend(self):
+        """Get the attention backend class (required by AttentionLayerBase)."""
+        # Return None as this is just a mock layer for benchmarking
+        return None
+
+    def get_kv_cache_spec(self):
+        """Get the KV cache spec (required by AttentionLayerBase)."""
+        return self._kv_cache_spec
+
+
+class MockModelConfig:
+    """Mock model configuration."""
+
+    def __init__(
+        self,
+        num_q_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        dtype: torch.dtype = torch.float16,
+        max_model_len: int = 32768,
+    ):
+        self._n_q = num_q_heads
+        self._n_kv = num_kv_heads
+        self._d = head_dim
+        self.dtype = dtype
+        self.max_model_len = max_model_len
+
+    def get_num_attention_heads(self, _=None) -> int:
+        return self._n_q
+
+    def get_num_kv_heads(self, _=None) -> int:
+        return self._n_kv
+
+    def get_head_size(self) -> int:
+        return self._d
+
+    def get_num_layers(self) -> int:
+        """Mock method for layer count queries."""
+        return 1
+
+    def get_sliding_window_for_layer(self, _layer_idx: int):
+        """Mock method for sliding window queries."""
+        return None
+
+    def get_logits_soft_cap_for_layer(self, _layer_idx: int):
+        """Mock method for logits soft cap queries."""
+        return None
+
+    def get_sm_scale_for_layer(self, _layer_idx: int) -> float:
+        """Mock method for SM scale queries."""
+        return 1.0 / (self.get_head_size() ** 0.5)
+
+
+class MockParallelConfig:
+    """Mock parallel configuration."""
+
+    pass
+
+
+class MockCompilationConfig:
+    """Mock compilation configuration."""
+
+    def __init__(self):
+        self.full_cuda_graph = False
+        self.static_forward_context = {}
+
+
+class MockVLLMConfig:
+    """Mock VLLM configuration."""
+
+    def __init__(self):
+        self.compilation_config = MockCompilationConfig()
+
+
+class MockRunner:
+    """Mock GPU runner for metadata builders."""
+
+    def __init__(
+        self,
+        seq_lens: np.ndarray,
+        query_start_locs: np.ndarray,
+        device: torch.device,
+        num_q_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        dtype: torch.dtype,
+    ):
+        self.model_config = MockModelConfig(num_q_heads, num_kv_heads, head_dim, dtype)
+        self.parallel_config = MockParallelConfig()
+        self.vllm_config = MockVLLMConfig()
+        self.seq_lens_np = seq_lens
+        self.query_start_loc_np = query_start_locs
+        self.device = device
+        self.attention_chunk_size = None
+        self.num_query_heads = num_q_heads
+        self.num_kv_heads = num_kv_heads
+        self.dtype = dtype
+
+
+@dataclass
+class ParameterSweep:
+    """Configuration for sweeping a backend parameter."""
+
+    param_name: str  # Name of the backend parameter to sweep
+    values: list[Any]  # List of values to test
+    include_auto: bool = False  # Also test with param unset (auto mode)
+    label_format: str = "{backend}_{param_name}_{value}"  # Result label template
+
+    def get_label(self, backend: str, value: Any) -> str:
+        """Generate a label for a specific parameter value."""
+        return self.label_format.format(
+            backend=backend, param_name=self.param_name, value=value
+        )
+
+
+@dataclass
+class ModelParameterSweep:
+    """Configuration for sweeping a model configuration parameter."""
+
+    param_name: str  # Name of the model config parameter to sweep (e.g., "num_q_heads")
+    values: list[Any]  # List of values to test
+    label_format: str = "{backend}_{param_name}_{value}"  # Result label template
+
+    def get_label(self, backend: str, value: Any) -> str:
+        """Generate a label for a specific parameter value."""
+        return self.label_format.format(
+            backend=backend, param_name=self.param_name, value=value
+        )
+
+
+@dataclass
+class BenchmarkConfig:
+    """Configuration for a single benchmark run."""
+
+    backend: str
+    batch_spec: str
+    num_layers: int
+    head_dim: int
+    num_q_heads: int
+    num_kv_heads: int
+    block_size: int
+    device: str
+    dtype: torch.dtype = torch.float16
+    repeats: int = 1
+    warmup_iters: int = 3
+    profile_memory: bool = False
+    use_cuda_graphs: bool = False
+
+    # MLA-specific
+    kv_lora_rank: int | None = None
+    qk_nope_head_dim: int | None = None
+    qk_rope_head_dim: int | None = None
+    v_head_dim: int | None = None
+
+    # Backend-specific tuning
+    num_kv_splits: int | None = None  # CUTLASS MLA
+    reorder_batch_threshold: int | None = None  # FlashAttn MLA, FlashMLA
+
+
+@dataclass
+class BenchmarkResult:
+    """Results from a single benchmark run."""
+
+    config: BenchmarkConfig
+    mean_time: float  # seconds
+    std_time: float  # seconds
+    min_time: float  # seconds
+    max_time: float  # seconds
+    throughput_tokens_per_sec: float | None = None
+    memory_allocated_mb: float | None = None
+    memory_reserved_mb: float | None = None
+    error: str | None = None
+
+    @property
+    def success(self) -> bool:
+        """Whether benchmark completed successfully."""
+        return self.error is None
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for serialization."""
+        return {
+            "config": asdict(self.config),
+            "mean_time": self.mean_time,
+            "std_time": self.std_time,
+            "min_time": self.min_time,
+            "max_time": self.max_time,
+            "throughput_tokens_per_sec": self.throughput_tokens_per_sec,
+            "memory_allocated_mb": self.memory_allocated_mb,
+            "memory_reserved_mb": self.memory_reserved_mb,
+            "error": self.error,
+        }
+
+
+class ResultsFormatter:
+    """Format and display benchmark results."""
+
+    def __init__(self, console: Console | None = None):
+        self.console = console or Console()
+
+    def print_table(
+        self,
+        results: list[BenchmarkResult],
+        backends: list[str],
+        compare_to_fastest: bool = True,
+    ):
+        """
+        Print results as a rich table.
+
+        Args:
+            results: List of BenchmarkResult
+            backends: List of backend names being compared
+            compare_to_fastest: Show percentage comparison to fastest
+        """
+        # Group by batch spec
+        by_spec = {}
+        for r in results:
+            spec = r.config.batch_spec
+            if spec not in by_spec:
+                by_spec[spec] = {}
+            by_spec[spec][r.config.backend] = r
+
+        # Create shortened backend names for display
+        def shorten_backend_name(name: str) -> str:
+            """Shorten long backend names for table display."""
+            # Remove common prefixes
+            name = name.replace("flashattn_mla", "famla")
+            name = name.replace("flashinfer_mla", "fimla")
+            name = name.replace("flashmla", "fmla")
+            name = name.replace("cutlass_mla", "cmla")
+            name = name.replace("numsplits", "ns")
+            return name
+
+        table = Table(title="Attention Benchmark Results")
+        table.add_column("Batch\nSpec", no_wrap=True)
+
+        multi = len(backends) > 1
+        for backend in backends:
+            short_name = shorten_backend_name(backend)
+            # Time column
+            col_time = f"{short_name}\nTime (s)"
+            table.add_column(col_time, justify="right", no_wrap=False)
+            if multi and compare_to_fastest:
+                # Relative performance column
+                col_rel = f"{short_name}\nvs Best"
+                table.add_column(col_rel, justify="right", no_wrap=False)
+
+        # Add rows
+        for spec in sorted(by_spec.keys()):
+            spec_results = by_spec[spec]
+            times = {b: r.mean_time for b, r in spec_results.items() if r.success}
+            best_time = min(times.values()) if times else 0.0
+
+            row = [spec]
+            for backend in backends:
+                if backend in spec_results:
+                    r = spec_results[backend]
+                    if r.success:
+                        row.append(f"{r.mean_time:.6f}")
+                        if multi and compare_to_fastest:
+                            pct = (
+                                (r.mean_time / best_time * 100) if best_time > 0 else 0
+                            )
+                            pct_str = f"{pct:.1f}%"
+                            if r.mean_time == best_time:
+                                pct_str = f"[bold green]{pct_str}[/]"
+                            row.append(pct_str)
+                    else:
+                        row.append("[red]ERROR[/]")
+                        if multi and compare_to_fastest:
+                            row.append("-")
+                else:
+                    row.append("-")
+                    if multi and compare_to_fastest:
+                        row.append("-")
+
+            table.add_row(*row)
+
+        self.console.print(table)
+
+    def save_csv(self, results: list[BenchmarkResult], path: str):
+        """Save results to CSV file."""
+        if not results:
+            return
+
+        path_obj = Path(path)
+        path_obj.parent.mkdir(parents=True, exist_ok=True)
+
+        with open(path, "w", newline="") as f:
+            writer = csv.DictWriter(
+                f,
+                fieldnames=[
+                    "backend",
+                    "batch_spec",
+                    "num_layers",
+                    "mean_time",
+                    "std_time",
+                    "throughput",
+                    "memory_mb",
+                ],
+            )
+            writer.writeheader()
+            for r in results:
+                writer.writerow(
+                    {
+                        "backend": r.config.backend,
+                        "batch_spec": r.config.batch_spec,
+                        "num_layers": r.config.num_layers,
+                        "mean_time": r.mean_time,
+                        "std_time": r.std_time,
+                        "throughput": r.throughput_tokens_per_sec or 0,
+                        "memory_mb": r.memory_allocated_mb or 0,
+                    }
+                )
+
+        self.console.print(f"[green]Saved CSV results to {path}[/]")
+
+    def save_json(self, results: list[BenchmarkResult], path: str):
+        """Save results to JSON file."""
+        path_obj = Path(path)
+        path_obj.parent.mkdir(parents=True, exist_ok=True)
+
+        data = [r.to_dict() for r in results]
+        with open(path, "w") as f:
+            json.dump(data, f, indent=2, default=str)
+
+        self.console.print(f"[green]Saved JSON results to {path}[/]")
+
+
+def setup_mla_dims(model_name: str = "deepseek-v3") -> dict:
+    """
+    Get MLA dimensions for known models.
+
+    Args:
+        model_name: Model identifier
+
+    Returns:
+        Dict with MLA dimension configuration
+    """
+    configs = {
+        "deepseek-v2": {
+            "kv_lora_rank": 512,
+            "qk_nope_head_dim": 128,
+            "qk_rope_head_dim": 64,
+            "v_head_dim": 128,
+            "num_q_heads": 128,
+            "num_kv_heads": 1,
+            "head_dim": 576,
+        },
+        "deepseek-v3": {
+            "kv_lora_rank": 512,
+            "qk_nope_head_dim": 128,
+            "qk_rope_head_dim": 64,
+            "v_head_dim": 128,
+            "num_q_heads": 128,
+            "num_kv_heads": 1,
+            "head_dim": 576,
+        },
+        "deepseek-v2-lite": {
+            "kv_lora_rank": 512,
+            "qk_nope_head_dim": 128,
+            "qk_rope_head_dim": 64,
+            "v_head_dim": 128,
+            "num_q_heads": 16,
+            "num_kv_heads": 1,
+            "head_dim": 576,
+        },
+    }
+
+    if model_name not in configs:
+        raise ValueError(
+            f"Unknown model '{model_name}'. Known models: {list(configs.keys())}"
+        )
+
+    return configs[model_name]
+
+
+def get_attention_scale(head_dim: int) -> float:
+    """Compute attention scale factor (1/sqrt(d))."""
+    return 1.0 / math.sqrt(head_dim)
+
+
+def is_mla_backend(backend: str) -> bool:
+    """
+    Check if backend is an MLA backend using the backend's is_mla() property.
+
+    Args:
+        backend: Backend name (e.g., "CUTLASS_MLA", "FLASHINFER_MLA")
+
+    Returns:
+        True if the backend is an MLA backend, False otherwise
+    """
+    from vllm.v1.attention.backends.registry import AttentionBackendEnum
+
+    try:
+        backend_class = AttentionBackendEnum[backend.upper()].get_class()
+        return backend_class.is_mla()
+    except (KeyError, ValueError, ImportError):
+        return False
--- a/benchmarks/attention_benchmarks/configs/mla_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
@@ -0,0 +1,61 @@
+# MLA decode-only benchmark configuration
+
+model:
+  name: "deepseek-v3"
+  num_layers: 60
+  num_q_heads: 128
+  num_kv_heads: 1  # MLA uses single latent KV
+  head_dim: 576
+  kv_lora_rank: 512
+  qk_nope_head_dim: 128
+  qk_rope_head_dim: 64
+  v_head_dim: 128
+  block_size: 128  # CUTLASS MLA and FlashAttn MLA use 128
+
+batch_specs:
+  # Small batches, varying sequence lengths
+  - "16q1s512"     # 16 requests, 512 KV cache
+  - "16q1s1k"      # 16 requests, 1k KV cache
+  - "16q1s2k"      # 16 requests, 2k KV cache
+  - "16q1s4k"      # 16 requests, 4k KV cache
+
+  # Medium batches
+  - "32q1s1k"      # 32 requests, 1k KV cache
+  - "32q1s2k"      # 32 requests, 2k KV cache
+  - "32q1s4k"      # 32 requests, 4k KV cache
+  - "32q1s8k"      # 32 requests, 8k KV cache
+
+  # Large batches
+  - "64q1s1k"      # 64 requests, 1k KV cache
+  - "64q1s2k"      # 64 requests, 2k KV cache
+  - "64q1s4k"      # 64 requests, 4k KV cache
+  - "64q1s8k"      # 64 requests, 8k KV cache
+
+  # Very large batches
+  - "128q1s1k"     # 128 requests, 1k KV cache
+  - "128q1s2k"     # 128 requests, 2k KV cache
+
+  # Long context
+  - "32q1s16k"     # 32 requests, 16k KV cache
+  - "32q1s32k"     # 32 requests, 32k KV cache
+
+backends:
+  - cutlass_mla
+  - flashinfer_mla
+  - flashattn_mla  # Hopper only
+  - flashmla        # Hopper only
+
+device: "cuda:0"
+repeats: 5
+warmup_iters: 3
+profile_memory: true
+
+# Backend-specific tuning
+cutlass_mla:
+  num_kv_splits: auto  # or specific value like 4, 8, 16
+
+flashattn_mla:
+  reorder_batch_threshold: 512
+
+flashmla:
+  reorder_batch_threshold: 1
--- a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
@@ -0,0 +1,60 @@
+# MLA mixed batch benchmark (prefill + decode)
+# Tests chunked prefill performance
+
+model:
+  name: "deepseek-v3"
+  num_layers: 60
+  num_q_heads: 128
+  num_kv_heads: 1
+  head_dim: 576
+  kv_lora_rank: 512
+  qk_nope_head_dim: 128
+  qk_rope_head_dim: 64
+  v_head_dim: 128
+  block_size: 128
+
+batch_specs:
+  # Small prefill + decode
+  - "1q1k_8q1s1k"           # 1 prefill + 8 decode
+  - "2q2k_16q1s1k"          # 2 prefill + 16 decode
+  - "4q1k_32q1s2k"          # 4 prefill + 32 decode
+
+  # Medium prefill + decode
+  - "2q4k_32q1s2k"          # 2 medium prefill + 32 decode
+  - "4q4k_64q1s2k"          # 4 medium prefill + 64 decode
+  - "8q2k_64q1s4k"          # 8 prefill + 64 decode
+
+  # Large prefill + decode (chunked prefill stress test)
+  - "2q8k_32q1s1k"          # 2 large prefill + 32 decode
+  - "1q16k_16q1s2k"         # 1 very large prefill + 16 decode
+  - "2q16k_32q1s4k"         # 2 very large prefill + 32 decode
+
+  # Context extension + decode
+  - "2q1kkv2k_16q1s1k"       # 2 extend + 16 decode
+  - "4q2kkv4k_32q1s2k"       # 4 extend + 32 decode
+  - "2q1kkv8k_32q1s2k"       # 2 large extend + 32 decode
+
+  # Explicitly chunked prefill
+  - "q8k"           # 8k prefill with chunking hint
+  - "q16k"          # 16k prefill with chunking hint
+  - "2q8k_32q1s2k"    # 2 chunked prefill + 32 decode
+
+  # High decode ratio (realistic serving)
+  - "1q2k_63q1s1k"          # 1 prefill + 63 decode
+  - "2q2k_62q1s2k"          # 2 prefill + 62 decode
+  - "4q4k_60q1s4k"          # 4 prefill + 60 decode
+
+backends:
+  - cutlass_mla
+  - flashinfer_mla
+  - flashattn_mla   # Hopper only
+  - flashmla        # Hopper only
+
+device: "cuda:0"
+repeats: 5
+warmup_iters: 3
+profile_memory: true
+
+# Analyze chunked prefill workspace size impact
+chunked_prefill:
+  test_workspace_sizes: [4096, 8192, 16384, 32768, 65536]
--- a/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
+++ b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
@@ -0,0 +1,88 @@
+# Study 4: What is optimal reorder_batch_threshold for MLA backends supporting query length > 1?
+# Question: At what query length does prefill pipeline become faster than decode pipeline?
+# Methodology: For each query length, compare decode vs prefill performance to find crossover point
+# Applies to: FlashAttn MLA, FlashMLA
+
+description: "Decode vs Prefill pipeline crossover analysis"
+
+# Test FlashAttn MLA
+backend: flashattn_mla
+
+# Mode: decode_vs_prefill comparison (special sweep mode)
+# For each batch spec, we'll test both decode and prefill pipelines
+mode: "decode_vs_prefill"
+
+# Query lengths to test (from old benchmark_mla_threshold.py methodology)
+# Each query length will be tested with BOTH decode and prefill pipelines:
+#   - decode: threshold >= query_length (forces decode pipeline)
+#   - prefill: threshold < query_length (forces prefill pipeline)
+#
+# We use q<N>s1k format which creates q_len=N, seq_len=1024 requests
+# This tests different query lengths with fixed sequence length context
+#
+# Using batch_spec_ranges for automatic generation:
+batch_spec_ranges:
+  - template: "q{q_len}s1k"
+    q_len:
+      start: 1
+      stop: 16
+      step: 1
+      end_inclusive: false
+  - template: "q{q_len}s1k"
+    q_len:
+      start: 16
+      stop: 64
+      step: 2
+      end_inclusive: false
+  - template: "q{q_len}s1k"
+    q_len:
+      start: 64
+      stop: 1024
+      step: 4
+      end_inclusive: true
+
+# Batch sizes to test (from old script)
+batch_sizes:
+  - 1
+  - 2
+  - 4
+  - 8
+  - 16
+  - 32
+  - 64
+  - 128
+  - 256
+
+# Model configuration (DeepSeek V2/V3 defaults)
+model:
+  num_layers: 10
+  head_dim: 576
+  num_q_heads: 128
+  num_kv_heads: 1
+  block_size: 128
+
+# Benchmark settings
+benchmark:
+  device: "cuda:0"
+  repeats: 15          # More repeats for spec decode variance
+  warmup_iters: 5
+  profile_memory: false
+
+# Output
+output:
+  csv: "reorder_threshold_results.csv"
+  json: "reorder_threshold_results.json"
+
+# Expected outcome (reproduces old benchmark_mla_threshold.py study):
+# - For each batch size, find the crossover point where prefill becomes faster than decode
+# - Show decode vs prefill performance across all query lengths
+# - Determine optimal reorder_batch_threshold based on last query length where decode is faster
+# - Understand how crossover point varies with batch size
+# - Provide data-driven guidance for default threshold value
+#
+# Methodology (from old script):
+# - Each query length tested with BOTH pipelines:
+#     * decode: threshold >= query_length (forces decode pipeline)
+#     * prefill: threshold < query_length (forces prefill pipeline)
+# - Compare which is faster to find crossover point
+#
--- a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
@@ -0,0 +1,62 @@
+# Speculative decoding benchmark configuration
+# Tests reorder_batch_threshold optimization
+
+model:
+  name: "deepseek-v3"
+  num_layers: 60
+  num_q_heads: 128
+  num_kv_heads: 1
+  head_dim: 576
+  kv_lora_rank: 512
+  qk_nope_head_dim: 128
+  qk_rope_head_dim: 64
+  v_head_dim: 128
+
+batch_specs:
+  # Pure speculative decode (K-token verification)
+  - "q2s1k"      # 2-token spec, 1k KV
+  - "q4s1k"      # 4-token spec, 1k KV
+  - "q8s1k"      # 8-token spec, 1k KV
+  - "q16s1k"     # 16-token spec, 1k KV
+
+  # Speculative with different context lengths
+  - "q4s2k"      # 4-token spec, 2k KV
+  - "q4s4k"      # 4-token spec, 4k KV
+  - "q8s2k"      # 8-token spec, 2k KV
+  - "q8s4k"      # 8-token spec, 4k KV
+
+  # Mixed: speculative + regular decode
+  - "32q4s1k"                    # 32 spec requests
+  - "16q4s1k_16q1s1k"              # 16 spec + 16 regular
+  - "8q8s2k_24q1s2k"               # 8 spec (8-tok) + 24 regular
+
+  # Mixed: speculative + prefill + decode
+  - "2q1k_16q4s1k_16q1s1k"         # 2 prefill + 16 spec + 16 decode
+  - "4q2k_32q4s2k_32q1s2k"         # 4 prefill + 32 spec + 32 decode
+
+  # Large batches with speculation
+  - "64q4s1k"                    # 64 spec requests
+  - "32q8s2k"                    # 32 spec (8-token)
+  - "16q16s4k"                   # 16 spec (16-token)
+
+# Backends that support query length > 1
+backends:
+  - flashattn_mla    # reorder_batch_threshold = 512
+  - flashmla          # reorder_batch_threshold = 1 (tunable)
+
+# FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
+# - flashinfer_mla
+
+# Benchmark settings
+benchmark:
+  device: "cuda:0"
+  repeats: 10  # More repeats for statistical significance
+  warmup_iters: 5
+  profile_memory: false
+
+# Test these threshold values for optimization
+parameter_sweep:
+  param_name: "reorder_batch_threshold"
+  values: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
+  include_auto: false
+  label_format: "{backend}_threshold_{value}"
--- a/benchmarks/attention_benchmarks/configs/standard_attention.yaml
+++ b/benchmarks/attention_benchmarks/configs/standard_attention.yaml
@@ -0,0 +1,40 @@
+# Standard attention backend benchmark configuration
+
+model:
+  num_layers: 32
+  num_q_heads: 32
+  num_kv_heads: 8  # GQA with 4:1 ratio
+  head_dim: 128
+  block_size: 16
+
+batch_specs:
+  # Pure prefill
+  - "q512"      # Small prefill (512 tokens)
+  - "q2k"       # Medium prefill (2048 tokens)
+  - "q4k"       # Large prefill (4096 tokens)
+  - "q8k"       # Very large prefill (8192 tokens)
+
+  # Pure decode
+  - "8q1s1k"      # 8 requests, 1k KV cache each
+  - "16q1s2k"     # 16 requests, 2k KV cache each
+  - "32q1s1k"     # 32 requests, 1k KV cache each
+  - "64q1s4k"     # 64 requests, 4k KV cache each
+
+  # Mixed prefill/decode
+  - "2q2k_8q1s1k"      # 2 prefill + 8 decode
+  - "4q1k_16q1s2k"     # 4 prefill + 16 decode
+  - "2q4k_32q1s1k"     # 2 large prefill + 32 decode
+
+  # Context extension
+  - "q1ks2k"          # 1k query, 2k sequence (chunked prefill)
+  - "2q1ks4k"         # 2 requests: 1k query, 4k sequence
+
+backends:
+  - flash
+  - triton
+  - flashinfer
+
+device: "cuda:0"
+repeats: 5
+warmup_iters: 3
+profile_memory: false
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -0,0 +1,836 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+MLA benchmark runner - shared utilities for MLA benchmarks.
+
+This module provides helpers for running MLA backends without
+needing full VllmConfig integration.
+"""
+
+import importlib
+
+import numpy as np
+import torch
+from batch_spec import parse_batch_spec
+from common import (
+    BenchmarkResult,
+    MockHfConfig,
+    MockKVBProj,
+    MockLayer,
+    setup_mla_dims,
+)
+
+from vllm.config import (
+    CacheConfig,
+    CompilationConfig,
+    ModelConfig,
+    ParallelConfig,
+    SchedulerConfig,
+    VllmConfig,
+    set_current_vllm_config,
+)
+
+# ============================================================================
+# VllmConfig Creation
+# ============================================================================
+
+
+def _add_mock_methods_to_model_config(model_config: ModelConfig) -> None:
+    """
+    Add mock methods for layer-specific queries to ModelConfig.
+
+    These methods are needed by metadata builders but aren't normally
+    present on ModelConfig when used in benchmark contexts.
+    """
+    import types
+
+    model_config.get_num_layers = types.MethodType(lambda self: 1, model_config)
+    model_config.get_sliding_window_for_layer = types.MethodType(
+        lambda self, _i: None, model_config
+    )
+    model_config.get_logits_soft_cap_for_layer = types.MethodType(
+        lambda self, _i: None, model_config
+    )
+    model_config.get_sm_scale_for_layer = types.MethodType(
+        lambda self, _i: 1.0 / model_config.get_head_size() ** 0.5, model_config
+    )
+
+
+def create_minimal_vllm_config(
+    model_name: str = "deepseek-v3",
+    block_size: int = 128,
+    max_num_seqs: int = 256,
+    mla_dims: dict | None = None,
+) -> VllmConfig:
+    """
+    Create minimal VllmConfig for MLA benchmarks.
+
+    Args:
+        model_name: Model name (deepseek-v2, deepseek-v3, etc.) - used if mla_dims not
+                    provided
+        block_size: KV cache block size
+        max_num_seqs: Maximum number of sequences
+        mla_dims: Optional custom MLA dimensions dict. If not provided, uses
+                  setup_mla_dims(model_name)
+
+    Returns:
+        VllmConfig for benchmarking
+    """
+    # Get MLA dimensions - use provided or load from model name
+    if mla_dims is None:
+        mla_dims = setup_mla_dims(model_name)
+
+    # Create mock HF config first (avoids downloading from HuggingFace)
+    mock_hf_config = MockHfConfig(mla_dims)
+
+    # Create a temporary minimal config.json to avoid HF downloads
+    # This ensures consistent ModelConfig construction without network access
+    import json
+    import os
+    import shutil
+    import tempfile
+
+    minimal_config = {
+        "architectures": ["DeepseekV2ForCausalLM"],
+        "model_type": "deepseek_v2",
+        "num_attention_heads": mla_dims["num_q_heads"],
+        "num_key_value_heads": mla_dims["num_kv_heads"],
+        "hidden_size": mla_dims["head_dim"] * mla_dims["num_q_heads"],
+        "torch_dtype": "bfloat16",
+        "max_position_embeddings": 163840,  # DeepSeek V3 default
+        "rope_theta": 10000.0,
+        "vocab_size": 128256,
+    }
+
+    # Create temporary directory with config.json
+    temp_dir = tempfile.mkdtemp(prefix="vllm_bench_")
+    config_path = os.path.join(temp_dir, "config.json")
+    with open(config_path, "w") as f:
+        json.dump(minimal_config, f)
+
+    try:
+        # Create model config using local path - no HF downloads
+        model_config = ModelConfig(
+            model=temp_dir,  # Use local temp directory
+            tokenizer=None,
+            tokenizer_mode="auto",
+            trust_remote_code=True,
+            dtype="bfloat16",
+            seed=0,
+            max_model_len=32768,
+            quantization=None,
+            quantization_param_path=None,
+            enforce_eager=False,
+            max_context_len_to_capture=None,
+            max_seq_len_to_capture=8192,
+            max_logprobs=20,
+            disable_sliding_window=False,
+            skip_tokenizer_init=True,
+            served_model_name=None,
+            limit_mm_per_prompt=None,
+            use_async_output_proc=True,
+            config_format="auto",
+        )
+    finally:
+        # Clean up temporary directory
+        shutil.rmtree(temp_dir, ignore_errors=True)
+
+    # Override with our mock config
+    model_config.hf_config = mock_hf_config
+    model_config.hf_text_config = mock_hf_config
+
+    # Add mock methods for layer-specific queries
+    _add_mock_methods_to_model_config(model_config)
+
+    # Create sub-configs
+    cache_config = CacheConfig(
+        block_size=block_size,
+        gpu_memory_utilization=0.9,
+        swap_space=0,
+        cache_dtype="auto",
+        enable_prefix_caching=False,
+    )
+
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=8192,
+        max_model_len=32768,
+        is_encoder_decoder=False,
+        enable_chunked_prefill=True,
+    )
+
+    parallel_config = ParallelConfig(
+        tensor_parallel_size=1,
+    )
+
+    compilation_config = CompilationConfig()
+
+    return VllmConfig(
+        model_config=model_config,
+        cache_config=cache_config,
+        parallel_config=parallel_config,
+        scheduler_config=scheduler_config,
+        compilation_config=compilation_config,
+    )
+
+
+# ============================================================================
+# Backend Configuration
+# ============================================================================
+
+
+# Backend name to class name prefix mapping
+_BACKEND_NAME_MAP = {
+    "flashattn_mla": "FlashAttnMLA",
+    "flashmla": "FlashMLA",
+    "flashinfer_mla": "FlashInferMLA",
+    "cutlass_mla": "CutlassMLA",
+}
+
+# Special properties that differ from defaults
+_BACKEND_PROPERTIES = {
+    "flashmla": {
+        "query_format": "concat",  # Single concatenated tensor (vs tuple)
+        "block_size": 64,  # FlashMLA uses fixed block size
+    },
+    "flashinfer_mla": {
+        "block_size": 64,  # FlashInfer MLA only supports 32 or 64
+    },
+}
+
+
+def _get_backend_config(backend: str) -> dict:
+    """
+    Get backend configuration using naming conventions.
+
+    All MLA backends follow the pattern:
+    - Module: vllm.v1.attention.backends.mla.{backend}
+    - Impl: {Name}Impl
+    - Metadata: {Name}Metadata (or MLACommonMetadata)
+    - DecodeMetadata: {Name}DecodeMetadata (or MLACommonDecodeMetadata)
+    - MetadataBuilder: {Name}MetadataBuilder
+    """
+    if backend not in _BACKEND_NAME_MAP:
+        raise ValueError(f"Unknown backend: {backend}")
+
+    name = _BACKEND_NAME_MAP[backend]
+    props = _BACKEND_PROPERTIES.get(backend, {})
+
+    # Check if backend uses common metadata (FlashInfer, CUTLASS)
+    uses_common = backend in ("flashinfer_mla", "cutlass_mla")
+
+    return {
+        "module": f"vllm.v1.attention.backends.mla.{backend}",
+        "impl_class": f"{name}Impl",
+        "metadata_class": "MLACommonMetadata" if uses_common else f"{name}Metadata",
+        "decode_metadata_class": "MLACommonDecodeMetadata"
+        if uses_common
+        else f"{name}DecodeMetadata",
+        "builder_class": f"{name}MetadataBuilder",
+        "query_format": props.get("query_format", "tuple"),
+        "block_size": props.get("block_size", None),
+    }
+
+
+# ============================================================================
+# Metadata Building Helpers
+# ============================================================================
+
+
+def _build_attention_metadata(
+    requests: list,
+    block_size: int,
+    device: torch.device,
+    builder_instance,
+) -> tuple:
+    """
+    Build attention metadata from batch requests.
+
+    Args:
+        requests: List of BatchRequest objects
+        block_size: KV cache block size
+        device: Target device
+        builder_instance: Metadata builder instance
+
+    Returns:
+        Tuple of (metadata, kv_cache_num_blocks)
+    """
+    q_lens = [r.q_len for r in requests]
+    kv_lens = [r.kv_len for r in requests]
+    total_q = sum(q_lens)
+    max_kv = max(kv_lens)
+
+    # Build query start locations
+    q_start_cpu = torch.tensor(
+        [0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))],
+        dtype=torch.int32,
+    )
+    q_start_gpu = q_start_cpu.to(device)
+
+    # Build sequence lengths
+    seq_lens_cpu = torch.tensor(kv_lens, dtype=torch.int32)
+    seq_lens_gpu = seq_lens_cpu.to(device)
+
+    # Build num_computed_tokens (context length for each request)
+    context_lens = [kv_len - q_len for q_len, kv_len in zip(q_lens, kv_lens)]
+    num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
+
+    # Build block table
+    num_blocks_per_req = [(kv + block_size - 1) // block_size for kv in kv_lens]
+    max_num_blocks = max(num_blocks_per_req)
+
+    block_table_cpu = np.zeros((len(requests), max_num_blocks), dtype=np.int32)
+    current_block = 0
+    for i, num_blocks in enumerate(num_blocks_per_req):
+        for j in range(num_blocks):
+            block_table_cpu[i, j] = current_block
+            current_block += 1
+
+    block_table_gpu = torch.from_numpy(block_table_cpu).to(device)
+
+    # Build slot mapping
+    slot_mapping_list = []
+    for i, (q_len, kv_len, num_blocks) in enumerate(
+        zip(q_lens, kv_lens, num_blocks_per_req)
+    ):
+        context_len = kv_len - q_len
+        for j in range(q_len):
+            token_kv_idx = context_len + j
+            block_idx = token_kv_idx // block_size
+            offset_in_block = token_kv_idx % block_size
+            global_block_id = block_table_cpu[i, block_idx]
+            slot_id = global_block_id * block_size + offset_in_block
+            slot_mapping_list.append(slot_id)
+
+    slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64, device=device)
+
+    # Create CommonAttentionMetadata
+    from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+
+    common_attn_metadata = CommonAttentionMetadata(
+        num_reqs=len(requests),
+        max_query_len=max(q_lens),
+        max_seq_len=max_kv,
+        num_actual_tokens=total_q,
+        query_start_loc=q_start_gpu,
+        query_start_loc_cpu=q_start_cpu,
+        seq_lens=seq_lens_gpu,
+        _seq_lens_cpu=seq_lens_cpu,
+        _num_computed_tokens_cpu=num_computed_tokens_cpu,
+        slot_mapping=slot_mapping,
+        block_table_tensor=block_table_gpu,
+        dcp_local_seq_lens=None,
+    )
+
+    # Use the production build() method
+    metadata = builder_instance.build(
+        common_prefix_len=0,
+        common_attn_metadata=common_attn_metadata,
+        fast_build=False,
+    )
+
+    return metadata, current_block
+
+
+def _create_input_tensors(
+    total_q: int,
+    mla_dims: dict,
+    query_format: str,
+    device: torch.device,
+    dtype: torch.dtype,
+):
+    """
+    Create input tensors for both decode and prefill modes.
+
+    MLA requires different tensor formats for decode vs prefill:
+    - Decode: Uses kv_lora_rank (512) dimension
+    - Prefill: Uses qk_nope_head_dim (128) to stay under FlashAttention's 256 limit
+
+    Args:
+        total_q: Total number of query tokens
+        mla_dims: MLA dimension configuration
+        query_format: Either "tuple" or "concat"
+        device: Target device
+        dtype: Tensor dtype
+
+    Returns:
+        Tuple of (decode_inputs, prefill_inputs)
+        - decode_inputs: Query tensor(s) for decode mode
+        - prefill_inputs: Dict with 'q', 'k_c_normed', 'k_pe', 'k_scale' for prefill
+    """
+    if query_format == "tuple":
+        # Decode mode format: (q_nope, q_pe) where q_nope has kv_lora_rank dim
+        q_nope_decode = torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["kv_lora_rank"],
+            device=device,
+            dtype=dtype,
+        )
+        q_pe = torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["qk_rope_head_dim"],
+            device=device,
+            dtype=dtype,
+        )
+        decode_inputs = (q_nope_decode, q_pe)
+
+        # For prefill, we need q with qk_nope_head_dim instead of kv_lora_rank
+        q_nope_prefill = torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["qk_nope_head_dim"],
+            device=device,
+            dtype=dtype,
+        )
+        prefill_q = torch.cat([q_nope_prefill, q_pe], dim=-1)
+    else:  # concat
+        decode_inputs = torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
+            device=device,
+            dtype=dtype,
+        )
+        # For prefill with concat format
+        prefill_q = torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
+            device=device,
+            dtype=dtype,
+        )
+
+    # Create additional inputs needed for prefill forward
+    k_c_normed = torch.randn(
+        total_q,
+        mla_dims["kv_lora_rank"],
+        device=device,
+        dtype=dtype,
+    )
+    k_pe = torch.randn(
+        total_q,
+        1,  # Single head for MLA
+        mla_dims["qk_rope_head_dim"],
+        device=device,
+        dtype=dtype,
+    )
+    k_scale = torch.ones(1, device=device, dtype=torch.float32)
+
+    output = torch.zeros(
+        total_q,
+        mla_dims["num_q_heads"] * mla_dims["v_head_dim"],
+        device=device,
+        dtype=dtype,
+    )
+
+    prefill_inputs = {
+        "q": prefill_q,
+        "k_c_normed": k_c_normed,
+        "k_pe": k_pe,
+        "k_scale": k_scale,
+        "output": output,
+    }
+
+    return decode_inputs, prefill_inputs
+
+
+# ============================================================================
+# Backend Initialization
+# ============================================================================
+
+
+def _create_backend_impl(
+    backend_cfg: dict,
+    mla_dims: dict,
+    vllm_config: VllmConfig,
+    device: torch.device,
+):
+    """
+    Create backend implementation instance.
+
+    Args:
+        backend_cfg: Backend configuration dict
+        mla_dims: MLA dimension configuration
+        vllm_config: VllmConfig instance
+        device: Target device
+
+    Returns:
+        Tuple of (impl, layer, builder_instance)
+    """
+    # Import backend classes
+    backend_module = importlib.import_module(backend_cfg["module"])
+    impl_class = getattr(backend_module, backend_cfg["impl_class"])
+
+    # Calculate scale
+    scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"])
+
+    # Create mock kv_b_proj layer for prefill mode
+    mock_kv_b_proj = MockKVBProj(
+        num_heads=mla_dims["num_q_heads"],
+        qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
+        v_head_dim=mla_dims["v_head_dim"],
+    )
+
+    # Create impl
+    impl = impl_class(
+        num_heads=mla_dims["num_q_heads"],
+        head_size=mla_dims["head_dim"],
+        scale=scale,
+        num_kv_heads=mla_dims["num_kv_heads"],
+        alibi_slopes=None,
+        sliding_window=None,
+        kv_cache_dtype="auto",
+        logits_soft_cap=None,
+        attn_type="decoder",
+        kv_sharing_target_layer_name=None,
+        q_lora_rank=None,
+        kv_lora_rank=mla_dims["kv_lora_rank"],
+        qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
+        qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
+        qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
+        v_head_dim=mla_dims["v_head_dim"],
+        kv_b_proj=mock_kv_b_proj,
+    )
+
+    # Initialize DCP attributes
+    if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1):
+        impl.dcp_world_size = 1
+        impl.dcp_rank = 0
+
+    # Create KV cache spec for MockLayer
+    from vllm.v1.kv_cache_interface import FullAttentionSpec
+
+    kv_cache_spec = FullAttentionSpec(
+        block_size=backend_cfg["block_size"] or vllm_config.cache_config.block_size,
+        num_kv_heads=1,  # MLA uses 1 KV head
+        head_size=576,  # MLA head dim
+        dtype=torch.bfloat16,
+    )
+
+    # Create mock layer
+    layer = MockLayer(device, impl=impl, kv_cache_spec=kv_cache_spec)
+
+    # Create builder instance if needed
+    builder_instance = None
+    if backend_cfg["builder_class"]:
+        builder_class = getattr(backend_module, backend_cfg["builder_class"])
+
+        # Populate static_forward_context so builder can find the layer
+        # MockLayer inherits from AttentionLayerBase, so isinstance checks pass
+        vllm_config.compilation_config.static_forward_context = {"placeholder": layer}
+
+        builder_instance = builder_class(
+            kv_cache_spec=kv_cache_spec,
+            layer_names=["placeholder"],
+            vllm_config=vllm_config,
+            device=device,
+        )
+
+    return impl, layer, builder_instance
+
+
+# ============================================================================
+# Config Helpers
+# ============================================================================
+
+
+def _extract_mla_dims_from_config(config) -> dict | None:
+    """
+    Extract MLA dimensions from BenchmarkConfig if all required fields are present.
+
+    Args:
+        config: BenchmarkConfig instance
+
+    Returns:
+        Dict with MLA dimensions if all fields are provided, None otherwise
+    """
+    # Check if all MLA-specific fields are provided
+    if all(
+        [
+            config.kv_lora_rank is not None,
+            config.qk_nope_head_dim is not None,
+            config.qk_rope_head_dim is not None,
+            config.v_head_dim is not None,
+        ]
+    ):
+        return {
+            "kv_lora_rank": config.kv_lora_rank,
+            "qk_nope_head_dim": config.qk_nope_head_dim,
+            "qk_rope_head_dim": config.qk_rope_head_dim,
+            "v_head_dim": config.v_head_dim,
+            "num_q_heads": config.num_q_heads,
+            "num_kv_heads": config.num_kv_heads,
+            "head_dim": config.head_dim,
+        }
+    # Fallback: if MLA fields not fully specified, try to construct from basic fields
+    elif config.head_dim == 576:
+        # This looks like a DeepSeek MLA config, use standard dimensions with custom
+        # head count
+        return {
+            "kv_lora_rank": 512,
+            "qk_nope_head_dim": 128,
+            "qk_rope_head_dim": 64,
+            "v_head_dim": 128,
+            "num_q_heads": config.num_q_heads,
+            "num_kv_heads": config.num_kv_heads,
+            "head_dim": config.head_dim,
+        }
+    return None
+
+
+# ============================================================================
+# Benchmark Execution
+# ============================================================================
+
+
+def _run_single_benchmark(
+    config,
+    impl,
+    layer,
+    builder_instance,
+    backend_cfg: dict,
+    mla_dims: dict,
+    device: torch.device,
+) -> BenchmarkResult:
+    """
+    Run a single benchmark iteration.
+
+    Args:
+        config: BenchmarkConfig instance
+        impl: Backend implementation instance
+        layer: MockLayer instance
+        builder_instance: Metadata builder instance
+        backend_cfg: Backend configuration dict
+        mla_dims: MLA dimension configuration
+        device: Target device
+
+    Returns:
+        BenchmarkResult with timing statistics
+    """
+    # Parse batch spec
+    requests = parse_batch_spec(config.batch_spec)
+    q_lens = [r.q_len for r in requests]
+    total_q = sum(q_lens)
+
+    # Determine block size
+    block_size = backend_cfg["block_size"] or config.block_size
+
+    # Build metadata
+    metadata, num_blocks = _build_attention_metadata(
+        requests, block_size, device, builder_instance
+    )
+
+    # Create KV cache
+    kv_cache = torch.zeros(
+        num_blocks,
+        block_size,
+        mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
+        device=device,
+        dtype=torch.bfloat16,
+    )
+
+    # Create input tensors for both decode and prefill modes
+    decode_inputs, prefill_inputs = _create_input_tensors(
+        total_q,
+        mla_dims,
+        backend_cfg["query_format"],
+        device,
+        torch.bfloat16,
+    )
+
+    # Determine which forward method to use based on metadata
+    if metadata.decode is not None:
+        forward_fn = lambda: impl._forward_decode(
+            decode_inputs, kv_cache, metadata, layer
+        )
+    elif metadata.prefill is not None:
+        forward_fn = lambda: impl._forward_prefill(
+            prefill_inputs["q"],
+            prefill_inputs["k_c_normed"],
+            prefill_inputs["k_pe"],
+            kv_cache,
+            metadata,
+            prefill_inputs["k_scale"],
+            prefill_inputs["output"],
+        )
+    else:
+        raise RuntimeError("Metadata has neither decode nor prefill metadata")
+
+    # Warmup
+    for _ in range(config.warmup_iters):
+        forward_fn()
+    torch.cuda.synchronize()
+
+    # Benchmark
+    times = []
+    for _ in range(config.repeats):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        for _ in range(config.num_layers):
+            forward_fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        elapsed_ms = start.elapsed_time(end)
+        times.append(elapsed_ms / 1000.0 / config.num_layers)
+
+    mean_time = float(np.mean(times))
+    return BenchmarkResult(
+        config=config,
+        mean_time=mean_time,
+        std_time=float(np.std(times)),
+        min_time=float(np.min(times)),
+        max_time=float(np.max(times)),
+        throughput_tokens_per_sec=total_q / mean_time if mean_time > 0 else 0,
+    )
+
+
+def _run_mla_benchmark_batched(
+    backend: str,
+    configs_with_params: list[tuple],  # [(config, threshold, num_splits), ...]
+) -> list[BenchmarkResult]:
+    """
+    Unified batched MLA benchmark runner for all backends.
+
+    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
+
+    This function reuses backend initialization across multiple benchmarks
+    to avoid setup/teardown overhead.
+
+    Args:
+        backend: Backend name
+        configs_with_params: List of (config, threshold, num_splits) tuples
+            - threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
+            - num_splits: num_kv_splits (CUTLASS only)
+
+    Returns:
+        List of BenchmarkResult objects
+    """
+    if not configs_with_params:
+        return []
+
+    backend_cfg = _get_backend_config(backend)
+    device = torch.device(configs_with_params[0][0].device)
+    torch.cuda.set_device(device)
+
+    # Determine block size
+    config_block_size = configs_with_params[0][0].block_size
+    block_size = backend_cfg["block_size"] or config_block_size
+
+    # Extract MLA dimensions from the first config
+    first_config = configs_with_params[0][0]
+    mla_dims = _extract_mla_dims_from_config(first_config)
+
+    # If config didn't provide MLA dims, fall back to default model
+    if mla_dims is None:
+        mla_dims = setup_mla_dims("deepseek-v3")
+
+    # Create and set vLLM config for MLA (reused across all benchmarks)
+    vllm_config = create_minimal_vllm_config(
+        model_name="deepseek-v3",  # Used only for model path
+        block_size=block_size,
+        mla_dims=mla_dims,  # Use custom dims from config or default
+    )
+
+    results = []
+
+    with set_current_vllm_config(vllm_config):
+        # Create backend impl, layer, and builder (reused across benchmarks)
+        impl, layer, builder_instance = _create_backend_impl(
+            backend_cfg, mla_dims, vllm_config, device
+        )
+
+        # Run each benchmark with the shared impl
+        for config, threshold, num_splits in configs_with_params:
+            # Set threshold for this benchmark (FlashAttn/FlashMLA only)
+            original_threshold = None
+            if threshold is not None and builder_instance:
+                original_threshold = builder_instance.reorder_batch_threshold
+                builder_instance.reorder_batch_threshold = threshold
+
+            # Set num_splits for CUTLASS
+            original_num_splits = None
+            if num_splits is not None and hasattr(impl, "_num_kv_splits"):
+                original_num_splits = impl._num_kv_splits
+                impl._num_kv_splits = num_splits
+
+            try:
+                result = _run_single_benchmark(
+                    config,
+                    impl,
+                    layer,
+                    builder_instance,
+                    backend_cfg,
+                    mla_dims,
+                    device,
+                )
+                results.append(result)
+
+            finally:
+                # Restore original threshold
+                if original_threshold is not None:
+                    builder_instance.reorder_batch_threshold = original_threshold
+
+                # Restore original num_splits
+                if original_num_splits is not None:
+                    impl._num_kv_splits = original_num_splits
+
+    return results
+
+
+# ============================================================================
+# Public API
+# ============================================================================
+
+
+def run_mla_benchmark(
+    backend: str,
+    config,
+    reorder_batch_threshold: int | None = None,
+    num_kv_splits: int | None = None,
+) -> BenchmarkResult | list[BenchmarkResult]:
+    """
+    Unified MLA benchmark runner for all backends.
+
+    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
+
+    Always uses batched execution internally for optimal performance.
+
+    Args:
+        backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla)
+        config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples
+        reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA
+                                 (single config mode only)
+        num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
+
+    Returns:
+        BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
+    """
+    # Normalize to batched mode: (config, threshold, num_splits)
+    if isinstance(config, list):
+        # Already in batched format
+        if len(config) > 0 and isinstance(config[0], tuple):
+            # Format: [(cfg, param), ...] where param is threshold or num_splits
+            if backend in ("flashattn_mla", "flashmla"):
+                configs_with_params = [(cfg, param, None) for cfg, param in config]
+            else:  # cutlass_mla or flashinfer_mla
+                configs_with_params = [(cfg, None, param) for cfg, param in config]
+        else:
+            # Format: [cfg, ...] - just configs
+            configs_with_params = [(cfg, None, None) for cfg in config]
+        return_single = False
+    else:
+        # Single config: convert to batched format
+        configs_with_params = [(config, reorder_batch_threshold, num_kv_splits)]
+        return_single = True
+
+    # Use unified batched execution
+    results = _run_mla_benchmark_batched(backend, configs_with_params)
+
+    # Return single result or list based on input
+    return results[0] if return_single else results
--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -0,0 +1,481 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Standard attention benchmark runner - shared utilities for non-MLA benchmarks.
+
+This module provides helpers for running standard attention backends
+(FlashAttention, Triton, FlashInfer) with real vLLM integration.
+"""
+
+import types
+
+import numpy as np
+import torch
+from batch_spec import parse_batch_spec, reorder_for_flashinfer
+from common import BenchmarkConfig, BenchmarkResult, MockLayer, get_attention_scale
+
+from vllm.config import (
+    CacheConfig,
+    CompilationConfig,
+    DeviceConfig,
+    LoadConfig,
+    ModelConfig,
+    ParallelConfig,
+    SchedulerConfig,
+    VllmConfig,
+)
+from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.kv_cache_interface import FullAttentionSpec
+
+# ============================================================================
+# Backend Configuration
+# ============================================================================
+
+
+_BACKEND_CONFIG = {
+    "flash": {
+        "module": "vllm.v1.attention.backends.flash_attn",
+        "backend_class": "FlashAttentionBackend",
+        "dtype": torch.float16,
+        "cache_layout": "standard",
+        # ^ [2, num_blocks, block_size, num_kv_heads, head_dim]
+    },
+    "triton": {
+        "module": "vllm.v1.attention.backends.triton_attn",
+        "backend_class": "TritonAttentionBackend",
+        "dtype": torch.float32,
+        "cache_layout": "standard",
+    },
+    "flashinfer": {
+        "module": "vllm.v1.attention.backends.flashinfer",
+        "backend_class": "FlashInferBackend",
+        "dtype": torch.float16,
+        "cache_layout": "flashinfer",
+        # ^ [num_blocks, 2, block_size, num_kv_heads, head_dim]
+    },
+}
+
+
+def _get_backend_config(backend: str) -> dict:
+    if backend not in _BACKEND_CONFIG:
+        raise ValueError(
+            f"Unknown backend: {backend}. "
+            f"Available: {', '.join(_BACKEND_CONFIG.keys())}"
+        )
+    return _BACKEND_CONFIG[backend]
+
+
+# ============================================================================
+# Metadata Building Helpers
+# ============================================================================
+
+
+def _build_common_attn_metadata(
+    q_lens: list[int],
+    kv_lens: list[int],
+    block_size: int,
+    device: torch.device,
+) -> CommonAttentionMetadata:
+    """Build CommonAttentionMetadata from query/kv lengths."""
+    batch_size = len(q_lens)
+    total_tokens = sum(q_lens)
+
+    query_start_loc = torch.zeros(batch_size + 1, dtype=torch.int32, device=device)
+    query_start_loc[1:] = torch.tensor(q_lens, dtype=torch.int32, device=device).cumsum(
+        0
+    )
+    query_start_loc_cpu = query_start_loc.cpu()
+
+    seq_lens = torch.tensor(kv_lens, dtype=torch.int32, device=device)
+    seq_lens_cpu = seq_lens.cpu()
+    max_seq_len = int(seq_lens_cpu.max())
+
+    context_lens = [kv - q for kv, q in zip(kv_lens, q_lens)]
+    num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
+
+    max_blocks = (max(kv_lens) + block_size - 1) // block_size
+    num_blocks = batch_size * max_blocks
+    block_table_tensor = torch.arange(
+        num_blocks, dtype=torch.int32, device=device
+    ).view(batch_size, max_blocks)
+    slot_mapping = torch.arange(total_tokens, dtype=torch.int64, device=device)
+
+    max_query_len = max(q_lens)
+
+    return CommonAttentionMetadata(
+        query_start_loc=query_start_loc,
+        query_start_loc_cpu=query_start_loc_cpu,
+        seq_lens=seq_lens,
+        seq_lens_cpu=seq_lens_cpu,
+        num_computed_tokens_cpu=num_computed_tokens_cpu,
+        num_reqs=batch_size,
+        num_actual_tokens=total_tokens,
+        max_query_len=max_query_len,
+        max_seq_len=max_seq_len,
+        block_table_tensor=block_table_tensor,
+        slot_mapping=slot_mapping,
+        causal=True,
+    )
+
+
+def _create_vllm_config(
+    config: BenchmarkConfig,
+    dtype: torch.dtype,
+    max_num_blocks: int,
+) -> VllmConfig:
+    """Create a VllmConfig for benchmarking with mock model methods."""
+    model_config = ModelConfig(
+        model="meta-llama/Meta-Llama-3-8B",
+        tokenizer="meta-llama/Meta-Llama-3-8B",
+        trust_remote_code=False,
+        dtype=dtype,
+        seed=0,
+        max_model_len=1024,
+    )
+
+    cache_config = CacheConfig(
+        block_size=config.block_size,
+        cache_dtype="auto",
+        swap_space=0,
+    )
+    cache_config.num_gpu_blocks = max_num_blocks
+    cache_config.num_cpu_blocks = 0
+
+    parallel_config = ParallelConfig(tensor_parallel_size=1)
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=256,
+        max_num_batched_tokens=8192,
+        max_model_len=8192,
+        is_encoder_decoder=False,
+        enable_chunked_prefill=True,
+    )
+    device_config = DeviceConfig()
+    load_config = LoadConfig()
+    compilation_config = CompilationConfig()
+
+    # Add mock methods for benchmark config values
+    model_config.get_num_layers = types.MethodType(
+        lambda self: config.num_layers, model_config
+    )
+    model_config.get_sliding_window_for_layer = types.MethodType(
+        lambda self, i: None, model_config
+    )
+    model_config.get_logits_soft_cap_for_layer = types.MethodType(
+        lambda self, i: 0.0, model_config
+    )
+    model_config.get_sm_scale_for_layer = types.MethodType(
+        lambda self, i: 1.0 / config.head_dim**0.5, model_config
+    )
+    model_config.get_num_attention_heads = types.MethodType(
+        lambda self, parallel_config=None: config.num_q_heads, model_config
+    )
+    model_config.get_num_kv_heads = types.MethodType(
+        lambda self, parallel_config=None: config.num_kv_heads, model_config
+    )
+    model_config.get_head_size = types.MethodType(
+        lambda self: config.head_dim, model_config
+    )
+    model_config.get_sliding_window = types.MethodType(lambda self: None, model_config)
+
+    return VllmConfig(
+        model_config=model_config,
+        cache_config=cache_config,
+        parallel_config=parallel_config,
+        scheduler_config=scheduler_config,
+        device_config=device_config,
+        load_config=load_config,
+        compilation_config=compilation_config,
+    )
+
+
+# ============================================================================
+# Backend Initialization
+# ============================================================================
+
+
+def _create_backend_impl(
+    backend_cfg: dict,
+    config: BenchmarkConfig,
+    device: torch.device,
+):
+    """Create backend implementation instance."""
+    import importlib
+
+    backend_module = importlib.import_module(backend_cfg["module"])
+    backend_class = getattr(backend_module, backend_cfg["backend_class"])
+
+    scale = get_attention_scale(config.head_dim)
+    dtype = backend_cfg["dtype"]
+
+    impl = backend_class.get_impl_cls()(
+        num_heads=config.num_q_heads,
+        head_size=config.head_dim,
+        scale=scale,
+        num_kv_heads=config.num_kv_heads,
+        alibi_slopes=None,
+        sliding_window=None,
+        kv_cache_dtype="auto",
+    )
+
+    kv_cache_spec = FullAttentionSpec(
+        block_size=config.block_size,
+        num_kv_heads=config.num_kv_heads,
+        head_size=config.head_dim,
+        dtype=dtype,
+    )
+
+    layer = MockLayer(device, kv_cache_spec=kv_cache_spec)
+
+    return backend_class, impl, layer, dtype
+
+
+def _create_metadata_builder(
+    backend_class,
+    kv_cache_spec: FullAttentionSpec,
+    vllm_config: VllmConfig,
+    device: torch.device,
+):
+    """Create metadata builder instance."""
+    return backend_class.get_builder_cls()(
+        kv_cache_spec=kv_cache_spec,
+        layer_names=["layer_0"],
+        vllm_config=vllm_config,
+        device=device,
+    )
+
+
+# ============================================================================
+# Tensor Creation Helpers
+# ============================================================================
+
+
+def _create_input_tensors(
+    config: BenchmarkConfig,
+    total_q: int,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> tuple:
+    """Create Q, K, V input tensors for all layers."""
+    q_list = [
+        torch.randn(
+            total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
+        )
+        for _ in range(config.num_layers)
+    ]
+    k_list = [
+        torch.randn(
+            total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dtype
+        )
+        for _ in range(config.num_layers)
+    ]
+    v_list = [
+        torch.randn(
+            total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dtype
+        )
+        for _ in range(config.num_layers)
+    ]
+    return q_list, k_list, v_list
+
+
+def _create_kv_cache(
+    config: BenchmarkConfig,
+    max_num_blocks: int,
+    cache_layout: str,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> list:
+    """Create KV cache tensors for all layers."""
+    if cache_layout == "flashinfer":
+        # FlashInfer layout: [num_blocks, 2, block_size, num_kv_heads, head_dim]
+        cache_list = [
+            torch.zeros(
+                max_num_blocks,
+                2,
+                config.block_size,
+                config.num_kv_heads,
+                config.head_dim,
+                device=device,
+                dtype=dtype,
+            )
+            for _ in range(config.num_layers)
+        ]
+    else:
+        # Standard layout: [2, num_blocks, block_size, num_kv_heads, head_dim]
+        cache_list = [
+            torch.zeros(
+                2,
+                max_num_blocks,
+                config.block_size,
+                config.num_kv_heads,
+                config.head_dim,
+                device=device,
+                dtype=dtype,
+            )
+            for _ in range(config.num_layers)
+        ]
+    return cache_list
+
+
+# ============================================================================
+# Benchmark Execution
+# ============================================================================
+
+
+def _run_single_benchmark(
+    config: BenchmarkConfig,
+    impl,
+    layer,
+    q_list: list,
+    k_list: list,
+    v_list: list,
+    cache_list: list,
+    attn_metadata,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> tuple:
+    """Run single benchmark iteration with warmup and timing loop."""
+    total_q = q_list[0].shape[0]
+    out = torch.empty(
+        total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
+    )
+
+    # Warmup
+    for _ in range(config.warmup_iters):
+        for i in range(config.num_layers):
+            impl.forward(
+                layer,
+                q_list[i],
+                k_list[i],
+                v_list[i],
+                cache_list[i],
+                attn_metadata,
+                output=out,
+            )
+    torch.cuda.synchronize()
+
+    # Benchmark
+    times = []
+    for _ in range(config.repeats):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        for i in range(config.num_layers):
+            impl.forward(
+                layer,
+                q_list[i],
+                k_list[i],
+                v_list[i],
+                cache_list[i],
+                attn_metadata,
+                output=out,
+            )
+        end.record()
+
+        torch.cuda.synchronize()
+        elapsed_ms = start.elapsed_time(end)
+        times.append(elapsed_ms / 1000.0 / config.num_layers)  # seconds per layer
+
+    mem_stats = {}
+    if config.profile_memory:
+        mem_stats = {
+            "allocated_mb": torch.cuda.memory_allocated(device) / 1024**2,
+            "reserved_mb": torch.cuda.memory_reserved(device) / 1024**2,
+        }
+
+    return times, mem_stats
+
+
+# ============================================================================
+# Public API
+# ============================================================================
+
+
+def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
+    """
+    Run standard attention benchmark with real kernels.
+
+    Supports: flash, triton, flashinfer
+
+    Args:
+        config: Benchmark configuration
+
+    Returns:
+        BenchmarkResult with timing and memory statistics
+    """
+    device = torch.device(config.device)
+    torch.cuda.set_device(device)
+
+    backend_cfg = _get_backend_config(config.backend)
+
+    requests = parse_batch_spec(config.batch_spec)
+
+    if config.backend == "flashinfer":
+        requests = reorder_for_flashinfer(requests)
+
+    q_lens = [r.q_len for r in requests]
+    kv_lens = [r.kv_len for r in requests]
+    total_q = sum(q_lens)
+    max_kv = max(kv_lens)
+
+    max_num_blocks = (max_kv + config.block_size - 1) // config.block_size
+
+    backend_class, impl, layer, dtype = _create_backend_impl(
+        backend_cfg, config, device
+    )
+
+    common_metadata = _build_common_attn_metadata(
+        q_lens, kv_lens, config.block_size, device
+    )
+
+    kv_cache_spec = FullAttentionSpec(
+        block_size=config.block_size,
+        num_kv_heads=config.num_kv_heads,
+        head_size=config.head_dim,
+        dtype=dtype,
+    )
+
+    vllm_config = _create_vllm_config(config, dtype, max_num_blocks)
+
+    builder = _create_metadata_builder(
+        backend_class, kv_cache_spec, vllm_config, device
+    )
+
+    attn_metadata = builder.build(
+        common_prefix_len=0,
+        common_attn_metadata=common_metadata,
+    )
+
+    q_list, k_list, v_list = _create_input_tensors(config, total_q, device, dtype)
+
+    cache_list = _create_kv_cache(
+        config, max_num_blocks, backend_cfg["cache_layout"], device, dtype
+    )
+
+    times, mem_stats = _run_single_benchmark(
+        config,
+        impl,
+        layer,
+        q_list,
+        k_list,
+        v_list,
+        cache_list,
+        attn_metadata,
+        device,
+        dtype,
+    )
+
+    mean_time = np.mean(times)
+    throughput = total_q / mean_time if mean_time > 0 else 0
+
+    return BenchmarkResult(
+        config=config,
+        mean_time=mean_time,
+        std_time=np.std(times),
+        min_time=np.min(times),
+        max_time=np.max(times),
+        throughput_tokens_per_sec=throughput,
+        memory_allocated_mb=mem_stats.get("allocated_mb"),
+        memory_reserved_mb=mem_stats.get("reserved_mb"),
+    )
--- a/benchmarks/kernels/benchmark_bitblas.py
+++ b/benchmarks/kernels/benchmark_bitblas.py
@@ -1,244 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-from packaging import version
-
-from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
-    MINIMUM_BITBLAS_VERSION,
-)
-
-try:
-    import bitblas
-
-    if version.parse(bitblas.__version__) < version.parse(MINIMUM_BITBLAS_VERSION):
-        raise ImportError(
-            "bitblas version is wrong. Please "
-            f"install bitblas>={MINIMUM_BITBLAS_VERSION}"
-        )
-except ImportError as e:
-    bitblas_import_exception = e
-    raise ValueError(
-        "Trying to use the bitblas backend, but could not import"
-        f"with the following error: {bitblas_import_exception}. "
-        "Please install bitblas through the following command: "
-        f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
-    ) from bitblas_import_exception
-
-from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
-
-from vllm.utils.argparse_utils import FlexibleArgumentParser
-
-parser = FlexibleArgumentParser(
-    description="Benchmark BitBLAS int4 on a specific target."
-)
-
-# Add arguments to the parser
-parser.add_argument(
-    "--target",
-    type=str,
-    default=auto_detect_nvidia_target(),
-    help="Specify the target device for benchmarking.",
-)
-parser.add_argument(
-    "--group_size", type=int, default=None, help="Group size for grouped quantization."
-)
-parser.add_argument(
-    "--A_dtype",
-    type=str,
-    default="float16",
-    choices=["float16", "float32", "float64", "int32", "int8"],
-    help="Data type of activation A.",
-)
-parser.add_argument(
-    "--W_dtype",
-    type=str,
-    default="int4",
-    choices=[
-        "float16",
-        "float32",
-        "float64",
-        "int32",
-        "int8",
-        "int4",
-        "int2",
-        "int1",
-        "nf4",
-        "fp4_e2m1",
-    ],
-    help="Data type of weight W.",
-)
-parser.add_argument(
-    "--accum_dtype",
-    type=str,
-    default="float16",
-    choices=["float16", "int32"],
-    help="Data type for accumulation.",
-)
-parser.add_argument(
-    "--out_dtype",
-    type=str,
-    default="float16",
-    choices=["float16", "float32", "int32", "int8"],
-    help="Data type for output.",
-)
-parser.add_argument(
-    "--layout",
-    type=str,
-    default="nt",
-    choices=["nt", "nn"],
-    help="Matrix layout, 'nt' for non-transpose A and transpose W.",
-)
-parser.add_argument(
-    "--with_bias", action="store_true", help="Include bias in the benchmark."
-)
-parser.add_argument(
-    "--with_scaling",
-    action="store_true",
-    help="Include scaling factor in the quantization.",
-)
-parser.add_argument(
-    "--with_zeros", action="store_true", help="Include zeros in the quantization."
-)
-parser.add_argument(
-    "--zeros_mode",
-    type=str,
-    default=None,
-    choices=["original", "rescale", "quantized"],
-    help="Specify the mode for calculating zeros.",
-)
-
-# Parse the arguments
-args = parser.parse_args()
-
-# Assign arguments to variables
-target = args.target
-A_dtype = args.A_dtype
-W_dtype = args.W_dtype
-accum_dtype = args.accum_dtype
-out_dtype = args.out_dtype
-layout = args.layout
-with_bias = args.with_bias
-group_size = args.group_size
-with_scaling = args.with_scaling
-with_zeros = args.with_zeros
-zeros_mode = args.zeros_mode
-
-# Define a list of shared arguments that repeat in every config
-shared_args = [
-    A_dtype,
-    W_dtype,
-    out_dtype,
-    accum_dtype,
-    layout,
-    with_bias,
-    group_size,
-    with_scaling,
-    with_zeros,
-    zeros_mode,
-]
-
-# Define just the (M, K, N) shapes in a more compact list
-shapes = [
-    # square test
-    (1, 16384, 16384),
-    # BLOOM-176B
-    (1, 43008, 14336),
-    (1, 14336, 14336),
-    (1, 57344, 14336),
-    (1, 14336, 57344),
-    # OPT-65B
-    (1, 9216, 9216),
-    (1, 36864, 9216),
-    (1, 9216, 36864),
-    (1, 22016, 8192),
-    # LLAMA-70B/65B
-    (1, 8192, 22016),
-    (1, 8192, 8192),
-    (1, 28672, 8192),
-    (1, 8192, 28672),
-    # square test
-    (16384, 16384, 16384),
-    # BLOOM-176B
-    (8192, 43008, 14336),
-    (8192, 14336, 14336),
-    (8192, 57344, 14336),
-    (8192, 14336, 57344),
-    # OPT-65B
-    (8192, 9216, 9216),
-    (8192, 36864, 9216),
-    (8192, 9216, 36864),
-    (8192, 22016, 8192),
-    # LLAMA-70B/65B
-    (8192, 8192, 22016),
-    (8192, 8192, 8192),
-    (8192, 28672, 8192),
-    (8192, 8192, 28672),
-]
-
-# Build test shapes with all the shared arguments
-test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args)) for shape in shapes]
-
-benchmark_sets = []
-benchmark_sets.extend(test_shapes)
-
-benchmark_results = {}
-for config_class, operator, input_args in benchmark_sets:
-    config = config_class(*input_args)
-    matmul = operator(config, target=target, enable_tuning=True)
-    kernel_latency = matmul.profile_latency()
-
-    print("Time cost is: {:.3f} ms".format(kernel_latency))
-
-    profile_config = {
-        f"{operator.__name__}-{'-'.join([str(i) for i in input_args])}": {
-            "BitBLAS_top20_latency": kernel_latency,
-        }
-    }
-
-    benchmark_results.update(profile_config)
-
-# Define headers for the table
-headers = [
-    "PrimFunc",
-    "Input Arguments",
-    "BitBLAS Top20 Latency",
-]
-
-# Calculate column widths for pretty printing
-col_widths = [0, 0, 0]
-for config_key, values in benchmark_results.items():
-    args_split = config_key.split("-")
-    func_name = args_split[0]
-    input_args_str = "-".join(args_split[1:])
-    col_widths[0] = max(col_widths[0], len(func_name) + 2, len(headers[0]) + 2)
-    col_widths[1] = max(col_widths[1], len(input_args_str) + 2, len(headers[1]) + 2)
-    col_widths[2] = max(
-        col_widths[2],
-        len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2,
-        len(headers[2]) + 2,
-    )
-    # break only if you want to measure widths from a single example;
-    # otherwise, let it loop over all items.
-
-# Print header
-for i, header in enumerate(headers):
-    headers[i] = header.ljust(col_widths[i])
-print("".join(headers))
-print("-" * sum(col_widths))
-
-# Print rows
-for config_key, values in benchmark_results.items():
-    args_split = config_key.split("-")
-    func_name = args_split[0]
-    input_args_str = "-".join(args_split[1:])
-    row = [
-        func_name,
-        input_args_str,
-        f"{values['BitBLAS_top20_latency']:.3f} ms",
-    ]
-    row_str = "".join(
-        [str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)]
-    )
-    print(row_str)
--- a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
@@ -197,7 +197,7 @@ def bench_run(
        )

        kernel = mk.FusedMoEModularKernel(
-            MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
+            MoEPrepareAndFinalizeNoEP(),
            CutlassExpertsFp4(
                make_dummy_moe_config(),
                quant_config=quant_config,
@@ -242,7 +242,7 @@ def bench_run(
        )

        kernel = mk.FusedMoEModularKernel(
-            MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
+            MoEPrepareAndFinalizeNoEP(),
            CutlassExpertsFp4(
                make_dummy_moe_config(),
                quant_config=quant_config,
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -842,6 +842,7 @@ class BenchmarkTensors:
            "sorted_token_ids": sorted_token_ids,
            "expert_ids": expert_ids,
            "num_tokens_post_padded": num_tokens_post_padded,
+            "token_lora_mapping": self.lora_kernel_meta.token_lora_mapping,
            "top_k_num": ctx.top_k_num,
            "device": self.input.device,
            "N": lora_rank,
@@ -915,6 +916,7 @@ class BenchmarkTensors:
            "sorted_token_ids": sorted_token_ids,
            "expert_ids": expert_ids,
            "num_tokens_post_padded": num_tokens_post_padded,
+            "token_lora_mapping": self.lora_kernel_meta.token_lora_mapping,
            "top_k_num": ctx.top_k_num,
            "device": self.input.device,
            "N": lora_rank,
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -6,12 +6,6 @@ import torch.utils.benchmark as benchmark
 from benchmark_shapes import WEIGHT_SHAPES

 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
-    GPTQ_MARLIN_24_MAX_PARALLEL,
-    GPTQ_MARLIN_24_MIN_THREAD_N,
-    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES,
-    GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES,
-)
 from vllm.model_executor.layers.quantization.utils.allspark_utils import (
    ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
    ALLSPARK_SUPPORTED_QUANT_TYPES,
@@ -34,9 +28,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
    awq_marlin_quantize,
    marlin_quantize,
 )
-from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
-    marlin_24_quantize,
-)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    gptq_pack,
    gptq_quantize_weights,
@@ -78,14 +69,7 @@ def bench_run(
    if size_k % group_size != 0:
        return

-    marlin_24_supported = (
-        quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
-        and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
-    )
-    repack_supported = (
-        quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
-        and group_size in MARLIN_SUPPORTED_GROUP_SIZES
-    )
+    repack_supported = group_size in MARLIN_SUPPORTED_GROUP_SIZES
    allspark_supported = (
        quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
        and group_size == -1
@@ -126,14 +110,6 @@ def bench_run(
            marlin_sort_indices,
        )

-    def gen_marlin_24_params():
-        marlin_24_w_ref = marlin_24_q_w_comp = marlin_24_meta = marlin_24_s = None
-        if marlin_24_supported:
-            (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
-                marlin_24_quantize(b, quant_type, group_size)
-            )
-        return (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s)
-
    def gen_repack_params():
        q_w_gptq = None
        repack_sort_indices = None
@@ -188,9 +164,6 @@ def bench_run(
        marlin_g_idx,
        marlin_sort_indices,
    ) = gen_marlin_params()
-    marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s = (
-        gen_marlin_24_params()
-    )
    q_w_gptq, repack_sort_indices = gen_repack_params()
    qw_reorder, s_reorder, zp_reorder, sm_count, sm_version, CUBLAS_M_THRESHOLD = (
        gen_allspark_params()
@@ -200,9 +173,6 @@ def bench_run(
    marlin_workspace = MarlinWorkspace(
        size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
    )
-    marlin_24_workspace = MarlinWorkspace(
-        size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
-    )

    globals = {
        # Gen params
@@ -222,12 +192,6 @@ def bench_run(
        "marlin_sort_indices": marlin_sort_indices,
        "marlin_workspace": marlin_workspace,
        "is_k_full": is_k_full,
-        # Marlin_24 params
-        "marlin_24_w_ref": marlin_24_w_ref,
-        "marlin_24_q_w_comp": marlin_24_q_w_comp,
-        "marlin_24_meta": marlin_24_meta,
-        "marlin_24_s": marlin_24_s,
-        "marlin_24_workspace": marlin_24_workspace,
        # GPTQ params
        "q_w_gptq": q_w_gptq,
        "repack_sort_indices": repack_sort_indices,
@@ -240,7 +204,6 @@ def bench_run(
        "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD,
        # Kernels
        "marlin_gemm": ops.marlin_gemm,
-        "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
        "gptq_marlin_repack": ops.gptq_marlin_repack,
        "allspark_w8a16_gemm": ops.allspark_w8a16_gemm,
    }
@@ -281,17 +244,6 @@ def bench_run(
        ).blocked_autorange(min_run_time=min_run_time)
    )

-    if marlin_24_supported:
-        results.append(
-            benchmark.Timer(
-                stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)",  # noqa: E501
-                globals=globals,
-                label=label,
-                sub_label=sub_label,
-                description="gptq_marlin_24_gemm",
-            ).blocked_autorange(min_run_time=min_run_time)
-        )
-
    if repack_supported:
        results.append(
            benchmark.Timer(
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -27,7 +27,6 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
    TritonOrDeepGemmExperts,
 )
-from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
 from vllm.triton_utils import triton
 from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -482,6 +481,8 @@ class BenchmarkWorker:
        block_quant_shape: list[int] = None,
        use_deep_gemm: bool = False,
    ) -> tuple[dict[str, int], float]:
+        # local import to allow serialization by ray
+
        set_random_seed(self.seed)
        dtype_str = _get_config_dtype_str(
            dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
@@ -535,6 +536,9 @@ class BenchmarkWorker:
        block_quant_shape: list[int],
        use_deep_gemm: bool,
    ) -> dict[str, int]:
+        # local import to allow serialization by ray
+        from vllm.platforms import current_platform
+
        best_config = None
        best_time = float("inf")
        if current_platform.is_rocm():
@@ -646,20 +650,28 @@ def save_configs(
        f.write("\n")


+def get_compressed_tensors_block_structure(config, default_value=None):
+    config_groups = config.get("config_groups", {})
+    if len(config_groups) != 1:
+        return default_value
+    group = next(iter(config_groups.values()))
+    weights = group.get("weights", {})
+    block_structure = weights.get("block_structure", default_value)
+    return block_structure
+
+
 def get_weight_block_size_safety(config, default_value=None):
    quantization_config = getattr(config, "quantization_config", {})
    if isinstance(quantization_config, dict):
-        return quantization_config.get("weight_block_size", default_value)
+        if "weight_block_size" in quantization_config:
+            return quantization_config["weight_block_size"]
+        return get_compressed_tensors_block_structure(
+            quantization_config, default_value
+        )
    return default_value


-def main(args: argparse.Namespace):
-    print(args)
-
-    config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
-    if args.model_prefix:
-        config = getattr(config, args.model_prefix)
-
+def get_model_params(config):
    if config.architectures[0] == "DbrxForCausalLM":
        E = config.ffn_config.moe_num_experts
        topk = config.ffn_config.moe_top_k
@@ -674,9 +686,11 @@ def main(args: argparse.Namespace):
        "DeepseekV2ForCausalLM",
        "DeepseekV3ForCausalLM",
        "DeepseekV32ForCausalLM",
+        "GlmMoeDsaForCausalLM",
        "Glm4MoeForCausalLM",
        "Glm4MoeLiteForCausalLM",
        "NemotronHForCausalLM",
+        "MistralLarge3ForCausalLM",
    ):
        E = config.n_routed_experts
        topk = config.num_experts_per_tok
@@ -697,16 +711,20 @@ def main(args: argparse.Namespace):
        topk = text_config.num_experts_per_tok
        intermediate_size = text_config.moe_intermediate_size
        hidden_size = text_config.hidden_size
-    elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
+    elif config.architectures[0] == "HunYuanMoEV1ForCausalLM":
        E = config.num_experts
        topk = config.moe_topk[0]
        intermediate_size = config.moe_intermediate_size[0]
        hidden_size = config.hidden_size
-    elif config.architectures[0] in ["Qwen3OmniMoeForConditionalGeneration"]:
+    elif config.architectures[0] == "Qwen3OmniMoeForConditionalGeneration":
        E = config.thinker_config.text_config.num_experts
        topk = config.thinker_config.text_config.num_experts_per_tok
        intermediate_size = config.thinker_config.text_config.moe_intermediate_size
        hidden_size = config.thinker_config.text_config.hidden_size
+    elif config.architectures[0] == "PixtralForConditionalGeneration":
+        # Pixtral can contain different LLM architectures,
+        # recurse to get their parameters
+        return get_model_params(config.get_text_config())
    else:
        # Support for llama4
        config = config.get_text_config()
@@ -715,6 +733,16 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok
        intermediate_size = config.intermediate_size
        hidden_size = config.hidden_size
+    return E, topk, intermediate_size, hidden_size
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
+    if args.model_prefix:
+        config = getattr(config, args.model_prefix)
+    E, topk, intermediate_size, hidden_size = get_model_params(config)
    enable_ep = bool(args.enable_expert_parallel)
    if enable_ep:
        ensure_divisibility(E, args.tp_size, "Number of experts")
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -10,8 +10,6 @@ from transformers import AutoConfig

 from vllm.model_executor.layers.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
-    _moe_permute,
-    _moe_unpermute_and_reduce,
    moe_permute,
    moe_unpermute,
 )
@@ -41,16 +39,13 @@ def benchmark_permute(
    use_fp8_w8a8: bool,
    use_int8_w8a16: bool,
    num_iters: int = 100,
-    use_customized_permute: bool = False,
 ) -> float:
    # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
    # output_hidden_states = torch.empty_like(hidden_states)
    if use_fp8_w8a8:
-        align_block_size = 128  # deepgemm needs 128 m aligned block
        qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
    else:
-        align_block_size = None
        qhidden_states = hidden_states

    gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32)
@@ -64,29 +59,13 @@ def benchmark_permute(
        input_gating.copy_(gating_output[i])

    def run():
-        if use_customized_permute:
-            (
-                permuted_hidden_states,
-                a1q_scale,
-                first_token_off,
-                inv_perm_idx,
-                m_indices,
-            ) = moe_permute(
-                qhidden_states,
-                a1q_scale=None,
-                topk_ids=topk_ids,
-                n_expert=num_experts,
-                expert_map=None,
-                align_block_size=align_block_size,
-            )
-        else:
-            (
-                permuted_hidden_states,
-                a1q_scale,
-                sorted_token_ids,
-                expert_ids,
-                inv_perm,
-            ) = _moe_permute(qhidden_states, None, topk_ids, num_experts, None, 16)
+        moe_permute(
+            qhidden_states,
+            a1q_scale=None,
+            topk_ids=topk_ids,
+            n_expert=num_experts,
+            expert_map=None,
+        )

    # JIT compilation & warmup
    run()
@@ -131,16 +110,12 @@ def benchmark_unpermute(
    use_fp8_w8a8: bool,
    use_int8_w8a16: bool,
    num_iters: int = 100,
-    use_customized_permute: bool = False,
 ) -> float:
    # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
-    output_hidden_states = torch.empty_like(hidden_states)
    if use_fp8_w8a8:
-        align_block_size = 128  # deepgemm needs 128 m aligned block
        qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
    else:
-        align_block_size = None
        qhidden_states = hidden_states

    input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
@@ -150,78 +125,36 @@ def benchmark_unpermute(
    )

    def prepare():
-        if use_customized_permute:
-            (
-                permuted_hidden_states,
-                a1q_scale,
-                first_token_off,
-                inv_perm_idx,
-                m_indices,
-            ) = moe_permute(
-                qhidden_states,
-                a1q_scale=None,
-                topk_ids=topk_ids,
-                n_expert=num_experts,
-                expert_map=None,
-                align_block_size=align_block_size,
-            )
-            # convert to fp16/bf16 as gemm output
-            return (
-                permuted_hidden_states.to(dtype),
-                first_token_off,
-                inv_perm_idx,
-                m_indices,
-            )
-        else:
-            (
-                permuted_qhidden_states,
-                a1q_scale,
-                sorted_token_ids,
-                expert_ids,
-                inv_perm,
-            ) = _moe_permute(
-                qhidden_states, None, topk_ids, num_experts, None, block_m=16
-            )
-            # convert to fp16/bf16 as gemm output
-            return (
-                permuted_qhidden_states.to(dtype),
-                a1q_scale,
-                sorted_token_ids,
-                expert_ids,
-                inv_perm,
-            )
+        (
+            permuted_hidden_states,
+            _,
+            first_token_off,
+            inv_perm_idx,
+            _,
+        ) = moe_permute(
+            qhidden_states,
+            a1q_scale=None,
+            topk_ids=topk_ids,
+            n_expert=num_experts,
+            expert_map=None,
+        )
+        # convert to fp16/bf16 as gemm output
+        return (
+            permuted_hidden_states.to(dtype),
+            first_token_off,
+            inv_perm_idx,
+        )

    def run(input: tuple):
-        if use_customized_permute:
-            (
-                permuted_hidden_states,
-                first_token_off,
-                inv_perm_idx,
-                m_indices,
-            ) = input
-            output = torch.empty_like(hidden_states)
-            moe_unpermute(
-                output,
-                permuted_hidden_states,
-                topk_weights,
-                inv_perm_idx,
-                first_token_off,
-            )
-        else:
-            (
-                permuted_hidden_states,
-                a1q_scale,
-                sorted_token_ids,
-                expert_ids,
-                inv_perm,
-            ) = input
-            _moe_unpermute_and_reduce(
-                output_hidden_states,
-                permuted_hidden_states,
-                inv_perm,
-                topk_weights,
-                True,
-            )
+        (permuted_hidden_states, first_token_off, inv_perm_idx) = input
+        output = torch.empty_like(hidden_states)
+        moe_unpermute(
+            output,
+            permuted_hidden_states,
+            topk_weights,
+            inv_perm_idx,
+            first_token_off,
+        )

    # JIT compilation & warmup
    input = prepare()
@@ -276,8 +209,7 @@ class BenchmarkWorker:
        dtype: torch.dtype,
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
-        use_customized_permute: bool = False,
-    ) -> tuple[dict[str, int], float]:
+    ) -> tuple[float, float]:
        set_random_seed(self.seed)

        permute_time = benchmark_permute(
@@ -289,7 +221,6 @@ class BenchmarkWorker:
            use_fp8_w8a8,
            use_int8_w8a16,
            num_iters=100,
-            use_customized_permute=use_customized_permute,
        )
        unpermute_time = benchmark_unpermute(
            num_tokens,
@@ -300,7 +231,6 @@ class BenchmarkWorker:
            use_fp8_w8a8,
            use_int8_w8a16,
            num_iters=100,
-            use_customized_permute=use_customized_permute,
        )
        return permute_time, unpermute_time

@@ -347,7 +277,6 @@ def main(args: argparse.Namespace):
    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
-    use_customized_permute = args.use_customized_permute

    if args.batch_size is None:
        batch_sizes = [
@@ -399,7 +328,6 @@ def main(args: argparse.Namespace):
                dtype,
                use_fp8_w8a8,
                use_int8_w8a16,
-                use_customized_permute,
            )
            for batch_size in batch_sizes
        ],
@@ -419,7 +347,6 @@ if __name__ == "__main__":
    parser.add_argument(
        "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
    )
-    parser.add_argument("--use-customized-permute", action="store_true")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--batch-size", type=int, required=False)
    parser.add_argument("--trust-remote-code", action="store_true")
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -22,8 +22,8 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser

 mp.set_start_method("spawn", force=True)

-assert current_platform.is_cuda(), (
-    "Only support tune w8a8 block fp8 kernel on CUDA device."
+assert current_platform.is_cuda() or current_platform.is_rocm(), (
+    "Only support tune w8a8 block fp8 kernel on CUDA/ROCm device."
 )

 DTYPE_MAP = {
--- a/benchmarks/kernels/cpu/benchmark_cpu_attn.py
+++ b/benchmarks/kernels/cpu/benchmark_cpu_attn.py
@@ -14,7 +14,7 @@ from vllm._custom_ops import (
 )
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
-from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
 from vllm.v1.attention.backends.cpu_attn import CPUAttentionBackend, _get_attn_isa


@@ -58,7 +58,7 @@ def main(
    seed: int = 0,
    iters: int = 20,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    num_seqs = len(seq_lens)
    query_lens = [x[0] for x in seq_lens]
    kv_lens = [x[1] for x in seq_lens]
--- a/benchmarks/kernels/cpu/benchmark_cpu_fused_moe.py
+++ b/benchmarks/kernels/cpu/benchmark_cpu_fused_moe.py
@@ -7,8 +7,8 @@ import time
 import numpy as np
 import torch

-from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import set_random_seed

 # Check if CPU MoE operations are available
 try:
@@ -41,7 +41,7 @@ def main(
    seed: int = 0,
    iters: int = 20,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    # up_dim = 2 * intermediate_size for gate + up projection
    up_dim = 2 * intermediate_size

--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -359,6 +359,19 @@ else()
    add_compile_definitions(-DVLLM_NUMA_DISABLED)
 endif()

+#
+# Generate CPU attention dispatch header
+#
+message(STATUS "Generating CPU attention dispatch header")
+execute_process(
+    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/csrc/cpu/generate_cpu_attn_dispatch.py
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/csrc/cpu
+    RESULT_VARIABLE GEN_RESULT
+)
+if(NOT GEN_RESULT EQUAL 0)
+    message(FATAL_ERROR "Failed to generate CPU attention dispatch header")
+endif()
+
 #
 # _C extension
 #
--- a/cmake/external_projects/triton_kernels.cmake
+++ b/cmake/external_projects/triton_kernels.cmake
@@ -1,9 +1,9 @@
 # Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels

-set(DEFAULT_TRITON_KERNELS_TAG "v3.5.0")
+set(DEFAULT_TRITON_KERNELS_TAG "v3.6.0")

 # Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
-# be directly set to the triton_kernels python directory. 
+# be directly set to the triton_kernels python directory.
 if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
  message(STATUS "[triton_kernels] Fetch from $ENV{TRITON_KERNELS_SRC_DIR}")
  FetchContent_Declare(
@@ -24,7 +24,7 @@ else()
  )
 endif()

-# Fetch content 
+# Fetch content
 FetchContent_MakeAvailable(triton_kernels)

 if (NOT triton_kernels_SOURCE_DIR)
@@ -47,7 +47,7 @@ install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/third_party/tr
 ## Copy .py files to install directory.
 install(DIRECTORY
        ${TRITON_KERNELS_PYTHON_DIR}
-        DESTINATION 
+        DESTINATION
        vllm/third_party/triton_kernels/
        COMPONENT triton_kernels
        FILES_MATCHING PATTERN "*.py")
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -24,6 +24,12 @@
 typedef __hip_bfloat16 __nv_bfloat16;
 #endif

+#if defined(__gfx942__)
+constexpr float kFp8ScaleDivisor = 224.f;
+#else
+constexpr float kFp8ScaleDivisor = 448.f;
+#endif
+
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                 int64_t block_size_in_bytes,
                 const torch::Tensor& block_mapping) {
@@ -401,8 +407,7 @@ __global__ void concat_and_cache_ds_mla_kernel(
  }

  // Compute the scale for the tile
-  float tile_scale = max_abs / 448.f;
-  tile_scale = fmaxf(tile_scale, FLT_MIN);
+  float tile_scale = fmaxf(max_abs / kFp8ScaleDivisor, FLT_MIN);

  // The first lane of each half-warp writes the scale to kv_cache
  if ((lane_idx == 0) || (lane_idx == 16)) {
@@ -471,11 +476,8 @@ __global__ void indexer_k_quant_and_cache_kernel(
 #endif
  }

-#if defined(__gfx942__)
-  float scale = fmaxf(amax, 1e-4) / 224.0f;
-#else
-  float scale = fmaxf(amax, 1e-4) / 448.0f;
-#endif
+  float scale = fmaxf(amax, 1e-4) / kFp8ScaleDivisor;
+
  if (use_ue8m0) {
    scale = exp2f(ceilf(log2f(scale)));
  }
--- a/csrc/cpu/cpu_attn.cpp
+++ b/csrc/cpu/cpu_attn.cpp
@@ -1,79 +1,4 @@
-#include "cpu_attn_vec.hpp"
-#include "cpu_attn_vec16.hpp"
-
-#ifdef CPU_CAPABILITY_AMXBF16
-  #include "cpu_attn_amx.hpp"
-  #define AMX_DISPATCH(...)                                                   \
-    case cpu_attention::ISA::AMX: {                                           \
-      using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::AMX, \
-                                                     scalar_t, head_dim>;     \
-      return __VA_ARGS__();                                                   \
-    }
-#else
-  #define AMX_DISPATCH(...) case cpu_attention::ISA::AMX:
-#endif
-
-#ifdef __aarch64__
-  #include "cpu_attn_neon.hpp"
-  // NEON requires head_dim to be a multiple of 32
-  #define NEON_DISPATCH(...)                                                   \
-    case cpu_attention::ISA::NEON: {                                           \
-      using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::NEON, \
-                                                     scalar_t, head_dim>;      \
-      return __VA_ARGS__();                                                    \
-    }
-#else
-  #define NEON_DISPATCH(...) case cpu_attention::ISA::NEON:
-#endif  // #ifdef __aarch64__
-
-#define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \
-  case HEAD_DIM: {                            \
-    constexpr size_t head_dim = HEAD_DIM;     \
-    return __VA_ARGS__();                     \
-  }
-
-#define CPU_ATTN_DISPATCH_CASE_HEADDIM(HEAD_DIM, ...)           \
-  [&] {                                                         \
-    switch (HEAD_DIM) {                                         \
-      CPU_ATTN_DISPATCH_CASE(32, __VA_ARGS__)                   \
-      CPU_ATTN_DISPATCH_CASE(64, __VA_ARGS__)                   \
-      CPU_ATTN_DISPATCH_CASE(80, __VA_ARGS__)                   \
-      CPU_ATTN_DISPATCH_CASE(96, __VA_ARGS__)                   \
-      CPU_ATTN_DISPATCH_CASE(112, __VA_ARGS__)                  \
-      CPU_ATTN_DISPATCH_CASE(128, __VA_ARGS__)                  \
-      CPU_ATTN_DISPATCH_CASE(160, __VA_ARGS__)                  \
-      CPU_ATTN_DISPATCH_CASE(192, __VA_ARGS__)                  \
-      CPU_ATTN_DISPATCH_CASE(224, __VA_ARGS__)                  \
-      CPU_ATTN_DISPATCH_CASE(256, __VA_ARGS__)                  \
-      default: {                                                \
-        TORCH_CHECK(false, "Invalid CPU attention head_dim: " + \
-                               std::to_string(HEAD_DIM));       \
-      }                                                         \
-    }                                                           \
-  }()
-
-#define CPU_ATTN_DISPATCH_IMPL(ISA_TYPE, ...)                                 \
-  [&] {                                                                       \
-    switch (ISA_TYPE) {                                                       \
-      AMX_DISPATCH(__VA_ARGS__)                                               \
-      NEON_DISPATCH(__VA_ARGS__)                                              \
-      case cpu_attention::ISA::VEC: {                                         \
-        using attn_impl =                                                     \
-            cpu_attention::AttentionImpl<cpu_attention::ISA::VEC, scalar_t,   \
-                                         head_dim>;                           \
-        return __VA_ARGS__();                                                 \
-      }                                                                       \
-      case cpu_attention::ISA::VEC16: {                                       \
-        using attn_impl =                                                     \
-            cpu_attention::AttentionImpl<cpu_attention::ISA::VEC16, scalar_t, \
-                                         head_dim>;                           \
-        return __VA_ARGS__();                                                 \
-      }                                                                       \
-      default: {                                                              \
-        TORCH_CHECK(false, "Invalid CPU attention ISA type.");                \
-      }                                                                       \
-    }                                                                         \
-  }()
+#include "cpu_attn_dispatch_generated.h"

 torch::Tensor get_scheduler_metadata(
    const int64_t num_req, const int64_t num_heads_q,
@@ -122,16 +47,14 @@ torch::Tensor get_scheduler_metadata(
  input.enable_kv_split = enable_kv_split;

  VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
-    CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
-      CPU_ATTN_DISPATCH_IMPL(isa, [&]() {
-        input.elem_size = sizeof(scalar_t);
-        input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
-        input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
-        input.output_buffer_elem_size =
-            sizeof(attn_impl::partial_output_buffer_t);
-        input.max_num_q_per_iter = attn_impl::MaxQHeadNumPerIteration;
-        input.kv_block_alignment = attn_impl::BlockSizeAlignment;
-      });
+    CPU_ATTN_DISPATCH(head_dim, isa, [&]() {
+      input.elem_size = sizeof(scalar_t);
+      input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
+      input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
+      input.output_buffer_elem_size =
+          sizeof(attn_impl::partial_output_buffer_t);
+      input.max_num_q_per_iter = attn_impl::MaxQHeadNumPerIteration;
+      input.kv_block_alignment = attn_impl::BlockSizeAlignment;
    });
  });

@@ -184,18 +107,14 @@ void cpu_attn_reshape_and_cache(

  VLLM_DISPATCH_FLOATING_TYPES(
      key.scalar_type(), "cpu_attn_reshape_and_cache", [&]() {
-        CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
-          CPU_ATTN_DISPATCH_IMPL(isa_tag, [&]() {
-            attn_impl::reshape_and_cache(
-                key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
-                key_cache.data_ptr<scalar_t>(),
-                value_cache.data_ptr<scalar_t>(),
-                slot_mapping.data_ptr<int64_t>(), token_num,
-                key_token_num_stride, value_token_num_stride, head_num,
-                key_head_num_stride, value_head_num_stride, num_blocks,
-                num_blocks_stride, cache_head_num_stride, block_size,
-                block_size_stride);
-          });
+        CPU_ATTN_DISPATCH(head_dim, isa_tag, [&]() {
+          attn_impl::reshape_and_cache(
+              key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
+              key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
+              slot_mapping.data_ptr<int64_t>(), token_num, key_token_num_stride,
+              value_token_num_stride, head_num, key_head_num_stride,
+              value_head_num_stride, num_blocks, num_blocks_stride,
+              cache_head_num_stride, block_size, block_size_stride);
        });
      });
 }
@@ -257,12 +176,10 @@ void cpu_attention_with_kv_cache(

  VLLM_DISPATCH_FLOATING_TYPES(
      query.scalar_type(), "cpu_attention_with_kv_cache", [&]() {
-        CPU_ATTN_DISPATCH_CASE_HEADDIM(query.size(2), [&] {
-          CPU_ATTN_DISPATCH_IMPL(input.metadata->isa, [&]() {
-            TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
-            cpu_attention::AttentionMainLoop<attn_impl> mainloop;
-            mainloop(&input);
-          });
+        CPU_ATTN_DISPATCH(query.size(2), input.metadata->isa, [&]() {
+          TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
+          cpu_attention::AttentionMainLoop<attn_impl> mainloop;
+          mainloop(&input);
        });
      });
 }
--- a/csrc/cpu/cpu_attn_amx.hpp
+++ b/csrc/cpu/cpu_attn_amx.hpp
@@ -377,7 +377,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
      const int32_t q_heads_per_kv, const int64_t q_num_stride,
      const int64_t q_head_stride, const float scale) {
    constexpr int64_t bytes_per_head = head_dim * sizeof(scalar_t);
-    // static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
+    static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
    constexpr int64_t head_size_block_num = bytes_per_head / AMX_TILE_ROW_BYTES;
    constexpr int64_t head_elem_num_pre_block =
        AMX_TILE_ROW_BYTES / sizeof(scalar_t);
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -816,14 +816,10 @@ struct VecTypeTrait<float> {
  using vec_t = vec_op::FP32Vec16;
 };

-// ARM only supports BF16 with ARMv8.6-A extension
-#if (defined(__aarch64__) && !defined(ARM_BF16_SUPPORT))
-#else
 template <>
 struct VecTypeTrait<c10::BFloat16> {
  using vec_t = vec_op::BF16Vec16;
 };
-#endif

 #if !defined(__powerpc__) && !defined(__s390x__)
 template <>
@@ -1111,7 +1107,8 @@ class AttentionMainLoop {
          if (sliding_window_left != -1) {
            pos = std::max(pos, curr_token_pos - sliding_window_left);
          }
-          return pos;
+          // Clamp to tile end to avoid OOB when window starts past the tile
+          return std::min(pos, kv_tile_end_pos);
        }();

        int32_t right_kv_pos = [&]() {
@@ -1585,17 +1582,10 @@ class AttentionMainLoop {

              if (use_sink) {
                alignas(64) float s_aux_fp32[16];
-#if defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
-                // ARM without native BF16 support: manual conversion
-                for (int i = 0; i < 16; ++i) {
-                  s_aux_fp32[i] = static_cast<float>(curr_s_aux[i]);
-                }
-#else
                // All other platforms have BF16Vec16 available
                vec_op::BF16Vec16 vec_bf16(curr_s_aux);
                vec_op::FP32Vec16 vec_fp32(vec_bf16);
                vec_fp32.save(s_aux_fp32);
-#endif

                float* __restrict__ curr_sum_buffer = sum_buffer;
                float* __restrict__ curr_max_buffer = max_buffer;
--- a/csrc/cpu/cpu_attn_neon.hpp
+++ b/csrc/cpu/cpu_attn_neon.hpp
@@ -4,6 +4,9 @@
 #include "cpu_attn_impl.hpp"
 #include <arm_neon.h>
 #include <type_traits>
+#ifdef ARM_BF16_SUPPORT
+  #include "cpu_attn_neon_bfmmla.hpp"
+#endif
 namespace cpu_attention {

 namespace {
@@ -57,7 +60,7 @@ FORCE_INLINE void load_row8_B_as_f32<c10::BFloat16>(const c10::BFloat16* p,
 #endif
 }

-// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with NEON FMLAs
+// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with ASIMD FMLAs
 // #Loads = (K // 4) * (M + 4 * sizeof(kv_cache_t) / 2)
 // #FMLAs = (K // 4) * (4 * 2 * M)
 // We have (4 * 2 * M) FMLAs for (M + 4 * sizeof(kv_cache_t) / 2) loads
@@ -264,7 +267,7 @@ class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
  constexpr static ISA ISAType = ISA::NEON;
  constexpr static bool scale_on_logits = false;  // apply scale on q_buffer

-  //  static_assert(HeadDim % HeadDimAlignment == 0);
+  static_assert(HeadDim % HeadDimAlignment == 0);
  // the gemm micro kernel is Mx8
  static_assert(HeadDimAlignment % 8 == 0);
  static_assert(BlockSizeAlignment % 8 == 0);
@@ -381,6 +384,18 @@ class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
    }
  }
 };
+
+#ifdef ARM_BF16_SUPPORT
+// For BF16 on Arm, reuse the BFMMLA kernels with 32-token alignment.
+template <int64_t head_dim>
+class AttentionImpl<ISA::NEON, c10::BFloat16, head_dim>
+    : public AttentionImplNEONBFMMLA<BLOCK_SIZE_ALIGNMENT, ISA::NEON,
+                                     head_dim> {};
+#endif
 }  // namespace cpu_attention

-#endif  // #ifndef CPU_ATTN_NEON_HPP
+#undef BLOCK_SIZE_ALIGNMENT
+#undef HEAD_SIZE_ALIGNMENT
+#undef MAX_Q_HEAD_NUM_PER_ITER
+
+#endif  // #ifndef CPU_ATTN_ASIMD_HPP
--- a/csrc/cpu/cpu_attn_neon_bfmmla.hpp
+++ b/csrc/cpu/cpu_attn_neon_bfmmla.hpp
@@ -0,0 +1,682 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#ifndef CPU_ATTN_NEON_BFMMLA_HPP
+#define CPU_ATTN_NEON_BFMMLA_HPP
+
+#include "cpu_attn_impl.hpp"
+
+#include <arm_neon.h>
+
+#include <cstdint>
+#include <vector>
+
+namespace cpu_attention {
+
+namespace {
+
+// BFMMLA tile dimensions
+constexpr int32_t TILE_ROWS = 2;  // M dimension
+constexpr int32_t TILE_K = 4;     // K reduction
+constexpr int32_t TILE_COLS = 2;  // N dimension (column-pair)
+
+// Derived constants
+constexpr int32_t OUTPUT_COLS_PER_BLOCK = 8;   // 4 column-pairs
+constexpr int32_t K_TOKENS_PER_GROUP = 8;      // Tokens grouped in K cache
+constexpr int32_t V_TOKENS_PER_ROW_BLOCK = 4;  // Tokens per V cache row block
+constexpr int32_t K_INNER_STRIDE = K_TOKENS_PER_GROUP * TILE_K;
+constexpr int32_t V_INNER_STRIDE = V_TOKENS_PER_ROW_BLOCK * TILE_COLS;
+constexpr int32_t PACK_ELEMENTS_PER_K_CHUNK = TILE_ROWS * TILE_K;  // A packing
+
+// Matrix Packing and Accumulator
+// Reshape two rows of Q into BFMMLA-friendly interleaved
+// Input:  row0 = [a0,a1,a2,a3], row1 = [b0,b1,b2,b3]
+// Output: [a0,a1,a2,a3,b0,b1,b2,b3, a4,a5,a6,a7,b4,b5,b6,b7]
+// For K tail (K % TILE_K != 0): pads with zeros to complete the final chunk
+FORCE_INLINE void reshape_Q_2xK_for_bfmmla(const c10::BFloat16* __restrict r0,
+                                           const c10::BFloat16* __restrict r1,
+                                           c10::BFloat16* __restrict dst,
+                                           int32_t K) {
+  const uint16_t* s0 = reinterpret_cast<const uint16_t*>(r0);
+  const uint16_t* s1 = reinterpret_cast<const uint16_t*>(r1);
+  uint16_t* d = reinterpret_cast<uint16_t*>(dst);
+
+  // Process TILE_K elements at a time (PACK_ELEMENTS_PER_K_CHUNK output)
+  int32_t k = 0;
+  for (; k + TILE_K <= K; k += TILE_K, d += PACK_ELEMENTS_PER_K_CHUNK) {
+    vst1q_u16(d, vcombine_u16(vld1_u16(s0 + k), vld1_u16(s1 + k)));
+  }
+
+  // Handle K tail: pack remaining elements with zero-padding
+  const int32_t tail = K - k;
+  if (tail > 0) {
+    // Pack remaining tail elements: [r0[k..k+tail-1], pad, r1[k..k+tail-1],
+    // pad]
+    for (int32_t t = 0; t < tail; ++t) {
+      d[t] = s0[k + t];
+      d[t + TILE_K] = s1[k + t];
+    }
+    // Zero-pad the rest
+    for (int32_t t = tail; t < TILE_K; ++t) {
+      d[t] = 0;
+      d[t + TILE_K] = 0;
+    }
+  }
+}
+
+// 2x2 accumulator load/store with compile-time row count
+template <int32_t m_rows>
+FORCE_INLINE float32x4_t load_acc_2x2(float* base, int64_t ldc, int col_off) {
+  static_assert(m_rows == 1 || m_rows == 2);
+  float32x2_t row0 = vld1_f32(base + col_off);
+  float32x2_t row1 =
+      (m_rows == 2) ? vld1_f32(base + ldc + col_off) : vdup_n_f32(0.f);
+  return vcombine_f32(row0, row1);
+}
+
+template <int32_t m_rows>
+FORCE_INLINE void store_acc_2x2(float32x4_t acc, float* base, int64_t ldc,
+                                int col_off) {
+  static_assert(m_rows == 1 || m_rows == 2);
+  vst1_f32(base + col_off, vget_low_f32(acc));
+  if constexpr (m_rows == 2) {
+    vst1_f32(base + ldc + col_off, vget_high_f32(acc));
+  }
+}
+
+// Initialize 4 column-pair accumulators for 2 rows (8 columns total)
+#define INIT_ACC_ROWPAIR_4(a0, a1, a2, a3, Crow, ldc, m_rows, accum) \
+  do {                                                               \
+    if (accum) {                                                     \
+      if (m_rows == 2) {                                             \
+        a0 = load_acc_2x2<2>(Crow, ldc, 0);                          \
+        a1 = load_acc_2x2<2>(Crow, ldc, 2);                          \
+        a2 = load_acc_2x2<2>(Crow, ldc, 4);                          \
+        a3 = load_acc_2x2<2>(Crow, ldc, 6);                          \
+      } else {                                                       \
+        a0 = load_acc_2x2<1>(Crow, ldc, 0);                          \
+        a1 = load_acc_2x2<1>(Crow, ldc, 2);                          \
+        a2 = load_acc_2x2<1>(Crow, ldc, 4);                          \
+        a3 = load_acc_2x2<1>(Crow, ldc, 6);                          \
+      }                                                              \
+    } else {                                                         \
+      a0 = a1 = a2 = a3 = vdupq_n_f32(0.f);                          \
+    }                                                                \
+  } while (0)
+
+// Store 4 column-pair accumulators back to C matrix
+#define STORE_ACC_ROWPAIR_4(a0, a1, a2, a3, Crow, ldc, m_rows) \
+  do {                                                         \
+    if (m_rows == 2) {                                         \
+      store_acc_2x2<2>(a0, Crow, ldc, 0);                      \
+      store_acc_2x2<2>(a1, Crow, ldc, 2);                      \
+      store_acc_2x2<2>(a2, Crow, ldc, 4);                      \
+      store_acc_2x2<2>(a3, Crow, ldc, 6);                      \
+    } else {                                                   \
+      store_acc_2x2<1>(a0, Crow, ldc, 0);                      \
+      store_acc_2x2<1>(a1, Crow, ldc, 2);                      \
+      store_acc_2x2<1>(a2, Crow, ldc, 4);                      \
+      store_acc_2x2<1>(a3, Crow, ldc, 6);                      \
+    }                                                          \
+  } while (0)
+
+// Perform 4 BFMMLA operations: acc += A @ B for 4 column-pairs
+#define BFMMLA_COMPUTE_4(r0, r1, r2, r3, a, b0, b1, b2, b3) \
+  do {                                                      \
+    r0 = vbfmmlaq_f32(r0, a, b0);                           \
+    r1 = vbfmmlaq_f32(r1, a, b1);                           \
+    r2 = vbfmmlaq_f32(r2, a, b2);                           \
+    r3 = vbfmmlaq_f32(r3, a, b3);                           \
+  } while (0)
+
+// Micro-kernel: updates a small fixed tile using BFMMLA.
+// RP = number of row-pairs (1,2,4)
+// Computes C[TILE_ROWS*RP, OUTPUT_COLS_PER_BLOCK] += A_packed @ B.
+// A_packed interleaves RP row-pairs; B layout is driven by the attention phase:
+// - AttentionGemmPhase::QK -> token-column layout (Q @ K^T)
+// - AttentionGemmPhase::PV -> token-row layout (P @ V)
+// K_static < 0 enables runtime K (PV only)
+template <int32_t RP, int32_t K_static, AttentionGemmPhase phase>
+FORCE_INLINE void gemm_rowpairs_x8_bfmmla_neon(
+    const bfloat16_t* const* __restrict A_packed_rp,
+    const int32_t* __restrict m_rows_rp, const bfloat16_t* __restrict B_blk,
+    float* __restrict C, int64_t ldc, bool accumulate, int64_t b_stride,
+    int32_t K_runtime = 0) {
+  static_assert(RP == 1 || RP == 2 || RP == 4, "RP must be 1,2,4");
+  static_assert(K_static < 0 || K_static % TILE_K == 0,
+                "K must be divisible by TILE_K");
+  static_assert(K_static >= 0 || phase == AttentionGemmPhase::PV,
+                "Runtime K only supported for PV");
+
+  constexpr bool runtime_k = (K_static < 0);
+  const int32_t K_iters =
+      runtime_k ? (K_runtime / TILE_K) : (K_static / TILE_K);
+  const int32_t K_tail = runtime_k ? (K_runtime % TILE_K) : 0;
+
+  if (!runtime_k) {
+    // Help the compiler fold away unused K_runtime when K is compile-time
+    (void)K_runtime;
+  }
+
+  auto* C_al = C;
+  const auto* B_al = B_blk;
+
+  // Setup A pointers
+  const bfloat16_t* a_ptr[4] = {
+      A_packed_rp[0],
+      (RP >= 2) ? A_packed_rp[1] : nullptr,
+      (RP >= 4) ? A_packed_rp[2] : nullptr,
+      (RP >= 4) ? A_packed_rp[3] : nullptr,
+  };
+
+  // Setup B pointers based on layout
+  const bfloat16_t* b_ptr[4];
+  if constexpr (phase == AttentionGemmPhase::PV) {
+    b_ptr[0] = B_blk + 0 * b_stride;
+    b_ptr[1] = B_blk + 1 * b_stride;
+    b_ptr[2] = B_blk + 2 * b_stride;
+    b_ptr[3] = B_blk + 3 * b_stride;
+  }
+
+  float32x4_t acc[4][4];
+
+// Initialize accumulators
+#define INIT_RP(rp)                                                            \
+  if constexpr (RP > rp) {                                                     \
+    INIT_ACC_ROWPAIR_4(acc[rp][0], acc[rp][1], acc[rp][2], acc[rp][3],         \
+                       C_al + (rp * 2) * ldc, ldc, m_rows_rp[rp], accumulate); \
+  }
+  INIT_RP(0);
+  INIT_RP(1);
+  INIT_RP(2);
+  INIT_RP(3);
+#undef INIT_RP
+
+  // Main compute loop
+  for (int32_t ki = 0; ki < K_iters; ++ki) {
+    bfloat16x8_t b0, b1, b2, b3;
+    if constexpr (phase == AttentionGemmPhase::PV) {
+      b0 = vld1q_bf16(b_ptr[0] + ki * V_INNER_STRIDE);
+      b1 = vld1q_bf16(b_ptr[1] + ki * V_INNER_STRIDE);
+      b2 = vld1q_bf16(b_ptr[2] + ki * V_INNER_STRIDE);
+      b3 = vld1q_bf16(b_ptr[3] + ki * V_INNER_STRIDE);
+    } else {
+      const bfloat16_t* b_base = B_al + ki * b_stride;
+      b0 = vld1q_bf16(b_base + 0 * V_INNER_STRIDE);
+      b1 = vld1q_bf16(b_base + 1 * V_INNER_STRIDE);
+      b2 = vld1q_bf16(b_base + 2 * V_INNER_STRIDE);
+      b3 = vld1q_bf16(b_base + 3 * V_INNER_STRIDE);
+    }
+
+#define COMPUTE_RP(rp)                                                       \
+  if constexpr (RP > rp) {                                                   \
+    bfloat16x8_t a = vld1q_bf16(a_ptr[rp] + ki * PACK_ELEMENTS_PER_K_CHUNK); \
+    BFMMLA_COMPUTE_4(acc[rp][0], acc[rp][1], acc[rp][2], acc[rp][3], a, b0,  \
+                     b1, b2, b3);                                            \
+  }
+    COMPUTE_RP(0);
+    COMPUTE_RP(1);
+    COMPUTE_RP(2);
+    COMPUTE_RP(3);
+#undef COMPUTE_RP
+  }
+
+  // K tail for runtime PV: fallback path
+  if constexpr (runtime_k) {
+    if (K_tail > 0) {
+      const int32_t tail_offset = K_iters * V_INNER_STRIDE;
+      const int32_t a_tail_offset = K_iters * PACK_ELEMENTS_PER_K_CHUNK;
+      for (int32_t kt = 0; kt < K_tail; ++kt) {
+        float32x4_t b_vecs[4];
+        for (int32_t p = 0; p < 4; ++p) {
+          const bfloat16_t* bp = b_ptr[p] + tail_offset + kt * TILE_COLS;
+          const float b0 = vcvtah_f32_bf16(bp[0]);
+          const float b1 = vcvtah_f32_bf16(bp[1]);
+          const float32x2_t b_pair = vset_lane_f32(b1, vdup_n_f32(b0), 1);
+          b_vecs[p] = vcombine_f32(b_pair, b_pair);
+        }
+
+#define TAIL_RP(rp)                                                     \
+  if constexpr (RP > rp) {                                              \
+    const bfloat16_t* ap = A_packed_rp[rp] + a_tail_offset;             \
+    float a_row0 = vcvtah_f32_bf16(ap[kt]);                             \
+    float a_row1 =                                                      \
+        (m_rows_rp[rp] == 2) ? vcvtah_f32_bf16(ap[kt + TILE_K]) : 0.0f; \
+    const float32x4_t a_vec =                                           \
+        vcombine_f32(vdup_n_f32(a_row0), vdup_n_f32(a_row1));           \
+    for (int32_t p = 0; p < 4; ++p) {                                   \
+      acc[rp][p] = vmlaq_f32(acc[rp][p], a_vec, b_vecs[p]);             \
+    }                                                                   \
+  }
+        TAIL_RP(0);
+        TAIL_RP(1);
+        TAIL_RP(2);
+        TAIL_RP(3);
+#undef TAIL_RP
+      }
+    }
+  }
+
+  // Store results
+#define STORE_RP(rp)                                                    \
+  if constexpr (RP > rp) {                                              \
+    STORE_ACC_ROWPAIR_4(acc[rp][0], acc[rp][1], acc[rp][2], acc[rp][3], \
+                        C_al + (rp * 2) * ldc, ldc, m_rows_rp[rp]);     \
+  }
+  STORE_RP(0);
+  STORE_RP(1);
+  STORE_RP(2);
+  STORE_RP(3);
+#undef STORE_RP
+}
+
+// Meso-kernel: packs a small MBxK slice of A, then tiles over N and calls the
+// micro-kernel for each OUTPUT_COLS_PER_BLOCK chunk. K_static < 0 enables
+// runtime K (PV only).
+template <int32_t MB, int32_t N, int32_t K_static, AttentionGemmPhase phase>
+FORCE_INLINE void gemm_packA_compute_MB_xN(
+    const c10::BFloat16* __restrict A, const c10::BFloat16* __restrict B,
+    float* __restrict C, int32_t K_runtime, int64_t lda, int64_t ldc,
+    int64_t b_layout_stride, int64_t b_reduction_stride, bool accumulate) {
+  static_assert(MB >= 1 && MB <= 8, "MB must be in [1,8]");
+  static_assert(N % OUTPUT_COLS_PER_BLOCK == 0,
+                "N must be a multiple of OUTPUT_COLS_PER_BLOCK");
+  static_assert(K_static < 0 || K_static % TILE_K == 0,
+                "K must be divisible by TILE_K");
+  static_assert(K_static >= 0 || phase == AttentionGemmPhase::PV,
+                "Runtime K only supported for PV");
+
+  constexpr bool runtime_k = (K_static < 0);
+  const int32_t K_val = runtime_k ? K_runtime : K_static;
+
+  // Keep small packs on-stack to avoid heap churn
+  constexpr int32_t STACK_PACK_STRIDE =
+      (1024 / TILE_K) * PACK_ELEMENTS_PER_K_CHUNK;
+
+  constexpr int32_t ROW_PAIRS = (MB + 1) / TILE_ROWS;
+  const int32_t pack_stride =
+      runtime_k ? ((K_val + TILE_K - 1) / TILE_K) * PACK_ELEMENTS_PER_K_CHUNK
+                : (K_static / TILE_K) * PACK_ELEMENTS_PER_K_CHUNK;
+
+  alignas(64) c10::BFloat16 A_packed_stack[ROW_PAIRS * STACK_PACK_STRIDE];
+  std::vector<c10::BFloat16> A_packed_heap;
+  c10::BFloat16* A_packed =
+      (pack_stride <= STACK_PACK_STRIDE)
+          ? A_packed_stack
+          : (A_packed_heap.resize(ROW_PAIRS * pack_stride),
+             A_packed_heap.data());
+
+  for (int32_t rp = 0; rp < ROW_PAIRS; ++rp) {
+    const int32_t m = rp * TILE_ROWS;
+    const int32_t m_rows = (m + 1 < MB) ? TILE_ROWS : 1;
+    const c10::BFloat16* A0 = A + m * lda;
+    const c10::BFloat16* A1 = (m_rows == TILE_ROWS) ? (A + (m + 1) * lda) : A0;
+    reshape_Q_2xK_for_bfmmla(A0, A1, A_packed + rp * pack_stride, K_val);
+  }
+
+  for (int32_t n = 0; n < N; n += OUTPUT_COLS_PER_BLOCK) {
+    const c10::BFloat16* B_blk_c10 =
+        (phase == AttentionGemmPhase::PV)
+            ? (B + (n / TILE_COLS) * b_layout_stride)
+            : (B + (n / OUTPUT_COLS_PER_BLOCK) * b_layout_stride);
+    const bfloat16_t* B_blk = reinterpret_cast<const bfloat16_t*>(B_blk_c10);
+
+    // Process row-pairs in groups of 4, 2, then 1
+    int32_t row_pair_idx = 0;
+
+#define PROCESS_RP_GROUP(group_size)                                       \
+  for (; row_pair_idx + (group_size - 1) < ROW_PAIRS;                      \
+       row_pair_idx += group_size) {                                       \
+    const bfloat16_t* Ap[group_size];                                      \
+    int32_t mr[group_size];                                                \
+    for (int32_t i = 0; i < group_size; ++i) {                             \
+      Ap[i] = reinterpret_cast<const bfloat16_t*>(                         \
+          A_packed + (row_pair_idx + i) * pack_stride);                    \
+      mr[i] = (((row_pair_idx + i) * TILE_ROWS + 1) < MB) ? TILE_ROWS : 1; \
+    }                                                                      \
+    float* C_blk = C + (row_pair_idx * TILE_ROWS) * ldc + n;               \
+    if constexpr (runtime_k) {                                             \
+      gemm_rowpairs_x8_bfmmla_neon<group_size, -1, phase>(                 \
+          Ap, mr, B_blk, C_blk, ldc, accumulate, b_layout_stride, K_val);  \
+    } else {                                                               \
+      gemm_rowpairs_x8_bfmmla_neon<group_size, K_static, phase>(           \
+          Ap, mr, B_blk, C_blk, ldc, accumulate,                           \
+          (phase == AttentionGemmPhase::PV) ? b_layout_stride              \
+                                            : b_reduction_stride);         \
+    }                                                                      \
+  }
+
+    PROCESS_RP_GROUP(4);
+    PROCESS_RP_GROUP(2);
+    PROCESS_RP_GROUP(1);
+#undef PROCESS_RP_GROUP
+  }
+}
+
+// Macro-kernel: iterates over M in MB={8,4,2,1} chunks.
+// Supports compile-time K specialization when K >= 0; otherwise uses runtime K
+// (runtime K path is only supported for PV).
+template <AttentionGemmPhase phase, int32_t N, int32_t K = -1>
+FORCE_INLINE void gemm_macro_neon_bfmmla(
+    const c10::BFloat16* __restrict A, const c10::BFloat16* __restrict B,
+    float* __restrict C, int32_t M, int32_t K_runtime, int64_t lda, int64_t ldc,
+    int64_t b_layout_stride, int64_t b_reduction_stride, bool accumulate) {
+  static_assert(N % OUTPUT_COLS_PER_BLOCK == 0,
+                "N must be a multiple of OUTPUT_COLS_PER_BLOCK");
+
+  if constexpr (K >= 0) {
+    static_assert(K % TILE_K == 0, "K must be divisible by TILE_K");
+    for (int32_t m = 0; m < M;) {
+      const int32_t rem = M - m;
+      const c10::BFloat16* A_blk = A + m * lda;
+      float* C_blk = C + m * ldc;
+
+#define DISPATCH_MB(mb)                                                   \
+  gemm_packA_compute_MB_xN<mb, N, K, phase>(A_blk, B, C_blk, 0, lda, ldc, \
+                                            b_layout_stride,              \
+                                            b_reduction_stride, accumulate)
+
+      if (rem >= 8) {
+        DISPATCH_MB(8);
+        m += 8;
+      } else if (rem >= 4) {
+        DISPATCH_MB(4);
+        m += 4;
+      } else if (rem >= 2) {
+        DISPATCH_MB(2);
+        m += 2;
+      } else {
+        DISPATCH_MB(1);
+        m += 1;
+      }
+#undef DISPATCH_MB
+    }
+  } else {
+    static_assert(phase == AttentionGemmPhase::PV,
+                  "Runtime K specialization only supported for PV.");
+    const int32_t K_val = K_runtime;
+
+    for (int32_t m = 0; m < M;) {
+      const int32_t rem = M - m;
+      const c10::BFloat16* A_blk = A + m * lda;
+      float* C_blk = C + m * ldc;
+
+#define DISPATCH_MB_RUNTIME(mb)                                                \
+  gemm_packA_compute_MB_xN<mb, N, -1, phase>(A_blk, B, C_blk, K_val, lda, ldc, \
+                                             b_layout_stride,                  \
+                                             b_reduction_stride, accumulate)
+
+      if (rem >= 8) {
+        DISPATCH_MB_RUNTIME(8);
+        m += 8;
+      } else if (rem >= 4) {
+        DISPATCH_MB_RUNTIME(4);
+        m += 4;
+      } else if (rem >= 2) {
+        DISPATCH_MB_RUNTIME(2);
+        m += 2;
+      } else {
+        DISPATCH_MB_RUNTIME(1);
+        m += 1;
+      }
+#undef DISPATCH_MB_RUNTIME
+    }
+  }
+}
+
+#undef INIT_ACC_ROWPAIR_4
+#undef STORE_ACC_ROWPAIR_4
+#undef BFMMLA_COMPUTE_4
+
+}  // namespace
+
+// TileGemm Adapter for Attention
+
+template <typename kv_cache_t, int32_t BlockTokens, int32_t HeadDim>
+class TileGemmNEONBFMMLA {
+ public:
+  template <AttentionGemmPhase phase, int32_t head_dim_ct>
+  FORCE_INLINE static void gemm(const int32_t m_size, void* __restrict__ a_tile,
+                                kv_cache_t* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                [[maybe_unused]] const int64_t ldb,
+                                const int64_t ldc,
+                                [[maybe_unused]] const int32_t block_size,
+                                [[maybe_unused]] const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    static_assert(BlockTokens % OUTPUT_COLS_PER_BLOCK == 0);
+    // BFMMLA kernels require compile-time head_dim; keep head_dim_ct only for
+    // API parity with other tile_gemm implementations.
+    if constexpr (head_dim_ct >= 0) {
+      static_assert(head_dim_ct == HeadDim,
+                    "BFMMLA expects head_dim_ct to match HeadDim; PV passes "
+                    "-1 for API parity.");
+    }
+
+    if constexpr (phase == AttentionGemmPhase::QK) {
+      const int64_t b_reduction_stride = K_INNER_STRIDE;
+      const int64_t b_token_block_stride = (HeadDim / TILE_K) * K_INNER_STRIDE;
+
+      gemm_macro_neon_bfmmla<AttentionGemmPhase::QK, BlockTokens, HeadDim>(
+          reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
+          m_size, 0, lda, ldc, b_token_block_stride, b_reduction_stride,
+          accum_c);
+    } else {
+      const int64_t b_pair_stride =
+          (block_size / V_TOKENS_PER_ROW_BLOCK) * V_INNER_STRIDE;
+
+      // PV gemm with runtime K specialization
+      switch (dynamic_k_size) {
+        case 32:
+          gemm_macro_neon_bfmmla<AttentionGemmPhase::PV, HeadDim, 32>(
+              reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
+              m_size, 32, lda, ldc, b_pair_stride, 0, accum_c);
+          break;
+        case 128:
+          gemm_macro_neon_bfmmla<AttentionGemmPhase::PV, HeadDim, 128>(
+              reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
+              m_size, 128, lda, ldc, b_pair_stride, 0, accum_c);
+          break;
+        case 256:
+          gemm_macro_neon_bfmmla<AttentionGemmPhase::PV, HeadDim, 256>(
+              reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
+              m_size, 256, lda, ldc, b_pair_stride, 0, accum_c);
+          break;
+        default:
+          gemm_macro_neon_bfmmla<AttentionGemmPhase::PV, HeadDim>(
+              reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
+              m_size, dynamic_k_size, lda, ldc, b_pair_stride, 0, accum_c);
+          break;
+      }
+    }
+  }
+};
+
+// Shared ASIMD BFMMLA implementation (BF16 only). The block size alignment and
+// ISA tag are template parameters so we can reuse the same kernels for
+// different NEON configurations.
+template <int64_t block_size_alignment, ISA isa_type, int64_t head_dim>
+class AttentionImplNEONBFMMLA {
+ public:
+  using query_t = c10::BFloat16;
+  using q_buffer_t = c10::BFloat16;
+  using kv_cache_t = c10::BFloat16;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = c10::BFloat16;
+
+  static constexpr int64_t BlockSizeAlignment = block_size_alignment;
+  // HeadDimAlignment equals head_dim so that the PV phase processes
+  // the full head dimension in a single gemm call.
+  static constexpr int64_t HeadDimAlignment = head_dim;
+  static constexpr int64_t MaxQHeadNumPerIteration = 16;
+  static constexpr int64_t HeadDim = head_dim;
+  static constexpr ISA ISAType = isa_type;
+  static constexpr bool scale_on_logits = false;
+
+  static_assert(HeadDim % OUTPUT_COLS_PER_BLOCK == 0);
+  static_assert(BlockSizeAlignment % OUTPUT_COLS_PER_BLOCK == 0);
+  static_assert(HeadDim % TILE_K == 0, "HeadDim must be a multiple of TILE_K");
+
+ public:
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    attention<
+        TileGemmNEONBFMMLA<kv_cache_t, static_cast<int32_t>(BlockSizeAlignment),
+                           static_cast<int32_t>(HeadDim)>>
+        attention_iteration;
+    attention_iteration(CPU_ATTENTION_PARAMS);
+  }
+
+  // Key cache stride per token group (TokenColumn layout; QK)
+  static constexpr int64_t k_cache_token_group_stride(
+      [[maybe_unused]] const int32_t block_size) {
+    static_assert(BlockSizeAlignment % K_TOKENS_PER_GROUP == 0);
+    return (BlockSizeAlignment / K_TOKENS_PER_GROUP) *
+           ((head_dim / TILE_K) * K_INNER_STRIDE);
+  }
+
+  // Value cache stride per token group (TokenRow layout; PV)
+  static constexpr int64_t v_cache_token_group_stride(
+      [[maybe_unused]] const int32_t block_size) {
+    static_assert(BlockSizeAlignment % V_TOKENS_PER_ROW_BLOCK == 0);
+    return (BlockSizeAlignment / V_TOKENS_PER_ROW_BLOCK) * V_INNER_STRIDE;
+  }
+
+  // The stride to move to the "next" head_dim group
+  // is the full V cache size per head, since HeadDimAlignment == head_dim.
+  // Hence, the stride is not used in this case
+  static constexpr int64_t v_cache_head_group_stride(
+      [[maybe_unused]] const int32_t block_size) {
+    return head_dim * block_size;
+  }
+
+  // Convert Q heads to BF16 and apply scale factor using native BF16 intrinsics
+  static void copy_q_heads_tile(c10::BFloat16* __restrict__ src,
+                                c10::BFloat16* __restrict__ q_buffer,
+                                const int32_t q_num,
+                                const int32_t q_heads_per_kv,
+                                const int64_t q_num_stride,
+                                const int64_t q_head_stride, float scale) {
+    constexpr int32_t dim = static_cast<int32_t>(head_dim);
+    const float32x4_t scale_vec = vdupq_n_f32(scale);
+
+    for (int32_t qi = 0; qi < q_num; ++qi) {
+      for (int32_t hi = 0; hi < q_heads_per_kv; ++hi) {
+        c10::BFloat16* __restrict__ curr_q =
+            src + qi * q_num_stride + hi * q_head_stride;
+        c10::BFloat16* __restrict__ dst =
+            q_buffer + qi * q_heads_per_kv * head_dim + hi * head_dim;
+
+        for (int32_t i = 0; i < dim; i += OUTPUT_COLS_PER_BLOCK) {
+          bfloat16x8_t in8 =
+              vld1q_bf16(reinterpret_cast<const bfloat16_t*>(curr_q + i));
+          float32x4_t lo = vmulq_f32(vcvtq_low_f32_bf16(in8), scale_vec);
+          float32x4_t hi = vmulq_f32(vcvtq_high_f32_bf16(in8), scale_vec);
+
+          bfloat16x4_t lo_b = vcvt_bf16_f32(lo);
+          bfloat16x4_t hi_b = vcvt_bf16_f32(hi);
+          bfloat16x8_t out = vcombine_bf16(lo_b, hi_b);
+          vst1q_bf16(reinterpret_cast<bfloat16_t*>(dst + i), out);
+        }
+      }
+    }
+  }
+
+ public:
+  // Reshape and cache K/V into BFMMLA-optimized layouts
+  // K cache:
+  // [block_size/K_TOKENS_PER_GROUP][head_dim/TILE_K][K_INNER_STRIDE]
+  // - TokenColumn
+  // V cache:
+  // [head_dim/TILE_COLS][block_size/V_TOKENS_PER_ROW_BLOCK][V_INNER_STRIDE]
+  // - TokenRows
+  static void reshape_and_cache(
+      const c10::BFloat16* __restrict__ key,
+      const c10::BFloat16* __restrict__ value,
+      c10::BFloat16* __restrict__ key_cache,
+      c10::BFloat16* __restrict__ value_cache,
+      const int64_t* __restrict__ slot_mapping, const int64_t token_num,
+      const int64_t key_token_num_stride, const int64_t value_token_num_stride,
+      const int64_t head_num, const int64_t key_head_num_stride,
+      const int64_t value_head_num_stride,
+      [[maybe_unused]] const int64_t num_blocks,
+      const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
+      const int64_t block_size,
+      [[maybe_unused]] const int64_t block_size_stride) {
+    const int64_t k_block_stride = (head_dim / TILE_K) * K_INNER_STRIDE;
+    const int64_t v_pair_stride =
+        (block_size / V_TOKENS_PER_ROW_BLOCK) * V_INNER_STRIDE;
+
+#pragma omp parallel for
+    for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
+      for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
+        const int64_t pos = slot_mapping[token_idx];
+        if (pos < 0) continue;
+
+        const int64_t block_idx = pos / block_size;
+        const int64_t block_offset = pos % block_size;
+
+        // Key cache: TokenColumn QK
+        {
+          const c10::BFloat16* __restrict key_src =
+              key + token_idx * key_token_num_stride +
+              head_idx * key_head_num_stride;
+
+          c10::BFloat16* __restrict key_base = key_cache +
+                                               block_idx * num_blocks_stride +
+                                               head_idx * cache_head_num_stride;
+
+          const int64_t block_in_block = block_offset / K_TOKENS_PER_GROUP;
+          const int64_t pair_in_block =
+              (block_offset % K_TOKENS_PER_GROUP) / TILE_COLS;
+          const int64_t lane_base = (block_offset & 1) ? TILE_K : 0;
+
+          c10::BFloat16* __restrict block_base =
+              key_base + block_in_block * k_block_stride;
+
+          for (int64_t hd4 = 0; hd4 < head_dim / TILE_K; ++hd4) {
+            uint16_t* dst_u16 = reinterpret_cast<uint16_t*>(
+                block_base + hd4 * K_INNER_STRIDE +
+                pair_in_block * V_INNER_STRIDE + lane_base);
+            const uint16_t* src_u16 =
+                reinterpret_cast<const uint16_t*>(key_src + hd4 * TILE_K);
+            vst1_u16(dst_u16, vld1_u16(src_u16));
+          }
+        }
+
+        // Value cache: TokenRow PV
+        {
+          const c10::BFloat16* __restrict value_src =
+              value + token_idx * value_token_num_stride +
+              head_idx * value_head_num_stride;
+
+          c10::BFloat16* __restrict value_base =
+              value_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride;
+
+          const int64_t row_block = block_offset / V_TOKENS_PER_ROW_BLOCK;
+          const int64_t lane = block_offset & (V_TOKENS_PER_ROW_BLOCK - 1);
+
+          c10::BFloat16* __restrict row_block_base =
+              value_base + row_block * V_INNER_STRIDE;
+
+          for (int64_t hd2 = 0; hd2 < head_dim / TILE_COLS; ++hd2) {
+            c10::BFloat16* __restrict dst_val =
+                row_block_base + hd2 * v_pair_stride;
+
+            const uint16_t* src_u16 =
+                reinterpret_cast<const uint16_t*>(value_src);
+            uint16_t* dst_u16 = reinterpret_cast<uint16_t*>(dst_val);
+            dst_u16[lane] = src_u16[hd2 * TILE_COLS + 0];
+            dst_u16[lane + V_TOKENS_PER_ROW_BLOCK] =
+                src_u16[hd2 * TILE_COLS + 1];
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace cpu_attention
+
+#endif  // CPU_ATTN_ASIMD_BFMMLA_HPP
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
--- a/csrc/cpu/cpu_wna16.cpp
+++ b/csrc/cpu/cpu_wna16.cpp
@@ -116,7 +116,7 @@ class Dequantizer4b {
      scalar_vec_t output_vec_0(wb_0);
      scalar_vec_t output_vec_1(wb_1);

-      // AMX needs to interlave K elements to pack as 32 bits
+      // AMX needs to interleave K elements to pack as 32 bits
      if constexpr (isa == ISA::AMX) {
        vec_op::interleave_save(output_vec_0, output_vec_1, curr_weight);
      } else {
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@@ -14,13 +14,11 @@ struct KernelVecType<float> {
  using cvt_vec_type = vec_op::FP32Vec16;
 };

-#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
 template <>
 struct KernelVecType<c10::BFloat16> {
  using load_vec_type = vec_op::BF16Vec16;
  using cvt_vec_type = vec_op::FP32Vec16;
 };
-#endif

 template <>
 struct KernelVecType<c10::Half> {
@@ -360,13 +358,14 @@ void onednn_scaled_mm(
    const std::optional<torch::Tensor>& azp,      // [M] or [1]
    const std::optional<torch::Tensor>& azp_adj,  // [M] or [1]
    const std::optional<torch::Tensor>& bias,     // [N]
-    int64_t handler) {
+    const torch::Tensor& handler_tensor) {
  CPU_KERNEL_GUARD_IN(onednn_scaled_mm)
  TORCH_CHECK(a.dim() == 2);
  TORCH_CHECK(a.is_contiguous());
  TORCH_CHECK(c.is_contiguous());
  W8A8MatMulPrimitiveHandler* ptr =
-      reinterpret_cast<W8A8MatMulPrimitiveHandler*>(handler);
+      reinterpret_cast<W8A8MatMulPrimitiveHandler*>(
+          handler_tensor.item<int64_t>());
  const int32_t* azp_ptr = nullptr;
  if (azp.has_value()) {
    azp_ptr = azp->data_ptr<int32_t>();
@@ -519,13 +518,14 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b,

 void onednn_mm(torch::Tensor& c,        // [M, OC], row-major
               const torch::Tensor& a,  // [M, IC], row-major
-               const std::optional<torch::Tensor>& bias, int64_t handler) {
+               const std::optional<torch::Tensor>& bias,
+               const torch::Tensor& handler_tensor) {
  CPU_KERNEL_GUARD_IN(onednn_mm)
  TORCH_CHECK(a.dim() == 2);
  TORCH_CHECK(a.stride(-1) == 1);
  TORCH_CHECK(c.stride(-1) == 1);
  MatMulPrimitiveHandler* ptr =
-      reinterpret_cast<MatMulPrimitiveHandler*>(handler);
+      reinterpret_cast<MatMulPrimitiveHandler*>(handler_tensor.item<int64_t>());

 // ACL matmuls expect contiguous source tensors
 #ifdef VLLM_USE_ACL
--- a/csrc/cpu/generate_cpu_attn_dispatch.py
+++ b/csrc/cpu/generate_cpu_attn_dispatch.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Generate CPU attention dispatch switch cases and kernel instantiations.
+"""
+
+import os
+
+# Head dimensions divisible by 32 (support all ISAs)
+HEAD_DIMS_32 = [32, 64, 96, 128, 160, 192, 224, 256]
+
+# Head dimensions divisible by 16 but not 32 (VEC16 only)
+HEAD_DIMS_16 = [80, 112]
+
+# ISA types
+ISA_TYPES = {
+    "AMX": 0,
+    "VEC": 1,
+    "VEC16": 2,
+    "NEON": 3,
+}
+
+# ISAs supported for head_dims divisible by 32
+ISA_FOR_32 = ["AMX", "NEON", "VEC", "VEC16"]
+
+# ISAs supported for head_dims divisible by 16 only
+ISA_FOR_16 = ["VEC16"]
+
+
+def encode_params(head_dim: int, isa_type: str) -> int:
+    """Encode head_dim and ISA type into a single int64_t."""
+    isa_val = ISA_TYPES[isa_type]
+    # Encoding: (head_dim << 8) | isa_type
+    # This allows head_dim up to 2^56 - 1 and 256 ISA types
+    return (head_dim << 8) | isa_val
+
+
+def generate_cases_for_isa_group(isa_list: list[str]) -> str:
+    """Generate switch cases for a specific ISA group."""
+    cases = []
+
+    # Generate cases for head_dims divisible by 32
+    for head_dim in HEAD_DIMS_32:
+        for isa in isa_list:
+            if isa not in ISA_FOR_32:
+                continue
+            encoded = encode_params(head_dim, isa)
+            case_str = (
+                f"""      case {encoded}LL: {{ """
+                f"""/* head_dim={head_dim}, isa={isa} */ \\"""
+                f"""
+        constexpr size_t head_dim = {head_dim}; \\"""
+                f"""
+        using attn_impl = cpu_attention::AttentionImpl<"""
+                f"""cpu_attention::ISA::{isa}, \\"""
+                f"""
+                                                       """
+                f"""scalar_t, head_dim>; \\"""
+                f"""
+        return __VA_ARGS__(); \\"""
+                f"""
+      }} \\"""
+            )
+            cases.append(case_str)
+
+    # Generate cases for head_dims divisible by 16 only
+    for head_dim in HEAD_DIMS_16:
+        for isa in isa_list:
+            encoded = encode_params(head_dim, isa)
+            case_str = (
+                f"""      case {encoded}LL: {{ """
+                f"""/* head_dim={head_dim}, isa={isa} """
+                f"""(using VEC16) */ \\"""
+                f"""
+        constexpr size_t head_dim = {head_dim}; \\"""
+                f"""
+        using attn_impl = cpu_attention::AttentionImpl<"""
+                f"""cpu_attention::ISA::VEC16, \\"""
+                f"""
+                                                       """
+                f"""scalar_t, head_dim>; \\"""
+                f"""
+        return __VA_ARGS__(); \\"""
+                f"""
+      }} \\"""
+            )
+            cases.append(case_str)
+
+    return "\n".join(cases)
+
+
+def generate_helper_function() -> str:
+    """Generate helper function to encode parameters."""
+    return """
+inline int64_t encode_cpu_attn_params(int64_t head_dim, cpu_attention::ISA isa) {
+  return (head_dim << 8) | static_cast<int64_t>(isa);
+}
+"""
+
+
+def generate_header_file() -> str:
+    """Generate the complete header file content."""
+    header = """// auto generated by generate_cpu_attn_dispatch.py
+// clang-format off
+
+#ifndef CPU_ATTN_DISPATCH_GENERATED_H
+#define CPU_ATTN_DISPATCH_GENERATED_H
+
+#include "cpu_attn_vec.hpp"
+#include "cpu_attn_vec16.hpp"
+
+#ifdef CPU_CAPABILITY_AMXBF16
+  #include "cpu_attn_amx.hpp"
+#endif
+
+#ifdef __aarch64__
+  #include "cpu_attn_neon.hpp"
+#endif
+
+"""
+
+    header += generate_helper_function()
+
+    # Generate dispatch macro with conditional compilation for different ISA sets
+    header += """
+// Dispatch macro using encoded parameters
+"""
+
+    # x86_64 with AMX
+    header += """#if defined(CPU_CAPABILITY_AMXBF16)
+#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
+  [&] { \\
+    int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
+    switch (encoded_params) { \\
+"""
+    header += generate_cases_for_isa_group(["AMX", "VEC", "VEC16"])
+    header += """
+      default: { \\
+        TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
+                    std::to_string(HEAD_DIM) + " isa=" + \\
+                    std::to_string(static_cast<int>(ISA_TYPE))); \\
+      } \\
+    } \\
+  }()
+
+"""
+
+    # ARM64 with NEON
+    header += """#elif defined(__aarch64__)
+#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
+  [&] { \\
+    int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
+    switch (encoded_params) { \\
+"""
+    header += generate_cases_for_isa_group(["NEON", "VEC", "VEC16"])
+    header += """
+      default: { \\
+        TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
+                    std::to_string(HEAD_DIM) + " isa=" + \\
+                    std::to_string(static_cast<int>(ISA_TYPE))); \\
+      } \\
+    } \\
+  }()
+
+"""
+
+    # Fallback: VEC and VEC16 only
+    header += """#else
+#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
+  [&] { \\
+    int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
+    switch (encoded_params) { \\
+"""
+    header += generate_cases_for_isa_group(["VEC", "VEC16"])
+    header += """
+      default: { \\
+        TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
+                    std::to_string(HEAD_DIM) + " isa=" + \\
+                    std::to_string(static_cast<int>(ISA_TYPE))); \\
+      } \\
+    } \\
+  }()
+
+#endif  /* CPU_CAPABILITY_AMXBF16 / __aarch64__ */
+
+#endif  // CPU_ATTN_DISPATCH_GENERATED_H
+"""
+
+    return header
+
+
+def main():
+    output_path = os.path.join(
+        os.path.dirname(__file__), "cpu_attn_dispatch_generated.h"
+    )
+
+    with open(output_path, "w") as f:
+        f.write(generate_header_file())
+
+
+if __name__ == "__main__":
+    main()
--- a/csrc/cpu/mla_decode.cpp
+++ b/csrc/cpu/mla_decode.cpp
@@ -38,9 +38,16 @@ struct KernelVecType<c10::BFloat16> {
  using qk_vec_type = vec_op::BF16Vec32;
  using v_load_vec_type = vec_op::BF16Vec16;
 };
-#elif defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
-// pass
-#else
+
+#elif defined(__s390x__)
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using qk_load_vec_type = vec_op::BF16Vec16;
+  using qk_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::BF16Vec16;
+};
+
+#elif defined(__aarch64__)
 template <>
 struct KernelVecType<c10::BFloat16> {
  using qk_load_vec_type = vec_op::BF16Vec16;
--- a/csrc/cpu/sgl-kernels/gemm.cpp
+++ b/csrc/cpu/sgl-kernels/gemm.cpp
@@ -265,7 +265,7 @@ void tinygemm_kernel(
        // mb_size = 4
        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
        case 0x44: LAUNCH_TINYGEMM_KERNEL_NN(4, 64); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
      }
    }
  }
--- a/csrc/cpu/sgl-kernels/gemm_fp8.cpp
+++ b/csrc/cpu/sgl-kernels/gemm_fp8.cpp
@@ -324,7 +324,7 @@ void tinygemm_kernel(
        case 0x22: LAUNCH_TINYGEMM_KERNEL_NN(2, 32); break;
        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
      }
    }
  }
--- a/csrc/cpu/sgl-kernels/gemm_int8.cpp
+++ b/csrc/cpu/sgl-kernels/gemm_int8.cpp
@@ -180,7 +180,7 @@ void tinygemm_kernel(
        // mb_size = 4
        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
        case 0x44: LAUNCH_TINYGEMM_KERNEL_NN(4, 64); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
      }
    }
  }
--- a/csrc/cpu/sgl-kernels/moe.cpp
+++ b/csrc/cpu/sgl-kernels/moe.cpp
@@ -398,7 +398,7 @@ void tinygemm_kernel(
        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
        // mb_size = 4
        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
      }
    }
  }
@@ -511,7 +511,7 @@ void tinygemm_kernel(
        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN2(3, 32); break;
        // mb_size = 4
        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN2(4, 32); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
      }
    }
  }
--- a/csrc/cpu/sgl-kernels/moe_int8.cpp
+++ b/csrc/cpu/sgl-kernels/moe_int8.cpp
@@ -271,7 +271,7 @@ void tinygemm_kernel(
        case 0x22: LAUNCH_TINYGEMM_KERNEL_VNNI(2, 32); break;
        case 0x32: LAUNCH_TINYGEMM_KERNEL_VNNI(3, 32); break;
        case 0x42: LAUNCH_TINYGEMM_KERNEL_VNNI(4, 32); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
      }
    }
  }
@@ -401,7 +401,7 @@ void tinygemm_kernel(
        case 0x22: LAUNCH_TINYGEMM_KERNEL_VNNI2(2, 32); break;
        case 0x32: LAUNCH_TINYGEMM_KERNEL_VNNI2(3, 32); break;
        case 0x42: LAUNCH_TINYGEMM_KERNEL_VNNI2(4, 32); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
      }
    }
  }
--- a/csrc/cpu/shm.cpp
+++ b/csrc/cpu/shm.cpp
@@ -237,10 +237,10 @@ struct ThreadSHMContext {
 class SHMManager {
 public:
  explicit SHMManager(const std::string& name, const int rank,
-                      const int group_size)
+                      const int group_size, const int thread_num)
      : _rank(rank),
        _group_size(group_size),
-        _thread_num(omp_get_max_threads()),
+        _thread_num(thread_num),
        _shm_names({""}),
        _shared_mem_ptrs({nullptr}),
        _shm_ctx(nullptr) {
@@ -282,11 +282,11 @@ class SHMManager {
  }

  static int64_t create_singleton_instance(const std::string& name,
-                                           const int group_size,
-                                           const int rank) {
+                                           const int group_size, const int rank,
+                                           const int thread_num) {
    std::lock_guard<std::mutex> guard(SingletonInstancesLock);
    SingletonInstances.emplace_back(
-        std::make_unique<SHMManager>(name, rank, group_size));
+        std::make_unique<SHMManager>(name, rank, group_size, thread_num));
    return static_cast<int64_t>(SingletonInstances.size() - 1);
  }

@@ -854,8 +854,9 @@ std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src) {
 }

 int64_t init_shm_manager(const std::string& name, const int64_t group_size,
-                         const int64_t rank) {
-  return SHMManager::create_singleton_instance(name, group_size, rank);
+                         const int64_t rank, const int64_t thread_num) {
+  return SHMManager::create_singleton_instance(name, group_size, rank,
+                                               thread_num);
 }

 std::string join_shm_manager(int64_t handle, const std::string& name) {
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -19,13 +19,14 @@ void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
                      const std::optional<torch::Tensor>& azp,
                      const std::optional<torch::Tensor>& azp_adj,
                      const std::optional<torch::Tensor>& bias,
-                      int64_t handler);
+                      const torch::Tensor& handler_tensor);

 int64_t create_onednn_mm_handler(const torch::Tensor& b,
                                 int64_t primitive_cache_size);

 void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
-               const std::optional<torch::Tensor>& bias, int64_t handler);
+               const std::optional<torch::Tensor>& bias,
+               const torch::Tensor& handler_tensor);

 bool is_onednn_acl_supported();

@@ -34,7 +35,7 @@ void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
                        torch::Tensor& block_tables, torch::Tensor& seq_lens);

 int64_t init_shm_manager(const std::string& name, const int64_t group_size,
-                         const int64_t rank);
+                         const int64_t rank, const int64_t thread_num);

 std::string join_shm_manager(int64_t handle, const std::string& name);

@@ -196,7 +197,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // oneDNN GEMM
  ops.def(
      "onednn_mm(Tensor! c, Tensor a, Tensor? bias, "
-      "int handler) -> ()");
+      "Tensor handler_tensor) -> ()");
  ops.impl("onednn_mm", torch::kCPU, &onednn_mm);

  // Check if oneDNN was built with ACL backend
@@ -212,7 +213,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // oneDNN scaled_mm for W8A8 with static per-tensor activation quantization
  ops.def(
      "onednn_scaled_mm(Tensor! c, Tensor a, Tensor a_scales, Tensor? azp, "
-      "Tensor? azp_adj, Tensor? bias, int handler) -> ()");
+      "Tensor? azp_adj, Tensor? bias, Tensor handler_tensor) -> ()");
  ops.impl("onednn_scaled_mm", torch::kCPU, &onednn_scaled_mm);

  // Compute int8 quantized tensor for given scaling factor.
@@ -231,8 +232,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {

 // SHM CCL
 #if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
-  ops.def("init_shm_manager(str name, int group_size, int rank) -> int",
-          &init_shm_manager);
+  ops.def(
+      "init_shm_manager(str name, int group_size, int rank, int thread_num) -> "
+      "int",
+      &init_shm_manager);
  ops.def("join_shm_manager(int handle, str name) -> str", &join_shm_manager);
  ops.def("shm_allreduce(int handle, Tensor! data) -> ()");
  ops.impl("shm_allreduce", torch::kCPU, &shm_allreduce);
@@ -291,7 +294,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "value_cache, Tensor(a3!) output, Tensor query_start_loc, Tensor "
      "seq_lens, float scale, bool causal, Tensor? alibi_slopes, SymInt "
      "sliding_window_left, SymInt sliding_window_right, Tensor block_table, "
-      "float softcap, Tensor sheduler_metadata, Tensor? s_aux) -> ()",
+      "float softcap, Tensor scheduler_metadata, Tensor? s_aux) -> ()",
      &cpu_attention_with_kv_cache);

  // placeholders
--- a/csrc/cpu/utils.hpp
+++ b/csrc/cpu/utils.hpp
@@ -30,12 +30,10 @@ struct VecTypeTrait<float> {
  using vec_t = vec_op::FP32Vec16;
 };

-#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
 template <>
 struct VecTypeTrait<c10::BFloat16> {
  using vec_t = vec_op::BF16Vec16;
 };
-#endif

 #if !defined(__powerpc__)
 template <>
--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -115,11 +115,28 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
  if (flag) {  // support GPUDirect RDMA if possible
    prop.allocFlags.gpuDirectRDMACapable = 1;
  }
+  int fab_flag = 0;
+  CUDA_CHECK(cuDeviceGetAttribute(
+      &fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device));
+  if (fab_flag) {  // support fabric handle if possible
+    prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
+  }
 #endif

 #ifndef USE_ROCM
  // Allocate memory using cuMemCreate
-  CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
+  CUresult ret = (CUresult)cuMemCreate(p_memHandle, size, &prop, 0);
+  if (ret) {
+    if (fab_flag &&
+        (ret == CUDA_ERROR_NOT_PERMITTED || ret == CUDA_ERROR_NOT_SUPPORTED)) {
+      // Fabric allocation may fail without multi-node nvlink,
+      // fallback to POSIX file descriptor
+      prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+      CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
+    } else {
+      CUDA_CHECK(ret);
+    }
+  }
  if (error_code != 0) {
    return;
  }
--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@@ -3,7 +3,8 @@
 #include "cutlass/cutlass.h"
 #include <climits>
 #include "cuda_runtime.h"
-#include <iostream>
+#include <cstdio>
+#include <cstdlib>

 /**
 * Helper function for checking CUTLASS errors
@@ -31,12 +32,63 @@ int32_t get_sm_version_num();
 * __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
 * into code that will be executed on the device where it is defined.
 */
+
+template <typename Kernel>
+struct enable_sm75_to_sm80 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
+    Kernel::invoke(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm[75, 80).\n");
+    asm("trap;");
+  #endif
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm80_to_sm89 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
+    Kernel::invoke(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm[80, 89).\n");
+    asm("trap;");
+  #endif
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm89_to_sm90 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
+    Kernel::invoke(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm[89, 90).\n");
+    asm("trap;");
+  #endif
+#endif
+  }
+};
+
 template <typename Kernel>
 struct enable_sm90_or_later : Kernel {
  template <typename... Args>
  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ >= 900
    Kernel::operator()(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm >= 90.\n");
+    asm("trap;");
+  #endif
 #endif
  }
 };
@@ -45,18 +97,43 @@ template <typename Kernel>
 struct enable_sm90_only : Kernel {
  template <typename... Args>
  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 900
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ == 900
    Kernel::operator()(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm90.\n");
+    asm("trap;");
+  #endif
 #endif
  }
 };

 template <typename Kernel>
-struct enable_sm100_only : Kernel {
+struct enable_sm100f_only : Kernel {
  template <typename... Args>
  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 1000
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ == 1000 || __CUDA_ARCH__ == 1030
    Kernel::operator()(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm100f.\n");
+    asm("trap;");
+  #endif
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm100a_only : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ == 1000
+    Kernel::operator()(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm100a.\n");
+    asm("trap;");
+  #endif
 #endif
  }
 };
@@ -65,7 +142,23 @@ template <typename Kernel>
 struct enable_sm120_only : Kernel {
  template <typename... Args>
  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 1200
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ == 1200
+    Kernel::operator()(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm120.\n");
+    asm("trap;");
+  #endif
+#endif
+  }
+};
+
+// SM12x family includes SM120 (RTX 5090) and SM121 (DGX Spark GB10)
+template <typename Kernel>
+struct enable_sm120_family : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && (__CUDA_ARCH__ >= 1200 && __CUDA_ARCH__ < 1300)
    Kernel::operator()(std::forward<Args>(args)...);
 #endif
  }
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@@ -770,7 +770,7 @@ torch::Tensor moe_wna16_marlin_gemm(
    b_bias = b_bias_or_none.value();
    TORCH_CHECK(b_bias.device().is_cuda(), "b_bias is not on GPU");
    TORCH_CHECK(b_bias.is_contiguous(), "b_bias is not contiguous");
-    TORCH_CHECK(b_bias.size(1) == size_n, "b_bias.size(0) != size_n");
+    TORCH_CHECK(b_bias.size(1) == size_n, "b_bias.size(1) != size_n");
    TORCH_CHECK(b_bias.stride(1) == 1, "b_bias.stride(1) != 1");
  } else {
    b_bias = torch::empty({0}, options);
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -14,12 +14,10 @@ void moe_permute(
    const torch::Tensor& token_expert_indices,       // [n_token, topk]
    const std::optional<torch::Tensor>& expert_map,  // [n_expert]
    int64_t n_expert, int64_t n_local_expert, int64_t topk,
-    const std::optional<int64_t>& align_block_size,
    torch::Tensor& permuted_input,             // [permuted_size, hidden]
    torch::Tensor& expert_first_token_offset,  // [n_local_expert + 1]
    torch::Tensor& inv_permuted_idx,           // [n_token, topk]
-    torch::Tensor& permuted_idx,               // [permute_size]
-    torch::Tensor& m_indices) {                // [align_expand_m]
+    torch::Tensor& permuted_idx) {             // [permute_size]
  TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long,
              "expert_first_token_offset must be int64");
  TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
@@ -34,8 +32,6 @@ void moe_permute(
              "token_expert_indices shape must be same as inv_permuted_idx");
  auto n_token = input.sizes()[0];
  auto n_hidden = input.sizes()[1];
-  auto align_block_size_value =
-      align_block_size.has_value() ? align_block_size.value() : -1;
  auto stream = at::cuda::getCurrentCUDAStream().stream();
  const long sorter_size =
      CubKeyValueSorter::getWorkspaceSize(n_token * topk, n_expert);
@@ -80,20 +76,8 @@ void moe_permute(
        get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
        get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
        get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
-        n_hidden, topk, n_local_expert, align_block_size_value, stream);
+        n_hidden, topk, n_local_expert, stream);
  });
-
-  // get m_indices and update expert_first_token_offset with align block
-  // this is only required for DeepGemm and not required for CUTLASS group gemm
-  if (align_block_size.has_value()) {
-    auto align_expert_first_token_offset =
-        torch::zeros_like(expert_first_token_offset);
-    getMIndices(get_ptr<int64_t>(expert_first_token_offset),
-                get_ptr<int64_t>(align_expert_first_token_offset),
-                get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
-                stream);
-    expert_first_token_offset.copy_(align_expert_first_token_offset);
-  }
 }

 void moe_unpermute(
@@ -186,16 +170,13 @@ void shuffle_rows(const torch::Tensor& input_tensor,

 #else

-void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
-                 torch::Tensor& topk_ids,
+void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_ids,
                 const torch::Tensor& token_expert_indices,
                 const std::optional<torch::Tensor>& expert_map,
                 int64_t n_expert, int64_t n_local_expert, int64_t topk,
-                 const std::optional<int64_t>& align_block_size,
                 torch::Tensor& permuted_input,
                 torch::Tensor& expert_first_token_offset,
-                 torch::Tensor& src_row_id2dst_row_id_map,
-                 torch::Tensor& m_indices) {
+                 torch::Tensor& inv_permuted_idx, torch::Tensor& permuted_idx) {
  TORCH_CHECK(false, "moe_permute is not supported on CUDA < 12.0");
 }

--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
@@ -168,64 +168,4 @@ void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
      topk_id_ptr, size, expert_map_ptr, num_experts);
 }

-template <bool ALIGN_BLOCK_SIZE>
-__global__ void getMIndicesKernel(int64_t* expert_first_token_offset,
-                                  int64_t* align_expert_first_token_offset,
-                                  int* m_indices, const int num_local_expert,
-                                  const int align_block_size) {
-  int eidx = blockIdx.x;
-  int tidx = threadIdx.x;
-  extern __shared__ int64_t smem_expert_first_token_offset[];
-  for (int i = tidx; i <= num_local_expert; i += blockDim.x) {
-    smem_expert_first_token_offset[i] = __ldg(expert_first_token_offset + i);
-  }
-  __syncthreads();
-  auto last_token_offset = smem_expert_first_token_offset[eidx + 1];
-  auto first_token_offset = smem_expert_first_token_offset[eidx];
-  int n_token_in_expert = last_token_offset - first_token_offset;
-
-  if constexpr (ALIGN_BLOCK_SIZE) {
-    n_token_in_expert = (n_token_in_expert + align_block_size - 1) /
-                        align_block_size * align_block_size;
-    // round up to ALIGN_BLOCK_SIZE
-    int64_t accumulate_align_offset = 0;
-    for (int i = 1; i <= eidx + 1; i++) {
-      int n_token = smem_expert_first_token_offset[i] -
-                    smem_expert_first_token_offset[i - 1];
-      accumulate_align_offset =
-          accumulate_align_offset + (n_token + align_block_size - 1) /
-                                        align_block_size * align_block_size;
-      if (i == eidx) {
-        first_token_offset = accumulate_align_offset;
-      }
-      // last block store align_expert_first_token_offset
-      if (eidx == num_local_expert - 1 && threadIdx.x == 0) {
-        align_expert_first_token_offset[i] = accumulate_align_offset;
-      }
-    }
-  }
-  for (int idx = tidx; idx < n_token_in_expert; idx += blockDim.x) {
-    // update m_indice with expert id
-    m_indices[first_token_offset + idx] = eidx;
-  }
-}
-
-void getMIndices(int64_t* expert_first_token_offset,
-                 int64_t* align_expert_first_token_offset, int* m_indices,
-                 int num_local_expert, const int align_block_size,
-                 cudaStream_t stream) {
-  int block = 256;
-  int grid = num_local_expert;
-  int smem_size = sizeof(int64_t) * (num_local_expert + 1);
-  if (align_block_size == -1) {
-    getMIndicesKernel<false><<<grid, block, smem_size, stream>>>(
-        expert_first_token_offset, align_expert_first_token_offset, m_indices,
-        num_local_expert, align_block_size);
-  } else {
-    getMIndicesKernel<true><<<grid, block, smem_size, stream>>>(
-        expert_first_token_offset, align_expert_first_token_offset, m_indices,
-        num_local_expert, align_block_size);
-  }
-}
-
 #endif
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
@@ -60,9 +60,9 @@ void expandInputRowsKernelLauncher(
    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
    int const* expanded_dest_row_to_expanded_source_row,
    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
-    int64_t* expert_first_token_offset, int64_t const num_rows,
+    int64_t const* expert_first_token_offset, int64_t const num_rows,
    int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
-    int num_local_experts, const int& align_block_size, cudaStream_t stream);
+    int num_local_experts, cudaStream_t stream);

 template <class T, class OutputType>
 void finalizeMoeRoutingKernelLauncher(
@@ -75,9 +75,4 @@ void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
                              const int* expert_map_ptr, int num_experts,
                              cudaStream_t stream);

-void getMIndices(int64_t* expert_first_token_offset,
-                 int64_t* align_expert_first_token_offset, int* m_indices,
-                 int num_local_expert, const int align_block_size,
-                 cudaStream_t stream);
-
 #include "moe_permute_unpermute_kernel.inl"
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
@@ -1,13 +1,13 @@
 #pragma once

-template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE>
+template <typename T, bool CHECK_SKIPPED>
 __global__ void expandInputRowsKernel(
    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
    int const* expanded_dest_row_to_expanded_source_row,
    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
-    int64_t* expert_first_token_offset, int64_t const num_rows,
+    int64_t const* expert_first_token_offset, int64_t const num_rows,
    int64_t const* num_dest_rows, int64_t const cols, int64_t k,
-    int num_local_experts, int align_block_size) {
+    int num_local_experts) {
  // Reverse permutation map.
  // I do this so that later, we can use the source -> dest map to do the k-way
  // reduction and unpermuting. I need the reverse map for that reduction to
@@ -18,37 +18,6 @@ __global__ void expandInputRowsKernel(
      expanded_dest_row_to_expanded_source_row[expanded_dest_row];
  int expert_id = sorted_experts[expanded_dest_row];

-  extern __shared__ int64_t smem_expert_first_token_offset[];
-  if constexpr (ALIGN_BLOCK_SIZE) {
-    // load g2s
-    for (int idx = threadIdx.x; idx < num_local_experts + 1;
-         idx += blockDim.x) {
-      smem_expert_first_token_offset[idx] =
-          __ldg(expert_first_token_offset + idx);
-    }
-    __syncthreads();
-    int lane_idx = threadIdx.x & 31;
-
-    if (lane_idx == 0) {
-      // set token_offset_in_expert = 0 if this expert is not local expert
-      int token_offset_in_expert =
-          expert_id >= num_local_experts
-              ? 0
-              : expanded_dest_row - smem_expert_first_token_offset[expert_id];
-      int64_t accumulate_align_offset = 0;
-#pragma unroll 1
-      for (int eidx = 1; eidx <= min(expert_id, num_local_experts); eidx++) {
-        auto n_token_in_expert = smem_expert_first_token_offset[eidx] -
-                                 smem_expert_first_token_offset[eidx - 1];
-        accumulate_align_offset += (n_token_in_expert + align_block_size - 1) /
-                                   align_block_size * align_block_size;
-      }
-      expanded_dest_row = accumulate_align_offset + token_offset_in_expert;
-    }
-    // lane0 shuffle broadcast align_expanded_dest_row
-    expanded_dest_row = __shfl_sync(0xffffffff, expanded_dest_row, 0);
-  }
-
  if (threadIdx.x == 0) {
    assert(expanded_dest_row <= INT32_MAX);
    expanded_source_row_to_expanded_dest_row[expanded_source_row] =
@@ -88,30 +57,25 @@ void expandInputRowsKernelLauncher(
    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
    int const* expanded_dest_row_to_expanded_source_row,
    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
-    int64_t* expert_first_token_offset, int64_t const num_rows,
+    int64_t const* expert_first_token_offset, int64_t const num_rows,
    int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
-    int num_local_experts, const int& align_block_size, cudaStream_t stream) {
+    int num_local_experts, cudaStream_t stream) {
  int64_t const blocks = num_rows * k;
  int64_t const threads = 256;
-  using FuncPtr = decltype(&expandInputRowsKernel<T, true, true>);
-  FuncPtr func_map[2][2] = {
-      {&expandInputRowsKernel<T, false, false>,
-       &expandInputRowsKernel<T, false, true>},
-      {&expandInputRowsKernel<T, true, false>,
-       &expandInputRowsKernel<T, true, true>},
+  using FuncPtr = decltype(&expandInputRowsKernel<T, true>);
+  FuncPtr func_map[2] = {
+      &expandInputRowsKernel<T, false>,
+      &expandInputRowsKernel<T, true>,
  };
  bool is_check_skip = num_valid_tokens_ptr != nullptr;
-  bool is_align_block_size = align_block_size != -1;
-  auto func = func_map[is_check_skip][is_align_block_size];
+  auto func = func_map[is_check_skip];

-  int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1);
-
-  func<<<blocks, threads, smem_size, stream>>>(
+  func<<<blocks, threads, 0, stream>>>(
      unpermuted_input, permuted_output, sorted_experts,
      expanded_dest_row_to_expanded_source_row,
      expanded_source_row_to_expanded_dest_row, permuted_idx,
      expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
-      num_local_experts, align_block_size);
+      num_local_experts);
 }

 template <class T, class U>
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -99,9 +99,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "moe_permute(Tensor input, Tensor topk_ids,"
      "Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
      "int n_local_expert,"
-      "int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
+      "int topk, Tensor! permuted_input, Tensor! "
      "expert_first_token_offset, Tensor! inv_permuted_idx, Tensor! "
-      "permuted_idx, Tensor! m_indices)->()");
+      "permuted_idx)->()");

  m.def(
      "moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights,"
--- a/Show More
+++ b/Show More