[CI] Implement uploading to PyPI and GitHub in the release pipeline, enable release image building for CUDA 13.0 (#31032 )

(cherry picked from commit 8e61425ee6)
[Frontend] Standardize use of create_error_response (#32319 )
2026-01-16 21:04:48 -08:00 · 2026-01-16 11:35:10 +00:00 · 2026-01-16 10:50:00 +08:00 · 2026-01-15 18:00:21 -08:00 · 2026-01-15 17:59:58 -08:00 · 2026-01-15 17:55:20 -08:00
1268 changed files with 35452 additions and 71986 deletions
--- a/.buildkite/.pipeline_gen_v2
+++ b/.buildkite/.pipeline_gen_v2
--- a/.buildkite/ci_config.yaml
+++ b/.buildkite/ci_config.yaml
@@ -1,8 +1,7 @@
 name: vllm_ci
 job_dirs:
  - ".buildkite/image_build"
  - ".buildkite/test_areas"
-  - ".buildkite/hardware_tests"
+  - ".buildkite/image_build"
 run_all_patterns:
  - "docker/Dockerfile"
  - "CMakeLists.txt"
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -1,29 +0,0 @@
 group: Hardware
 steps:
  - label: "AMD: :docker: build image"
    depends_on: []
    device: amd_cpu
    no_plugin: true
    commands:
    - >
      docker build
      --build-arg max_jobs=16
      --build-arg REMOTE_VLLM=1
      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
      --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
      --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
      -f docker/Dockerfile.rocm
      --target test
      --no-cache
      --progress plain .
    - docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
    env:
      DOCKER_BUILDKIT: "1"
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
          limit: 1
        - exit_status: -10  # Agent was lost
          limit: 1
        - exit_status: 1  # Machine occasionally fail
          limit: 1
--- a/.buildkite/hardware_tests/arm.yaml
+++ b/.buildkite/hardware_tests/arm.yaml
@@ -1,8 +0,0 @@
 group: Hardware
 steps:
  - label: "Arm CPU Test"
    soft_fail: true
    device: arm_cpu
    no_plugin: true
    commands: 
    - bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
--- a/.buildkite/hardware_tests/ascend_npu.yaml
+++ b/.buildkite/hardware_tests/ascend_npu.yaml
@@ -1,10 +0,0 @@
 group: Hardware
 depends_on: ~
 steps:
  - label: "Ascend NPU Test"
    soft_fail: true
    timeout_in_minutes: 20
    no_plugin: true
    device: ascend_npu
    commands: 
    - bash .buildkite/scripts/hardware_ci/run-npu-test.sh
--- a/.buildkite/hardware_tests/gh200.yaml
+++ b/.buildkite/hardware_tests/gh200.yaml
@@ -1,10 +0,0 @@
 group: Hardware
 steps:
  - label: "GH200 Test"
    soft_fail: true
    device: gh200
    no_plugin: true
    optional: true
    commands: 
    - nvidia-smi 
    - bash .buildkite/scripts/hardware_ci/run-gh200-test.sh
--- a/.buildkite/hardware_tests/intel.yaml
+++ b/.buildkite/hardware_tests/intel.yaml
@@ -1,24 +0,0 @@
 group: Hardware
 depends_on: ~
 steps:
  - label: "Intel CPU Test"
    soft_fail: true
    device: intel_cpu
    no_plugin: true
    commands: 
    - bash .buildkite/scripts/hardware_ci/run-cpu-test.sh
  - label: "Intel HPU Test"
    soft_fail: true
    device: intel_hpu
    no_plugin: true
    commands: 
    - bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
  - label: "Intel GPU Test"
    depends_on: []
    soft_fail: true
    device: intel_gpu
    no_plugin: true
    commands: 
    - bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@@ -1,256 +1,56 @@
 #!/bin/bash
-set -euo pipefail
+set -e
-# replace invalid characters in Docker image tags and truncate to 128 chars
+if [[ $# -lt 8 ]]; then
-clean_docker_tag() {
+  echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
-    local input="$1"
+  exit 1
    echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128
 }
 print_usage_and_exit() {
    echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
    exit 1
 }
 print_instance_info() {
    echo ""
    echo "=== Debug: Instance Information ==="
    # Get IMDSv2 token
    if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
            -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then
        AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
            http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown")
        INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
            http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown")
        INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
            http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")
        AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
            http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown")
        echo "AMI ID:        ${AMI_ID}"
        echo "Instance Type: ${INSTANCE_TYPE}"
        echo "Instance ID:   ${INSTANCE_ID}"
        echo "AZ:            ${AZ}"
    else
        echo "Not running on EC2 or IMDS not available"
    fi
    # Check for warm cache AMI (marker file baked into custom AMI)
    if [[ -f /etc/vllm-ami-info ]]; then
        echo "Cache:         warm (custom vLLM AMI)"
        cat /etc/vllm-ami-info
    else
        echo "Cache:         cold (standard AMI)"
    fi
    echo "==================================="
    echo ""
 }
 setup_buildx_builder() {
    echo "--- :buildkite: Setting up buildx builder"
    if [[ -S "${BUILDKIT_SOCKET}" ]]; then
        # Custom AMI with standalone buildkitd - use remote driver for warm cache
        echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
        echo "Using remote driver to connect to buildkitd (warm cache available)"
        if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
            echo "Using existing baked-vllm-builder"
            docker buildx use baked-vllm-builder
        else
            echo "Creating baked-vllm-builder with remote driver"
            docker buildx create \
                --name baked-vllm-builder \
                --driver remote \
                --use \
                "unix://${BUILDKIT_SOCKET}"
        fi
        docker buildx inspect --bootstrap
    elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
        # Existing builder available
        echo "Using existing builder: ${BUILDER_NAME}"
        docker buildx use "${BUILDER_NAME}"
        docker buildx inspect --bootstrap
    else
        # No local buildkitd, no existing builder - create new docker-container builder
        echo "No local buildkitd found, using docker-container driver"
        docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
        docker buildx inspect --bootstrap
    fi
    # builder info
    echo "Active builder:"
    docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
 }
 check_and_skip_if_image_exists() {
    if [[ -n "${IMAGE_TAG:-}" ]]; then
        echo "--- :mag: Checking if image exists"
        if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
            echo "Image already exists: ${IMAGE_TAG}"
            echo "Skipping build"
            exit 0
        fi
        echo "Image not found, proceeding with build"
    fi
 }
 ecr_login() {
    aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
    aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
 }
 prepare_cache_tags() {
    # resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
    TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
    MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
    if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
        if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
            cache="${MAIN_CACHE_ECR}:latest"
        else
            clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH")
            cache="${TEST_CACHE_ECR}:${clean_branch}"
        fi
        CACHE_TO="$cache"
        CACHE_FROM="$cache"
        CACHE_FROM_BASE_BRANCH="$cache"
    else
        CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
        CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
        if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then
            CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest"
        else
            clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH")
            CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}"
        fi
    fi
    CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest"
    export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
 }
 resolve_parent_commit() {
    if [[ -z "${PARENT_COMMIT:-}" ]]; then
        PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
        if [[ -n "${PARENT_COMMIT}" ]]; then
            echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
            export PARENT_COMMIT
        else
            echo "Could not determine parent commit (may be first commit in repo)"
        fi
    else
        echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
    fi
 }
 print_bake_config() {
    echo "--- :page_facing_up: Resolved bake configuration"
    BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
    docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
    echo "Saved bake config to ${BAKE_CONFIG_FILE}"
    echo "--- :arrow_down: Uploading bake config to Buildkite"
    buildkite-agent artifact upload "${BAKE_CONFIG_FILE}"
 }
 #################################
 #         Main Script           #
 #################################
 print_instance_info
 if [[ $# -lt 7 ]]; then
    print_usage_and_exit
 fi
 # input args
 REGISTRY=$1
 REPO=$2
 BUILDKITE_COMMIT=$3
 BRANCH=$4
 VLLM_USE_PRECOMPILED=$5
 VLLM_MERGE_BASE_COMMIT=$6
-IMAGE_TAG=$7
+CACHE_FROM=$7
-IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional
+CACHE_TO=$8
-# build config
+# authenticate with AWS ECR
-TARGET="test-ci"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
-VLLM_BAKE_FILE_PATH="${VLLM_BAKE_FILE_PATH:-docker/docker-bake.hcl}"
+aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
 BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
 CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
 CI_HCL_PATH="/tmp/ci.hcl"
 BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"
-prepare_cache_tags
+# docker buildx 
-ecr_login
+docker buildx create --name vllm-builder --driver docker-container --use
 docker buildx inspect --bootstrap
 docker buildx ls
-# Environment info (for docs and human readers)
+# skip build if image already exists
-#   VLLM_CI_BRANCH      - ci-infra branch to use (default: main)
+if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
-#   VLLM_BAKE_FILE_PATH      - Path to vLLM's bake file (default: docker/docker-bake.hcl)
+  echo "Image not found, proceeding with build..."
-#   BUILDER_NAME        - Name for buildx builder (default: vllm-builder)
+else
-#
+  echo "Image found"
-# Build configuration (exported as environment variables for bake):
+  exit 0
 export BUILDKITE_COMMIT
 export PARENT_COMMIT
 export IMAGE_TAG
 export IMAGE_TAG_LATEST
 export CACHE_FROM
 export CACHE_FROM_BASE_BRANCH
 export CACHE_FROM_MAIN
 export CACHE_TO
 export VLLM_USE_PRECOMPILED
 export VLLM_MERGE_BASE_COMMIT
 # print args
 echo "--- :mag: Arguments"
 echo "REGISTRY: ${REGISTRY}"
 echo "REPO: ${REPO}"
 echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
 echo "BRANCH: ${BRANCH}"
 echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
 echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
 echo "IMAGE_TAG: ${IMAGE_TAG}"
 echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
 # print build configuration
 echo "--- :mag: Build configuration"
 echo "TARGET: ${TARGET}"
 echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
 echo "BUILDER_NAME: ${BUILDER_NAME}"
 echo "CI_HCL_URL: ${CI_HCL_URL}"
 echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}"
 echo "--- :mag: Cache tags"
 echo "CACHE_TO: ${CACHE_TO}"
 echo "CACHE_FROM: ${CACHE_FROM}"
 echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}"
 echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
 check_and_skip_if_image_exists
 echo "--- :docker: Setting up Docker buildx bake"
 echo "Target: ${TARGET}"
 echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
 echo "CI HCL path: ${CI_HCL_PATH}"
 if [[ ! -f "${VLLM_BAKE_FILE_PATH}" ]]; then
    echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE_PATH}"
    echo "Make sure you're running from the vLLM repository root"
    exit 1
 fi
-echo "--- :arrow_down: Downloading ci.hcl"
+if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
-curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
+  merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
-echo "Downloaded to ${CI_HCL_PATH}"
+else
-
+  merge_base_commit_build_args=""
 if [[ ! -f "${CI_HCL_PATH}" ]]; then
    echo "Error: ci.hcl not found at ${CI_HCL_PATH}"
    exit 1
 fi
-setup_buildx_builder
+# build
-
+docker buildx build --file docker/Dockerfile \
-resolve_parent_commit
+  --build-arg max_jobs=16 \
-export PARENT_COMMIT
+  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
-
+  --build-arg USE_SCCACHE=1 \
-print_bake_config
+  --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
-
+  --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
-echo "--- :docker: Building ${TARGET}"
+  --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
-docker --debug buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"
+  ${merge_base_commit_build_args} \
-
+  --cache-from type=registry,ref=${CACHE_FROM},mode=max \
-echo "--- :white_check_mark: Build complete"
+  --cache-to type=registry,ref=${CACHE_TO},mode=max \
  --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
  $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
  --push \
  --target test \
  --progress plain .
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -4,8 +4,7 @@ steps:
    key: image-build
    depends_on: []
    commands:
-    - if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
+    - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
    - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
--- a/.buildkite/lm-eval-harness/configs/models-small-rocm.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small-rocm.txt
@@ -1,5 +0,0 @@
 Qwen2.5-1.5B-Instruct.yaml
 Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
 Qwen1.5-MoE-W4A16-compressed-tensors.yaml
--- a/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -393,7 +393,7 @@ if __name__ == "__main__":
    with open(results_folder / md_file, "w") as f:
        results = read_markdown(
            "../.buildkite/performance-benchmarks/"
-            "performance-benchmarks-descriptions.md"
+            + "performance-benchmarks-descriptions.md"
        )
        results = results.format(
            latency_tests_markdown_table=latency_md_table,
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -25,9 +25,9 @@ check_gpus() {
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
-
+  
  declare -g arch_suffix=''
-
+  
  if command -v nvidia-smi; then
    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
  elif command -v amd-smi; then
@@ -181,20 +181,19 @@ upload_to_buildkite() {
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }
-run_benchmark_tests() {
+run_latency_tests() {
-  # run benchmark tests using `vllm bench <test_type>` command
+  # run latency tests using `vllm bench latency` command
-  # $1: test type (latency or throughput)
+  # $1: a json file specifying latency test cases
  # $2: a json file specifying test cases
-  local test_type=$1
+  local latency_test_file
-  local test_file=$2
+  latency_test_file=$1
-  # Iterate over tests
+  # Iterate over latency tests
-  jq -c '.[]' "$test_file" | while read -r params; do
+  jq -c '.[]' "$latency_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
-    if [[ ! "$test_name" =~ ^${test_type}_ ]]; then
+    if [[ ! "$test_name" =~ ^latency_ ]]; then
-      echo "In ${test_type}-test.json, test_name must start with \"${test_type}_\"."
+      echo "In latency-test.json, test_name must start with \"latency_\"."
      exit 1
    fi
@@ -205,15 +204,15 @@ run_benchmark_tests() {
    fi
    # get arguments
-    bench_params=$(echo "$params" | jq -r '.parameters')
+    latency_params=$(echo "$params" | jq -r '.parameters')
-    bench_args=$(json2args "$bench_params")
+    latency_args=$(json2args "$latency_params")
-    bench_environment_variables=$(echo "$params" | jq -r '.environment_variables')
+    latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
-    bench_envs=$(json2envs "$bench_environment_variables")
+    latency_envs=$(json2envs "$latency_environment_variables")
    # check if there is enough GPU to run the test
-    tp=$(echo "$bench_params" | jq -r '.tensor_parallel_size')
+    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
    if [[ "$ON_CPU" == "1" ]]; then
-      pp=$(echo "$bench_params" | jq -r '.pipeline_parallel_size // 1')
+      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@@ -226,42 +225,97 @@ run_benchmark_tests() {
      fi
    fi
-    bench_command=" $bench_envs vllm bench $test_type \
+    latency_command=" $latency_envs vllm bench latency \
      --output-json $RESULTS_FOLDER/${test_name}.json \
-      $bench_args"
+      $latency_args"
    echo "Running test case $test_name"
-    echo "${test_type^} command: $bench_command"
+    echo "Latency command: $latency_command"
-    # recording benchmarking command and GPU command
+    # recoding benchmarking command ang GPU command
    jq_output=$(jq -n \
-      --arg command "$bench_command" \
+      --arg latency "$latency_command" \
      --arg gpu "$gpu_type" \
      --arg test_type "$test_type" \
      '{
-        ($test_type + "_command"): $command,
+        latency_command: $latency,
        gpu_type: $gpu
      }')
    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
    # run the benchmark
-    eval "$bench_command"
+    eval "$latency_command"
    kill_gpu_processes
  done
 }
 run_latency_tests() {
  run_benchmark_tests "latency" "$1"
 }
 run_startup_tests() {
  run_benchmark_tests "startup" "$1"
 }
 run_throughput_tests() {
-  run_benchmark_tests "throughput" "$1"
+  # run throughput tests using `vllm bench throughput`
  # $1: a json file specifying throughput test cases
  local throughput_test_file
  throughput_test_file=$1
  # Iterate over throughput tests
  jq -c '.[]' "$throughput_test_file" | while read -r params; do
    # get the test name, and append the GPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^throughput_ ]]; then
      echo "In throughput-test.json, test_name must start with \"throughput_\"."
      exit 1
    fi
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # get arguments
    throughput_params=$(echo "$params" | jq -r '.parameters')
    throughput_args=$(json2args "$throughput_params")
    throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
    throughput_envs=$(json2envs "$throughput_environment_variables")
    # check if there is enough GPU to run the test
    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
    if [[ "$ON_CPU" == "1" ]]; then
      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
      world_size=$(($tp*$pp))
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
        continue
      fi
    else
      if [[ $gpu_count -lt $tp ]]; then
        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
        continue
      fi
    fi
    throughput_command=" $throughput_envs vllm bench throughput \
      --output-json $RESULTS_FOLDER/${test_name}.json \
      $throughput_args"
    echo "Running test case $test_name"
    echo "Throughput command: $throughput_command"
    # recoding benchmarking command ang GPU command
    jq_output=$(jq -n \
      --arg command "$throughput_command" \
      --arg gpu "$gpu_type" \
      '{
        throughput_command: $command,
        gpu_type: $gpu
      }')
    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
    # run the benchmark
    eval "$throughput_command"
    kill_gpu_processes
  done
 }
 run_serving_tests() {
@@ -480,7 +534,6 @@ main() {
  # benchmarking
  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
  run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}"
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
  # postprocess benchmarking results
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,270 +1,277 @@
 steps:
  # aarch64 + CUDA builds
  - label: "Build wheel - aarch64 - CUDA 12.9"
    depends_on: ~
    id: build-wheel-arm64-cuda-12-9
    agents:
      queue: arm64_cpu_queue_postmerge
    commands:
      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-nightly-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  - label: "Build wheel - aarch64 - CUDA 13.0"
    depends_on: ~
    id: build-wheel-arm64-cuda-13-0
    agents:
      queue: arm64_cpu_queue_postmerge
    commands:
      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04  --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
    env:
      DOCKER_BUILDKIT: "1"
  # aarch64 build
  - label: "Build wheel - aarch64 - CPU"
    depends_on: ~
    id: build-wheel-arm64-cpu
    agents:
      queue: arm64_cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
    env:
      DOCKER_BUILDKIT: "1"
  # x86 + CUDA builds
  - label: "Build wheel - x86_64 - CUDA 12.9"
    depends_on: ~
    id: build-wheel-x86-cuda-12-9
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
    env:
      DOCKER_BUILDKIT: "1"
  - label: "Build wheel - x86_64 - CUDA 13.0"
    depends_on: ~
    id: build-wheel-x86-cuda-13-0
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
    env:
      DOCKER_BUILDKIT: "1"
  # x86 CPU wheel build
  - label: "Build wheel - x86_64 - CPU"
    depends_on: ~
    id: build-wheel-x86-cpu
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
    env:
      DOCKER_BUILDKIT: "1"
  # Build release images (CUDA 12.9)
  - label: "Build release image - x86_64 - CUDA 12.9"
    depends_on: ~
    id: build-release-image-x86
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
      # re-tag to default image tag and push, just in case arm64 build fails
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
  - label: "Build release image - aarch64 - CUDA 12.9"
    depends_on: ~
    id: build-release-image-arm64
    agents:
      queue: arm64_cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
  - label: "Create multi-arch manifest - CUDA 12.9"
    depends_on:
      - build-release-image-x86
      - build-release-image-arm64
    id: create-multi-arch-manifest
    agents:
      queue: small_cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
      - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
  - label: "Annotate release workflow - CUDA 12.9"
    depends_on:
      - create-multi-arch-manifest
    id: annotate-release-workflow
    agents:
      queue: small_cpu_queue_postmerge
    commands:
      - "bash .buildkite/scripts/annotate-release.sh"
  - block: "Build CUDA 13.0 release images"
    key: block-release-image-build-cuda-13-0
    depends_on: ~
  - label: "Build release image - x86_64 - CUDA 13.0"
    depends_on: block-release-image-build-cuda-13-0
    id: build-release-image-x86-cuda-13-0
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
      # re-tag to default image tag and push, just in case arm64 build fails
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
  - label: "Build release image - aarch64 - CUDA 13.0"
    depends_on: block-release-image-build-cuda-13-0
    id: build-release-image-arm64-cuda-13-0
    agents:
      queue: arm64_cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
  - label: "Create multi-arch manifest - CUDA 13.0"
    depends_on:
      - build-release-image-x86-cuda-13-0
      - build-release-image-arm64-cuda-13-0
    id: create-multi-arch-manifest-cuda-13-0
    agents:
      queue: small_cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
      - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
  - input: "Provide Release version here"
    id: input-release-version
    fields:
      - text: "What is the release version?"
        key: release-version
-  - group: "Build Python wheels"
+  - block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
-    key: "build-wheels"
+    key: block-upload-release-wheels
-    steps:
+    depends_on:
-      - label: "Build wheel - aarch64 - CUDA 12.9"
+      - input-release-version
-        depends_on: ~
+      - build-wheel-x86-cuda-12-9
-        id: build-wheel-arm64-cuda-12-9
+      - build-wheel-x86-cuda-13-0
-        agents:
+      - build-wheel-x86-cpu
-          queue: arm64_cpu_queue_postmerge
+      - build-wheel-arm64-cuda-12-9
-        commands:
+      - build-wheel-arm64-cuda-13-0
-          # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+      - build-wheel-arm64-cpu
          # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
          - "mkdir artifacts"
          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
          - "bash .buildkite/scripts/upload-nightly-wheels.sh"
        env:
          DOCKER_BUILDKIT: "1"
-      - label: "Build wheel - aarch64 - CUDA 13.0"
+  - label: "Upload release wheels to PyPI and GitHub"
-        depends_on: ~
+    depends_on:
-        id: build-wheel-arm64-cuda-13-0
+      - block-upload-release-wheels
-        agents:
+    id: upload-release-wheels
-          queue: arm64_cpu_queue_postmerge
+    agents:
-        commands:
+      queue: small_cpu_queue_postmerge
-          # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+    commands:
-          # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+      - "bash .buildkite/scripts/upload-release-wheels.sh"
          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04  --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
          - "mkdir artifacts"
          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
        env:
          DOCKER_BUILDKIT: "1"
-      - label: "Build wheel - aarch64 - CPU"
+  - block: "Build CPU release image"
-        depends_on: ~
+    key: block-cpu-release-image-build
-        id: build-wheel-arm64-cpu
+    depends_on: ~
        agents:
          queue: arm64_cpu_queue_postmerge
        commands:
          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
          - "mkdir artifacts"
          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
        env:
          DOCKER_BUILDKIT: "1"
-      - label: "Build wheel - x86_64 - CUDA 12.9"
+  - label: "Build and publish CPU release image"
-        depends_on: ~
+    depends_on: block-cpu-release-image-build
-        id: build-wheel-x86-cuda-12-9
+    agents:
-        agents:
+      queue: cpu_queue_postmerge
-          queue: cpu_queue_postmerge
+    commands:
-        commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
-          - "mkdir artifacts"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
-          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
+    env:
-        env:
+      DOCKER_BUILDKIT: "1"
          DOCKER_BUILDKIT: "1"
-      - label: "Build wheel - x86_64 - CUDA 13.0"
+  - block: "Build arm64 CPU release image"
-        depends_on: ~
+    key: block-arm64-cpu-release-image-build
-        id: build-wheel-x86-cuda-13-0
+    depends_on: ~
        agents:
          queue: cpu_queue_postmerge
        commands:
          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
          - "mkdir artifacts"
          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
        env:
          DOCKER_BUILDKIT: "1"
-      - label: "Build wheel - x86_64 - CPU"
+  - label: "Build and publish arm64 CPU release image"
-        depends_on: ~
+    depends_on: block-arm64-cpu-release-image-build
-        id: build-wheel-x86-cpu
+    agents:
-        agents:
+      queue: arm64_cpu_queue_postmerge
-          queue: cpu_queue_postmerge
+    commands:
-        commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
-          - "mkdir artifacts"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
-          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
-          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+    env:
-        env:
+      DOCKER_BUILDKIT: "1"
          DOCKER_BUILDKIT: "1"
-  - group: "Build release Docker images"
+  - block: "Build ROCm release image"
-    key: "build-release-images"
+    key: block-rocm-release-image-build
-    steps:
+    depends_on: ~
      - label: "Build release image - x86_64 - CUDA 12.9"
        depends_on: ~
        id: build-release-image-x86
        agents:
          queue: cpu_queue_postmerge
        commands:
          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
          # re-tag to default image tag and push, just in case arm64 build fails
          - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-      - label: "Build release image - aarch64 - CUDA 12.9"
+  - label: "Build release image (ROCm)"
-        depends_on: ~
+    depends_on: block-rocm-release-image-build
-        id: build-release-image-arm64
+    id: build-release-image-rocm
-        agents:
+    agents:
-          queue: arm64_cpu_queue_postmerge
+      queue: cpu_queue_postmerge
-        commands:
+    commands:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      # Build base image first
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
      # Build vLLM ROCm image using the base
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
-      - label: "Build release image - x86_64 - CUDA 13.0"
+  
-        depends_on: ~
+  - label: "Build and publish nightly multi-arch image to DockerHub"
-        id: build-release-image-x86-cuda-13-0
+    depends_on:
-        agents:
+      - create-multi-arch-manifest
-          queue: cpu_queue_postmerge
+    if: build.env("NIGHTLY") == "1"
-        commands:
+    agents:
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      queue: small_cpu_queue_postmerge
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
+    commands:
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          # re-tag to default image tag and push, just in case arm64 build fails
+      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
-          - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
-
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
-      - label: "Build release image - aarch64 - CUDA 13.0"
+      - "docker push vllm/vllm-openai:nightly-x86_64"
-        depends_on: ~
+      - "docker push vllm/vllm-openai:nightly-aarch64"
-        id: build-release-image-arm64-cuda-13-0
+      - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
-        agents:
+      - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
-          queue: arm64_cpu_queue_postmerge
+      - "docker manifest push vllm/vllm-openai:nightly"
-        commands:
+      - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
-          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      # Clean up old nightly builds (keep only last 14)
-          # compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
+      - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
+    plugins:
-          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
+      - docker-login#v3.0.0:
-
+          username: vllmbot
-      - block: "Build release image for x86_64 CPU"
+          password-env: DOCKERHUB_TOKEN
-        key: block-cpu-release-image-build
+    env:
-        depends_on: ~
+      DOCKER_BUILDKIT: "1"
-
+      DOCKERHUB_USERNAME: "vllmbot"
      - label: "Build release image - x86_64 - CPU"
        depends_on:
          - block-cpu-release-image-build
          - input-release-version
        agents:
          queue: cpu_queue_postmerge
        commands:
          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
        env:
          DOCKER_BUILDKIT: "1"
      - block: "Build release image for arm64 CPU"
        key: block-arm64-cpu-release-image-build
        depends_on: ~
      - label: "Build release image - arm64 - CPU"
        depends_on: 
          - block-arm64-cpu-release-image-build
          - input-release-version
        agents:
          queue: arm64_cpu_queue_postmerge
        commands:
          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
          - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
          - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
        env:
          DOCKER_BUILDKIT: "1"
  - group: "Publish release images"
    key: "publish-release-images"
    steps:
      - label: "Create multi-arch manifest - CUDA 12.9"
        depends_on:
          - build-release-image-x86
          - build-release-image-arm64
        id: create-multi-arch-manifest
        agents:
          queue: small_cpu_queue_postmerge
        commands:
          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
          - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
          - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
      - label: "Annotate release workflow - CUDA 12.9"
        depends_on:
          - create-multi-arch-manifest
        id: annotate-release-workflow
        agents:
          queue: small_cpu_queue_postmerge
        commands:
          - "bash .buildkite/scripts/annotate-release.sh"
      - label: "Create multi-arch manifest - CUDA 13.0"
        depends_on:
          - build-release-image-x86-cuda-13-0
          - build-release-image-arm64-cuda-13-0
        id: create-multi-arch-manifest-cuda-13-0
        agents:
          queue: small_cpu_queue_postmerge
        commands:
          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
          - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
          - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
      - label: "Publish nightly multi-arch image to DockerHub"
        depends_on:
          - create-multi-arch-manifest
        if: build.env("NIGHTLY") == "1"
        agents:
          queue: small_cpu_queue_postmerge
        commands:
          - "bash .buildkite/scripts/push-nightly-builds.sh"
          # Clean up old nightly builds (keep only last 14)
          - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
        plugins:
          - docker-login#v3.0.0:
              username: vllmbot
              password-env: DOCKERHUB_TOKEN
        env:
          DOCKER_BUILDKIT: "1"
          DOCKERHUB_USERNAME: "vllmbot"
      - label: "Publish nightly multi-arch image to DockerHub - CUDA 13.0"
        depends_on:
          - create-multi-arch-manifest-cuda-13-0
        if: build.env("NIGHTLY") == "1"
        agents:
          queue: small_cpu_queue_postmerge
        commands:
          - "bash .buildkite/scripts/push-nightly-builds.sh cu130"
          # Clean up old nightly builds (keep only last 14)
          - "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
        plugins:
          - docker-login#v3.0.0:
              username: vllmbot
              password-env: DOCKERHUB_TOKEN
        env:
          DOCKER_BUILDKIT: "1"
          DOCKERHUB_USERNAME: "vllmbot"
  - group: "Publish wheels"
    key: "publish-wheels"
    steps:
      - block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
        key: block-upload-release-wheels
        depends_on:
          - input-release-version
          - build-wheels
      - label: "Upload release wheels to PyPI"
        depends_on:
          - block-upload-release-wheels
        id: upload-release-wheels
        agents:
          queue: small_cpu_queue_postmerge
        commands:
          - "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
  # =============================================================================
  # ROCm Release Pipeline (x86_64 only)
@@ -459,7 +466,7 @@ steps:
      S3_BUCKET: "vllm-wheels"
  # ROCm Job 2: Build vLLM ROCm Wheel
-  - label: ":python: Build vLLM ROCm Wheel - x86_64"
+  - label: ":python: Build vLLM ROCm Wheel"
    id: build-rocm-vllm-wheel
    depends_on:
      - step: build-rocm-base-wheels
@@ -621,93 +628,9 @@ steps:
    depends_on:
      - step: upload-rocm-wheels
        allow_failure: true
      - step: input-release-version
        allow_failure: true
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "bash .buildkite/scripts/annotate-rocm-release.sh"
    env:
      S3_BUCKET: "vllm-wheels"
  # ROCm Job 5: Generate Root Index for ROCm Wheels (for release only)
  # This is the job to create https://wheels.vllm.ai/rocm/ index allowing
  # users to install with `uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/`
  - block: "Generate Root Index for ROCm Wheels for Release"
    key: block-generate-root-index-rocm-wheels
    depends_on: upload-rocm-wheels
  - label: ":package: Generate Root Index for ROCm Wheels for Release"
    depends_on: block-generate-root-index-rocm-wheels
    id: generate-root-index-rocm-wheels
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
    env:
      S3_BUCKET: "vllm-wheels"
      VARIANT: "rocm700"
  # ROCm Job 5: Build ROCm Release Docker Image
  - label: ":docker: Build release image - x86_64 - ROCm"
    id: build-rocm-release-image
    depends_on:
      - step: build-rocm-base-wheels
        allow_failure: false
    agents:
      queue: cpu_queue_postmerge
    timeout_in_minutes: 60
    commands:
      - |
        set -euo pipefail
        # Login to ECR
        aws ecr-public get-login-password --region us-east-1 | \
          docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
        # Download Docker image from S3 (set by build-rocm-base-wheels)
        DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
        if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
          echo "ERROR: rocm-docker-image-s3-path metadata not found"
          exit 1
        fi
        echo "Downloading base image from $${DOCKER_IMAGE_S3_PATH}"
        mkdir -p artifacts/rocm-docker-image
        aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
        # Load base Docker image
        echo "Loading base Docker image..."
        LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
        BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
        echo "Loaded base image: $${BASE_IMAGE_TAG}"
        # Tag and push the base image to ECR
        docker tag "$${BASE_IMAGE_TAG}" public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
        docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
        echo "Pushed base image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base"
        # Get GPU architectures from meta-data
        PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
        PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
        # Build vLLM ROCm release image using cached base
        DOCKER_BUILDKIT=1 docker build \
          --build-arg max_jobs=16 \
          --build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
          --build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
          --build-arg USE_SCCACHE=1 \
          --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
          --build-arg SCCACHE_REGION_NAME=us-west-2 \
          --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
          --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm \
          --target vllm-openai \
          --progress plain \
          -f docker/Dockerfile.rocm .
        # Push to ECR
        docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
        echo "Pushed: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
    env:
      DOCKER_BUILDKIT: "1"
      S3_BUCKET: "vllm-wheels"
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -11,80 +11,51 @@ fi
 buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
 To download the wheel (by commit):
 \`\`\`
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_aarch64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
-(Optional) For CUDA 13.0:
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .
 (Optional) For CPU:
 aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
 \`\`\`
 To download the wheel (by version):
 \`\`\`
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
 \`\`\`
 To download and upload the image:
 \`\`\`
 Download images:
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
 Tag and push images:
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
 docker push vllm/vllm-openai:latest-x86_64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
 docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
 docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
 docker push vllm/vllm-openai:latest-x86_64-cu130
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker push vllm/vllm-openai:latest-aarch64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai:rocm
-docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
+docker tag vllm/vllm-openai:rocm vllm/vllm-openai:latest-rocm
-docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+docker tag vllm/vllm-openai:rocm vllm/vllm-openai:v${RELEASE_VERSION}-rocm
-docker push vllm/vllm-openai:latest-aarch64-cu130
+docker push vllm/vllm-openai:latest-rocm
-docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-rocm
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:latest
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
 docker push vllm/vllm-openai-rocm:latest
 docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
 Create multi-arch manifest:
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
 docker push vllm/vllm-openai-rocm:latest-base
 docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
 docker manifest rm vllm/vllm-openai:latest
 docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker manifest push vllm/vllm-openai:latest
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
 docker manifest rm vllm/vllm-openai:latest-cu130
 docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
 docker manifest push vllm/vllm-openai:latest-cu130
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
 \`\`\`
 EOF 
--- a/.buildkite/scripts/annotate-rocm-release.sh
+++ b/.buildkite/scripts/annotate-rocm-release.sh
@@ -3,32 +3,25 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 #
 # Generate Buildkite annotation for ROCm wheel release
 set -ex
 # Get build configuration from meta-data
 # Extract ROCm version dynamically from Dockerfile.rocm_base
-# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0"
+# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.1-complete -> extracts "7.1"
 ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
 PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
 PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
 # TODO: Enable the nightly build for ROCm
 # Get release version, default to 1.0.0.dev for nightly/per-commit builds
 RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "")
 if [ -z "${RELEASE_VERSION}" ]; then
  RELEASE_VERSION="1.0.0.dev"
 fi
 # S3 URLs
 S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
 S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
-S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"
+S3_URL="https://${S3_BUCKET}.s3.${S3_REGION}.amazonaws.com"
 ROCM_PATH="rocm/${BUILDKITE_COMMIT}"
 # Format ROCm version for path (e.g., "7.1" -> "rocm710")
 ROCM_VERSION_PATH="rocm$(echo ${ROCM_VERSION} | tr -d '.')"
 ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
 buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
-## ROCm Wheel and Docker Image Releases
+## :rocm: ROCm Wheel Release
 ### Build Configuration
 | Setting | Value |
 |---------|-------|
@@ -41,72 +34,41 @@ buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' <<
 ### :package: Installation
 **Install from this build (by commit):**
 \`\`\`bash
-pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
+uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/{rocm_variant}/
-# Example for ROCm ${ROCM_VERSION}:
+# Example:
-pip install vllm --extra-index-url ${S3_URL}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
+uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/rocm700/
 \`\`\`
 **Install from nightly (if published):**
 \`\`\`bash
-pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
+uv pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/
 \`\`\`
 ### :floppy_disk: Download Wheels Directly
 \`\`\`bash
 # List all ROCm wheels
-aws s3 ls s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/
+aws s3 ls s3://${S3_BUCKET}/${ROCM_PATH}/
 # Download specific wheels
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/vllm-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/vllm-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torch-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torch-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/triton_rocm-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-kernels-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torchvision-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
+aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/amdsmi-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/aiter-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
 \`\`\`
 ### :gear: Included Packages
 - **vllm**: vLLM with ROCm support
 - **torch**: PyTorch built for ROCm ${ROCM_VERSION}
- **triton**: Triton
+- **triton_rocm**: Triton built for ROCm
 - **triton-kernels**: Triton kernels
 - **torchvision**: TorchVision for ROCm PyTorch
 - **torchaudio**: Torchaudio for ROCm PyTorch
 - **amdsmi**: AMD SMI Python bindings
 - **aiter**: Aiter for ROCm
 - **flash-attn**: Flash Attention for ROCm
 ### :warning: Notes
 - These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
 - Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
 - Platform: Linux x86_64 only
 ### :package: Docker Image Release
 To download and upload the image:
 \`\`\`
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
 docker push vllm/vllm-openai-rocm:latest-base
 docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
 docker push vllm/vllm-openai-rocm:latest
 docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
 \`\`\`
 EOF
--- a/.buildkite/scripts/cherry-pick-from-milestone.sh
+++ b/.buildkite/scripts/cherry-pick-from-milestone.sh
@@ -1,242 +0,0 @@
 #!/bin/bash
 #
 # cherry-pick-from-milestone.sh
 # Find commits from a GitHub milestone that are missing from the current branch
 # and output them in chronological order for cherry-picking.
 #
 # Usage: ./cherry-pick-from-milestone.sh <milestone> [--dry-run] [--execute]
 #
 set -euo pipefail
 # Colors for output
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 NC='\033[0m' # No Color
 usage() {
    cat <<EOF
 Usage: $(basename "$0") <milestone> [options]
 Find commits from a GitHub milestone that need to be cherry-picked into the current branch.
 Arguments:
    milestone       The GitHub milestone name (e.g., v0.14.0)
 Options:
    --dry-run       Show the cherry-pick commands without executing (default)
    --execute       Actually execute the cherry-picks
    --main-branch   Specify the main branch name (default: main)
    --help          Show this help message
 Examples:
    $(basename "$0") v0.14.0
    $(basename "$0") v0.14.0 --dry-run
    $(basename "$0") v0.14.0 --execute
    $(basename "$0") v0.14.0 --main-branch master
 EOF
    exit 1
 }
 log_info() {
    echo -e "${BLUE}[INFO]${NC} $1"
 }
 log_success() {
    echo -e "${GREEN}[OK]${NC} $1"
 }
 log_warn() {
    echo -e "${YELLOW}[WARN]${NC} $1"
 }
 log_error() {
    echo -e "${RED}[ERROR]${NC} $1" >&2
 }
 # Default values
 MILESTONE=""
 DRY_RUN=true
 MAIN_BRANCH="main"
 # Parse arguments
 while [[ $# -gt 0 ]]; do
    case $1 in
        --dry-run)
            DRY_RUN=true
            shift
            ;;
        --execute)
            DRY_RUN=false
            shift
            ;;
        --main-branch)
            MAIN_BRANCH="$2"
            shift 2
            ;;
        --help|-h)
            usage
            ;;
        -*)
            log_error "Unknown option: $1"
            usage
            ;;
        *)
            if [[ -z "$MILESTONE" ]]; then
                MILESTONE="$1"
            else
                log_error "Unexpected argument: $1"
                usage
            fi
            shift
            ;;
    esac
 done
 # Validate milestone argument
 if [[ -z "$MILESTONE" ]]; then
    log_error "Milestone is required"
    usage
 fi
 # Check if we're in a git repository
 if ! git rev-parse --is-inside-work-tree &>/dev/null; then
    log_error "Not in a git repository"
    exit 1
 fi
 # Check if gh CLI is available
 if ! command -v gh &>/dev/null; then
    log_error "GitHub CLI (gh) is not installed"
    exit 1
 fi
 # Check if authenticated with gh
 if ! gh auth status &>/dev/null; then
    log_error "Not authenticated with GitHub CLI. Run 'gh auth login' first."
    exit 1
 fi
 CURRENT_BRANCH=$(git branch --show-current)
 log_info "Current branch: ${CURRENT_BRANCH}"
 log_info "Main branch: ${MAIN_BRANCH}"
 log_info "Milestone: ${MILESTONE}"
 echo ""
 # Fetch latest from remote
 log_info "Fetching latest from remote..."
 git fetch origin "$MAIN_BRANCH" --quiet
 # Get merged PRs from the milestone, sorted by merge date
 log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
 # Store PR data in a temp file
 PR_DATA=$(mktemp)
 trap "rm -f $PR_DATA" EXIT
 if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
    --limit 1000 \
    --json number,title,mergeCommit,mergedAt \
    --jq 'sort_by(.mergedAt) | .[] | "\(.mergeCommit.oid)\t\(.number)\t\(.title)"' > "$PR_DATA" 2>/dev/null; then
    log_error "Failed to fetch PRs from milestone '${MILESTONE}'"
    log_error "This could be due to:"
    log_error "  - Milestone does not exist"
    log_error "  - Network/authentication issues"
    log_error "  - Invalid milestone name format"
    exit 1
 fi
 if [[ ! -s "$PR_DATA" ]]; then
    log_warn "No merged PRs found for milestone '${MILESTONE}'"
    exit 0
 fi
 TOTAL_PRS=$(wc -l < "$PR_DATA")
 log_info "Found ${TOTAL_PRS} merged PR(s) in milestone"
 echo ""
 # Find commits that are missing from current branch
 MISSING_COMMITS=()
 MISSING_INFO=()
 while IFS=$'\t' read -r sha pr_number title; do
    # Skip if SHA is empty or null
    if [[ -z "$sha" || "$sha" == "null" ]]; then
        log_warn "PR #${pr_number} has no merge commit SHA, skipping"
        continue
    fi
    # Check if this commit is already in the current branch
    if git merge-base --is-ancestor "$sha" HEAD 2>/dev/null; then
        log_success "PR #${pr_number} already in branch: ${title:0:60}"
    else
        log_warn "PR #${pr_number} MISSING: ${title:0:60}"
        MISSING_COMMITS+=("$sha")
        MISSING_INFO+=("$sha PR #${pr_number}: ${title}")
    fi
 done < "$PR_DATA"
 echo ""
 if [[ ${#MISSING_COMMITS[@]} -eq 0 ]]; then
    log_success "All PRs from milestone '${MILESTONE}' are already in the current branch!"
    exit 0
 fi
 log_info "Found ${#MISSING_COMMITS[@]} missing commit(s) to cherry-pick"
 echo ""
 # Output the cherry-pick commands
 echo "=========================================="
 echo "Cherry-pick commands (in chronological order):"
 echo "=========================================="
 echo ""
 for info in "${MISSING_INFO[@]}"; do
    echo "# $info"
 done
 echo ""
 echo "# Run these commands to cherry-pick all missing commits:"
 echo "git cherry-pick ${MISSING_COMMITS[*]}"
 echo ""
 # Or one by one
 echo "# Or cherry-pick one at a time:"
 for sha in "${MISSING_COMMITS[@]}"; do
    echo "git cherry-pick $sha"
 done
 echo ""
 # Execute if requested
 if [[ "$DRY_RUN" == false ]]; then
    echo "=========================================="
    log_info "Executing cherry-picks..."
    echo "=========================================="
    for i in "${!MISSING_COMMITS[@]}"; do
        sha="${MISSING_COMMITS[$i]}"
        info="${MISSING_INFO[$i]}"
        echo ""
        log_info "Cherry-picking: $info"
        if git cherry-pick "$sha"; then
            log_success "Successfully cherry-picked $sha"
        else
            log_error "Failed to cherry-pick $sha"
            log_error "Resolve conflicts and run 'git cherry-pick --continue', or 'git cherry-pick --abort' to cancel"
            exit 1
        fi
    done
    echo ""
    log_success "All cherry-picks completed successfully!"
 else
    echo "=========================================="
    echo -e "${YELLOW}Dry run mode - no changes made${NC}"
    echo "Run with --execute to perform the cherry-picks"
    echo "=========================================="
 fi
--- a/.buildkite/scripts/cleanup-nightly-builds.sh
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@@ -3,14 +3,7 @@
 set -ex
 # Clean up old nightly builds from DockerHub, keeping only the last 14 builds
-# This script uses DockerHub API to list and delete old tags with specified prefix
+# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
 # Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
 # Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
 # Get tag prefix from argument, default to "nightly-" if not provided
 TAG_PREFIX="${1:-nightly-}"
 echo "Cleaning up tags with prefix: $TAG_PREFIX"
 # DockerHub API endpoint for vllm/vllm-openai repository
 REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
@@ -52,7 +45,7 @@ get_all_tags() {
        set -x
        # Get both last_updated timestamp and tag name, separated by |
-        local tags=$(echo "$response" | jq -r --arg prefix "$TAG_PREFIX" '.results[] | select(.name | startswith($prefix)) | "\(.last_updated)|\(.name)"')
+        local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
        if [ -z "$tags" ]; then
            break
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -112,7 +112,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
 def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
    """
-    Generate project list HTML content linking to each project & variant subdirectory.
+    Generate project list HTML content linking to each project & variant sub-directory.
    """
    href_tags = []
    for name in sorted(subdir_names):
@@ -168,23 +168,23 @@ def generate_index_and_metadata(
        comment (str | None): Optional comment to include in the generated HTML files.
    First, parse all wheel files to extract metadata.
-    We need to collect all wheel files for each variant, and generate an index for it (in a subdirectory).
+    We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
    The index for the default variant (if any) is generated in the root index directory.
    If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
    is purely a copy of the corresponding variant index, with only the links adjusted.
    Otherwise, all wheels without variant suffixes are treated as the default variant.
-    If `alias_to_default` is provided, an additional alias subdirectory is created, it has the same content
+    If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
    as the default variant index, but the links are adjusted accordingly.
    Index directory structure:
        index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
-            index.html  # project list, linking to "vllm/" and other packages, and all variant subdirectories
+            index.html  # project list, linking to "vllm/" and other packages, and all variant sub-directories
            vllm/
                index.html # package index, pointing to actual files in wheel_base_dir (relative path)
                metadata.json # machine-readable metadata for all wheels in this package
-            cpu/ # cpu variant subdirectory
+            cpu/ # cpu variant sub-directory
                index.html
                vllm/
                    index.html
@@ -194,7 +194,7 @@ def generate_index_and_metadata(
                vllm/
                    index.html
                    metadata.json
-            cu130/ # cu130 variant subdirectory
+            cu130/ # cu130 variant sub-directory
                index.html
                vllm/
                    index.html
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -44,17 +44,6 @@ cleanup_docker() {
  fi
 }
 cleanup_network() {
  for node in $(seq 0 $((NUM_NODES-1))); do
    if docker pr -a -q -f name="node${node}" | grep -q .; then
      docker stop "node${node}"
    fi
  done
  if docker network ls | grep docker-net; then
    docker network rm docker-net
  fi
 }
 # Call the cleanup docker function
 cleanup_docker
@@ -235,35 +224,6 @@ if [[ $commands == *"--shard-id="* ]]; then
    echo "All shards reported no tests collected. Failing the build."
    exit 1
  fi
 elif [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
  export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')
  if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then
      prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g')
      echo "PREFIX: ${prefix}"
      export composite_command="(command rocm-smi || true)"
      myIFS=$IFS
      IFS=','
      read -ra node0 <<< ${BASH_REMATCH[2]}
      read -ra node1 <<< ${BASH_REMATCH[3]}
      IFS=$myIFS
      for i in "${!node0[@]}";do 
        command_node_0=$(echo ${node0[i]} | sed 's/\"//g')
        command_node_1=$(echo ${node1[i]} | sed 's/\"//g')
        export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
        echo "COMMANDS: ${commands}"
        composite_command=$(echo "${composite_command} && ${commands}")
      done
      /bin/bash -c "${composite_command}"
      cleanup_network
  else
      echo "Failed to parse node commands! Exiting."
      cleanup_network
      exit 111
  fi
 else
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -46,7 +46,7 @@ docker run \
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
    pytest -v -s v1/structured_output
-    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py
+    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
    pytest -v -s v1/test_serial_utils.py
 '
--- a/.buildkite/scripts/push-nightly-builds.sh
+++ b/.buildkite/scripts/push-nightly-builds.sh
@@ -1,36 +0,0 @@
 #!/bin/bash
 set -ex
 # Get tag variant from argument, default to empty if not provided, should be something like "cu130".
 # Due to limits in cleanup script, we must move variants to use separate tags like "cu130-nightly",
 # otherwise they will be cleaned up together with the main "nightly" tags.
 TAG_VARIANT="$1"
 if [ -n "$TAG_VARIANT" ]; then
    ORIG_TAG_SUFFIX="-$TAG_VARIANT"
    TAG_NAME="$TAG_VARIANT-nightly"
 else
    ORIG_TAG_SUFFIX=""
    TAG_NAME="nightly"
 fi
 ORIG_TAG_NAME="$BUILDKITE_COMMIT"
 echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag name: $TAG_NAME"
 # pull original arch-dependent images from AWS ECR Public
 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
 # tag arch-dependent images
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
 # push arch-dependent images to DockerHub
 docker push vllm/vllm-openai:$TAG_NAME-x86_64
 docker push vllm/vllm-openai:$TAG_NAME-aarch64
 # push arch-independent manifest to DockerHub
 docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
 docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
 docker manifest push vllm/vllm-openai:$TAG_NAME
 docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -18,18 +18,15 @@ wait_for_server() {
 MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
-# Set BACKENDS and platform-specific args based on platform
+# Set BACKENDS based on platform
 if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
  # ROCm platform
  BACKENDS=("allgather_reducescatter")
  # Disable MOE padding for ROCm since it is causing eplb to fail
  export VLLM_ROCM_MOE_PADDING=0
  PLATFORM_ARGS=("--no-async-scheduling")
  echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
 else
  # Non-ROCm platform (CUDA/other)
  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
  PLATFORM_ARGS=()
 fi
 cleanup() {
@@ -57,7 +54,6 @@ for BACK in "${BACKENDS[@]}"; do
    --trust-remote-code \
    --max-model-len 2048 \
    --gpu-memory-utilization 0.9 \
    "${PLATFORM_ARGS[@]}" \
    --port $PORT &
  SERVER_PID=$!
  wait_for_server $PORT
--- a/.buildkite/scripts/trigger-ci-build.sh
+++ b/.buildkite/scripts/trigger-ci-build.sh
@@ -1,227 +0,0 @@
 #!/bin/bash
 #
 # trigger-ci-build.sh
 # Trigger a Buildkite CI build using the bk CLI for the current commit and branch
 # with RUN_ALL=1 and NIGHTLY=1 environment variables.
 #
 # Usage: ./trigger-ci-build.sh [options]
 #
 # Requires: bk CLI (https://buildkite.com/docs/platform/cli)
 #
 # SAFETY: Dry-run by default. Use --execute to actually trigger a build.
 #
 set -euo pipefail
 # Colors for output
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 NC='\033[0m' # No Color
 # Default configuration
 PIPELINE="ci"
 DRY_RUN=true
 usage() {
    cat <<EOF
 Usage: $(basename "$0") [options]
 Trigger a Buildkite CI build using the bk CLI for the current commit and branch.
 Sets RUN_ALL=1 and NIGHTLY=1 environment variables.
 SAFETY: Dry-run by default. Use --execute to actually trigger a build.
 Options:
    --execute       Actually trigger the build (default: dry-run)
    --pipeline      Buildkite pipeline slug (default: ${PIPELINE})
    --commit        Override commit SHA (default: current HEAD)
    --branch        Override branch name (default: current branch)
    --message       Custom build message (default: auto-generated)
    --help          Show this help message
 Prerequisites:
    - bk CLI installed: brew tap buildkite/buildkite && brew install buildkite/buildkite/bk
    - bk configured: bk configure
 Examples:
    $(basename "$0")                        # Dry-run, show what would happen
    $(basename "$0") --execute              # Actually trigger the build
    $(basename "$0") --pipeline ci-shadow   # Dry-run with different pipeline
 EOF
    exit 1
 }
 log_info() {
    echo -e "${BLUE}[INFO]${NC} $1"
 }
 log_success() {
    echo -e "${GREEN}[OK]${NC} $1"
 }
 log_warn() {
    echo -e "${YELLOW}[WARN]${NC} $1"
 }
 log_error() {
    echo -e "${RED}[ERROR]${NC} $1" >&2
 }
 # Parse arguments
 COMMIT=""
 BRANCH=""
 MESSAGE=""
 while [[ $# -gt 0 ]]; do
    case $1 in
        --execute)
            DRY_RUN=false
            shift
            ;;
        --pipeline)
            PIPELINE="$2"
            shift 2
            ;;
        --commit)
            COMMIT="$2"
            shift 2
            ;;
        --branch)
            BRANCH="$2"
            shift 2
            ;;
        --message)
            MESSAGE="$2"
            shift 2
            ;;
        --help|-h)
            usage
            ;;
        -*)
            log_error "Unknown option: $1"
            usage
            ;;
        *)
            log_error "Unexpected argument: $1"
            usage
            ;;
    esac
 done
 # Check if bk CLI is installed
 if ! command -v bk &>/dev/null; then
    log_error "Buildkite CLI (bk) is not installed"
    echo ""
    echo "Install with:"
    echo "  brew tap buildkite/buildkite && brew install buildkite/buildkite/bk"
    echo ""
    echo "Then configure:"
    echo "  bk configure"
    exit 1
 fi
 # Check if we're in a git repository
 if ! git rev-parse --is-inside-work-tree &>/dev/null; then
    log_error "Not in a git repository"
    exit 1
 fi
 # Get current commit and branch if not overridden
 if [[ -z "$COMMIT" ]]; then
    COMMIT=$(git rev-parse HEAD)
 fi
 if [[ -z "$BRANCH" ]]; then
    BRANCH=$(git branch --show-current)
    if [[ -z "$BRANCH" ]]; then
        # Detached HEAD state - try to get branch from ref
        BRANCH=$(git rev-parse --abbrev-ref HEAD)
    fi
 fi
 # Generate default message if not provided
 if [[ -z "$MESSAGE" ]]; then
    COMMIT_MSG=$(git log -1 --pretty=format:"%s" "$COMMIT" 2>/dev/null || echo "Manual build")
    MESSAGE="[Manual] ${COMMIT_MSG}"
 fi
 # Safety check: Verify the commit exists on the remote
 log_info "Verifying commit exists on remote..."
 git fetch origin --quiet 2>/dev/null || true
 # Check if commit is reachable from any remote branch
 REMOTE_BRANCHES=$(git branch -r --contains "$COMMIT" 2>/dev/null || true)
 if [[ -z "$REMOTE_BRANCHES" ]]; then
    log_error "Commit ${COMMIT} does not exist on any remote branch!"
    echo ""
    echo "The CI system will fail to checkout this commit."
    echo "Please push your changes first:"
    echo ""
    echo "  git push origin ${BRANCH}"
    echo ""
    exit 1
 fi
 log_success "Commit found on remote branches:"
 echo "$REMOTE_BRANCHES" | head -5 | sed 's/^/  /'
 if [[ $(echo "$REMOTE_BRANCHES" | wc -l) -gt 5 ]]; then
    echo "  ... and more"
 fi
 echo ""
 log_info "Pipeline: ${PIPELINE}"
 log_info "Branch: ${BRANCH}"
 log_info "Commit: ${COMMIT}"
 log_info "Message: ${MESSAGE}"
 log_info "Environment: RUN_ALL=1, NIGHTLY=1"
 echo ""
 # Build the command
 CMD=(bk build create
    -y
    -w
    -i
    --pipeline "${PIPELINE}"
    --commit "${COMMIT}"
    --branch "${BRANCH}"
    --message "${MESSAGE}"
    --env "RUN_ALL=1"
    --env "NIGHTLY=1"
 )
 if [[ "$DRY_RUN" == true ]]; then
    echo "=========================================="
    log_warn "DRY-RUN MODE - No build will be triggered"
    echo "=========================================="
    echo ""
    echo "Command that would be executed:"
    echo ""
    # Escape single quotes in values for safe shell display
    escape_for_shell() {
        printf '%s' "$1" | sed "s/'/'\\\\''/g"
    }
    echo "  bk build create \\"
    echo "    -y \\"
    echo "    -w \\"
    echo "    -i \\"
    echo "    --pipeline '$(escape_for_shell "${PIPELINE}")' \\"
    echo "    --commit '$(escape_for_shell "${COMMIT}")' \\"
    echo "    --branch '$(escape_for_shell "${BRANCH}")' \\"
    echo "    --message '$(escape_for_shell "${MESSAGE}")' \\"
    echo "    --env 'RUN_ALL=1' \\"
    echo "    --env 'NIGHTLY=1'"
    echo ""
    echo "=========================================="
    echo -e "${YELLOW}To actually trigger this build, run:${NC}"
    echo ""
    echo "  $0 --execute"
    echo "=========================================="
    exit 0
 fi
 log_info "Triggering build..."
 # Execute the command - bk will print the URL and open browser
 "${CMD[@]}"
--- a/.buildkite/scripts/upload-release-wheels-pypi.sh
+++ b/.buildkite/scripts/upload-release-wheels-pypi.sh
@@ -1,70 +0,0 @@
 #!/usr/bin/env bash
 set -e
 BUCKET="vllm-wheels"
 SUBPATH=$BUILDKITE_COMMIT
 S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
 RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
 GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
 echo "Release version from Buildkite: $RELEASE_VERSION"
 if [[ -z "$GIT_VERSION" ]]; then
    echo "[FATAL] Not on a git tag, cannot create release."
    exit 1
 else
    echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
 fi
 # sanity check for version mismatch
 if [[ "$RELEASE_VERSION" != "$GIT_VERSION" ]]; then
  if [[ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]]; then
    echo "[WARNING] Force release and ignore version mismatch"
  else
    echo "[FATAL] Release version from Buildkite does not match Git version."
    exit 1
  fi
 fi
 PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
 # check pypi token
 if [[ -z "$PYPI_TOKEN" ]]; then
  echo "[FATAL] PYPI_TOKEN is not set."
  exit 1
 else
  export TWINE_USERNAME="__token__"
  export TWINE_PASSWORD="$PYPI_TOKEN"
 fi
 set -x # avoid printing secrets above
 # install twine from pypi
 python3 -m venv /tmp/vllm-release-env
 source /tmp/vllm-release-env/bin/activate
 pip install twine
 python3 -m twine --version
 # copy release wheels to local directory
 DIST_DIR=/tmp/vllm-release-dist
 echo "Existing wheels on S3:"
 aws s3 ls "$S3_COMMIT_PREFIX"
 echo "Copying wheels to local directory"
 mkdir -p $DIST_DIR
 # include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
 aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
 echo "Wheels copied to local directory"
 # generate source tarball
 git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
 ls -la $DIST_DIR
 # upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
 PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
 if [[ -z "$PYPI_WHEEL_FILES" ]]; then
  echo "No default variant wheels found, quitting..."
  exit 1
 fi
 python3 -m twine check $PYPI_WHEEL_FILES
 python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES
 echo "Wheels uploaded to PyPI"
--- a/.buildkite/scripts/upload-release-wheels.sh
+++ b/.buildkite/scripts/upload-release-wheels.sh
@@ -0,0 +1,103 @@
 #!/usr/bin/env bash
 set -e
 BUCKET="vllm-wheels"
 SUBPATH=$BUILDKITE_COMMIT
 S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
 RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
 echo "Release version from Buildkite: $RELEASE_VERSION"
 GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
 if [ -z "$GIT_VERSION" ]; then
    echo "[FATAL] Not on a git tag, cannot create release."
    exit 1
 else
    echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
 fi
 # sanity check for version mismatch
 if [ "v$RELEASE_VERSION" != "$GIT_VERSION" ]; then
  if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
    echo "[WARNING] Force release and ignore version mismatch"
  else
    echo "[FATAL] Release version from Buildkite does not match Git version."
    exit 1
  fi
 fi
 # check pypi token
 if [ -z "$PYPI_TOKEN" ]; then
  echo "[FATAL] PYPI_TOKEN is not set."
  exit 1
 else
  export TWINE_USERNAME="__token__"
  export TWINE_PASSWORD="$PYPI_TOKEN"
 fi
 # check github token
 if [ -z "$GITHUB_TOKEN" ]; then
  echo "[FATAL] GITHUB_TOKEN is not set."
  exit 1
 else
  export GH_TOKEN="$GITHUB_TOKEN"
 fi
 set -x # avoid printing secrets above
 # download gh CLI from github
 # Get latest gh CLI version from GitHub API
 GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
 if [ -z "$GH_VERSION" ]; then
  echo "[FATAL] Failed to get latest gh CLI version from GitHub"
  exit 1
 fi
 echo "Downloading gh CLI version: $GH_VERSION"
 GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
 GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
 GH_INSTALL_DIR="/tmp/gh-install"
 mkdir -p "$GH_INSTALL_DIR"
 pushd "$GH_INSTALL_DIR"
 curl -L -o "$GH_TARBALL" "$GH_URL"
 tar -xzf "$GH_TARBALL"
 GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
 if [ -z "$GH_BIN" ]; then
  echo "[FATAL] Failed to find gh CLI executable"
  exit 1
 fi
 echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
 echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
 command "$GH_BIN" release list --limit 5
 popd
 # install twine from pypi
 python3 -m venv /tmp/vllm-release-env
 source /tmp/vllm-release-env/bin/activate
 pip install twine
 python3 -m twine --version
 # copy release wheels to local directory
 DIST_DIR=/tmp/vllm-release-dist
 echo "Existing wheels on S3:"
 aws s3 ls "$S3_COMMIT_PREFIX"
 echo "Copying wheels to local directory"
 mkdir -p $DIST_DIR
 # include only wheels for the release version, ignore all files with "dev" or "rc" in the name
 aws s3 cp --recursive --exclude "*" --include "vllm-${RELEASE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc*" "$S3_COMMIT_PREFIX" $DIST_DIR
 echo "Wheels copied to local directory"
 # generate source tarball
 git archive --format=tar.gz --output="$DIST_DIR/vllm-${RELEASE_VERSION}.tar.gz" $BUILDKITE_COMMIT
 ls -la $DIST_DIR
 # upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
 PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${RELEASE_VERSION}*.whl" -not -name "*+*")
 if [ -z "$PYPI_WHEEL_FILES" ]; then
  echo "No default variant wheels found, quitting..."
  exit 1
 fi
 python3 -m twine check $PYPI_WHEEL_FILES
 python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
 echo "Wheels uploaded to PyPI"
 # create release on GitHub with the release version and all wheels
 command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -71,7 +71,6 @@ steps:
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
  - tests/tool_parsers
@@ -83,7 +82,6 @@ steps:
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s renderers
  - pytest -v -s tokenizers_
  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
@@ -430,8 +428,6 @@ steps:
  timeout_in_minutes: 30
  gpu: h100
  source_file_dependencies:
    - vllm/config/attention.py
    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
@@ -456,12 +452,10 @@ steps:
  timeout_in_minutes: 30
  gpu: b200
  source_file_dependencies:
    - vllm/config/attention.py
    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
-    - pytest -v -s v1/attention
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
 - label: V1 Test others (CPU) # 5 mins
  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
@@ -640,9 +634,8 @@ steps:
  # grade: Blocking
  source_file_dependencies:
  - csrc/attention/
  - vllm/attention
  - vllm/v1/attention
    # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
  - vllm/model_executor/layers/attention
  - tests/kernels/attention
  commands:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -710,17 +703,6 @@ steps:
    - pytest -v -s kernels/moe/test_batched_deepgemm.py
    - pytest -v -s kernels/attention/test_deepgemm_attention.py
 - label: Kernels Helion Test
  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  source_file_dependencies:
  - vllm/utils/import_utils.py
  - tests/kernels/helion/
  commands:
    - pip install helion
    - pytest -v -s kernels/helion/
 - label: Model Executor Test # 23min
  timeout_in_minutes: 35
  torch_nightly: true
@@ -873,7 +855,7 @@ steps:
 - label: Language Models Tests (Standard)
  timeout_in_minutes: 25
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
@@ -1132,7 +1114,7 @@ steps:
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-  - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -1278,7 +1260,7 @@ steps:
 - label: 2 Node Tests (4 GPUs in total) # 16min
  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdmultinode]
+  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  # grade: Blocking
  working_dir: "/vllm-workspace/tests"
@@ -1292,15 +1274,15 @@ steps:
  - tests/distributed/
  - tests/examples/offline_inference/data_parallel.py
  commands:
-  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)  | grep 'Same node test passed'   | grep 'Node count test passed'
+  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py 
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py 
+    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py 
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py 
+    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
 - label: Distributed Tests (2 GPUs) # 68min
@@ -1469,7 +1451,7 @@ steps:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
 - label: NixlConnector PD accuracy tests (Distributed) # 30min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  # grade: Blocking
  timeout_in_minutes: 30
@@ -1480,10 +1462,10 @@ steps:
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
-    - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+    - VLLM_ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 - label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  # grade: Blocking
  timeout_in_minutes: 15
@@ -1494,7 +1476,7 @@ steps:
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
-    - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+    - VLLM_ATTENTION_BACKEND=ROCM_ATTN DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 ##### multi gpus test #####
 ##### A100 test #####
@@ -1509,9 +1491,6 @@ steps:
  source_file_dependencies:
  - vllm/
  commands:
  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
  # TODO: Remove when the bug is fixed in a future ROCm release
  - export TORCH_NCCL_BLOCKING_WAIT=1
  # NOTE: don't test llama model here, it seems hf implementation is buggy
  # see https://github.com/vllm-project/vllm/pull/5689 for details
  - pytest -v -s distributed/test_custom_all_reduce.py
@@ -1683,6 +1662,17 @@ steps:
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
 - label: DeepSeek V2-Lite Async EPLB Accuracy
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_4
  # grade: Blocking
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
 - label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
  timeout_in_minutes: 60
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -64,7 +64,6 @@ steps:
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
  - tests/tool_parsers
@@ -76,7 +75,6 @@ steps:
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s renderers
  - pytest -v -s tokenizers_
  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
@@ -362,7 +360,7 @@ steps:
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
-    - pytest -v -s -m 'not slow_test' v1/spec_decode
+    - pytest -v -s v1/spec_decode
    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
    - pytest -v -s -m 'not cpu_test' v1/metrics
    - pytest -v -s v1/test_oracle.py
@@ -376,8 +374,6 @@ steps:
  timeout_in_minutes: 30
  gpu: h100
  source_file_dependencies:
    - vllm/config/attention.py
    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
@@ -400,12 +396,10 @@ steps:
  timeout_in_minutes: 30
  gpu: b200
  source_file_dependencies:
    - vllm/config/attention.py
    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
-    - pytest -v -s v1/attention
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
 - label: V1 Test others (CPU) # 5 mins
  source_file_dependencies:
@@ -568,9 +562,8 @@ steps:
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/attention/
  - vllm/attention
  - vllm/v1/attention
    # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
  - vllm/model_executor/layers/attention
  - tests/kernels/attention
  commands:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -631,56 +624,6 @@ steps:
    - pytest -v -s kernels/moe/test_batched_deepgemm.py
    - pytest -v -s kernels/attention/test_deepgemm_attention.py
 - label: Kernels Helion Test
  timeout_in_minutes: 30
  gpu: h100
  source_file_dependencies:
  - vllm/utils/import_utils.py
  - tests/kernels/helion/
  commands:
    - pip install helion
    - pytest -v -s kernels/helion/
 - label: Kernels FP8 MoE Test (1 H100)
  timeout_in_minutes: 90
  gpu: h100
  num_gpus: 1
  optional: true
  commands:
    - pytest -v -s kernels/moe/test_cutlass_moe.py
    - pytest -v -s kernels/moe/test_flashinfer.py
    - pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
    - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
    - pytest -v -s kernels/moe/test_moe.py
    # - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
    - pytest -v -s kernels/moe/test_block_int8.py
    - pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
    - pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
 - label: Kernels FP8 MoE Test (2 H100s)
  timeout_in_minutes: 90
  gpu: h100
  num_gpus: 2
  optional: true
  commands:
    - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
    - pytest -v -s kernels/moe/test_deepep_moe.py
    - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
    # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
 - label: Kernels Fp4 MoE Test (B200)
  timeout_in_minutes: 60
  gpu: b200
  num_gpus: 1
  optional: true
  commands:
    - pytest -v -s kernels/moe/test_cutedsl_moe.py
    - pytest -v -s kernels/moe/test_flashinfer_moe.py
    - pytest -v -s kernels/moe/test_nvfp4_moe.py
    - pytest -v -s kernels/moe/test_ocp_mx_moe.py
 - label: Model Executor Test # 23min
  timeout_in_minutes: 35
  torch_nightly: true
@@ -1008,7 +951,7 @@ steps:
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Blackwell Test # 23 min
+- label: Blackwell Test # 21 min
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  gpu: b200
@@ -1018,7 +961,7 @@ steps:
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-  - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -1048,8 +991,6 @@ steps:
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
    # e2e
    - pytest -v -s tests/models/quantization/test_nvfp4.py
 - label: Blackwell Fusion and Compile Tests # 30 min
  timeout_in_minutes: 40
@@ -1104,48 +1045,6 @@ steps:
    # Run all e2e fusion tests
    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
 - label: Hopper Fusion E2E Tests (H100) # 10min
  timeout_in_minutes: 70
  working_dir: "/vllm-workspace/"
  gpu: h100
  optional: true
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  - tests/compile/test_fusion_attn.py
  commands:
    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
    # skip Llama-4 since it does not fit on this device
    - pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
 - label: Hopper Fusion Distributed E2E Tests (2xH100)  # 70min
  timeout_in_minutes: 70
  working_dir: "/vllm-workspace/"
  gpu: h100
  optional: true
  num_gpus: 2
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  - tests/compile/distributed/test_fusions_e2e.py
  commands:
    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
    # Run all e2e fusion tests
    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
@@ -1317,7 +1216,7 @@ steps:
  - pytest -v -s distributed/test_distributed_oot.py
  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
-  - pytest -v -s plugins/lora_resolvers # unit tests for lora resolver plugins
+  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
 - label: Pipeline + Context Parallelism Test # 45min
  timeout_in_minutes: 60
@@ -1420,20 +1319,6 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py
 - label: Acceptance Length Test (Large Models) # optional
  timeout_in_minutes: 120
  gpu: h100
  optional: true
  num_gpus: 1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/spec_decode/
  - vllm/model_executor/models/mlp_speculator.py
  - tests/v1/spec_decode/test_acceptance_length.py
  commands:
    - export VLLM_ALLOW_INSECURE_SERIALIZATION=1
    - pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
 - label: LM Eval Large Models # optional
  gpu: a100
  optional: true
@@ -1459,31 +1344,22 @@ steps:
    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
- label: Sequence Parallel Tests (H100) # 60 min
+##### H200 test #####
-  timeout_in_minutes: 60
+- label: Distributed Tests (H200) # optional
-  working_dir: "/vllm-workspace/"
+  gpu: h200
  gpu: h100
  optional: true
  num_gpus: 2
  commands:
    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
    # Run sequence parallel tests
    - pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
 - label: Distributed Tests (H100) # optional
  gpu: h100
  optional: true
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
-    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py
 ##### H200 test #####
 - label: LM Eval Large Models (H200) # optional
  timeout_in_minutes: 60
  gpu: h200
--- a/.buildkite/test_areas/attention.yaml
+++ b/.buildkite/test_areas/attention.yaml
@@ -4,10 +4,8 @@ depends_on:
 steps:
 - label: V1 attention (H100)
  timeout_in_minutes: 30
-  device: h100
+  gpu: h100
  source_file_dependencies:
    - vllm/config/attention.py
    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
@@ -15,11 +13,9 @@ steps:
 - label: V1 attention (B200)
  timeout_in_minutes: 30
-  device: b200
+  gpu: b200
  source_file_dependencies:
    - vllm/config/attention.py
    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
-    - pytest -v -s v1/attention
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Fusion and Compile Tests (B200)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
-  device: b200
+  gpu: b200
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -26,7 +26,7 @@ steps:
    - nvidia-smi
    - pytest -v -s tests/compile/test_fusion_attn.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
-    # this runner has 2 GPUs available even though num_devices=2 is not set
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
    # Wrap with quotes to escape yaml
@@ -37,9 +37,9 @@ steps:
 - label: Fusion E2E (2 GPUs)(B200)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
-  device: b200
+  gpu: b200
  optional: true
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Distributed Comm Ops
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
  - vllm/distributed
  - tests/distributed
@@ -18,7 +18,7 @@ steps:
 - label: Distributed (2 GPUs)
  timeout_in_minutes: 90
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
  - vllm/compilation/
  - vllm/distributed/
@@ -54,7 +54,7 @@ steps:
 - label: Distributed Tests (4 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
  - tests/distributed/test_utils
@@ -103,8 +103,8 @@ steps:
 - label: Distributed Tests (8 GPUs)(H100)
  timeout_in_minutes: 10
-  device: h100
+  gpu: h100
-  num_devices: 8
+  num_gpus: 8
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - examples/offline_inference/torchrun_dp_example.py
@@ -120,9 +120,9 @@ steps:
  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
 - label: Distributed Tests (4 GPUs)(A100)
-  device: a100
+  gpu: a100
  optional: true
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
  - vllm/
  commands:
@@ -133,34 +133,26 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py
- label: Sequence Parallel Tests (H100)
+- label: Distributed Tests (2 GPUs)(H200)
-  timeout_in_minutes: 60
+  gpu: h200
  working_dir: "/vllm-workspace/"
  device: h100
  optional: true
  num_devices: 2
  commands:
    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
    # Run sequence parallel tests
    - pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
 - label: Distributed Tests (2 GPUs)(H100)
  device: h100
  optional: true
  working_dir: "/vllm-workspace/"
-  num_devices: 2
+  num_gpus: 2
  commands:
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
-    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py
 - label: Distributed Tests (2 GPUs)(B200)
-  device: b200
+  gpu: b200
  optional: true
  working_dir: "/vllm-workspace/"
-  num_devices: 2
+  num_gpus: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
@@ -169,9 +161,8 @@ steps:
 - label: 2 Node Test (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  num_nodes: 2
  no_plugin: true
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
@@ -180,12 +171,12 @@ steps:
  - tests/distributed/
  - tests/examples/offline_inference/data_parallel.py
  commands:
-    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
+    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
 - label: Distributed NixlConnector PD accuracy (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
@@ -193,21 +184,10 @@ steps:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 - label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 - label: Pipeline + Context Parallelism (4 GPUs))
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
@@ -216,46 +196,4 @@ steps:
  - tests/distributed/
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
-  - pytest -v -s distributed/test_pipeline_parallel.py
+  - pytest -v -s distributed/test_pipeline_parallel.py
 - label: Hopper Fusion E2E Tests (H100)
  timeout_in_minutes: 70
  working_dir: "/vllm-workspace/"
  device: h100
  optional: true
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  - tests/compile/test_fusion_attn.py
  commands:
    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
    # skip Llama-4 since it does not fit on this device
    - pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
 - label: Hopper Fusion Distributed E2E Tests (2xH100)
  timeout_in_minutes: 70
  working_dir: "/vllm-workspace/"
  device: h100
  optional: true
  num_devices: 2
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  - tests/compile/distributed/test_fusions_e2e.py
  commands:
    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
    # Run all e2e fusion tests
    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -4,27 +4,27 @@ depends_on:
 steps:
 - label: DeepSeek V2-Lite Accuracy
  timeout_in_minutes: 60
-  device: h100
+  gpu: h100
  optional: true
-  num_devices: 4
+  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
 - label: Qwen3-30B-A3B-FP8-block Accuracy
  timeout_in_minutes: 60
-  device: h100
+  gpu: h100
  optional: true
-  num_devices: 4
+  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
 - label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
  timeout_in_minutes: 60
-  device: b200
+  gpu: b200
  optional: true
-  num_devices: 2
+  num_gpus: 2
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
@@ -33,11 +33,10 @@ steps:
  timeout_in_minutes: 30
  optional: true
  soft_fail: true
-  num_devices: 2
+  num_gpus: 2
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
    - nvidia-smi
    - bash .buildkite/scripts/run-prime-rl-test.sh
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -23,8 +23,4 @@ steps:
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - pytest -v -s v1/e2e
-    # Run this test standalone for now;
+    - pytest -v -s v1/engine
    # need to untangle use (implicit) use of spawn/fork across the tests.
    - pytest -v -s v1/engine/test_preprocess_error_handling.py
    # Run the rest of v1/engine tests
    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -14,7 +14,7 @@ steps:
 - label: EPLB Execution
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/eplb
  - tests/distributed/test_eplb_execute.py
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -15,9 +15,8 @@ steps:
  timeout_in_minutes: 35
  source_file_dependencies:
  - csrc/attention/
  - vllm/attention
  - vllm/v1/attention
    # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
  - vllm/model_executor/layers/attention
  - tests/kernels/attention
  commands:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -58,8 +57,8 @@ steps:
 - label: Kernels DeepGEMM Test (H100)
  timeout_in_minutes: 45
-  device: h100
+  gpu: h100
-  num_devices: 1
+  num_gpus: 1
  source_file_dependencies:
  - tools/install_deepgemm.sh
  - vllm/utils/deep_gemm.py
@@ -78,7 +77,7 @@ steps:
 - label: Kernels (B200)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
-  device: b200
+  gpu: b200
  # optional: true
  source_file_dependencies:
  - csrc/quantization/fp4/
@@ -86,7 +85,7 @@ steps:
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-  - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -115,55 +114,4 @@ steps:
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
    # e2e
    - pytest -v -s tests/models/quantization/test_nvfp4.py
 - label: Kernels Helion Test
  timeout_in_minutes: 30
  device: h100
  source_file_dependencies:
  - vllm/utils/import_utils.py
  - tests/kernels/helion/
  commands:
    - pip install helion
    - pytest -v -s kernels/helion/
 - label: Kernels FP8 MoE Test (1 H100)
  timeout_in_minutes: 90
  device: h100
  num_devices: 1
  optional: true
  commands:
    - pytest -v -s kernels/moe/test_cutlass_moe.py
    - pytest -v -s kernels/moe/test_flashinfer.py
    - pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
    - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
    - pytest -v -s kernels/moe/test_moe.py
    # - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
    - pytest -v -s kernels/moe/test_block_int8.py
    - pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
    - pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
 - label: Kernels FP8 MoE Test (2 H100s)
  timeout_in_minutes: 90
  device: h100
  num_devices: 2
  optional: true
  commands:
    - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
    - pytest -v -s kernels/moe/test_deepep_moe.py
    - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
    # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
 - label: Kernels Fp4 MoE Test (B200)
  timeout_in_minutes: 60
  device: b200
  num_devices: 1
  optional: true
  commands:
    - pytest -v -s kernels/moe/test_cutedsl_moe.py
    - pytest -v -s kernels/moe/test_flashinfer_moe.py
    - pytest -v -s kernels/moe/test_nvfp4_moe.py
    - pytest -v -s kernels/moe/test_ocp_mx_moe.py
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -12,9 +12,9 @@ steps:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
 - label: LM Eval Large Models (4 GPUs)(A100)
-  device: a100
+  gpu: a100
  optional: true
-  num_devices: 4
+  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
@@ -24,9 +24,9 @@ steps:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 - label: LM Eval Large Models (4 GPUs)(H100)
-  device: h100
+  gpu: h100
  optional: true
-  num_devices: 4
+  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
@@ -37,39 +37,10 @@ steps:
 - label: LM Eval Small Models (B200)
  timeout_in_minutes: 120
-  device: b200
+  gpu: b200
  optional: true
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
 - label: LM Eval Large Models (H200)
  timeout_in_minutes: 60
  device: h200
  optional: true
  num_devices: 8
  commands:
    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
 - label: MoE Refactor Integration Test (H100 - TEMPORARY)
  device: h100
  optional: true
  num_devices: 2
  commands:
    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
 - label: MoE Refactor Integration Test (B200 - TEMPORARY)
  device: b200
  optional: true
  num_devices: 2
  commands:
    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
 - label: MoE Refactor Integration Test (B200 DP - TEMPORARY)
  device: b200
  optional: true
  num_devices: 2
  commands:
    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@@ -14,7 +14,7 @@ steps:
 - label: LoRA TP (Distributed)
  timeout_in_minutes: 30
-  num_devices: 4
+  num_gpus: 4
  source_file_dependencies:
  - vllm/lora
  - tests/lora
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -27,12 +27,11 @@ steps:
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 - label: V1 Others (CPU)
-  depends_on:
+  depends_on: ~
    - image-build-cpu
  source_file_dependencies:
    - vllm/
    - tests/v1
-  device: cpu
+  no_gpu: true
  commands:
    # split the test to avoid interference
    - pytest -v -s -m 'cpu_test' v1/core
@@ -83,7 +82,7 @@ steps:
 - label: Metrics, Tracing (2 GPUs)
  timeout_in_minutes: 20
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
  - vllm/
  - tests/v1/tracing
@@ -115,27 +114,24 @@ steps:
  - pytest -v -s utils_
 - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
-  depends_on: 
+  depends_on: ~
  - image-build-cpu
  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
  - tests/tool_parsers
  - tests/transformers_utils
  - tests/config
-  device: cpu
+  no_gpu: true
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s renderers
  - pytest -v -s tokenizers_
  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
@@ -144,7 +140,7 @@ steps:
 - label: GPT-OSS Eval (B200)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
-  device: b200
+  gpu: b200
  optional: true
  source_file_dependencies:
  - tests/evals/gpt_oss
@@ -157,7 +153,7 @@ steps:
 - label: Batch Invariance (H100)
  timeout_in_minutes: 25
-  device: h100
+  gpu: h100
  source_file_dependencies:
    - vllm/v1/attention
    - vllm/model_executor/layers
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -39,14 +39,12 @@ steps:
    - pytest -v -s models/test_transformers.py models/test_registry.py
 - label: Basic Models Test (Other CPU) # 5min
  depends_on: 
  - image-build-cpu
  timeout_in_minutes: 10
  source_file_dependencies:
  - vllm/
  - tests/models/test_utils.py
  - tests/models/test_vision.py
-  device: cpu
+  no_gpu: true
  commands:
    - pytest -v -s models/test_utils.py models/test_vision.py
--- a/.buildkite/test_areas/models_distributed.yaml
+++ b/.buildkite/test_areas/models_distributed.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Distributed Model Tests (2 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
  - vllm/model_executor/model_loader/sharded_state_loader.py
  - vllm/model_executor/models/
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -14,13 +14,11 @@ steps:
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 - label: Multi-Modal Processor Test (CPU)
  depends_on: 
  - image-build-cpu
  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
-  device: cpu
+  no_gpu: true
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Plugin Tests (2 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
--- a/.buildkite/test_areas/quantization.yaml
+++ b/.buildkite/test_areas/quantization.yaml
@@ -16,14 +16,14 @@ steps:
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
-  - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 - label: Quantized MoE Test (B200)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
-  device: b200
+  gpu: b200
  source_file_dependencies:
  - tests/quantization/test_blackwell_moe.py
  - vllm/model_executor/models/deepseek_v2.py
--- a/.buildkite/test_areas/weight_loading.yaml
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Weight Loading Multiple GPU  # 33min
  timeout_in_minutes: 45
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
  optional: true
  source_file_dependencies:
  - vllm/
@@ -15,8 +15,8 @@ steps:
 - label: Weight Loading Multiple GPU - Large Models # optional
  working_dir: "/vllm-workspace/tests"
-  num_devices: 2
+  num_gpus: 2
-  device: a100
+  gpu: a100
  optional: true
  source_file_dependencies:
  - vllm/
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,8 +2,8 @@
 # for more info about CODEOWNERS file
 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention @LucasWilkinson
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
 /vllm/model_executor/layers/attention @LucasWilkinson
 /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
@@ -16,7 +16,7 @@
 /vllm/entrypoints @aarnphm @chaunceyjiang
 /vllm/tool_parsers @aarnphm @chaunceyjiang
 /vllm/compilation @zou3519 @youkaichao @ProExpertProg
-/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery
+/vllm/distributed/kv_transfer @NickLucche @ApostaC
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # Any change to the VllmConfig changes can have a large user-facing impact,
@@ -30,14 +30,12 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/v1/attention/backends/mla @pavanimajety
 /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
-/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
+/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
 /vllm/v1/sample @22quinn @houseroad @njhill
 /vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/kv_cache_interface.py @heheda12345
-/vllm/v1/kv_offload @ApostaC @orozery
+/vllm/v1/offloading @ApostaC
 /vllm/v1/worker/gpu/kv_connector.py @orozery
 /vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery
 # Model runner V2
 /vllm/v1/worker/gpu @WoosukKwon
@@ -56,13 +54,13 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
-/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
+/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
 /tests/v1/kv_connector/nixl_integration @NickLucche
-/tests/v1/kv_connector @ApostaC @orozery
+/tests/v1/kv_connector @ApostaC
-/tests/v1/kv_offload @ApostaC @orozery
+/tests/v1/offloading @ApostaC
 /tests/v1/determinism @yewentao256 
 # Transformers modeling backend
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -414,18 +414,6 @@ pull_request_rules:
      remove:
        - needs-rebase
 - name: label-bug
  description: Automatically apply bug label
  conditions:
    - label != stale
    - or:
      - title~=(?i)\bbug\b
      - title~=(?i)\bbugfix\b
  actions:
    label:
      add:
        - bug
 - name: label-kv-connector
  description: Automatically apply kv-connector label
  conditions:
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -29,9 +29,8 @@ jobs:
      - name: Install dependencies and build vLLM
        run: |
          uv pip install -r requirements/cpu-build.txt --index-strategy unsafe-best-match
          uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
-          uv pip install -e . --no-build-isolation
+          uv pip install -e .
        env:
          CMAKE_BUILD_PARALLEL_LEVEL: 4
--- a/.gitignore
+++ b/.gitignore
@@ -7,9 +7,6 @@ vllm/vllm_flash_attn/*
 # OpenAI triton kernels copied from source
 vllm/third_party/triton_kernels/*
 # FlashMLA interface copied from source
 vllm/third_party/flashmla/flash_mla_interface.py
 # triton jit
 .triton
@@ -194,9 +191,6 @@ CLAUDE.md
 AGENTS.md
 .codex/
 # Cursor
 .cursor/
 # DS Store
 .DS_Store
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -147,17 +147,6 @@ repos:
    entry: python tools/pre_commit/validate_config.py
    language: python
    additional_dependencies: [regex]
  - id: validate-docker-versions
    name: Validate docker/versions.json matches Dockerfile
    entry: python tools/generate_versions_json.py --check
    language: python
    files: ^docker/(Dockerfile|versions\.json)$
    pass_filenames: false
    additional_dependencies: [dockerfile-parse]
  - id: attention-backend-docs
    name: Check attention backend documentation is up to date
    entry: python tools/pre_commit/generate_attention_backend_docs.py --check
    language: python
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -377,7 +377,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    # preselected input type pairs and schedules.
    # Generate sources:
    set(MARLIN_GEN_SCRIPT
-      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/marlin/generate_kernels.py)
+      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
    file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
    list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
    set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
@@ -412,7 +412,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
    if (MARLIN_ARCHS)
-      file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/marlin/sm80_kernel_*_float16.cu")
+      file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
        CUDA_ARCHS "${MARLIN_ARCHS}")
@@ -422,7 +422,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      endif()
      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
-      file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/marlin/sm80_kernel_*_bfloat16.cu")
+      file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
        CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
@@ -434,7 +434,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
    if (MARLIN_SM75_ARCHS) 
-      file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/marlin/sm75_kernel_*.cu")
+      file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/gptq_marlin/sm75_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
        CUDA_ARCHS "${MARLIN_SM75_ARCHS}")
@@ -446,7 +446,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
    if (MARLIN_FP8_ARCHS) 
-      file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/marlin/sm89_kernel_*.cu")
+      file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
        CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
@@ -458,10 +458,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
    set(MARLIN_SRCS
-       "csrc/quantization/marlin/marlin.cu"
+       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
-       "csrc/quantization/marlin/marlin_int4_fp8_preprocess.cu"
+       "csrc/quantization/gptq_marlin/gptq_marlin.cu"
-       "csrc/quantization/marlin/gptq_marlin_repack.cu"
+       "csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
-       "csrc/quantization/marlin/awq_marlin_repack.cu")
+       "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
       "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
    set_gencode_flags_for_srcs(
      SRCS "${MARLIN_SRCS}"
      CUDA_ARCHS "${MARLIN_OTHER_ARCHS}")
--- a/benchmarks/attention_benchmarks/README.md
+++ b/benchmarks/attention_benchmarks/README.md
@@ -1,266 +0,0 @@
 # vLLM Attention Benchmarking Suite
 Fast, flexible benchmarking for vLLM attention and MLA backends with an extended batch specification grammar.
 ## Quick Start
 ```bash
 cd benchmarks/attention_benchmarks
 # Run a pre-configured benchmark
 python benchmark.py --config configs/mla_decode.yaml
 python benchmark.py --config configs/mla_mixed_batch.yaml
 python benchmark.py --config configs/speculative_decode.yaml
 python benchmark.py --config configs/standard_attention.yaml
 python benchmark.py --config configs/reorder_threshold.yaml
 # Or run custom benchmarks
 python benchmark.py \
    --backends flash flashinfer \
    --batch-specs "q2k" "8q1s1k" "2q2k_32q1s1k" \
    --output-csv results.csv
 ```
 ## Simplified Batch Specification Grammar
 Express workloads concisely using query length and sequence length:
 ```python
 "q2k"              # 2048-token prefill (q_len=2048, seq_len=2048)
 "q1s1k"            # Decode: 1 token with 1K sequence
 "8q1s1k"           # 8 decode requests
 "q4s1k"            # 4-token extend (e.g., spec decode)
 "2q2k_32q1s1k"     # Mixed: 2 prefills + 32 decodes
 "16q4s1k"          # 16 spec decode (4 tokens each)
 ```
 ### Grammar Rule
 ```text
 Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
 - count:   Number of identical requests (optional, default=1)
 - q_len:   Query length (number of new tokens)
 - seq_len: Total sequence length (optional, defaults to q_len for prefill)
 - 'k':     Multiplies value by 1024
 Mixed batches: Use _ to combine (e.g., "2q2k_32q1s1k")
 ```
 **Note**: Decode, prefill, and spec decode are just different query lengths - no special syntax needed!
 ## Pre-configured Benchmarks
 The suite includes several pre-configured YAML benchmark configurations:
 ### MLA Decode Benchmark
 Tests pure decode performance across MLA backends with varying batch sizes and sequence lengths.
 ```bash
 python benchmark.py --config configs/mla_decode.yaml
 ```
 ### MLA Mixed Batch Benchmark
 Tests chunked prefill performance with mixed prefill + decode batches.
 ```bash
 python benchmark.py --config configs/mla_mixed_batch.yaml
 ```
 ### Speculative Decoding Benchmark
 Tests speculative decode scenarios (K-token verification) and reorder_batch_threshold optimization.
 ```bash
 python benchmark.py --config configs/speculative_decode.yaml
 ```
 ### Standard Attention Benchmark
 Tests standard attention backends (Flash/Triton/FlashInfer) with pure prefill, decode, and mixed batches.
 ```bash
 python benchmark.py --config configs/standard_attention.yaml
 ```
 ### Reorder Threshold Study
 **Question:** At what query length does the prefill pipeline become faster than the decode pipeline?
 Tests query lengths from 1-1024 across 9 batch sizes to find the crossover point. Uses `decode_vs_prefill` mode to compare both pipelines for each query length.
 ```bash
 python benchmark.py --config configs/reorder_threshold.yaml
 ```
 ---
 ## Universal Benchmark
 The `benchmark.py` script handles **all** backends - both standard attention and MLA.
 ### Standard Attention (Flash/Triton/FlashInfer)
 ```bash
 python benchmark.py \
    --backends flash triton flashinfer \
    --batch-specs "q2k" "8q1s1k" "2q2k_32q1s1k" \
    --num-layers 10 \
    --repeats 5 \
    --output-csv results.csv
 ```
 ### MLA Backends
 ```bash
 # Compare all MLA backends
 python benchmark.py \
    --backends cutlass_mla flashinfer_mla flashattn_mla flashmla \
    --batch-specs "64q1s1k" "64q1s4k" \
    --output-csv mla_results.csv
 ```
 ### Parameter Sweeps
 Use `--sweep-param` and `--sweep-values` to run parameter sweeps from the CLI:
 #### CUTLASS MLA num-splits Optimization
 **Question:** What is the optimal `num_kv_splits` for CUTLASS MLA?
 ```bash
 python benchmark.py \
    --backend cutlass_mla \
    --batch-specs "64q1s1k" "64q1s4k" "64q1s16k" \
    --sweep-param num_kv_splits \
    --sweep-values 1 2 4 8 16 \
    --output-json optimal_splits.json
 ```
 #### Reorder Batch Threshold Optimization
 **Question:** What's the optimal `reorder_batch_threshold` for speculative decoding?
 ```bash
 python benchmark.py \
    --backend flashmla \
    --batch-specs "q4s1k" "q8s2k" \
    --sweep-param reorder_batch_threshold \
    --sweep-values 1 4 16 64 256 512 \
    --output-csv threshold_sweep.csv
 ```
 ### All Command-Line Options
 ```text
 --config CONFIG                     # Path to YAML config file (overrides other args)
 --backends BACKEND [BACKEND ...]    # flash, triton, flashinfer, cutlass_mla,
                                    # flashinfer_mla, flashattn_mla, flashmla
 --backend BACKEND                   # Single backend (alternative to --backends)
 --batch-specs SPEC [SPEC ...]       # Batch specifications using extended grammar
 # Model configuration
 --num-layers N                      # Number of layers
 --head-dim N                        # Head dimension
 --num-q-heads N                     # Query heads
 --num-kv-heads N                    # KV heads
 --block-size N                      # Block size
 # Benchmark settings
 --device DEVICE                     # Device (default: cuda:0)
 --repeats N                         # Repetitions
 --warmup-iters N                    # Warmup iterations
 --profile-memory                    # Profile memory usage
 # Parameter sweeps
 --sweep-param PARAM                 # Parameter name to sweep (e.g., num_kv_splits,
                                    # reorder_batch_threshold)
 --sweep-values N [N ...]            # Values to sweep for the parameter
 # Output
 --output-csv FILE                   # Save to CSV
 --output-json FILE                  # Save to JSON
 ```
 ## Hardware Requirements
 | Backend | Hardware |
 |---------|----------|
 | Flash/Triton/FlashInfer | Any CUDA GPU |
 | CUTLASS MLA | Blackwell (SM100+) |
 | FlashAttn MLA | Hopper (SM90+) |
 | FlashMLA | Hopper (SM90+) |
 | FlashInfer-MLA | Any CUDA GPU |
 ## Using MLA Runner Directly
 All MLA backends are available through `mla_runner.run_mla_benchmark()`:
 ```python
 from mla_runner import run_mla_benchmark
 from common import BenchmarkConfig
 config = BenchmarkConfig(
    backend="cutlass_mla",
    batch_spec="64q1s4k",
    num_layers=10,
    head_dim=576,
    num_q_heads=128,
    num_kv_heads=1,
    block_size=128,
    device="cuda:0",
    repeats=5,
    warmup_iters=3,
 )
 # CUTLASS MLA with specific num_kv_splits
 result = run_mla_benchmark("cutlass_mla", config, num_kv_splits=4)
 print(f"Time: {result.mean_time:.6f}s")
 # FlashInfer-MLA
 result = run_mla_benchmark("flashinfer_mla", config)
 # FlashAttn MLA (Hopper SM90+)
 result = run_mla_benchmark("flashattn_mla", config, reorder_batch_threshold=64)
 # FlashMLA (Hopper SM90+)
 result = run_mla_benchmark("flashmla", config, reorder_batch_threshold=64)
 ```
 ## Python API
 ```python
 from batch_spec import parse_batch_spec, format_batch_spec, get_batch_stats
 from common import BenchmarkConfig, BenchmarkResult, ResultsFormatter
 # Parse batch specs
 requests = parse_batch_spec("2q2k_q4s1k_32q1s1k")
 print(format_batch_spec(requests))
 # "2 prefill (2x2k), 1 extend (1xq4kv1k), 32 decode (32x1k)"
 # Get batch statistics
 stats = get_batch_stats(requests)
 print(f"Total tokens: {stats['total_tokens']}")
 print(f"Num decode: {stats['num_decode']}, Num prefill: {stats['num_prefill']}")
 # Format results
 formatter = ResultsFormatter()
 formatter.save_csv(results, "output.csv")
 formatter.save_json(results, "output.json")
 ```
 ## Tips
 **1. Warmup matters** - Use `--warmup-iters 10` for stable results
 **2. Multiple repeats** - Use `--repeats 20` for low variance
 **3. Save results** - Always use `--output-csv` or `--output-json`
 **4. Test incrementally** - Start with `--num-layers 1 --repeats 1`
 **5. Extended grammar** - Leverage spec decode, chunked prefill patterns
 **6. Parameter sweeps** - Use `--sweep-param` and `--sweep-values` to find optimal values
--- a/benchmarks/attention_benchmarks/init.py
+++ b/benchmarks/attention_benchmarks/init.py
@@ -1,44 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """vLLM Attention Benchmarking Suite."""
 from .batch_spec import (
    BatchRequest,
    format_batch_spec,
    get_batch_stats,
    parse_batch_spec,
    reorder_for_flashinfer,
    split_by_type,
 )
 from .common import (
    BenchmarkConfig,
    BenchmarkResult,
    MockLayer,
    MockModelConfig,
    ResultsFormatter,
    get_attention_scale,
    is_mla_backend,
    setup_mla_dims,
 )
 __all__ = [
    # Batch specification
    "BatchRequest",
    "parse_batch_spec",
    "format_batch_spec",
    "reorder_for_flashinfer",
    "split_by_type",
    "get_batch_stats",
    # Benchmarking infrastructure
    "BenchmarkConfig",
    "BenchmarkResult",
    "ResultsFormatter",
    # Mock objects
    "MockLayer",
    "MockModelConfig",
    # Utilities
    "setup_mla_dims",
    "get_attention_scale",
    "is_mla_backend",
 ]
--- a/benchmarks/attention_benchmarks/batch_spec.py
+++ b/benchmarks/attention_benchmarks/batch_spec.py
@@ -1,231 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Simplified batch specification grammar for attention benchmarks.
 Grammar (underscore-separated segments):
  Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
  - count: Number of identical requests (optional, default=1)
  - q_len: Query length (number of new tokens)
  - seq_len: Total sequence length (optional, defaults to q_len for prefill)
  - 'k' suffix: Multiplies value by 1024
 Common patterns:
  - Prefill:  q_len == seq_len  (e.g., "q2k" → 2048 new tokens, 2048 seq)
  - Decode:   q_len == 1        (e.g., "q1s1k" → 1 token, 1024 seq length)
  - Extend:   q_len < seq_len   (e.g., "q4s1k" → 4 tokens, 1024 seq length)
 Examples:
  q2k              -> [(2048, 2048)]           # Prefill: 2048 tokens
  q1s1k            -> [(1, 1024)]              # Decode: 1 token, 1K sequence
  8q1s1k           -> [(1, 1024)] * 8          # 8 decode requests
  q4s1k            -> [(4, 1024)]              # 4-token extend (spec decode)
  2q1k_32q1s1k     -> [(1024, 1024)] * 2 + [(1, 1024)] * 32  # Mixed batch
  16q4s1k          -> [(4, 1024)] * 16         # 16 spec decode requests
 """
 from collections import Counter
 from dataclasses import dataclass
 import regex as re
@dataclass
 class BatchRequest:
    """Represents a single request in a batch."""
    q_len: int  # Query length (number of new tokens)
    kv_len: int  # Total KV cache length
    @property
    def is_decode(self) -> bool:
        """True if this is a decode request (q_len == 1)."""
        return self.q_len == 1
    @property
    def is_prefill(self) -> bool:
        """True if this is a pure prefill (q_len == kv_len)."""
        return self.q_len == self.kv_len
    @property
    def is_extend(self) -> bool:
        """True if this is context extension (q_len > 1, kv_len > q_len)."""
        return self.q_len > 1 and self.kv_len > self.q_len
    @property
    def context_len(self) -> int:
        """Context length (KV cache - query)."""
        return self.kv_len - self.q_len
    def as_tuple(self) -> tuple[int, int]:
        """Return as (q_len, kv_len) tuple for compatibility."""
        return (self.q_len, self.kv_len)
 def _parse_size(size_str: str, k_suffix: str) -> int:
    """Parse size string with optional 'k' suffix."""
    size = int(size_str)
    return size * 1024 if k_suffix == "k" else size
 def parse_batch_spec(spec: str) -> list[BatchRequest]:
    """
    Parse batch specification string into list of BatchRequest objects.
    Grammar: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
    Args:
        spec: Batch specification string (see module docstring for grammar)
    Returns:
        List of BatchRequest objects
    Raises:
        ValueError: If spec format is invalid
    """
    requests = []
    for seg in spec.split("_"):
        # Unified pattern: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
        m = re.match(r"^(?:(\d+))?q(\d+)(k?)(?:s(\d+)(k?))?$", seg)
        if m:
            cnt = int(m.group(1)) if m.group(1) else 1
            q_len = _parse_size(m.group(2), m.group(3))
            kv_len = _parse_size(m.group(4), m.group(5)) if m.group(4) else q_len
            requests.extend([BatchRequest(q_len=q_len, kv_len=kv_len)] * cnt)
            continue
        raise ValueError(f"Invalid batch spec segment: '{seg}'")
    return requests
 def format_batch_spec(requests: list[BatchRequest]) -> str:
    """
    Format list of BatchRequest into human-readable string.
    Groups requests by type and provides counts and sizes.
    Args:
        requests: List of BatchRequest objects
    Returns:
        Formatted string describing the batch
    """
    kinds = {
        "prefill": [],
        "extend": [],
        "decode": [],
    }
    for req in requests:
        tup = (req.q_len, req.kv_len)
        if req.is_prefill:
            kinds["prefill"].append(tup)
        elif req.is_extend:
            kinds["extend"].append(tup)
        elif req.is_decode:
            kinds["decode"].append(tup)
    parts = []
    for kind in ["prefill", "extend", "decode"]:
        lst = kinds[kind]
        if not lst:
            continue
        cnt_total = len(lst)
        ctr = Counter(lst)
        inner = []
        for (q, kv), cnt in ctr.items():
            if kind == "prefill":
                size = f"{q // 1024}k" if q % 1024 == 0 else str(q)
                inner.append(f"{cnt}x{size}")
            elif kind == "decode":
                size = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
                inner.append(f"{cnt}x{size}")
            else:  # extend
                qstr = f"{q // 1024}k" if q % 1024 == 0 else str(q)
                kstr = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
                inner.append(f"{cnt}xq{qstr}kv{kstr}")
        parts.append(f"{cnt_total} {kind} ({', '.join(inner)})")
    return ", ".join(parts)
 def reorder_for_flashinfer(requests: list[BatchRequest]) -> list[BatchRequest]:
    """
    Reorder requests for FlashInfer: decode first, then prefill.
    FlashInfer expects decode requests before prefill requests for
    optimal performance.
    Args:
        requests: Original list of BatchRequest
    Returns:
        Reordered list with decode requests first
    """
    decodes = [r for r in requests if r.is_decode]
    non_decodes = [r for r in requests if not r.is_decode]
    return decodes + non_decodes
 def split_by_type(
    requests: list[BatchRequest],
 ) -> dict[str, list[BatchRequest]]:
    """
    Split requests by type for analysis.
    Args:
        requests: List of BatchRequest
    Returns:
        Dict with keys: 'decode', 'prefill', 'extend'
    """
    result = {
        "decode": [],
        "prefill": [],
        "extend": [],
    }
    for req in requests:
        if req.is_decode:
            result["decode"].append(req)
        elif req.is_prefill:
            result["prefill"].append(req)
        elif req.is_extend:
            result["extend"].append(req)
    return result
 def get_batch_stats(requests: list[BatchRequest]) -> dict:
    """
    Compute statistics about a batch.
    Args:
        requests: List of BatchRequest
    Returns:
        Dict with batch statistics
    """
    by_type = split_by_type(requests)
    return {
        "total_requests": len(requests),
        "num_decode": len(by_type["decode"]),
        "num_prefill": len(by_type["prefill"]),
        "num_extend": len(by_type["extend"]),
        "total_tokens": sum(r.q_len for r in requests),
        "total_kv_cache": sum(r.kv_len for r in requests),
        "max_q_len": max((r.q_len for r in requests), default=0),
        "max_kv_len": max((r.kv_len for r in requests), default=0),
        "avg_q_len": sum(r.q_len for r in requests) / len(requests) if requests else 0,
        "avg_kv_len": (
            sum(r.kv_len for r in requests) / len(requests) if requests else 0
        ),
    }
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -1,886 +0,0 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Universal vLLM Attention Benchmark
 Benchmark any attention backend with the extended grammar.
 Supports standard attention (Flash/Triton/FlashInfer) and MLA backends.
 Examples:
    # Standard attention
    python benchmark.py --backends flash flashinfer --batch-specs "q2k" "8q1s1k"
    # MLA backends
    python benchmark.py --backends cutlass_mla flashinfer_mla --batch-specs "64q1s1k"
    # Parameter sweep (CLI)
    python benchmark.py --backend cutlass_mla \
                        --batch-specs "64q1s1k" \
                        --sweep-param num_kv_splits \
                        --sweep-values 1 4 8 16
    # Parameter sweep (YAML config - recommended)
    python benchmark.py --config configs/cutlass_numsplits.yaml
 """
 import argparse
 import sys
 from dataclasses import replace
 from pathlib import Path
 import yaml
 from rich.console import Console
 from tqdm import tqdm
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 from batch_spec import parse_batch_spec
 from common import (
    BenchmarkConfig,
    BenchmarkResult,
    ModelParameterSweep,
    ParameterSweep,
    ResultsFormatter,
    is_mla_backend,
 )
 def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
    """Run standard attention benchmark (Flash/Triton/FlashInfer)."""
    from runner import run_attention_benchmark
    return run_attention_benchmark(config)
 def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
    """Run MLA benchmark with appropriate backend."""
    from mla_runner import run_mla_benchmark as run_mla
    return run_mla(config.backend, config, **kwargs)
 def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
    """
    Run a single benchmark with proper backend selection.
    Args:
        config: BenchmarkConfig with backend, batch_spec, and model params
        **kwargs: Additional arguments passed to MLA benchmarks
    Returns:
        BenchmarkResult (may have error field set on failure)
    """
    try:
        if is_mla_backend(config.backend):
            return run_mla_benchmark(config, **kwargs)
        else:
            return run_standard_attention_benchmark(config)
    except Exception as e:
        return BenchmarkResult(
            config=config,
            mean_time=float("inf"),
            std_time=0,
            min_time=float("inf"),
            max_time=float("inf"),
            error=str(e),
        )
 def run_model_parameter_sweep(
    backends: list[str],
    batch_specs: list[str],
    base_config_args: dict,
    sweep: ModelParameterSweep,
    console: Console,
 ) -> list[BenchmarkResult]:
    """
    Run model parameter sweep for given backends and batch specs.
    Args:
        backends: List of backend names
        batch_specs: List of batch specifications
        base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
        sweep: ModelParameterSweep configuration
        console: Rich console for output
    Returns:
        List of BenchmarkResult objects
    """
    all_results = []
    console.print(
        f"[yellow]Model sweep mode: testing {sweep.param_name} = {sweep.values}[/]"
    )
    total = len(backends) * len(batch_specs) * len(sweep.values)
    with tqdm(total=total, desc="Benchmarking") as pbar:
        for backend in backends:
            for spec in batch_specs:
                for value in sweep.values:
                    # Create config with modified model parameter
                    config_args = base_config_args.copy()
                    config_args[sweep.param_name] = value
                    # Create config with original backend for running
                    clean_config = BenchmarkConfig(
                        backend=backend, batch_spec=spec, **config_args
                    )
                    # Run benchmark
                    result = run_benchmark(clean_config)
                    # Replace backend with labeled version for display
                    backend_label = sweep.get_label(backend, value)
                    labeled_config = replace(result.config, backend=backend_label)
                    result = replace(result, config=labeled_config)
                    all_results.append(result)
                    if not result.success:
                        console.print(
                            f"[red]Error {backend} {spec} {sweep.param_name}="
                            f"{value}: {result.error}[/]"
                        )
                    pbar.update(1)
    # Display sweep results - create separate table for each parameter value
    console.print("\n[bold green]Model Parameter Sweep Results:[/]")
    formatter = ResultsFormatter(console)
    # Group results by parameter value and extract backend mapping
    by_param_value = {}
    backend_mapping = {}  # Maps labeled backend -> original backend
    for r in all_results:
        # Extract original backend and param value from labeled backend
        # The label format is: {backend}_{param_name}_{value}
        # We need to reverse engineer this
        labeled_backend = r.config.backend
        # Try each backend to find which one this result belongs to
        for backend in backends:
            for value in sweep.values:
                expected_label = sweep.get_label(backend, value)
                if labeled_backend == expected_label:
                    backend_mapping[labeled_backend] = backend
                    param_value = str(value)
                    if param_value not in by_param_value:
                        by_param_value[param_value] = []
                    by_param_value[param_value].append(r)
                    break
    # Create a table for each parameter value
    sorted_param_values = sorted(
        by_param_value.keys(), key=lambda x: int(x) if x.isdigit() else x
    )
    for param_value in sorted_param_values:
        console.print(f"\n[bold cyan]{sweep.param_name} = {param_value}[/]")
        param_results = by_param_value[param_value]
        # Create modified results with original backend names
        modified_results = []
        for r in param_results:
            # Get the original backend name from our mapping
            original_backend = backend_mapping[r.config.backend]
            modified_config = replace(r.config, backend=original_backend)
            modified_result = replace(r, config=modified_config)
            modified_results.append(modified_result)
        # Print table with original backend names
        formatter.print_table(modified_results, backends, compare_to_fastest=True)
    # Show optimal backend for each (param_value, batch_spec) combination
    console.print(
        f"\n[bold cyan]Optimal backend for each ({sweep.param_name}, batch_spec):[/]"
    )
    # Group by (param_value, batch_spec)
    by_param_and_spec = {}
    for r in all_results:
        if r.success:
            # Find which (backend, value) this result corresponds to
            labeled_backend = r.config.backend
            for backend in backends:
                for value in sweep.values:
                    expected_label = sweep.get_label(backend, value)
                    if labeled_backend == expected_label:
                        param_value = str(value)
                        spec = r.config.batch_spec
                        key = (param_value, spec)
                        if key not in by_param_and_spec:
                            by_param_and_spec[key] = []
                        by_param_and_spec[key].append(r)
                        break
    # Sort by param value then spec
    sorted_keys = sorted(
        by_param_and_spec.keys(),
        key=lambda x: (int(x[0]) if x[0].isdigit() else x[0], x[1]),
    )
    current_param_value = None
    for param_value, spec in sorted_keys:
        # Print header when param value changes
        if param_value != current_param_value:
            console.print(f"\n  [bold]{sweep.param_name}={param_value}:[/]")
            current_param_value = param_value
        results = by_param_and_spec[(param_value, spec)]
        best = min(results, key=lambda r: r.mean_time)
        # Extract original backend name using the mapping
        backend_name = backend_mapping[best.config.backend]
        # Show all backends' times for comparison
        times_str = " | ".join(
            [
                f"{backend_mapping[r.config.backend]}: {r.mean_time:.6f}s"
                for r in sorted(results, key=lambda r: r.mean_time)
            ]
        )
        console.print(
            f"    {spec:12s} -> [bold green]{backend_name:15s}[/] ({times_str})"
        )
    return all_results
 def run_parameter_sweep(
    backends: list[str],
    batch_specs: list[str],
    base_config_args: dict,
    sweep: ParameterSweep,
    console: Console,
 ) -> list[BenchmarkResult]:
    """
    Run parameter sweep for given backends and batch specs.
    Args:
        backends: List of backend names
        batch_specs: List of batch specifications
        base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
        sweep: ParameterSweep configuration
        console: Rich console for output
    Returns:
        List of BenchmarkResult objects
    """
    all_results = []
    # Build list of values to sweep (including auto if requested)
    sweep_values = list(sweep.values)
    if sweep.include_auto:
        sweep_values.append("auto")
    console.print(f"[yellow]Sweep mode: testing {sweep.param_name} = {sweep_values}[/]")
    total = len(backends) * len(batch_specs) * len(sweep_values)
    with tqdm(total=total, desc="Benchmarking") as pbar:
        for backend in backends:
            for spec in batch_specs:
                for value in sweep_values:
                    # Create config with original backend for running
                    config = BenchmarkConfig(
                        backend=backend, batch_spec=spec, **base_config_args
                    )
                    # Prepare kwargs for benchmark runner
                    kwargs = {}
                    if value != "auto":
                        kwargs[sweep.param_name] = value
                    # Run benchmark
                    result = run_benchmark(config, **kwargs)
                    # Replace backend with labeled version for display
                    backend_label = sweep.get_label(backend, value)
                    labeled_config = replace(result.config, backend=backend_label)
                    result = replace(result, config=labeled_config)
                    all_results.append(result)
                    if not result.success:
                        console.print(
                            f"[red]Error {backend} {spec} {sweep.param_name}="
                            f"{value}: {result.error}[/]"
                        )
                    pbar.update(1)
    # Display sweep results
    console.print("\n[bold green]Sweep Results:[/]")
    backend_labels = [sweep.get_label(b, v) for b in backends for v in sweep_values]
    formatter = ResultsFormatter(console)
    formatter.print_table(all_results, backend_labels)
    # Show optimal values
    console.print(f"\n[bold cyan]Optimal {sweep.param_name} per batch spec:[/]")
    by_spec = {}
    for r in all_results:
        if r.success:
            spec = r.config.batch_spec
            if spec not in by_spec:
                by_spec[spec] = []
            by_spec[spec].append(r)
    for spec in sorted(by_spec.keys()):
        results = by_spec[spec]
        best = min(results, key=lambda r: r.mean_time)
        console.print(
            f"  {spec}: [bold green]{best.config.backend}[/] ({best.mean_time:.6f}s)"
        )
    return all_results
 def load_config_from_yaml(config_path: str) -> dict:
    """Load configuration from YAML file."""
    with open(config_path) as f:
        return yaml.safe_load(f)
 def generate_batch_specs_from_ranges(ranges: list[dict]) -> list[str]:
    """
    Generate batch specs from range specifications.
    Args:
        ranges: List of range specifications, each containing:
            - template: Batch spec template (e.g., "q{q_len}kv1k")
            - q_len: Dict with start, stop, step, end_inclusive (optional)
            - Other parameters can also be ranges
    Returns:
        List of generated batch spec strings
    Example:
        ranges = [
            {
                "template": "q{q_len}kv1k",
                "q_len": {
                    "start": 1,
                    "stop": 16,
                    "step": 1,
                    "end_inclusive": true  # Optional, defaults to true
                }
            }
        ]
        Returns: ["q1kv1k", "q2kv1k", ..., "q16kv1k"]
    """
    all_specs = []
    for range_spec in ranges:
        template = range_spec.get("template")
        if not template:
            raise ValueError("Range specification must include 'template'")
        # Extract all range parameters from the spec
        range_params = {}
        for key, value in range_spec.items():
            if key == "template":
                continue
            if isinstance(value, dict) and "start" in value:
                # This is a range specification
                start = value["start"]
                stop = value["stop"]
                step = value.get("step", 1)
                # Check if end should be inclusive (default: True)
                end_inclusive = value.get("end_inclusive", True)
                # Adjust stop based on end_inclusive
                if end_inclusive:
                    range_params[key] = list(range(start, stop + 1, step))
                else:
                    range_params[key] = list(range(start, stop, step))
            else:
                # This is a fixed value
                range_params[key] = [value]
        # Generate all combinations (Cartesian product)
        if range_params:
            import itertools
            param_names = list(range_params.keys())
            param_values = [range_params[name] for name in param_names]
            for values in itertools.product(*param_values):
                params = dict(zip(param_names, values))
                spec = template.format(**params)
                all_specs.append(spec)
        else:
            # No parameters, just use template as-is
            all_specs.append(template)
    return all_specs
 def main():
    parser = argparse.ArgumentParser(
        description="Universal vLLM attention benchmark",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )
    # Config file
    parser.add_argument(
        "--config",
        help="Path to YAML config file (overrides other args)",
    )
    # Backend selection
    parser.add_argument(
        "--backends",
        nargs="+",
        help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
        "flashinfer_mla, flashattn_mla, flashmla)",
    )
    parser.add_argument(
        "--backend",
        help="Single backend (alternative to --backends)",
    )
    # Batch specifications
    parser.add_argument(
        "--batch-specs",
        nargs="+",
        default=["q2k", "8q1s1k"],
        help="Batch specifications using extended grammar",
    )
    # Model config
    parser.add_argument("--num-layers", type=int, default=10, help="Number of layers")
    parser.add_argument("--head-dim", type=int, default=128, help="Head dimension")
    parser.add_argument("--num-q-heads", type=int, default=32, help="Query heads")
    parser.add_argument("--num-kv-heads", type=int, default=8, help="KV heads")
    parser.add_argument("--block-size", type=int, default=16, help="Block size")
    # Benchmark settings
    parser.add_argument("--device", default="cuda:0", help="Device")
    parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
    parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
    parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
    # Parameter sweep (use YAML config for advanced sweeps)
    parser.add_argument(
        "--sweep-param",
        help="Parameter name to sweep (e.g., num_kv_splits, reorder_batch_threshold)",
    )
    parser.add_argument(
        "--sweep-values",
        type=int,
        nargs="+",
        help="Values to sweep for the parameter",
    )
    # Output
    parser.add_argument("--output-csv", help="Save to CSV")
    parser.add_argument("--output-json", help="Save to JSON")
    args = parser.parse_args()
    console = Console()
    console.print("[bold cyan]vLLM Attention Benchmark[/]")
    # Load config from YAML if provided
    if args.config:
        console.print(f"[yellow]Loading config from: {args.config}[/]")
        yaml_config = load_config_from_yaml(args.config)
        # Show description if available
        if "description" in yaml_config:
            console.print(f"[dim]{yaml_config['description']}[/]")
        # Override args with YAML values
        # (YAML takes precedence unless CLI arg was explicitly set)
        # Backend(s)
        if "backend" in yaml_config:
            args.backend = yaml_config["backend"]
            args.backends = None
        elif "backends" in yaml_config:
            args.backends = yaml_config["backends"]
            args.backend = None
        # Check for special modes
        if "mode" in yaml_config:
            args.mode = yaml_config["mode"]
        else:
            args.mode = None
        # Batch specs and sizes
        # Support both explicit batch_specs and generated batch_spec_ranges
        if "batch_spec_ranges" in yaml_config:
            # Generate batch specs from ranges
            generated_specs = generate_batch_specs_from_ranges(
                yaml_config["batch_spec_ranges"]
            )
            # Combine with any explicit batch_specs
            if "batch_specs" in yaml_config:
                args.batch_specs = yaml_config["batch_specs"] + generated_specs
            else:
                args.batch_specs = generated_specs
            console.print(
                f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
            )
        elif "batch_specs" in yaml_config:
            args.batch_specs = yaml_config["batch_specs"]
        if "batch_sizes" in yaml_config:
            args.batch_sizes = yaml_config["batch_sizes"]
        else:
            args.batch_sizes = None
        # Model config
        if "model" in yaml_config:
            model = yaml_config["model"]
            args.num_layers = model.get("num_layers", args.num_layers)
            args.head_dim = model.get("head_dim", args.head_dim)
            args.num_q_heads = model.get("num_q_heads", args.num_q_heads)
            args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads)
            args.block_size = model.get("block_size", args.block_size)
        # Benchmark settings
        if "benchmark" in yaml_config:
            bench = yaml_config["benchmark"]
            args.device = bench.get("device", args.device)
            args.repeats = bench.get("repeats", args.repeats)
            args.warmup_iters = bench.get("warmup_iters", args.warmup_iters)
            args.profile_memory = bench.get("profile_memory", args.profile_memory)
        # Parameter sweep configuration
        if "parameter_sweep" in yaml_config:
            sweep_config = yaml_config["parameter_sweep"]
            args.parameter_sweep = ParameterSweep(
                param_name=sweep_config["param_name"],
                values=sweep_config["values"],
                include_auto=sweep_config.get("include_auto", False),
                label_format=sweep_config.get(
                    "label_format", "{backend}_{param_name}_{value}"
                ),
            )
        else:
            args.parameter_sweep = None
        # Model parameter sweep configuration
        if "model_parameter_sweep" in yaml_config:
            sweep_config = yaml_config["model_parameter_sweep"]
            args.model_parameter_sweep = ModelParameterSweep(
                param_name=sweep_config["param_name"],
                values=sweep_config["values"],
                label_format=sweep_config.get(
                    "label_format", "{backend}_{param_name}_{value}"
                ),
            )
        else:
            args.model_parameter_sweep = None
        # Output
        if "output" in yaml_config:
            output = yaml_config["output"]
            if "csv" in output and not args.output_csv:
                args.output_csv = output["csv"]
            if "json" in output and not args.output_json:
                args.output_json = output["json"]
        console.print()
    # Handle CLI-based parameter sweep (if not from YAML)
    if (
        (not hasattr(args, "parameter_sweep") or args.parameter_sweep is None)
        and args.sweep_param
        and args.sweep_values
    ):
        args.parameter_sweep = ParameterSweep(
            param_name=args.sweep_param,
            values=args.sweep_values,
            include_auto=False,
            label_format="{backend}_{param_name}_{value}",
        )
    # Determine backends
    backends = args.backends or ([args.backend] if args.backend else ["flash"])
    console.print(f"Backends: {', '.join(backends)}")
    console.print(f"Batch specs: {', '.join(args.batch_specs)}")
    console.print()
    # Run benchmarks
    all_results = []
    # Handle special mode: decode_vs_prefill comparison
    if hasattr(args, "mode") and args.mode == "decode_vs_prefill":
        console.print("[yellow]Mode: Decode vs Prefill pipeline comparison[/]")
        console.print(
            "[dim]For each query length, testing both decode and prefill pipelines[/]"
        )
        console.print("[dim]Using batched execution for optimal performance[/]")
        # Extract batch sizes from config
        batch_sizes = getattr(args, "batch_sizes", [1])
        backend = backends[0]  # Use first backend (should only be one)
        # Calculate total benchmarks
        total = len(batch_sizes)
        with tqdm(total=total, desc="Benchmarking") as pbar:
            for batch_size in batch_sizes:
                # Prepare all configs for this batch size
                configs_with_thresholds = []
                for spec in args.batch_specs:
                    # Parse the batch spec to get query length
                    requests = parse_batch_spec(spec)
                    if not requests:
                        console.print(
                            f"[red]Error: Could not parse batch spec '{spec}'[/]"
                        )
                        continue
                    # Get query length from first request
                    query_length = requests[0].q_len
                    # Create batch spec for this batch size
                    # For batch_size > 1, we need to prepend the count
                    batch_spec = f"{batch_size}{spec}" if batch_size > 1 else spec
                    # Create base config (without backend name)
                    base_config = BenchmarkConfig(
                        backend=backend,  # Will be overridden later
                        batch_spec=batch_spec,
                        num_layers=args.num_layers,
                        head_dim=args.head_dim,
                        num_q_heads=args.num_q_heads,
                        num_kv_heads=args.num_kv_heads,
                        block_size=args.block_size,
                        device=args.device,
                        repeats=args.repeats,
                        warmup_iters=args.warmup_iters,
                        profile_memory=args.profile_memory,
                    )
                    # Add decode pipeline config
                    decode_threshold = query_length
                    config_decode = replace(
                        base_config,
                        backend=f"{backend}_decode_qlen{query_length}_bs{batch_size}",
                    )
                    configs_with_thresholds.append((config_decode, decode_threshold))
                    # Add prefill pipeline config if query_length > 1
                    if query_length > 1:
                        prefill_threshold = query_length - 1
                        config_prefill = replace(
                            base_config,
                            backend=f"{backend}_prefill_qlen{query_length}"
                            f"_bs{batch_size}",
                        )
                        configs_with_thresholds.append(
                            (config_prefill, prefill_threshold)
                        )
                # Run all benchmarks for this batch size in one go (batched mode)
                try:
                    from mla_runner import run_mla_benchmark as run_mla
                    # Use batched API: pass list of (config, threshold) tuples
                    timing_results = run_mla(backend, configs_with_thresholds)
                    # Create BenchmarkResult objects from timing results
                    for (config, _), timing in zip(
                        configs_with_thresholds, timing_results
                    ):
                        result = BenchmarkResult(
                            config=config,
                            mean_time=timing["mean"],
                            std_time=timing["std"],
                            min_time=timing["min"],
                            max_time=timing["max"],
                            throughput_tokens_per_sec=timing.get("throughput", None),
                        )
                        all_results.append(result)
                except Exception as e:
                    import traceback
                    console.print(
                        f"[red]Error running batched benchmarks for "
                        f"batch_size={batch_size}: {e}[/]"
                    )
                    console.print("[red]Traceback:[/]")
                    traceback.print_exc()
                    # Add error results for all configs
                    for config, _ in configs_with_thresholds:
                        result = BenchmarkResult(
                            config=config,
                            mean_time=float("inf"),
                            std_time=0,
                            min_time=float("inf"),
                            max_time=float("inf"),
                            error=str(e),
                        )
                        all_results.append(result)
                pbar.update(1)
        # Display decode vs prefill results
        console.print("\n[bold green]Decode vs Prefill Results:[/]")
        # Group by batch size
        by_batch_size = {}
        for r in all_results:
            if r.success:
                # Extract batch size from backend name
                parts = r.config.backend.split("_")
                bs_part = [p for p in parts if p.startswith("bs")]
                if bs_part:
                    bs = int(bs_part[0][2:])
                    if bs not in by_batch_size:
                        by_batch_size[bs] = []
                    by_batch_size[bs].append(r)
        # For each batch size, analyze crossover point
        for bs in sorted(by_batch_size.keys()):
            console.print(f"\n[bold cyan]Batch size: {bs}[/]")
            results = by_batch_size[bs]
            # Group by query length
            by_qlen = {}
            for r in results:
                parts = r.config.backend.split("_")
                qlen_part = [p for p in parts if p.startswith("qlen")]
                if qlen_part:
                    qlen = int(qlen_part[0][4:])
                    if qlen not in by_qlen:
                        by_qlen[qlen] = {}
                    pipeline = "decode" if "decode" in r.config.backend else "prefill"
                    by_qlen[qlen][pipeline] = r
            # Find crossover point
            last_decode_faster = None
            for qlen in sorted(by_qlen.keys()):
                pipelines = by_qlen[qlen]
                if "decode" in pipelines and "prefill" in pipelines:
                    decode_time = pipelines["decode"].mean_time
                    prefill_time = pipelines["prefill"].mean_time
                    faster = "decode" if decode_time < prefill_time else "prefill"
                    speedup = (
                        prefill_time / decode_time
                        if decode_time < prefill_time
                        else decode_time / prefill_time
                    )
                    console.print(
                        f"  qlen={qlen:3d}: decode={decode_time:.6f}s, "
                        f"prefill={prefill_time:.6f}s -> "
                        f"[bold]{faster}[/] ({speedup:.2f}x)"
                    )
                    if faster == "decode":
                        last_decode_faster = qlen
            if last_decode_faster is not None:
                optimal_threshold = last_decode_faster
                console.print(
                    f"\n  [bold green]Optimal threshold for batch_size={bs}: "
                    f"{optimal_threshold}[/]"
                )
                console.print(
                    f"  [dim](Use decode pipeline for query_length <= "
                    f"{optimal_threshold})[/]"
                )
            else:
                console.print(
                    f"\n  [yellow]Prefill always faster for batch_size={bs}[/]"
                )
    # Handle model parameter sweep mode
    elif hasattr(args, "model_parameter_sweep") and args.model_parameter_sweep:
        # Model parameter sweep
        base_config_args = {
            "num_layers": args.num_layers,
            "head_dim": args.head_dim,
            "num_q_heads": args.num_q_heads,
            "num_kv_heads": args.num_kv_heads,
            "block_size": args.block_size,
            "device": args.device,
            "repeats": args.repeats,
            "warmup_iters": args.warmup_iters,
            "profile_memory": args.profile_memory,
        }
        all_results = run_model_parameter_sweep(
            backends,
            args.batch_specs,
            base_config_args,
            args.model_parameter_sweep,
            console,
        )
    # Handle parameter sweep mode (unified)
    elif hasattr(args, "parameter_sweep") and args.parameter_sweep:
        # Unified parameter sweep
        base_config_args = {
            "num_layers": args.num_layers,
            "head_dim": args.head_dim,
            "num_q_heads": args.num_q_heads,
            "num_kv_heads": args.num_kv_heads,
            "block_size": args.block_size,
            "device": args.device,
            "repeats": args.repeats,
            "warmup_iters": args.warmup_iters,
            "profile_memory": args.profile_memory,
        }
        all_results = run_parameter_sweep(
            backends, args.batch_specs, base_config_args, args.parameter_sweep, console
        )
    else:
        # Normal mode: compare backends
        total = len(backends) * len(args.batch_specs)
        with tqdm(total=total, desc="Benchmarking") as pbar:
            for spec in args.batch_specs:
                for backend in backends:
                    config = BenchmarkConfig(
                        backend=backend,
                        batch_spec=spec,
                        num_layers=args.num_layers,
                        head_dim=args.head_dim,
                        num_q_heads=args.num_q_heads,
                        num_kv_heads=args.num_kv_heads,
                        block_size=args.block_size,
                        device=args.device,
                        repeats=args.repeats,
                        warmup_iters=args.warmup_iters,
                        profile_memory=args.profile_memory,
                    )
                    result = run_benchmark(config)
                    all_results.append(result)
                    if not result.success:
                        console.print(f"[red]Error {backend} {spec}: {result.error}[/]")
                    pbar.update(1)
        # Display results
        console.print("\n[bold green]Results:[/]")
        formatter = ResultsFormatter(console)
        formatter.print_table(all_results, backends)
    # Save results
    if all_results:
        formatter = ResultsFormatter(console)
        if args.output_csv:
            formatter.save_csv(all_results, args.output_csv)
        if args.output_json:
            formatter.save_json(all_results, args.output_json)
 if __name__ == "__main__":
    main()
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -1,503 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Common utilities for attention benchmarking."""
 import csv
 import json
 import math
 from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Any
 import numpy as np
 import torch
 from rich.console import Console
 from rich.table import Table
 # Mock classes for vLLM attention infrastructure
 class MockHfConfig:
    """Mock HuggingFace config that satisfies vLLM's requirements."""
    def __init__(self, mla_dims: dict):
        self.num_attention_heads = mla_dims["num_q_heads"]
        self.num_key_value_heads = mla_dims["num_kv_heads"]
        self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"]
        self.model_type = "deepseek_v2"
        self.is_encoder_decoder = False
        self.kv_lora_rank = mla_dims["kv_lora_rank"]
        self.qk_nope_head_dim = mla_dims["qk_nope_head_dim"]
        self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"]
        self.v_head_dim = mla_dims["v_head_dim"]
        self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
    def get_text_config(self):
        return self
 # Import AttentionLayerBase at module level to avoid circular dependencies
 try:
    from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
    _HAS_ATTENTION_LAYER_BASE = True
 except ImportError:
    _HAS_ATTENTION_LAYER_BASE = False
    AttentionLayerBase = object  # Fallback
 class MockKVBProj:
    """Mock KV projection layer for MLA prefill mode.
    Mimics ColumnParallelLinear behavior for kv_b_proj in MLA backends.
    Projects kv_c_normed to [qk_nope_head_dim + v_head_dim] per head.
    """
    def __init__(self, num_heads: int, qk_nope_head_dim: int, v_head_dim: int):
        self.num_heads = num_heads
        self.qk_nope_head_dim = qk_nope_head_dim
        self.v_head_dim = v_head_dim
        self.out_dim = qk_nope_head_dim + v_head_dim
    def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
        """
        Project kv_c_normed to output space.
        Args:
            x: Input tensor [num_tokens, kv_lora_rank]
        Returns:
            Tuple containing output tensor
                [num_tokens, num_heads, qk_nope_head_dim + v_head_dim]
        """
        num_tokens = x.shape[0]
        result = torch.randn(
            num_tokens,
            self.num_heads,
            self.out_dim,
            device=x.device,
            dtype=x.dtype,
        )
        return (result,)  # Return as tuple to match ColumnParallelLinear API
 class MockLayer(AttentionLayerBase):
    """Mock attention layer with scale parameters and impl.
    Inherits from AttentionLayerBase so it passes isinstance checks
    in get_layers_from_vllm_config when FlashInfer prefill is enabled.
    """
    def __init__(self, device: torch.device, impl=None, kv_cache_spec=None):
        # Don't call super().__init__() as AttentionLayerBase doesn't have __init__
        self._k_scale = torch.tensor(1.0, device=device)
        self._v_scale = torch.tensor(1.0, device=device)
        self._q_scale = torch.tensor(1.0, device=device)
        # Scalar floats for kernels that need them
        self._k_scale_float = float(self._k_scale.item())
        self._v_scale_float = float(self._v_scale.item())
        self._q_scale_float = float(self._q_scale.item())
        # AttentionImpl for metadata builders to query
        self.impl = impl
        # KV cache spec for get_kv_cache_spec
        self._kv_cache_spec = kv_cache_spec
    def get_attn_backend(self):
        """Get the attention backend class (required by AttentionLayerBase)."""
        # Return None as this is just a mock layer for benchmarking
        return None
    def get_kv_cache_spec(self):
        """Get the KV cache spec (required by AttentionLayerBase)."""
        return self._kv_cache_spec
 class MockModelConfig:
    """Mock model configuration."""
    def __init__(
        self,
        num_q_heads: int,
        num_kv_heads: int,
        head_dim: int,
        dtype: torch.dtype = torch.float16,
        max_model_len: int = 32768,
    ):
        self._n_q = num_q_heads
        self._n_kv = num_kv_heads
        self._d = head_dim
        self.dtype = dtype
        self.max_model_len = max_model_len
    def get_num_attention_heads(self, _=None) -> int:
        return self._n_q
    def get_num_kv_heads(self, _=None) -> int:
        return self._n_kv
    def get_head_size(self) -> int:
        return self._d
    def get_num_layers(self) -> int:
        """Mock method for layer count queries."""
        return 1
    def get_sliding_window_for_layer(self, _layer_idx: int):
        """Mock method for sliding window queries."""
        return None
    def get_logits_soft_cap_for_layer(self, _layer_idx: int):
        """Mock method for logits soft cap queries."""
        return None
    def get_sm_scale_for_layer(self, _layer_idx: int) -> float:
        """Mock method for SM scale queries."""
        return 1.0 / (self.get_head_size() ** 0.5)
 class MockParallelConfig:
    """Mock parallel configuration."""
    pass
 class MockCompilationConfig:
    """Mock compilation configuration."""
    def __init__(self):
        self.full_cuda_graph = False
        self.static_forward_context = {}
 class MockVLLMConfig:
    """Mock VLLM configuration."""
    def __init__(self):
        self.compilation_config = MockCompilationConfig()
 class MockRunner:
    """Mock GPU runner for metadata builders."""
    def __init__(
        self,
        seq_lens: np.ndarray,
        query_start_locs: np.ndarray,
        device: torch.device,
        num_q_heads: int,
        num_kv_heads: int,
        head_dim: int,
        dtype: torch.dtype,
    ):
        self.model_config = MockModelConfig(num_q_heads, num_kv_heads, head_dim, dtype)
        self.parallel_config = MockParallelConfig()
        self.vllm_config = MockVLLMConfig()
        self.seq_lens_np = seq_lens
        self.query_start_loc_np = query_start_locs
        self.device = device
        self.attention_chunk_size = None
        self.num_query_heads = num_q_heads
        self.num_kv_heads = num_kv_heads
        self.dtype = dtype
@dataclass
 class ParameterSweep:
    """Configuration for sweeping a backend parameter."""
    param_name: str  # Name of the backend parameter to sweep
    values: list[Any]  # List of values to test
    include_auto: bool = False  # Also test with param unset (auto mode)
    label_format: str = "{backend}_{param_name}_{value}"  # Result label template
    def get_label(self, backend: str, value: Any) -> str:
        """Generate a label for a specific parameter value."""
        return self.label_format.format(
            backend=backend, param_name=self.param_name, value=value
        )
@dataclass
 class ModelParameterSweep:
    """Configuration for sweeping a model configuration parameter."""
    param_name: str  # Name of the model config parameter to sweep (e.g., "num_q_heads")
    values: list[Any]  # List of values to test
    label_format: str = "{backend}_{param_name}_{value}"  # Result label template
    def get_label(self, backend: str, value: Any) -> str:
        """Generate a label for a specific parameter value."""
        return self.label_format.format(
            backend=backend, param_name=self.param_name, value=value
        )
@dataclass
 class BenchmarkConfig:
    """Configuration for a single benchmark run."""
    backend: str
    batch_spec: str
    num_layers: int
    head_dim: int
    num_q_heads: int
    num_kv_heads: int
    block_size: int
    device: str
    dtype: torch.dtype = torch.float16
    repeats: int = 1
    warmup_iters: int = 3
    profile_memory: bool = False
    use_cuda_graphs: bool = False
    # MLA-specific
    kv_lora_rank: int | None = None
    qk_nope_head_dim: int | None = None
    qk_rope_head_dim: int | None = None
    v_head_dim: int | None = None
    # Backend-specific tuning
    num_kv_splits: int | None = None  # CUTLASS MLA
    reorder_batch_threshold: int | None = None  # FlashAttn MLA, FlashMLA
@dataclass
 class BenchmarkResult:
    """Results from a single benchmark run."""
    config: BenchmarkConfig
    mean_time: float  # seconds
    std_time: float  # seconds
    min_time: float  # seconds
    max_time: float  # seconds
    throughput_tokens_per_sec: float | None = None
    memory_allocated_mb: float | None = None
    memory_reserved_mb: float | None = None
    error: str | None = None
    @property
    def success(self) -> bool:
        """Whether benchmark completed successfully."""
        return self.error is None
    def to_dict(self) -> dict[str, Any]:
        """Convert to dictionary for serialization."""
        return {
            "config": asdict(self.config),
            "mean_time": self.mean_time,
            "std_time": self.std_time,
            "min_time": self.min_time,
            "max_time": self.max_time,
            "throughput_tokens_per_sec": self.throughput_tokens_per_sec,
            "memory_allocated_mb": self.memory_allocated_mb,
            "memory_reserved_mb": self.memory_reserved_mb,
            "error": self.error,
        }
 class ResultsFormatter:
    """Format and display benchmark results."""
    def __init__(self, console: Console | None = None):
        self.console = console or Console()
    def print_table(
        self,
        results: list[BenchmarkResult],
        backends: list[str],
        compare_to_fastest: bool = True,
    ):
        """
        Print results as a rich table.
        Args:
            results: List of BenchmarkResult
            backends: List of backend names being compared
            compare_to_fastest: Show percentage comparison to fastest
        """
        # Group by batch spec
        by_spec = {}
        for r in results:
            spec = r.config.batch_spec
            if spec not in by_spec:
                by_spec[spec] = {}
            by_spec[spec][r.config.backend] = r
        # Create shortened backend names for display
        def shorten_backend_name(name: str) -> str:
            """Shorten long backend names for table display."""
            # Remove common prefixes
            name = name.replace("flashattn_mla", "famla")
            name = name.replace("flashinfer_mla", "fimla")
            name = name.replace("flashmla", "fmla")
            name = name.replace("cutlass_mla", "cmla")
            name = name.replace("numsplits", "ns")
            return name
        table = Table(title="Attention Benchmark Results")
        table.add_column("Batch\nSpec", no_wrap=True)
        multi = len(backends) > 1
        for backend in backends:
            short_name = shorten_backend_name(backend)
            # Time column
            col_time = f"{short_name}\nTime (s)"
            table.add_column(col_time, justify="right", no_wrap=False)
            if multi and compare_to_fastest:
                # Relative performance column
                col_rel = f"{short_name}\nvs Best"
                table.add_column(col_rel, justify="right", no_wrap=False)
        # Add rows
        for spec in sorted(by_spec.keys()):
            spec_results = by_spec[spec]
            times = {b: r.mean_time for b, r in spec_results.items() if r.success}
            best_time = min(times.values()) if times else 0.0
            row = [spec]
            for backend in backends:
                if backend in spec_results:
                    r = spec_results[backend]
                    if r.success:
                        row.append(f"{r.mean_time:.6f}")
                        if multi and compare_to_fastest:
                            pct = (
                                (r.mean_time / best_time * 100) if best_time > 0 else 0
                            )
                            pct_str = f"{pct:.1f}%"
                            if r.mean_time == best_time:
                                pct_str = f"[bold green]{pct_str}[/]"
                            row.append(pct_str)
                    else:
                        row.append("[red]ERROR[/]")
                        if multi and compare_to_fastest:
                            row.append("-")
                else:
                    row.append("-")
                    if multi and compare_to_fastest:
                        row.append("-")
            table.add_row(*row)
        self.console.print(table)
    def save_csv(self, results: list[BenchmarkResult], path: str):
        """Save results to CSV file."""
        if not results:
            return
        path_obj = Path(path)
        path_obj.parent.mkdir(parents=True, exist_ok=True)
        with open(path, "w", newline="") as f:
            writer = csv.DictWriter(
                f,
                fieldnames=[
                    "backend",
                    "batch_spec",
                    "num_layers",
                    "mean_time",
                    "std_time",
                    "throughput",
                    "memory_mb",
                ],
            )
            writer.writeheader()
            for r in results:
                writer.writerow(
                    {
                        "backend": r.config.backend,
                        "batch_spec": r.config.batch_spec,
                        "num_layers": r.config.num_layers,
                        "mean_time": r.mean_time,
                        "std_time": r.std_time,
                        "throughput": r.throughput_tokens_per_sec or 0,
                        "memory_mb": r.memory_allocated_mb or 0,
                    }
                )
        self.console.print(f"[green]Saved CSV results to {path}[/]")
    def save_json(self, results: list[BenchmarkResult], path: str):
        """Save results to JSON file."""
        path_obj = Path(path)
        path_obj.parent.mkdir(parents=True, exist_ok=True)
        data = [r.to_dict() for r in results]
        with open(path, "w") as f:
            json.dump(data, f, indent=2, default=str)
        self.console.print(f"[green]Saved JSON results to {path}[/]")
 def setup_mla_dims(model_name: str = "deepseek-v3") -> dict:
    """
    Get MLA dimensions for known models.
    Args:
        model_name: Model identifier
    Returns:
        Dict with MLA dimension configuration
    """
    configs = {
        "deepseek-v2": {
            "kv_lora_rank": 512,
            "qk_nope_head_dim": 128,
            "qk_rope_head_dim": 64,
            "v_head_dim": 128,
            "num_q_heads": 128,
            "num_kv_heads": 1,
            "head_dim": 576,
        },
        "deepseek-v3": {
            "kv_lora_rank": 512,
            "qk_nope_head_dim": 128,
            "qk_rope_head_dim": 64,
            "v_head_dim": 128,
            "num_q_heads": 128,
            "num_kv_heads": 1,
            "head_dim": 576,
        },
        "deepseek-v2-lite": {
            "kv_lora_rank": 512,
            "qk_nope_head_dim": 128,
            "qk_rope_head_dim": 64,
            "v_head_dim": 128,
            "num_q_heads": 16,
            "num_kv_heads": 1,
            "head_dim": 576,
        },
    }
    if model_name not in configs:
        raise ValueError(
            f"Unknown model '{model_name}'. Known models: {list(configs.keys())}"
        )
    return configs[model_name]
 def get_attention_scale(head_dim: int) -> float:
    """Compute attention scale factor (1/sqrt(d))."""
    return 1.0 / math.sqrt(head_dim)
 def is_mla_backend(backend: str) -> bool:
    """
    Check if backend is an MLA backend using the backend's is_mla() property.
    Args:
        backend: Backend name (e.g., "CUTLASS_MLA", "FLASHINFER_MLA")
    Returns:
        True if the backend is an MLA backend, False otherwise
    """
    from vllm.v1.attention.backends.registry import AttentionBackendEnum
    try:
        backend_class = AttentionBackendEnum[backend.upper()].get_class()
        return backend_class.is_mla()
    except (KeyError, ValueError, ImportError):
        return False
--- a/benchmarks/attention_benchmarks/configs/mla_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
@@ -1,61 +0,0 @@
 # MLA decode-only benchmark configuration
 model:
  name: "deepseek-v3"
  num_layers: 60
  num_q_heads: 128
  num_kv_heads: 1  # MLA uses single latent KV
  head_dim: 576
  kv_lora_rank: 512
  qk_nope_head_dim: 128
  qk_rope_head_dim: 64
  v_head_dim: 128
  block_size: 128  # CUTLASS MLA and FlashAttn MLA use 128
 batch_specs:
  # Small batches, varying sequence lengths
  - "16q1s512"     # 16 requests, 512 KV cache
  - "16q1s1k"      # 16 requests, 1k KV cache
  - "16q1s2k"      # 16 requests, 2k KV cache
  - "16q1s4k"      # 16 requests, 4k KV cache
  # Medium batches
  - "32q1s1k"      # 32 requests, 1k KV cache
  - "32q1s2k"      # 32 requests, 2k KV cache
  - "32q1s4k"      # 32 requests, 4k KV cache
  - "32q1s8k"      # 32 requests, 8k KV cache
  # Large batches
  - "64q1s1k"      # 64 requests, 1k KV cache
  - "64q1s2k"      # 64 requests, 2k KV cache
  - "64q1s4k"      # 64 requests, 4k KV cache
  - "64q1s8k"      # 64 requests, 8k KV cache
  # Very large batches
  - "128q1s1k"     # 128 requests, 1k KV cache
  - "128q1s2k"     # 128 requests, 2k KV cache
  # Long context
  - "32q1s16k"     # 32 requests, 16k KV cache
  - "32q1s32k"     # 32 requests, 32k KV cache
 backends:
  - cutlass_mla
  - flashinfer_mla
  - flashattn_mla  # Hopper only
  - flashmla        # Hopper only
 device: "cuda:0"
 repeats: 5
 warmup_iters: 3
 profile_memory: true
 # Backend-specific tuning
 cutlass_mla:
  num_kv_splits: auto  # or specific value like 4, 8, 16
 flashattn_mla:
  reorder_batch_threshold: 512
 flashmla:
  reorder_batch_threshold: 1
--- a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
@@ -1,60 +0,0 @@
 # MLA mixed batch benchmark (prefill + decode)
 # Tests chunked prefill performance
 model:
  name: "deepseek-v3"
  num_layers: 60
  num_q_heads: 128
  num_kv_heads: 1
  head_dim: 576
  kv_lora_rank: 512
  qk_nope_head_dim: 128
  qk_rope_head_dim: 64
  v_head_dim: 128
  block_size: 128
 batch_specs:
  # Small prefill + decode
  - "1q1k_8q1s1k"           # 1 prefill + 8 decode
  - "2q2k_16q1s1k"          # 2 prefill + 16 decode
  - "4q1k_32q1s2k"          # 4 prefill + 32 decode
  # Medium prefill + decode
  - "2q4k_32q1s2k"          # 2 medium prefill + 32 decode
  - "4q4k_64q1s2k"          # 4 medium prefill + 64 decode
  - "8q2k_64q1s4k"          # 8 prefill + 64 decode
  # Large prefill + decode (chunked prefill stress test)
  - "2q8k_32q1s1k"          # 2 large prefill + 32 decode
  - "1q16k_16q1s2k"         # 1 very large prefill + 16 decode
  - "2q16k_32q1s4k"         # 2 very large prefill + 32 decode
  # Context extension + decode
  - "2q1kkv2k_16q1s1k"       # 2 extend + 16 decode
  - "4q2kkv4k_32q1s2k"       # 4 extend + 32 decode
  - "2q1kkv8k_32q1s2k"       # 2 large extend + 32 decode
  # Explicitly chunked prefill
  - "q8k"           # 8k prefill with chunking hint
  - "q16k"          # 16k prefill with chunking hint
  - "2q8k_32q1s2k"    # 2 chunked prefill + 32 decode
  # High decode ratio (realistic serving)
  - "1q2k_63q1s1k"          # 1 prefill + 63 decode
  - "2q2k_62q1s2k"          # 2 prefill + 62 decode
  - "4q4k_60q1s4k"          # 4 prefill + 60 decode
 backends:
  - cutlass_mla
  - flashinfer_mla
  - flashattn_mla   # Hopper only
  - flashmla        # Hopper only
 device: "cuda:0"
 repeats: 5
 warmup_iters: 3
 profile_memory: true
 # Analyze chunked prefill workspace size impact
 chunked_prefill:
  test_workspace_sizes: [4096, 8192, 16384, 32768, 65536]
--- a/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
+++ b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
@@ -1,88 +0,0 @@
 # Study 4: What is optimal reorder_batch_threshold for MLA backends supporting query length > 1?
 # Question: At what query length does prefill pipeline become faster than decode pipeline?
 # Methodology: For each query length, compare decode vs prefill performance to find crossover point
 # Applies to: FlashAttn MLA, FlashMLA
 description: "Decode vs Prefill pipeline crossover analysis"
 # Test FlashAttn MLA
 backend: flashattn_mla
 # Mode: decode_vs_prefill comparison (special sweep mode)
 # For each batch spec, we'll test both decode and prefill pipelines
 mode: "decode_vs_prefill"
 # Query lengths to test (from old benchmark_mla_threshold.py methodology)
 # Each query length will be tested with BOTH decode and prefill pipelines:
 #   - decode: threshold >= query_length (forces decode pipeline)
 #   - prefill: threshold < query_length (forces prefill pipeline)
 #
 # We use q<N>s1k format which creates q_len=N, seq_len=1024 requests
 # This tests different query lengths with fixed sequence length context
 #
 # Using batch_spec_ranges for automatic generation:
 batch_spec_ranges:
  - template: "q{q_len}s1k"
    q_len:
      start: 1
      stop: 16
      step: 1
      end_inclusive: false
  - template: "q{q_len}s1k"
    q_len:
      start: 16
      stop: 64
      step: 2
      end_inclusive: false
  - template: "q{q_len}s1k"
    q_len:
      start: 64
      stop: 1024
      step: 4
      end_inclusive: true
 # Batch sizes to test (from old script)
 batch_sizes:
  - 1
  - 2
  - 4
  - 8
  - 16
  - 32
  - 64
  - 128
  - 256
 # Model configuration (DeepSeek V2/V3 defaults)
 model:
  num_layers: 10
  head_dim: 576
  num_q_heads: 128
  num_kv_heads: 1
  block_size: 128
 # Benchmark settings
 benchmark:
  device: "cuda:0"
  repeats: 15          # More repeats for spec decode variance
  warmup_iters: 5
  profile_memory: false
 # Output
 output:
  csv: "reorder_threshold_results.csv"
  json: "reorder_threshold_results.json"
 # Expected outcome (reproduces old benchmark_mla_threshold.py study):
 # - For each batch size, find the crossover point where prefill becomes faster than decode
 # - Show decode vs prefill performance across all query lengths
 # - Determine optimal reorder_batch_threshold based on last query length where decode is faster
 # - Understand how crossover point varies with batch size
 # - Provide data-driven guidance for default threshold value
 #
 # Methodology (from old script):
 # - Each query length tested with BOTH pipelines:
 #     * decode: threshold >= query_length (forces decode pipeline)
 #     * prefill: threshold < query_length (forces prefill pipeline)
 # - Compare which is faster to find crossover point
 #
--- a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
@@ -1,62 +0,0 @@
 # Speculative decoding benchmark configuration
 # Tests reorder_batch_threshold optimization
 model:
  name: "deepseek-v3"
  num_layers: 60
  num_q_heads: 128
  num_kv_heads: 1
  head_dim: 576
  kv_lora_rank: 512
  qk_nope_head_dim: 128
  qk_rope_head_dim: 64
  v_head_dim: 128
 batch_specs:
  # Pure speculative decode (K-token verification)
  - "q2s1k"      # 2-token spec, 1k KV
  - "q4s1k"      # 4-token spec, 1k KV
  - "q8s1k"      # 8-token spec, 1k KV
  - "q16s1k"     # 16-token spec, 1k KV
  # Speculative with different context lengths
  - "q4s2k"      # 4-token spec, 2k KV
  - "q4s4k"      # 4-token spec, 4k KV
  - "q8s2k"      # 8-token spec, 2k KV
  - "q8s4k"      # 8-token spec, 4k KV
  # Mixed: speculative + regular decode
  - "32q4s1k"                    # 32 spec requests
  - "16q4s1k_16q1s1k"              # 16 spec + 16 regular
  - "8q8s2k_24q1s2k"               # 8 spec (8-tok) + 24 regular
  # Mixed: speculative + prefill + decode
  - "2q1k_16q4s1k_16q1s1k"         # 2 prefill + 16 spec + 16 decode
  - "4q2k_32q4s2k_32q1s2k"         # 4 prefill + 32 spec + 32 decode
  # Large batches with speculation
  - "64q4s1k"                    # 64 spec requests
  - "32q8s2k"                    # 32 spec (8-token)
  - "16q16s4k"                   # 16 spec (16-token)
 # Backends that support query length > 1
 backends:
  - flashattn_mla    # reorder_batch_threshold = 512
  - flashmla          # reorder_batch_threshold = 1 (tunable)
 # FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
 # - flashinfer_mla
 # Benchmark settings
 benchmark:
  device: "cuda:0"
  repeats: 10  # More repeats for statistical significance
  warmup_iters: 5
  profile_memory: false
 # Test these threshold values for optimization
 parameter_sweep:
  param_name: "reorder_batch_threshold"
  values: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
  include_auto: false
  label_format: "{backend}_threshold_{value}"
--- a/benchmarks/attention_benchmarks/configs/standard_attention.yaml
+++ b/benchmarks/attention_benchmarks/configs/standard_attention.yaml
@@ -1,40 +0,0 @@
 # Standard attention backend benchmark configuration
 model:
  num_layers: 32
  num_q_heads: 32
  num_kv_heads: 8  # GQA with 4:1 ratio
  head_dim: 128
  block_size: 16
 batch_specs:
  # Pure prefill
  - "q512"      # Small prefill (512 tokens)
  - "q2k"       # Medium prefill (2048 tokens)
  - "q4k"       # Large prefill (4096 tokens)
  - "q8k"       # Very large prefill (8192 tokens)
  # Pure decode
  - "8q1s1k"      # 8 requests, 1k KV cache each
  - "16q1s2k"     # 16 requests, 2k KV cache each
  - "32q1s1k"     # 32 requests, 1k KV cache each
  - "64q1s4k"     # 64 requests, 4k KV cache each
  # Mixed prefill/decode
  - "2q2k_8q1s1k"      # 2 prefill + 8 decode
  - "4q1k_16q1s2k"     # 4 prefill + 16 decode
  - "2q4k_32q1s1k"     # 2 large prefill + 32 decode
  # Context extension
  - "q1ks2k"          # 1k query, 2k sequence (chunked prefill)
  - "2q1ks4k"         # 2 requests: 1k query, 4k sequence
 backends:
  - flash
  - triton
  - flashinfer
 device: "cuda:0"
 repeats: 5
 warmup_iters: 3
 profile_memory: false
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -1,836 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 MLA benchmark runner - shared utilities for MLA benchmarks.
 This module provides helpers for running MLA backends without
 needing full VllmConfig integration.
 """
 import importlib
 import numpy as np
 import torch
 from batch_spec import parse_batch_spec
 from common import (
    BenchmarkResult,
    MockHfConfig,
    MockKVBProj,
    MockLayer,
    setup_mla_dims,
 )
 from vllm.config import (
    CacheConfig,
    CompilationConfig,
    ModelConfig,
    ParallelConfig,
    SchedulerConfig,
    VllmConfig,
    set_current_vllm_config,
 )
 # ============================================================================
 # VllmConfig Creation
 # ============================================================================
 def _add_mock_methods_to_model_config(model_config: ModelConfig) -> None:
    """
    Add mock methods for layer-specific queries to ModelConfig.
    These methods are needed by metadata builders but aren't normally
    present on ModelConfig when used in benchmark contexts.
    """
    import types
    model_config.get_num_layers = types.MethodType(lambda self: 1, model_config)
    model_config.get_sliding_window_for_layer = types.MethodType(
        lambda self, _i: None, model_config
    )
    model_config.get_logits_soft_cap_for_layer = types.MethodType(
        lambda self, _i: None, model_config
    )
    model_config.get_sm_scale_for_layer = types.MethodType(
        lambda self, _i: 1.0 / model_config.get_head_size() ** 0.5, model_config
    )
 def create_minimal_vllm_config(
    model_name: str = "deepseek-v3",
    block_size: int = 128,
    max_num_seqs: int = 256,
    mla_dims: dict | None = None,
 ) -> VllmConfig:
    """
    Create minimal VllmConfig for MLA benchmarks.
    Args:
        model_name: Model name (deepseek-v2, deepseek-v3, etc.) - used if mla_dims not
                    provided
        block_size: KV cache block size
        max_num_seqs: Maximum number of sequences
        mla_dims: Optional custom MLA dimensions dict. If not provided, uses
                  setup_mla_dims(model_name)
    Returns:
        VllmConfig for benchmarking
    """
    # Get MLA dimensions - use provided or load from model name
    if mla_dims is None:
        mla_dims = setup_mla_dims(model_name)
    # Create mock HF config first (avoids downloading from HuggingFace)
    mock_hf_config = MockHfConfig(mla_dims)
    # Create a temporary minimal config.json to avoid HF downloads
    # This ensures consistent ModelConfig construction without network access
    import json
    import os
    import shutil
    import tempfile
    minimal_config = {
        "architectures": ["DeepseekV2ForCausalLM"],
        "model_type": "deepseek_v2",
        "num_attention_heads": mla_dims["num_q_heads"],
        "num_key_value_heads": mla_dims["num_kv_heads"],
        "hidden_size": mla_dims["head_dim"] * mla_dims["num_q_heads"],
        "torch_dtype": "bfloat16",
        "max_position_embeddings": 163840,  # DeepSeek V3 default
        "rope_theta": 10000.0,
        "vocab_size": 128256,
    }
    # Create temporary directory with config.json
    temp_dir = tempfile.mkdtemp(prefix="vllm_bench_")
    config_path = os.path.join(temp_dir, "config.json")
    with open(config_path, "w") as f:
        json.dump(minimal_config, f)
    try:
        # Create model config using local path - no HF downloads
        model_config = ModelConfig(
            model=temp_dir,  # Use local temp directory
            tokenizer=None,
            tokenizer_mode="auto",
            trust_remote_code=True,
            dtype="bfloat16",
            seed=0,
            max_model_len=32768,
            quantization=None,
            quantization_param_path=None,
            enforce_eager=False,
            max_context_len_to_capture=None,
            max_seq_len_to_capture=8192,
            max_logprobs=20,
            disable_sliding_window=False,
            skip_tokenizer_init=True,
            served_model_name=None,
            limit_mm_per_prompt=None,
            use_async_output_proc=True,
            config_format="auto",
        )
    finally:
        # Clean up temporary directory
        shutil.rmtree(temp_dir, ignore_errors=True)
    # Override with our mock config
    model_config.hf_config = mock_hf_config
    model_config.hf_text_config = mock_hf_config
    # Add mock methods for layer-specific queries
    _add_mock_methods_to_model_config(model_config)
    # Create sub-configs
    cache_config = CacheConfig(
        block_size=block_size,
        gpu_memory_utilization=0.9,
        swap_space=0,
        cache_dtype="auto",
        enable_prefix_caching=False,
    )
    scheduler_config = SchedulerConfig(
        max_num_seqs=max_num_seqs,
        max_num_batched_tokens=8192,
        max_model_len=32768,
        is_encoder_decoder=False,
        enable_chunked_prefill=True,
    )
    parallel_config = ParallelConfig(
        tensor_parallel_size=1,
    )
    compilation_config = CompilationConfig()
    return VllmConfig(
        model_config=model_config,
        cache_config=cache_config,
        parallel_config=parallel_config,
        scheduler_config=scheduler_config,
        compilation_config=compilation_config,
    )
 # ============================================================================
 # Backend Configuration
 # ============================================================================
 # Backend name to class name prefix mapping
 _BACKEND_NAME_MAP = {
    "flashattn_mla": "FlashAttnMLA",
    "flashmla": "FlashMLA",
    "flashinfer_mla": "FlashInferMLA",
    "cutlass_mla": "CutlassMLA",
 }
 # Special properties that differ from defaults
 _BACKEND_PROPERTIES = {
    "flashmla": {
        "query_format": "concat",  # Single concatenated tensor (vs tuple)
        "block_size": 64,  # FlashMLA uses fixed block size
    },
    "flashinfer_mla": {
        "block_size": 64,  # FlashInfer MLA only supports 32 or 64
    },
 }
 def _get_backend_config(backend: str) -> dict:
    """
    Get backend configuration using naming conventions.
    All MLA backends follow the pattern:
    - Module: vllm.v1.attention.backends.mla.{backend}
    - Impl: {Name}Impl
    - Metadata: {Name}Metadata (or MLACommonMetadata)
    - DecodeMetadata: {Name}DecodeMetadata (or MLACommonDecodeMetadata)
    - MetadataBuilder: {Name}MetadataBuilder
    """
    if backend not in _BACKEND_NAME_MAP:
        raise ValueError(f"Unknown backend: {backend}")
    name = _BACKEND_NAME_MAP[backend]
    props = _BACKEND_PROPERTIES.get(backend, {})
    # Check if backend uses common metadata (FlashInfer, CUTLASS)
    uses_common = backend in ("flashinfer_mla", "cutlass_mla")
    return {
        "module": f"vllm.v1.attention.backends.mla.{backend}",
        "impl_class": f"{name}Impl",
        "metadata_class": "MLACommonMetadata" if uses_common else f"{name}Metadata",
        "decode_metadata_class": "MLACommonDecodeMetadata"
        if uses_common
        else f"{name}DecodeMetadata",
        "builder_class": f"{name}MetadataBuilder",
        "query_format": props.get("query_format", "tuple"),
        "block_size": props.get("block_size", None),
    }
 # ============================================================================
 # Metadata Building Helpers
 # ============================================================================
 def _build_attention_metadata(
    requests: list,
    block_size: int,
    device: torch.device,
    builder_instance,
 ) -> tuple:
    """
    Build attention metadata from batch requests.
    Args:
        requests: List of BatchRequest objects
        block_size: KV cache block size
        device: Target device
        builder_instance: Metadata builder instance
    Returns:
        Tuple of (metadata, kv_cache_num_blocks)
    """
    q_lens = [r.q_len for r in requests]
    kv_lens = [r.kv_len for r in requests]
    total_q = sum(q_lens)
    max_kv = max(kv_lens)
    # Build query start locations
    q_start_cpu = torch.tensor(
        [0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))],
        dtype=torch.int32,
    )
    q_start_gpu = q_start_cpu.to(device)
    # Build sequence lengths
    seq_lens_cpu = torch.tensor(kv_lens, dtype=torch.int32)
    seq_lens_gpu = seq_lens_cpu.to(device)
    # Build num_computed_tokens (context length for each request)
    context_lens = [kv_len - q_len for q_len, kv_len in zip(q_lens, kv_lens)]
    num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
    # Build block table
    num_blocks_per_req = [(kv + block_size - 1) // block_size for kv in kv_lens]
    max_num_blocks = max(num_blocks_per_req)
    block_table_cpu = np.zeros((len(requests), max_num_blocks), dtype=np.int32)
    current_block = 0
    for i, num_blocks in enumerate(num_blocks_per_req):
        for j in range(num_blocks):
            block_table_cpu[i, j] = current_block
            current_block += 1
    block_table_gpu = torch.from_numpy(block_table_cpu).to(device)
    # Build slot mapping
    slot_mapping_list = []
    for i, (q_len, kv_len, num_blocks) in enumerate(
        zip(q_lens, kv_lens, num_blocks_per_req)
    ):
        context_len = kv_len - q_len
        for j in range(q_len):
            token_kv_idx = context_len + j
            block_idx = token_kv_idx // block_size
            offset_in_block = token_kv_idx % block_size
            global_block_id = block_table_cpu[i, block_idx]
            slot_id = global_block_id * block_size + offset_in_block
            slot_mapping_list.append(slot_id)
    slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64, device=device)
    # Create CommonAttentionMetadata
    from vllm.v1.attention.backends.utils import CommonAttentionMetadata
    common_attn_metadata = CommonAttentionMetadata(
        num_reqs=len(requests),
        max_query_len=max(q_lens),
        max_seq_len=max_kv,
        num_actual_tokens=total_q,
        query_start_loc=q_start_gpu,
        query_start_loc_cpu=q_start_cpu,
        seq_lens=seq_lens_gpu,
        _seq_lens_cpu=seq_lens_cpu,
        _num_computed_tokens_cpu=num_computed_tokens_cpu,
        slot_mapping=slot_mapping,
        block_table_tensor=block_table_gpu,
        dcp_local_seq_lens=None,
    )
    # Use the production build() method
    metadata = builder_instance.build(
        common_prefix_len=0,
        common_attn_metadata=common_attn_metadata,
        fast_build=False,
    )
    return metadata, current_block
 def _create_input_tensors(
    total_q: int,
    mla_dims: dict,
    query_format: str,
    device: torch.device,
    dtype: torch.dtype,
 ):
    """
    Create input tensors for both decode and prefill modes.
    MLA requires different tensor formats for decode vs prefill:
    - Decode: Uses kv_lora_rank (512) dimension
    - Prefill: Uses qk_nope_head_dim (128) to stay under FlashAttention's 256 limit
    Args:
        total_q: Total number of query tokens
        mla_dims: MLA dimension configuration
        query_format: Either "tuple" or "concat"
        device: Target device
        dtype: Tensor dtype
    Returns:
        Tuple of (decode_inputs, prefill_inputs)
        - decode_inputs: Query tensor(s) for decode mode
        - prefill_inputs: Dict with 'q', 'k_c_normed', 'k_pe', 'k_scale' for prefill
    """
    if query_format == "tuple":
        # Decode mode format: (q_nope, q_pe) where q_nope has kv_lora_rank dim
        q_nope_decode = torch.randn(
            total_q,
            mla_dims["num_q_heads"],
            mla_dims["kv_lora_rank"],
            device=device,
            dtype=dtype,
        )
        q_pe = torch.randn(
            total_q,
            mla_dims["num_q_heads"],
            mla_dims["qk_rope_head_dim"],
            device=device,
            dtype=dtype,
        )
        decode_inputs = (q_nope_decode, q_pe)
        # For prefill, we need q with qk_nope_head_dim instead of kv_lora_rank
        q_nope_prefill = torch.randn(
            total_q,
            mla_dims["num_q_heads"],
            mla_dims["qk_nope_head_dim"],
            device=device,
            dtype=dtype,
        )
        prefill_q = torch.cat([q_nope_prefill, q_pe], dim=-1)
    else:  # concat
        decode_inputs = torch.randn(
            total_q,
            mla_dims["num_q_heads"],
            mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
            device=device,
            dtype=dtype,
        )
        # For prefill with concat format
        prefill_q = torch.randn(
            total_q,
            mla_dims["num_q_heads"],
            mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
            device=device,
            dtype=dtype,
        )
    # Create additional inputs needed for prefill forward
    k_c_normed = torch.randn(
        total_q,
        mla_dims["kv_lora_rank"],
        device=device,
        dtype=dtype,
    )
    k_pe = torch.randn(
        total_q,
        1,  # Single head for MLA
        mla_dims["qk_rope_head_dim"],
        device=device,
        dtype=dtype,
    )
    k_scale = torch.ones(1, device=device, dtype=torch.float32)
    output = torch.zeros(
        total_q,
        mla_dims["num_q_heads"] * mla_dims["v_head_dim"],
        device=device,
        dtype=dtype,
    )
    prefill_inputs = {
        "q": prefill_q,
        "k_c_normed": k_c_normed,
        "k_pe": k_pe,
        "k_scale": k_scale,
        "output": output,
    }
    return decode_inputs, prefill_inputs
 # ============================================================================
 # Backend Initialization
 # ============================================================================
 def _create_backend_impl(
    backend_cfg: dict,
    mla_dims: dict,
    vllm_config: VllmConfig,
    device: torch.device,
 ):
    """
    Create backend implementation instance.
    Args:
        backend_cfg: Backend configuration dict
        mla_dims: MLA dimension configuration
        vllm_config: VllmConfig instance
        device: Target device
    Returns:
        Tuple of (impl, layer, builder_instance)
    """
    # Import backend classes
    backend_module = importlib.import_module(backend_cfg["module"])
    impl_class = getattr(backend_module, backend_cfg["impl_class"])
    # Calculate scale
    scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"])
    # Create mock kv_b_proj layer for prefill mode
    mock_kv_b_proj = MockKVBProj(
        num_heads=mla_dims["num_q_heads"],
        qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
        v_head_dim=mla_dims["v_head_dim"],
    )
    # Create impl
    impl = impl_class(
        num_heads=mla_dims["num_q_heads"],
        head_size=mla_dims["head_dim"],
        scale=scale,
        num_kv_heads=mla_dims["num_kv_heads"],
        alibi_slopes=None,
        sliding_window=None,
        kv_cache_dtype="auto",
        logits_soft_cap=None,
        attn_type="decoder",
        kv_sharing_target_layer_name=None,
        q_lora_rank=None,
        kv_lora_rank=mla_dims["kv_lora_rank"],
        qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
        qk_rope_head_dim=mla_dims["qk_rope_head_dim"],
        qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
        v_head_dim=mla_dims["v_head_dim"],
        kv_b_proj=mock_kv_b_proj,
    )
    # Initialize DCP attributes
    if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1):
        impl.dcp_world_size = 1
        impl.dcp_rank = 0
    # Create KV cache spec for MockLayer
    from vllm.v1.kv_cache_interface import FullAttentionSpec
    kv_cache_spec = FullAttentionSpec(
        block_size=backend_cfg["block_size"] or vllm_config.cache_config.block_size,
        num_kv_heads=1,  # MLA uses 1 KV head
        head_size=576,  # MLA head dim
        dtype=torch.bfloat16,
    )
    # Create mock layer
    layer = MockLayer(device, impl=impl, kv_cache_spec=kv_cache_spec)
    # Create builder instance if needed
    builder_instance = None
    if backend_cfg["builder_class"]:
        builder_class = getattr(backend_module, backend_cfg["builder_class"])
        # Populate static_forward_context so builder can find the layer
        # MockLayer inherits from AttentionLayerBase, so isinstance checks pass
        vllm_config.compilation_config.static_forward_context = {"placeholder": layer}
        builder_instance = builder_class(
            kv_cache_spec=kv_cache_spec,
            layer_names=["placeholder"],
            vllm_config=vllm_config,
            device=device,
        )
    return impl, layer, builder_instance
 # ============================================================================
 # Config Helpers
 # ============================================================================
 def _extract_mla_dims_from_config(config) -> dict | None:
    """
    Extract MLA dimensions from BenchmarkConfig if all required fields are present.
    Args:
        config: BenchmarkConfig instance
    Returns:
        Dict with MLA dimensions if all fields are provided, None otherwise
    """
    # Check if all MLA-specific fields are provided
    if all(
        [
            config.kv_lora_rank is not None,
            config.qk_nope_head_dim is not None,
            config.qk_rope_head_dim is not None,
            config.v_head_dim is not None,
        ]
    ):
        return {
            "kv_lora_rank": config.kv_lora_rank,
            "qk_nope_head_dim": config.qk_nope_head_dim,
            "qk_rope_head_dim": config.qk_rope_head_dim,
            "v_head_dim": config.v_head_dim,
            "num_q_heads": config.num_q_heads,
            "num_kv_heads": config.num_kv_heads,
            "head_dim": config.head_dim,
        }
    # Fallback: if MLA fields not fully specified, try to construct from basic fields
    elif config.head_dim == 576:
        # This looks like a DeepSeek MLA config, use standard dimensions with custom
        # head count
        return {
            "kv_lora_rank": 512,
            "qk_nope_head_dim": 128,
            "qk_rope_head_dim": 64,
            "v_head_dim": 128,
            "num_q_heads": config.num_q_heads,
            "num_kv_heads": config.num_kv_heads,
            "head_dim": config.head_dim,
        }
    return None
 # ============================================================================
 # Benchmark Execution
 # ============================================================================
 def _run_single_benchmark(
    config,
    impl,
    layer,
    builder_instance,
    backend_cfg: dict,
    mla_dims: dict,
    device: torch.device,
 ) -> BenchmarkResult:
    """
    Run a single benchmark iteration.
    Args:
        config: BenchmarkConfig instance
        impl: Backend implementation instance
        layer: MockLayer instance
        builder_instance: Metadata builder instance
        backend_cfg: Backend configuration dict
        mla_dims: MLA dimension configuration
        device: Target device
    Returns:
        BenchmarkResult with timing statistics
    """
    # Parse batch spec
    requests = parse_batch_spec(config.batch_spec)
    q_lens = [r.q_len for r in requests]
    total_q = sum(q_lens)
    # Determine block size
    block_size = backend_cfg["block_size"] or config.block_size
    # Build metadata
    metadata, num_blocks = _build_attention_metadata(
        requests, block_size, device, builder_instance
    )
    # Create KV cache
    kv_cache = torch.zeros(
        num_blocks,
        block_size,
        mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
        device=device,
        dtype=torch.bfloat16,
    )
    # Create input tensors for both decode and prefill modes
    decode_inputs, prefill_inputs = _create_input_tensors(
        total_q,
        mla_dims,
        backend_cfg["query_format"],
        device,
        torch.bfloat16,
    )
    # Determine which forward method to use based on metadata
    if metadata.decode is not None:
        forward_fn = lambda: impl._forward_decode(
            decode_inputs, kv_cache, metadata, layer
        )
    elif metadata.prefill is not None:
        forward_fn = lambda: impl._forward_prefill(
            prefill_inputs["q"],
            prefill_inputs["k_c_normed"],
            prefill_inputs["k_pe"],
            kv_cache,
            metadata,
            prefill_inputs["k_scale"],
            prefill_inputs["output"],
        )
    else:
        raise RuntimeError("Metadata has neither decode nor prefill metadata")
    # Warmup
    for _ in range(config.warmup_iters):
        forward_fn()
    torch.cuda.synchronize()
    # Benchmark
    times = []
    for _ in range(config.repeats):
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        start.record()
        for _ in range(config.num_layers):
            forward_fn()
        end.record()
        torch.cuda.synchronize()
        elapsed_ms = start.elapsed_time(end)
        times.append(elapsed_ms / 1000.0 / config.num_layers)
    mean_time = float(np.mean(times))
    return BenchmarkResult(
        config=config,
        mean_time=mean_time,
        std_time=float(np.std(times)),
        min_time=float(np.min(times)),
        max_time=float(np.max(times)),
        throughput_tokens_per_sec=total_q / mean_time if mean_time > 0 else 0,
    )
 def _run_mla_benchmark_batched(
    backend: str,
    configs_with_params: list[tuple],  # [(config, threshold, num_splits), ...]
 ) -> list[BenchmarkResult]:
    """
    Unified batched MLA benchmark runner for all backends.
    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
    This function reuses backend initialization across multiple benchmarks
    to avoid setup/teardown overhead.
    Args:
        backend: Backend name
        configs_with_params: List of (config, threshold, num_splits) tuples
            - threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
            - num_splits: num_kv_splits (CUTLASS only)
    Returns:
        List of BenchmarkResult objects
    """
    if not configs_with_params:
        return []
    backend_cfg = _get_backend_config(backend)
    device = torch.device(configs_with_params[0][0].device)
    torch.cuda.set_device(device)
    # Determine block size
    config_block_size = configs_with_params[0][0].block_size
    block_size = backend_cfg["block_size"] or config_block_size
    # Extract MLA dimensions from the first config
    first_config = configs_with_params[0][0]
    mla_dims = _extract_mla_dims_from_config(first_config)
    # If config didn't provide MLA dims, fall back to default model
    if mla_dims is None:
        mla_dims = setup_mla_dims("deepseek-v3")
    # Create and set vLLM config for MLA (reused across all benchmarks)
    vllm_config = create_minimal_vllm_config(
        model_name="deepseek-v3",  # Used only for model path
        block_size=block_size,
        mla_dims=mla_dims,  # Use custom dims from config or default
    )
    results = []
    with set_current_vllm_config(vllm_config):
        # Create backend impl, layer, and builder (reused across benchmarks)
        impl, layer, builder_instance = _create_backend_impl(
            backend_cfg, mla_dims, vllm_config, device
        )
        # Run each benchmark with the shared impl
        for config, threshold, num_splits in configs_with_params:
            # Set threshold for this benchmark (FlashAttn/FlashMLA only)
            original_threshold = None
            if threshold is not None and builder_instance:
                original_threshold = builder_instance.reorder_batch_threshold
                builder_instance.reorder_batch_threshold = threshold
            # Set num_splits for CUTLASS
            original_num_splits = None
            if num_splits is not None and hasattr(impl, "_num_kv_splits"):
                original_num_splits = impl._num_kv_splits
                impl._num_kv_splits = num_splits
            try:
                result = _run_single_benchmark(
                    config,
                    impl,
                    layer,
                    builder_instance,
                    backend_cfg,
                    mla_dims,
                    device,
                )
                results.append(result)
            finally:
                # Restore original threshold
                if original_threshold is not None:
                    builder_instance.reorder_batch_threshold = original_threshold
                # Restore original num_splits
                if original_num_splits is not None:
                    impl._num_kv_splits = original_num_splits
    return results
 # ============================================================================
 # Public API
 # ============================================================================
 def run_mla_benchmark(
    backend: str,
    config,
    reorder_batch_threshold: int | None = None,
    num_kv_splits: int | None = None,
 ) -> BenchmarkResult | list[BenchmarkResult]:
    """
    Unified MLA benchmark runner for all backends.
    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla
    Always uses batched execution internally for optimal performance.
    Args:
        backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla)
        config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples
        reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA
                                 (single config mode only)
        num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
    Returns:
        BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
    """
    # Normalize to batched mode: (config, threshold, num_splits)
    if isinstance(config, list):
        # Already in batched format
        if len(config) > 0 and isinstance(config[0], tuple):
            # Format: [(cfg, param), ...] where param is threshold or num_splits
            if backend in ("flashattn_mla", "flashmla"):
                configs_with_params = [(cfg, param, None) for cfg, param in config]
            else:  # cutlass_mla or flashinfer_mla
                configs_with_params = [(cfg, None, param) for cfg, param in config]
        else:
            # Format: [cfg, ...] - just configs
            configs_with_params = [(cfg, None, None) for cfg in config]
        return_single = False
    else:
        # Single config: convert to batched format
        configs_with_params = [(config, reorder_batch_threshold, num_kv_splits)]
        return_single = True
    # Use unified batched execution
    results = _run_mla_benchmark_batched(backend, configs_with_params)
    # Return single result or list based on input
    return results[0] if return_single else results
--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -1,481 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Standard attention benchmark runner - shared utilities for non-MLA benchmarks.
 This module provides helpers for running standard attention backends
 (FlashAttention, Triton, FlashInfer) with real vLLM integration.
 """
 import types
 import numpy as np
 import torch
 from batch_spec import parse_batch_spec, reorder_for_flashinfer
 from common import BenchmarkConfig, BenchmarkResult, MockLayer, get_attention_scale
 from vllm.config import (
    CacheConfig,
    CompilationConfig,
    DeviceConfig,
    LoadConfig,
    ModelConfig,
    ParallelConfig,
    SchedulerConfig,
    VllmConfig,
 )
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import FullAttentionSpec
 # ============================================================================
 # Backend Configuration
 # ============================================================================
 _BACKEND_CONFIG = {
    "flash": {
        "module": "vllm.v1.attention.backends.flash_attn",
        "backend_class": "FlashAttentionBackend",
        "dtype": torch.float16,
        "cache_layout": "standard",
        # ^ [2, num_blocks, block_size, num_kv_heads, head_dim]
    },
    "triton": {
        "module": "vllm.v1.attention.backends.triton_attn",
        "backend_class": "TritonAttentionBackend",
        "dtype": torch.float32,
        "cache_layout": "standard",
    },
    "flashinfer": {
        "module": "vllm.v1.attention.backends.flashinfer",
        "backend_class": "FlashInferBackend",
        "dtype": torch.float16,
        "cache_layout": "flashinfer",
        # ^ [num_blocks, 2, block_size, num_kv_heads, head_dim]
    },
 }
 def _get_backend_config(backend: str) -> dict:
    if backend not in _BACKEND_CONFIG:
        raise ValueError(
            f"Unknown backend: {backend}. "
            f"Available: {', '.join(_BACKEND_CONFIG.keys())}"
        )
    return _BACKEND_CONFIG[backend]
 # ============================================================================
 # Metadata Building Helpers
 # ============================================================================
 def _build_common_attn_metadata(
    q_lens: list[int],
    kv_lens: list[int],
    block_size: int,
    device: torch.device,
 ) -> CommonAttentionMetadata:
    """Build CommonAttentionMetadata from query/kv lengths."""
    batch_size = len(q_lens)
    total_tokens = sum(q_lens)
    query_start_loc = torch.zeros(batch_size + 1, dtype=torch.int32, device=device)
    query_start_loc[1:] = torch.tensor(q_lens, dtype=torch.int32, device=device).cumsum(
        0
    )
    query_start_loc_cpu = query_start_loc.cpu()
    seq_lens = torch.tensor(kv_lens, dtype=torch.int32, device=device)
    seq_lens_cpu = seq_lens.cpu()
    max_seq_len = int(seq_lens_cpu.max())
    context_lens = [kv - q for kv, q in zip(kv_lens, q_lens)]
    num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
    max_blocks = (max(kv_lens) + block_size - 1) // block_size
    num_blocks = batch_size * max_blocks
    block_table_tensor = torch.arange(
        num_blocks, dtype=torch.int32, device=device
    ).view(batch_size, max_blocks)
    slot_mapping = torch.arange(total_tokens, dtype=torch.int64, device=device)
    max_query_len = max(q_lens)
    return CommonAttentionMetadata(
        query_start_loc=query_start_loc,
        query_start_loc_cpu=query_start_loc_cpu,
        seq_lens=seq_lens,
        seq_lens_cpu=seq_lens_cpu,
        num_computed_tokens_cpu=num_computed_tokens_cpu,
        num_reqs=batch_size,
        num_actual_tokens=total_tokens,
        max_query_len=max_query_len,
        max_seq_len=max_seq_len,
        block_table_tensor=block_table_tensor,
        slot_mapping=slot_mapping,
        causal=True,
    )
 def _create_vllm_config(
    config: BenchmarkConfig,
    dtype: torch.dtype,
    max_num_blocks: int,
 ) -> VllmConfig:
    """Create a VllmConfig for benchmarking with mock model methods."""
    model_config = ModelConfig(
        model="meta-llama/Meta-Llama-3-8B",
        tokenizer="meta-llama/Meta-Llama-3-8B",
        trust_remote_code=False,
        dtype=dtype,
        seed=0,
        max_model_len=1024,
    )
    cache_config = CacheConfig(
        block_size=config.block_size,
        cache_dtype="auto",
        swap_space=0,
    )
    cache_config.num_gpu_blocks = max_num_blocks
    cache_config.num_cpu_blocks = 0
    parallel_config = ParallelConfig(tensor_parallel_size=1)
    scheduler_config = SchedulerConfig(
        max_num_seqs=256,
        max_num_batched_tokens=8192,
        max_model_len=8192,
        is_encoder_decoder=False,
        enable_chunked_prefill=True,
    )
    device_config = DeviceConfig()
    load_config = LoadConfig()
    compilation_config = CompilationConfig()
    # Add mock methods for benchmark config values
    model_config.get_num_layers = types.MethodType(
        lambda self: config.num_layers, model_config
    )
    model_config.get_sliding_window_for_layer = types.MethodType(
        lambda self, i: None, model_config
    )
    model_config.get_logits_soft_cap_for_layer = types.MethodType(
        lambda self, i: 0.0, model_config
    )
    model_config.get_sm_scale_for_layer = types.MethodType(
        lambda self, i: 1.0 / config.head_dim**0.5, model_config
    )
    model_config.get_num_attention_heads = types.MethodType(
        lambda self, parallel_config=None: config.num_q_heads, model_config
    )
    model_config.get_num_kv_heads = types.MethodType(
        lambda self, parallel_config=None: config.num_kv_heads, model_config
    )
    model_config.get_head_size = types.MethodType(
        lambda self: config.head_dim, model_config
    )
    model_config.get_sliding_window = types.MethodType(lambda self: None, model_config)
    return VllmConfig(
        model_config=model_config,
        cache_config=cache_config,
        parallel_config=parallel_config,
        scheduler_config=scheduler_config,
        device_config=device_config,
        load_config=load_config,
        compilation_config=compilation_config,
    )
 # ============================================================================
 # Backend Initialization
 # ============================================================================
 def _create_backend_impl(
    backend_cfg: dict,
    config: BenchmarkConfig,
    device: torch.device,
 ):
    """Create backend implementation instance."""
    import importlib
    backend_module = importlib.import_module(backend_cfg["module"])
    backend_class = getattr(backend_module, backend_cfg["backend_class"])
    scale = get_attention_scale(config.head_dim)
    dtype = backend_cfg["dtype"]
    impl = backend_class.get_impl_cls()(
        num_heads=config.num_q_heads,
        head_size=config.head_dim,
        scale=scale,
        num_kv_heads=config.num_kv_heads,
        alibi_slopes=None,
        sliding_window=None,
        kv_cache_dtype="auto",
    )
    kv_cache_spec = FullAttentionSpec(
        block_size=config.block_size,
        num_kv_heads=config.num_kv_heads,
        head_size=config.head_dim,
        dtype=dtype,
    )
    layer = MockLayer(device, kv_cache_spec=kv_cache_spec)
    return backend_class, impl, layer, dtype
 def _create_metadata_builder(
    backend_class,
    kv_cache_spec: FullAttentionSpec,
    vllm_config: VllmConfig,
    device: torch.device,
 ):
    """Create metadata builder instance."""
    return backend_class.get_builder_cls()(
        kv_cache_spec=kv_cache_spec,
        layer_names=["layer_0"],
        vllm_config=vllm_config,
        device=device,
    )
 # ============================================================================
 # Tensor Creation Helpers
 # ============================================================================
 def _create_input_tensors(
    config: BenchmarkConfig,
    total_q: int,
    device: torch.device,
    dtype: torch.dtype,
 ) -> tuple:
    """Create Q, K, V input tensors for all layers."""
    q_list = [
        torch.randn(
            total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
        )
        for _ in range(config.num_layers)
    ]
    k_list = [
        torch.randn(
            total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dtype
        )
        for _ in range(config.num_layers)
    ]
    v_list = [
        torch.randn(
            total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dtype
        )
        for _ in range(config.num_layers)
    ]
    return q_list, k_list, v_list
 def _create_kv_cache(
    config: BenchmarkConfig,
    max_num_blocks: int,
    cache_layout: str,
    device: torch.device,
    dtype: torch.dtype,
 ) -> list:
    """Create KV cache tensors for all layers."""
    if cache_layout == "flashinfer":
        # FlashInfer layout: [num_blocks, 2, block_size, num_kv_heads, head_dim]
        cache_list = [
            torch.zeros(
                max_num_blocks,
                2,
                config.block_size,
                config.num_kv_heads,
                config.head_dim,
                device=device,
                dtype=dtype,
            )
            for _ in range(config.num_layers)
        ]
    else:
        # Standard layout: [2, num_blocks, block_size, num_kv_heads, head_dim]
        cache_list = [
            torch.zeros(
                2,
                max_num_blocks,
                config.block_size,
                config.num_kv_heads,
                config.head_dim,
                device=device,
                dtype=dtype,
            )
            for _ in range(config.num_layers)
        ]
    return cache_list
 # ============================================================================
 # Benchmark Execution
 # ============================================================================
 def _run_single_benchmark(
    config: BenchmarkConfig,
    impl,
    layer,
    q_list: list,
    k_list: list,
    v_list: list,
    cache_list: list,
    attn_metadata,
    device: torch.device,
    dtype: torch.dtype,
 ) -> tuple:
    """Run single benchmark iteration with warmup and timing loop."""
    total_q = q_list[0].shape[0]
    out = torch.empty(
        total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
    )
    # Warmup
    for _ in range(config.warmup_iters):
        for i in range(config.num_layers):
            impl.forward(
                layer,
                q_list[i],
                k_list[i],
                v_list[i],
                cache_list[i],
                attn_metadata,
                output=out,
            )
    torch.cuda.synchronize()
    # Benchmark
    times = []
    for _ in range(config.repeats):
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        start.record()
        for i in range(config.num_layers):
            impl.forward(
                layer,
                q_list[i],
                k_list[i],
                v_list[i],
                cache_list[i],
                attn_metadata,
                output=out,
            )
        end.record()
        torch.cuda.synchronize()
        elapsed_ms = start.elapsed_time(end)
        times.append(elapsed_ms / 1000.0 / config.num_layers)  # seconds per layer
    mem_stats = {}
    if config.profile_memory:
        mem_stats = {
            "allocated_mb": torch.cuda.memory_allocated(device) / 1024**2,
            "reserved_mb": torch.cuda.memory_reserved(device) / 1024**2,
        }
    return times, mem_stats
 # ============================================================================
 # Public API
 # ============================================================================
 def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
    """
    Run standard attention benchmark with real kernels.
    Supports: flash, triton, flashinfer
    Args:
        config: Benchmark configuration
    Returns:
        BenchmarkResult with timing and memory statistics
    """
    device = torch.device(config.device)
    torch.cuda.set_device(device)
    backend_cfg = _get_backend_config(config.backend)
    requests = parse_batch_spec(config.batch_spec)
    if config.backend == "flashinfer":
        requests = reorder_for_flashinfer(requests)
    q_lens = [r.q_len for r in requests]
    kv_lens = [r.kv_len for r in requests]
    total_q = sum(q_lens)
    max_kv = max(kv_lens)
    max_num_blocks = (max_kv + config.block_size - 1) // config.block_size
    backend_class, impl, layer, dtype = _create_backend_impl(
        backend_cfg, config, device
    )
    common_metadata = _build_common_attn_metadata(
        q_lens, kv_lens, config.block_size, device
    )
    kv_cache_spec = FullAttentionSpec(
        block_size=config.block_size,
        num_kv_heads=config.num_kv_heads,
        head_size=config.head_dim,
        dtype=dtype,
    )
    vllm_config = _create_vllm_config(config, dtype, max_num_blocks)
    builder = _create_metadata_builder(
        backend_class, kv_cache_spec, vllm_config, device
    )
    attn_metadata = builder.build(
        common_prefix_len=0,
        common_attn_metadata=common_metadata,
    )
    q_list, k_list, v_list = _create_input_tensors(config, total_q, device, dtype)
    cache_list = _create_kv_cache(
        config, max_num_blocks, backend_cfg["cache_layout"], device, dtype
    )
    times, mem_stats = _run_single_benchmark(
        config,
        impl,
        layer,
        q_list,
        k_list,
        v_list,
        cache_list,
        attn_metadata,
        device,
        dtype,
    )
    mean_time = np.mean(times)
    throughput = total_q / mean_time if mean_time > 0 else 0
    return BenchmarkResult(
        config=config,
        mean_time=mean_time,
        std_time=np.std(times),
        min_time=np.min(times),
        max_time=np.max(times),
        throughput_tokens_per_sec=throughput,
        memory_allocated_mb=mem_stats.get("allocated_mb"),
        memory_reserved_mb=mem_stats.get("reserved_mb"),
    )
--- a/benchmarks/kernels/bench_nvfp4_quant.py
+++ b/benchmarks/kernels/bench_nvfp4_quant.py
@@ -20,12 +20,8 @@ FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
 PROVIDER_CFGS = {
-    "vllm": dict(backend="vllm", is_sf_swizzled_layout=False, enabled=True),
+    "vllm": dict(backend="vllm", enabled=True),
-    "vllm-swizzle": dict(backend="vllm", is_sf_swizzled_layout=True, enabled=True),
+    "flashinfer": dict(backend="flashinfer", enabled=True),
    "flashinfer": dict(backend="flashinfer", is_sf_swizzled_layout=False, enabled=True),
    "flashinfer-swizzle": dict(
        backend="flashinfer", is_sf_swizzled_layout=True, enabled=True
    ),
 }
 _enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
@@ -40,7 +36,7 @@ def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor:
@triton.testing.perf_report(
    triton.testing.Benchmark(
        x_names=["batch_size"],
-        x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192],
+        x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
        x_log=False,
        line_arg="provider",
        line_vals=_enabled,
@@ -67,36 +63,19 @@ def benchmark(batch_size, provider, N, K):
    if cfg["backend"] == "vllm":
        # vLLM's FP4 quantization
-        if cfg["is_sf_swizzled_layout"]:
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
-            ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: ops.scaled_fp4_quant(a, a_global_scale),
-                lambda: ops.scaled_fp4_quant(
+            quantiles=quantiles,
-                    a, a_global_scale, is_sf_swizzled_layout=True
+        )
                ),
                quantiles=quantiles,
            )
        else:
            ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
                lambda: ops.scaled_fp4_quant(
                    a, a_global_scale, is_sf_swizzled_layout=False
                ),
                quantiles=quantiles,
            )
    elif cfg["backend"] == "flashinfer":
        # FlashInfer's FP4 quantization
-        if cfg["is_sf_swizzled_layout"]:
+        # Use is_sf_swizzled_layout=True to match vLLM's output format
-            ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
-                lambda: flashinfer_fp4_quantize(
+            lambda: flashinfer_fp4_quantize(
-                    a, a_global_scale, is_sf_swizzled_layout=True
+                a, a_global_scale, is_sf_swizzled_layout=True
-                ),
+            ),
-                quantiles=quantiles,
+            quantiles=quantiles,
-            )
+        )
        else:
            ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
                lambda: flashinfer_fp4_quantize(
                    a, a_global_scale, is_sf_swizzled_layout=False
                ),
                quantiles=quantiles,
            )
    # Convert ms to us for better readability at small batch sizes
    to_us = lambda t_ms: t_ms * 1000
@@ -113,9 +92,7 @@ def prepare_shapes(args):
    return out
-def _test_accuracy_once(
+def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
    M: int, K: int, dtype: torch.dtype, device: str, is_sf_swizzled_layout: bool
 ):
    """Test accuracy between vLLM and FlashInfer FP4 quantization."""
    # Create input tensor
    a = torch.randn((M, K), device=device, dtype=dtype)
@@ -124,13 +101,11 @@ def _test_accuracy_once(
    a_global_scale = compute_global_scale(a)
    # vLLM quantization
-    vllm_fp4, vllm_scale = ops.scaled_fp4_quant(
+    vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale)
        a, a_global_scale, is_sf_swizzled_layout=is_sf_swizzled_layout
    )
    # FlashInfer quantization (with swizzled layout to match vLLM's output)
    flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize(
-        a, a_global_scale, is_sf_swizzled_layout=is_sf_swizzled_layout
+        a, a_global_scale, is_sf_swizzled_layout=True
    )
    flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn)
@@ -139,14 +114,7 @@ def _test_accuracy_once(
        vllm_fp4,
        flashinfer_fp4,
    )
-    # Compare scales
+    print(f"M={M}, K={K}, dtype={dtype}: PASSED")
    torch.testing.assert_close(
        vllm_scale,
        flashinfer_scale,
    )
    print(
        f"M={M}, K={K}, dtype={dtype}, is_sf_swizzled_layout={is_sf_swizzled_layout}: PASSED"  # noqa: E501
    )
 def test_accuracy():
@@ -162,10 +130,9 @@ def test_accuracy():
    Ms = [1, 1024]
    Ks = [4096]
-    for is_sf_swizzled_layout in [True, False]:
+    for M in Ms:
-        for M in Ms:
+        for K in Ks:
-            for K in Ks:
+            _test_accuracy_once(M, K, dtype, device)
                _test_accuracy_once(M, K, dtype, device, is_sf_swizzled_layout)
    print("\nAll accuracy tests passed!")
@@ -178,7 +145,7 @@ if __name__ == "__main__":
        "--models",
        nargs="+",
        type=str,
-        default=["meta-llama/Llama-3.3-70B-Instruct"],
+        default=["meta-llama/Llama-3.1-8B-Instruct"],
        choices=list(WEIGHT_SHAPES.keys()),
    )
    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@@ -7,7 +7,7 @@ import itertools
 import torch
 import vllm.model_executor.layers.activation  # noqa F401
-from vllm.model_executor.custom_op import op_registry
+from vllm.model_executor.custom_op import CustomOp
 from vllm.triton_utils import triton
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
@@ -33,14 +33,14 @@ def benchmark_activation(
    torch.set_default_device(device)
    if func_name == "gelu_and_mul":
-        layer = op_registry[func_name](approximate="none")
+        layer = CustomOp.op_registry[func_name](approximate="none")
    elif func_name == "gelu_and_mul_tanh":
-        layer = op_registry["gelu_and_mul"](approximate="tanh")
+        layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh")
    elif func_name == "fatrelu_and_mul":
        threshold = 0.5
-        layer = op_registry[func_name](threshold)
+        layer = CustomOp.op_registry[func_name](threshold)
    else:
-        layer = op_registry[func_name]()
+        layer = CustomOp.op_registry[func_name]()
    x = torch.randn(num_tokens, dim, dtype=dtype, device=device)
    compiled_layer = torch.compile(layer.forward_native)
--- a/benchmarks/kernels/benchmark_bitblas.py
+++ b/benchmarks/kernels/benchmark_bitblas.py
@@ -0,0 +1,244 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 from packaging import version
 from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
    MINIMUM_BITBLAS_VERSION,
 )
 try:
    import bitblas
    if version.parse(bitblas.__version__) < version.parse(MINIMUM_BITBLAS_VERSION):
        raise ImportError(
            "bitblas version is wrong. Please "
            f"install bitblas>={MINIMUM_BITBLAS_VERSION}"
        )
 except ImportError as e:
    bitblas_import_exception = e
    raise ValueError(
        "Trying to use the bitblas backend, but could not import"
        f"with the following error: {bitblas_import_exception}. "
        "Please install bitblas through the following command: "
        f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
    ) from bitblas_import_exception
 from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 parser = FlexibleArgumentParser(
    description="Benchmark BitBLAS int4 on a specific target."
 )
 # Add arguments to the parser
 parser.add_argument(
    "--target",
    type=str,
    default=auto_detect_nvidia_target(),
    help="Specify the target device for benchmarking.",
 )
 parser.add_argument(
    "--group_size", type=int, default=None, help="Group size for grouped quantization."
 )
 parser.add_argument(
    "--A_dtype",
    type=str,
    default="float16",
    choices=["float16", "float32", "float64", "int32", "int8"],
    help="Data type of activation A.",
 )
 parser.add_argument(
    "--W_dtype",
    type=str,
    default="int4",
    choices=[
        "float16",
        "float32",
        "float64",
        "int32",
        "int8",
        "int4",
        "int2",
        "int1",
        "nf4",
        "fp4_e2m1",
    ],
    help="Data type of weight W.",
 )
 parser.add_argument(
    "--accum_dtype",
    type=str,
    default="float16",
    choices=["float16", "int32"],
    help="Data type for accumulation.",
 )
 parser.add_argument(
    "--out_dtype",
    type=str,
    default="float16",
    choices=["float16", "float32", "int32", "int8"],
    help="Data type for output.",
 )
 parser.add_argument(
    "--layout",
    type=str,
    default="nt",
    choices=["nt", "nn"],
    help="Matrix layout, 'nt' for non-transpose A and transpose W.",
 )
 parser.add_argument(
    "--with_bias", action="store_true", help="Include bias in the benchmark."
 )
 parser.add_argument(
    "--with_scaling",
    action="store_true",
    help="Include scaling factor in the quantization.",
 )
 parser.add_argument(
    "--with_zeros", action="store_true", help="Include zeros in the quantization."
 )
 parser.add_argument(
    "--zeros_mode",
    type=str,
    default=None,
    choices=["original", "rescale", "quantized"],
    help="Specify the mode for calculating zeros.",
 )
 # Parse the arguments
 args = parser.parse_args()
 # Assign arguments to variables
 target = args.target
 A_dtype = args.A_dtype
 W_dtype = args.W_dtype
 accum_dtype = args.accum_dtype
 out_dtype = args.out_dtype
 layout = args.layout
 with_bias = args.with_bias
 group_size = args.group_size
 with_scaling = args.with_scaling
 with_zeros = args.with_zeros
 zeros_mode = args.zeros_mode
 # Define a list of shared arguments that repeat in every config
 shared_args = [
    A_dtype,
    W_dtype,
    out_dtype,
    accum_dtype,
    layout,
    with_bias,
    group_size,
    with_scaling,
    with_zeros,
    zeros_mode,
 ]
 # Define just the (M, K, N) shapes in a more compact list
 shapes = [
    # square test
    (1, 16384, 16384),
    # BLOOM-176B
    (1, 43008, 14336),
    (1, 14336, 14336),
    (1, 57344, 14336),
    (1, 14336, 57344),
    # OPT-65B
    (1, 9216, 9216),
    (1, 36864, 9216),
    (1, 9216, 36864),
    (1, 22016, 8192),
    # LLAMA-70B/65B
    (1, 8192, 22016),
    (1, 8192, 8192),
    (1, 28672, 8192),
    (1, 8192, 28672),
    # square test
    (16384, 16384, 16384),
    # BLOOM-176B
    (8192, 43008, 14336),
    (8192, 14336, 14336),
    (8192, 57344, 14336),
    (8192, 14336, 57344),
    # OPT-65B
    (8192, 9216, 9216),
    (8192, 36864, 9216),
    (8192, 9216, 36864),
    (8192, 22016, 8192),
    # LLAMA-70B/65B
    (8192, 8192, 22016),
    (8192, 8192, 8192),
    (8192, 28672, 8192),
    (8192, 8192, 28672),
 ]
 # Build test shapes with all the shared arguments
 test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args)) for shape in shapes]
 benchmark_sets = []
 benchmark_sets.extend(test_shapes)
 benchmark_results = {}
 for config_class, operator, input_args in benchmark_sets:
    config = config_class(*input_args)
    matmul = operator(config, target=target, enable_tuning=True)
    kernel_latency = matmul.profile_latency()
    print("Time cost is: {:.3f} ms".format(kernel_latency))
    profile_config = {
        f"{operator.__name__}-{'-'.join([str(i) for i in input_args])}": {
            "BitBLAS_top20_latency": kernel_latency,
        }
    }
    benchmark_results.update(profile_config)
 # Define headers for the table
 headers = [
    "PrimFunc",
    "Input Arguments",
    "BitBLAS Top20 Latency",
 ]
 # Calculate column widths for pretty printing
 col_widths = [0, 0, 0]
 for config_key, values in benchmark_results.items():
    args_split = config_key.split("-")
    func_name = args_split[0]
    input_args_str = "-".join(args_split[1:])
    col_widths[0] = max(col_widths[0], len(func_name) + 2, len(headers[0]) + 2)
    col_widths[1] = max(col_widths[1], len(input_args_str) + 2, len(headers[1]) + 2)
    col_widths[2] = max(
        col_widths[2],
        len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2,
        len(headers[2]) + 2,
    )
    # break only if you want to measure widths from a single example;
    # otherwise, let it loop over all items.
 # Print header
 for i, header in enumerate(headers):
    headers[i] = header.ljust(col_widths[i])
 print("".join(headers))
 print("-" * sum(col_widths))
 # Print rows
 for config_key, values in benchmark_results.items():
    args_split = config_key.split("-")
    func_name = args_split[0]
    input_args_str = "-".join(args_split[1:])
    row = [
        func_name,
        input_args_str,
        f"{values['BitBLAS_top20_latency']:.3f} ms",
    ]
    row_str = "".join(
        [str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)]
    )
    print(row_str)
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -9,7 +9,6 @@ but use different quantization strategies and backends.
 import torch
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
 from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
@@ -139,13 +138,12 @@ def bench_run(
    fn = mk.FusedMoEModularKernel(
        MoEPrepareAndFinalizeNoEP(),
        CutlassExpertsFp8(
-            moe_config=make_dummy_moe_config(
+            out_dtype=a.dtype,
-                num_experts=num_experts,
+            e=num_experts,
-                hidden_dim=k,
+            n=n,
-                intermediate_size_per_partition=n,
+            k=k,
                in_dtype=a.dtype,
            ),
            quant_config=quant_config,
            device=w1.device,
        ),
    )
--- a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
@@ -12,7 +12,6 @@ import torch
 import torch.utils.benchmark as benchmark
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import (
@@ -197,9 +196,10 @@ def bench_run(
        )
        kernel = mk.FusedMoEModularKernel(
-            MoEPrepareAndFinalizeNoEP(),
+            MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
            CutlassExpertsFp4(
-                make_dummy_moe_config(),
+                out_dtype=dtype,
                max_experts_per_worker=e,
                quant_config=quant_config,
            ),
        )
@@ -242,9 +242,10 @@ def bench_run(
        )
        kernel = mk.FusedMoEModularKernel(
-            MoEPrepareAndFinalizeNoEP(),
+            MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
            CutlassExpertsFp4(
-                make_dummy_moe_config(),
+                out_dtype=dtype,
                max_experts_per_worker=e,
                quant_config=quant_config,
            ),
        )
--- a/benchmarks/kernels/benchmark_fused_topk.py
+++ b/benchmarks/kernels/benchmark_fused_topk.py
@@ -1,99 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
 import torch
 from vllm.model_executor.layers.fused_moe.router.fused_topk_router import fused_topk
 from vllm.triton_utils import triton
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 num_tokens_range = [2**i for i in range(0, 8, 2)]
 num_experts_range = [16, 32, 64, 128, 256, 512]
 topk_range = [3, 4]
 configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
 def torch_topk(
    gating_output: torch.Tensor,
    topk: int,
    renormalize: bool,
    scoring_func: str = "softmax",
 ):
    if scoring_func == "softmax":
        scores = torch.softmax(gating_output.float(), dim=-1)
    else:
        scores = torch.sigmoid(gating_output.float())
    topk_weights, topk_ids = torch.topk(scores, k=topk, dim=-1)
    if renormalize:
        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
    return topk_weights, topk_ids
 def get_benchmark(scoring_func):
    @triton.testing.perf_report(
        triton.testing.Benchmark(
            x_names=["num_tokens", "num_experts", "topk"],
            x_vals=[list(_) for _ in configs],
            line_arg="provider",
            line_vals=["torch", "vllm"],
            line_names=["Torch", "vLLM"],
            styles=[("blue", "-"), ("red", "-")],
            ylabel="us",
            plot_name=f"fused-topk-perf-{scoring_func}",
            args={},
        )
    )
    def benchmark(num_tokens, num_experts, topk, provider):
        dtype = torch.bfloat16
        hidden_size = 1024
        renormalize = True
        hidden_states = torch.randn(
            (num_tokens, hidden_size), dtype=dtype, device="cuda"
        )
        gating_output = torch.randn(
            (num_tokens, num_experts), dtype=dtype, device="cuda"
        )
        quantiles = [0.5, 0.2, 0.8]
        if provider == "torch":
            ms, min_ms, max_ms = triton.testing.do_bench(
                lambda: torch_topk(
                    gating_output=gating_output,
                    topk=topk,
                    renormalize=renormalize,
                    scoring_func=scoring_func,
                ),
                quantiles=quantiles,
            )
        else:
            ms, min_ms, max_ms = triton.testing.do_bench(
                lambda: fused_topk(
                    hidden_states=hidden_states,
                    gating_output=gating_output,
                    topk=topk,
                    renormalize=renormalize,
                    scoring_func=scoring_func,
                ),
                quantiles=quantiles,
            )
        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
    return benchmark
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(description="Benchmark the MoE topk kernel.")
    parser.add_argument("--scoring-func", type=str, default="softmax")
    parser.add_argument("--save-path", type=str, default="./configs/fused_topk/")
    args = parser.parse_args()
    # Get the benchmark function
    benchmark = get_benchmark(args.scoring_func)
    # Run performance benchmark
    benchmark.run(print_data=True, save_path=args.save_path)
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -6,7 +6,6 @@ import torch.utils.benchmark as benchmark
 from benchmark_shapes import WEIGHT_SHAPES_MOE
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
@@ -135,13 +134,13 @@ def bench_run(
        fn = mk.FusedMoEModularKernel(
            MoEPrepareAndFinalizeNoEP(),
            CutlassExpertsFp8(
-                moe_config=make_dummy_moe_config(
+                out_dtype=a.dtype,
-                    num_experts=w2.shape[0],
+                # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
-                    hidden_dim=w2.shape[1],
+                e=w2.shape[0],
-                    intermediate_size_per_partition=w2.shape[2],
+                n=w2.shape[2],
-                    in_dtype=a.dtype,
+                k=w2.shape[1],
                ),
                quant_config=quant_config,
                device=w1.device,
            ),
        )
@@ -167,13 +166,13 @@ def bench_run(
        fn = mk.FusedMoEModularKernel(
            MoEPrepareAndFinalizeNoEP(),
            CutlassExpertsFp8(
-                moe_config=make_dummy_moe_config(
+                out_dtype=a.dtype,
-                    num_experts=w2.shape[0],
+                # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
-                    hidden_dim=w2.shape[1],
+                e=w2.shape[0],
-                    intermediate_size_per_partition=w2.shape[2],
+                n=w2.shape[2],
-                    in_dtype=a.dtype,
+                k=w2.shape[1],
                ),
                quant_config=quant_config,
                device=w1.device,
            ),
        )
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -231,7 +231,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
        assert bt.w_tok_s is None
        assert bt.group_size is not None
-        fn = lambda: ops.marlin_gemm(
+        fn = lambda: ops.gptq_marlin_gemm(
            a=bt.a,
            c=None,
            b_q_weight=w_q,
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -6,6 +6,12 @@ import torch.utils.benchmark as benchmark
 from benchmark_shapes import WEIGHT_SHAPES
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
    GPTQ_MARLIN_24_MAX_PARALLEL,
    GPTQ_MARLIN_24_MIN_THREAD_N,
    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES,
    GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES,
 )
 from vllm.model_executor.layers.quantization.utils.allspark_utils import (
    ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
    ALLSPARK_SUPPORTED_QUANT_TYPES,
@@ -28,6 +34,9 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
    awq_marlin_quantize,
    marlin_quantize,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
    marlin_24_quantize,
 )
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    gptq_pack,
    gptq_quantize_weights,
@@ -69,7 +78,14 @@ def bench_run(
    if size_k % group_size != 0:
        return
-    repack_supported = group_size in MARLIN_SUPPORTED_GROUP_SIZES
+    marlin_24_supported = (
        quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
        and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
    )
    repack_supported = (
        quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
        and group_size in MARLIN_SUPPORTED_GROUP_SIZES
    )
    allspark_supported = (
        quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
        and group_size == -1
@@ -110,6 +126,14 @@ def bench_run(
            marlin_sort_indices,
        )
    def gen_marlin_24_params():
        marlin_24_w_ref = marlin_24_q_w_comp = marlin_24_meta = marlin_24_s = None
        if marlin_24_supported:
            (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
                marlin_24_quantize(b, quant_type, group_size)
            )
        return (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s)
    def gen_repack_params():
        q_w_gptq = None
        repack_sort_indices = None
@@ -164,6 +188,9 @@ def bench_run(
        marlin_g_idx,
        marlin_sort_indices,
    ) = gen_marlin_params()
    marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s = (
        gen_marlin_24_params()
    )
    q_w_gptq, repack_sort_indices = gen_repack_params()
    qw_reorder, s_reorder, zp_reorder, sm_count, sm_version, CUBLAS_M_THRESHOLD = (
        gen_allspark_params()
@@ -173,6 +200,9 @@ def bench_run(
    marlin_workspace = MarlinWorkspace(
        size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
    )
    marlin_24_workspace = MarlinWorkspace(
        size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
    )
    globals = {
        # Gen params
@@ -192,6 +222,12 @@ def bench_run(
        "marlin_sort_indices": marlin_sort_indices,
        "marlin_workspace": marlin_workspace,
        "is_k_full": is_k_full,
        # Marlin_24 params
        "marlin_24_w_ref": marlin_24_w_ref,
        "marlin_24_q_w_comp": marlin_24_q_w_comp,
        "marlin_24_meta": marlin_24_meta,
        "marlin_24_s": marlin_24_s,
        "marlin_24_workspace": marlin_24_workspace,
        # GPTQ params
        "q_w_gptq": q_w_gptq,
        "repack_sort_indices": repack_sort_indices,
@@ -203,7 +239,8 @@ def bench_run(
        "sm_version": sm_version,
        "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD,
        # Kernels
-        "marlin_gemm": ops.marlin_gemm,
+        "gptq_marlin_gemm": ops.gptq_marlin_gemm,
        "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
        "gptq_marlin_repack": ops.gptq_marlin_repack,
        "allspark_w8a16_gemm": ops.allspark_w8a16_gemm,
    }
@@ -226,24 +263,35 @@ def bench_run(
    results.append(
        benchmark.Timer(
-            stmt="output = marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
-            description="marlin_gemm",
+            description="gptq_marlin_gemm",
        ).blocked_autorange(min_run_time=min_run_time)
    )
    results.append(
        benchmark.Timer(
-            stmt="output = marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
-            description="marlin_gemm_fp32",
+            description="gptq_marlin_gemm_fp32",
        ).blocked_autorange(min_run_time=min_run_time)
    )
    if marlin_24_supported:
        results.append(
            benchmark.Timer(
                stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)",  # noqa: E501
                globals=globals,
                label=label,
                sub_label=sub_label,
                description="gptq_marlin_24_gemm",
            ).blocked_autorange(min_run_time=min_run_time)
        )
    if repack_supported:
        results.append(
            benchmark.Timer(
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -15,18 +15,11 @@ import ray
 import torch
 from ray.experimental.tqdm_ray import tqdm
 from vllm.model_executor.layers.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEConfig,
    FusedMoEParallelConfig,
    FusedMoEQuantConfig,
    RoutingMethodType,
    _get_config_dtype_str,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
    TritonOrDeepGemmExperts,
 )
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
 from vllm.triton_utils import triton
@@ -201,36 +194,10 @@ def benchmark_config(
            block_shape=block_quant_shape,
        )
        deep_gemm_experts = None
        if use_deep_gemm:
            deep_gemm_experts = mk.FusedMoEModularKernel(
                prepare_finalize=MoEPrepareAndFinalizeNoEP(),
                fused_experts=TritonOrDeepGemmExperts(
                    moe_config=FusedMoEConfig(
                        num_experts=num_experts,
                        experts_per_token=topk,
                        hidden_dim=hidden_size,
                        intermediate_size_per_partition=shard_intermediate_size,
                        num_local_experts=num_experts,
                        activation="silu",
                        moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
                        in_dtype=init_dtype,
                        routing_method=RoutingMethodType.TopK,
                        device="cuda",
                    ),
                    quant_config=quant_config,
                ),
            )
        with override_config(config):
            topk_weights, topk_ids, token_expert_indices = fused_topk(
                x, input_gating, topk, renormalize=not use_deep_gemm
            )
            if use_deep_gemm:
                return deep_gemm_experts(
                    x, w1, w2, topk_weights, topk_ids, inplace=True
                )
            return fused_experts(
                x,
                w1,
@@ -239,6 +206,7 @@ def benchmark_config(
                topk_ids,
                inplace=True,
                quant_config=quant_config,
                allow_deep_gemm=use_deep_gemm,
            )
    # JIT compilation & warmup
@@ -675,7 +643,6 @@ def main(args: argparse.Namespace):
        "DeepseekV3ForCausalLM",
        "DeepseekV32ForCausalLM",
        "Glm4MoeForCausalLM",
        "Glm4MoeLiteForCausalLM",
        "NemotronHForCausalLM",
    ):
        E = config.n_routed_experts
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -8,8 +8,10 @@ import ray
 import torch
 from transformers import AutoConfig
-from vllm.model_executor.layers.fused_moe import fused_topk
+from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
    _moe_permute,
    _moe_unpermute_and_reduce,
    moe_permute,
    moe_unpermute,
 )
@@ -39,6 +41,7 @@ def benchmark_permute(
    use_fp8_w8a8: bool,
    use_int8_w8a16: bool,
    num_iters: int = 100,
    use_customized_permute: bool = False,
 ) -> float:
    # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
@@ -61,14 +64,31 @@ def benchmark_permute(
        input_gating.copy_(gating_output[i])
    def run():
-        moe_permute(
+        if use_customized_permute:
-            qhidden_states,
+            (
-            a1q_scale=None,
+                permuted_hidden_states,
-            topk_ids=topk_ids,
+                a1q_scale,
-            n_expert=num_experts,
+                first_token_off,
-            expert_map=None,
+                inv_perm_idx,
-            align_block_size=align_block_size,
+                m_indices,
-        )
+            ) = moe_permute(
                qhidden_states,
                a1q_scale=None,
                topk_ids=topk_ids,
                n_expert=num_experts,
                expert_map=None,
                align_block_size=align_block_size,
            )
        else:
            (
                permuted_hidden_states,
                a1q_scale,
                sorted_token_ids,
                expert_ids,
                inv_perm,
            ) = _moe_permute(
                qhidden_states, None, topk_ids, num_experts, None, align_block_size
            )
    # JIT compilation & warmup
    run()
@@ -113,9 +133,11 @@ def benchmark_unpermute(
    use_fp8_w8a8: bool,
    use_int8_w8a16: bool,
    num_iters: int = 100,
    use_customized_permute: bool = False,
 ) -> float:
    # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
    output_hidden_states = torch.empty_like(hidden_states)
    if use_fp8_w8a8:
        align_block_size = 128  # deepgemm needs 128 m aligned block
        qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
@@ -130,37 +152,78 @@ def benchmark_unpermute(
    )
    def prepare():
-        (
+        if use_customized_permute:
-            permuted_hidden_states,
+            (
-            _,
+                permuted_hidden_states,
-            first_token_off,
+                a1q_scale,
-            inv_perm_idx,
+                first_token_off,
-            _,
+                inv_perm_idx,
-        ) = moe_permute(
+                m_indices,
-            qhidden_states,
+            ) = moe_permute(
-            a1q_scale=None,
+                qhidden_states,
-            topk_ids=topk_ids,
+                a1q_scale=None,
-            n_expert=num_experts,
+                topk_ids=topk_ids,
-            expert_map=None,
+                n_expert=num_experts,
-            align_block_size=align_block_size,
+                expert_map=None,
-        )
+                align_block_size=align_block_size,
-        # convert to fp16/bf16 as gemm output
+            )
-        return (
+            # convert to fp16/bf16 as gemm output
-            permuted_hidden_states.to(dtype),
+            return (
-            first_token_off,
+                permuted_hidden_states.to(dtype),
-            inv_perm_idx,
+                first_token_off,
-        )
+                inv_perm_idx,
                m_indices,
            )
        else:
            (
                permuted_qhidden_states,
                a1q_scale,
                sorted_token_ids,
                expert_ids,
                inv_perm,
            ) = _moe_permute(
                qhidden_states, None, topk_ids, num_experts, None, align_block_size
            )
            # convert to fp16/bf16 as gemm output
            return (
                permuted_qhidden_states.to(dtype),
                a1q_scale,
                sorted_token_ids,
                expert_ids,
                inv_perm,
            )
    def run(input: tuple):
-        (permuted_hidden_states, first_token_off, inv_perm_idx) = input
+        if use_customized_permute:
-        output = torch.empty_like(hidden_states)
+            (
-        moe_unpermute(
+                permuted_hidden_states,
-            output,
+                first_token_off,
-            permuted_hidden_states,
+                inv_perm_idx,
-            topk_weights,
+                m_indices,
-            inv_perm_idx,
+            ) = input
-            first_token_off,
+            output = torch.empty_like(hidden_states)
-        )
+            moe_unpermute(
                output,
                permuted_hidden_states,
                topk_weights,
                inv_perm_idx,
                first_token_off,
            )
        else:
            (
                permuted_hidden_states,
                a1q_scale,
                sorted_token_ids,
                expert_ids,
                inv_perm,
            ) = input
            _moe_unpermute_and_reduce(
                output_hidden_states,
                permuted_hidden_states,
                inv_perm,
                topk_weights,
                True,
            )
    # JIT compilation & warmup
    input = prepare()
@@ -215,7 +278,8 @@ class BenchmarkWorker:
        dtype: torch.dtype,
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
-    ) -> tuple[float, float]:
+        use_customized_permute: bool = False,
    ) -> tuple[dict[str, int], float]:
        set_random_seed(self.seed)
        permute_time = benchmark_permute(
@@ -227,6 +291,7 @@ class BenchmarkWorker:
            use_fp8_w8a8,
            use_int8_w8a16,
            num_iters=100,
            use_customized_permute=use_customized_permute,
        )
        unpermute_time = benchmark_unpermute(
            num_tokens,
@@ -237,6 +302,7 @@ class BenchmarkWorker:
            use_fp8_w8a8,
            use_int8_w8a16,
            num_iters=100,
            use_customized_permute=use_customized_permute,
        )
        return permute_time, unpermute_time
@@ -264,7 +330,6 @@ def main(args: argparse.Namespace):
        config.architectures[0] == "DeepseekV3ForCausalLM"
        or config.architectures[0] == "DeepseekV2ForCausalLM"
        or config.architectures[0] == "Glm4MoeForCausalLM"
        or config.architectures[0] == "Glm4MoeLiteForCausalLM"
    ):
        E = config.n_routed_experts
        topk = config.num_experts_per_tok
@@ -283,6 +348,7 @@ def main(args: argparse.Namespace):
    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    use_customized_permute = args.use_customized_permute
    if args.batch_size is None:
        batch_sizes = [
@@ -334,6 +400,7 @@ def main(args: argparse.Namespace):
                dtype,
                use_fp8_w8a8,
                use_int8_w8a16,
                use_customized_permute,
            )
            for batch_size in batch_sizes
        ],
@@ -353,6 +420,7 @@ if __name__ == "__main__":
    parser.add_argument(
        "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
    )
    parser.add_argument("--use-customized-permute", action="store_true")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--batch-size", type=int, required=False)
    parser.add_argument("--trust-remote-code", action="store_true")
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -22,8 +22,8 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser
 mp.set_start_method("spawn", force=True)
-assert current_platform.is_cuda() or current_platform.is_rocm(), (
+assert current_platform.is_cuda(), (
-    "Only support tune w8a8 block fp8 kernel on CUDA/ROCm device."
+    "Only support tune w8a8 block fp8 kernel on CUDA device."
 )
 DTYPE_MAP = {
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -14,6 +14,7 @@ from vllm.triton_utils import triton
 from vllm.utils.deep_gemm import (
    calc_diff,
    fp8_gemm_nt,
    get_col_major_tma_aligned_tensor,
    per_block_cast_to_fp8,
 )
@@ -47,9 +48,8 @@ def benchmark_shape(
    block_size = [128, 128]
    # Pre-quantize A for all implementations
-    A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(
+    A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
-        A, block_size[1], column_major_scales=True, tma_aligned_scales=True
+    A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
    )
    C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
    A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
    A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -13,8 +13,6 @@ endif()
 #
 # Define environment variables for special configurations
 #
 set(ENABLE_AVX2 $ENV{VLLM_CPU_AVX2})
 set(ENABLE_AVX512 $ENV{VLLM_CPU_AVX512})
 set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16})
 set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI})
 set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16})
@@ -105,16 +103,6 @@ else()
    find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
    find_isa(${CPUINFO} "S390" S390_FOUND)
    find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
    # Support cross-compilation by allowing override via environment variables
    if (ENABLE_AVX2)
        set(AVX2_FOUND ON)
        message(STATUS "AVX2 support enabled via VLLM_CPU_AVX2 environment variable")
    endif()
    if (ENABLE_AVX512)
        set(AVX512_FOUND ON)
        message(STATUS "AVX512 support enabled via VLLM_CPU_AVX512 environment variable")
    endif()
 endif()
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
@@ -391,12 +379,6 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
    endif()
 endif()
 if (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
    set(VLLM_EXT_SRC
        "csrc/cpu/shm.cpp"
        ${VLLM_EXT_SRC})
 endif()
 if(USE_ONEDNN)
    set(VLLM_EXT_SRC
        "csrc/cpu/dnnl_kernels.cpp"
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -19,7 +19,7 @@ else()
  FetchContent_Declare(
        flashmla
        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
-        GIT_TAG c2afa9cb93e674d5a9120a170a6da57b89267208
+        GIT_TAG 46d64a8ebef03fa50b4ae74937276a5c940e3f95
        GIT_PROGRESS TRUE
        CONFIGURE_COMMAND ""
        BUILD_COMMAND ""
@@ -30,24 +30,6 @@ endif()
 FetchContent_MakeAvailable(flashmla)
 message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
 # Vendor FlashMLA interface into vLLM with torch-ops shim.
 set(FLASHMLA_VENDOR_DIR "${CMAKE_SOURCE_DIR}/vllm/third_party/flashmla")
 file(MAKE_DIRECTORY "${FLASHMLA_VENDOR_DIR}")
 file(READ "${flashmla_SOURCE_DIR}/flash_mla/flash_mla_interface.py"
     FLASHMLA_INTERFACE_CONTENT)
 string(REPLACE "import flash_mla.cuda as flash_mla_cuda"
               "import vllm._flashmla_C\nflash_mla_cuda = torch.ops._flashmla_C"
               FLASHMLA_INTERFACE_CONTENT
               "${FLASHMLA_INTERFACE_CONTENT}")
 file(WRITE "${FLASHMLA_VENDOR_DIR}/flash_mla_interface.py"
     "${FLASHMLA_INTERFACE_CONTENT}")
 # Install the generated flash_mla_interface.py to the wheel
 # Use COMPONENT _flashmla_C to ensure it's installed with the C extension
 install(FILES "${FLASHMLA_VENDOR_DIR}/flash_mla_interface.py"
        DESTINATION vllm/third_party/flashmla/
        COMPONENT _flashmla_C)
 # The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
 # Only build FlashMLA kernels if we are building for something compatible with 
 # sm90a
@@ -73,42 +55,16 @@ if(FLASH_MLA_ARCHS)
    set(FlashMLA_SOURCES
        ${flashmla_SOURCE_DIR}/csrc/torch_api.cpp
-
+        ${flashmla_SOURCE_DIR}/csrc/pybind.cpp
-        # Misc kernels for decoding
+        ${flashmla_SOURCE_DIR}/csrc/smxx/get_mla_metadata.cu
-        ${flashmla_SOURCE_DIR}/csrc/smxx/decode/get_decoding_sched_meta/get_decoding_sched_meta.cu
+        ${flashmla_SOURCE_DIR}/csrc/smxx/mla_combine.cu
-        ${flashmla_SOURCE_DIR}/csrc/smxx/decode/combine/combine.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/splitkv_mla.cu
-
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/splitkv_mla.cu
        # sm90 dense decode
        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/instantiations/fp16.cu
        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/instantiations/bf16.cu
        # sm90 sparse decode
        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/model1_persistent_h64.cu
        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/model1_persistent_h128.cu
        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/v32_persistent_h64.cu
        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/v32_persistent_h128.cu
        # sm90 sparse prefill
        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu
-        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k512.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/decode/sparse_fp8/splitkv_mla.cu
        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k512_topklen.cu
        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k576.cu
        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k576_topklen.cu
        # sm100 dense prefill & backward
        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu
-
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_bwd_sm100.cu
-        # sm100 sparse prefill
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu
        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head64/instantiations/phase1_k512.cu
        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head64/instantiations/phase1_k576.cu
        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head128/instantiations/phase1_k512.cu
        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head128/instantiations/phase1_k576.cu
        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/instantiations/phase1_prefill_k512.cu
        # sm100 sparse decode
        ${flashmla_SOURCE_DIR}/csrc/sm100/decode/head64/instantiations/v32.cu
        ${flashmla_SOURCE_DIR}/csrc/sm100/decode/head64/instantiations/model1.cu
        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/instantiations/phase1_decode_k512.cu
    )
    set(FlashMLA_Extension_SOURCES
@@ -120,7 +76,6 @@ if(FLASH_MLA_ARCHS)
    set(FlashMLA_INCLUDES
        ${flashmla_SOURCE_DIR}/csrc
        ${flashmla_SOURCE_DIR}/csrc/kerutils/include
        ${flashmla_SOURCE_DIR}/csrc/sm90
        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
        ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
@@ -128,6 +83,7 @@ if(FLASH_MLA_ARCHS)
    set(FlashMLA_Extension_INCLUDES
        ${flashmla_SOURCE_DIR}/csrc
        ${flashmla_SOURCE_DIR}/csrc/sm90
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/
        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
        ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
@@ -154,12 +110,9 @@ if(FLASH_MLA_ARCHS)
    # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
    # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
    # Also enable C++20 for the FlashMLA sources (required for std::span, requires, etc.)
    target_compile_options(_flashmla_C PRIVATE
        $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
-        $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>
+        $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
        $<$<COMPILE_LANGUAGE:CXX>:-std=c++20>
        $<$<COMPILE_LANGUAGE:CUDA>:-std=c++20>)
    define_extension_target(
        _flashmla_extension_C
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -7,7 +7,6 @@
 #include <vector>
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                 int64_t block_size_in_bytes,
                 const torch::Tensor& block_mapping);
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -24,14 +24,7 @@
 typedef __hip_bfloat16 __nv_bfloat16;
 #endif
 #if defined(__gfx942__)
 constexpr float kFp8ScaleDivisor = 224.f;
 #else
 constexpr float kFp8ScaleDivisor = 448.f;
 #endif
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                 int64_t block_size_in_bytes,
                 const torch::Tensor& block_mapping) {
  torch::Device src_device = src.device();
  torch::Device dst_device = dst.device();
@@ -56,6 +49,10 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
  char* src_ptr = static_cast<char*>(src.data_ptr());
  char* dst_ptr = static_cast<char*>(dst.data_ptr());
  // We use the stride instead of numel in case the cache is padded for memory
  // alignment reasons, we assume the blocks data (inclusive of any padding)
  // is contiguous in memory
  const int64_t block_size_in_bytes = src.element_size() * src.stride(0);
  const at::cuda::OptionalCUDAGuard device_guard(
      src_device.is_cuda() ? src_device : dst_device);
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@@ -208,8 +205,7 @@ __global__ void reshape_and_cache_flash_kernel(
    const int64_t block_stride, const int64_t page_stride,
    const int64_t head_stride, const int64_t key_stride,
    const int64_t value_stride, const int num_heads, const int head_size,
-    const int block_size, const float* k_scale, const float* v_scale,
+    const int block_size, const float* k_scale, const float* v_scale) {
    const int kv_scale_stride) {
  const int64_t token_idx = blockIdx.x;
  const int64_t slot_idx = slot_mapping[token_idx];
  // NOTE: slot_idx can be -1 if the token is padded
@@ -233,23 +229,21 @@ __global__ void reshape_and_cache_flash_kernel(
  // this is true for the NHD layout where `head_stride == head_size`
  const bool is_contiguous_heads = (head_stride == head_size);
  float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
  float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
  constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
-
+  CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
-  if (is_contiguous_heads && kv_scale_stride == 0) {
+  CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
-    // NHD layout and k/v_scales are [1] (i.e. single scale for all heads)
+  if (is_contiguous_heads) {
    // NHD layout
    // kv cache: [num_blocks, block_size, num_heads, head_size]
    float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
    float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
    CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
    CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
    vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, n_elems, threadIdx.x,
                                       blockDim.x, k_op);
    vectorize_with_alignment<VEC_SIZE>(value_src, value_dst, n_elems,
                                       threadIdx.x, blockDim.x, v_op);
  } else {
    // HND layout OR k/v_scales are [num_heads] (i.e. per-attn-head)
    // HND layout: heads are strided, but each head_size segment is contiguous
    // kv cache: [num_blocks, num_heads, block_size, head_size]
    const int lane = threadIdx.x & 31;     // 0..31 within warp
@@ -265,16 +259,6 @@ __global__ void reshape_and_cache_flash_kernel(
      cache_t* __restrict__ v_dst_h =
          value_dst + static_cast<int64_t>(head) * head_stride;
      float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto)
                              ? 0.f
                              : k_scale[head * kv_scale_stride];
      float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto)
                              ? 0.f
                              : v_scale[head * kv_scale_stride];
      CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
      CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
      // within each head, let the 32 threads of the warp perform the vector
      // copy
      vectorize_with_alignment<VEC_SIZE>(k_src_h, k_dst_h, head_size, lane, 32,
@@ -407,7 +391,8 @@ __global__ void concat_and_cache_ds_mla_kernel(
  }
  // Compute the scale for the tile
-  float tile_scale = fmaxf(max_abs / kFp8ScaleDivisor, FLT_MIN);
+  float tile_scale = max_abs / 448.f;
  tile_scale = fmaxf(tile_scale, FLT_MIN);
  // The first lane of each half-warp writes the scale to kv_cache
  if ((lane_idx == 0) || (lane_idx == 16)) {
@@ -476,8 +461,11 @@ __global__ void indexer_k_quant_and_cache_kernel(
 #endif
  }
-  float scale = fmaxf(amax, 1e-4) / kFp8ScaleDivisor;
+#if defined(__gfx942__)
-
+  float scale = fmaxf(amax, 1e-4) / 224.0f;
 #else
  float scale = fmaxf(amax, 1e-4) / 448.0f;
 #endif
  if (use_ue8m0) {
    scale = exp2f(ceilf(log2f(scale)));
  }
@@ -620,8 +608,7 @@ void reshape_and_cache(
          slot_mapping.data_ptr<int64_t>(), block_stride, page_stride,    \
          head_stride, key_stride, value_stride, num_heads, head_size,    \
          block_size, reinterpret_cast<const float*>(k_scale.data_ptr()), \
-          reinterpret_cast<const float*>(v_scale.data_ptr()),             \
+          reinterpret_cast<const float*>(v_scale.data_ptr()));
          kv_scale_stride);
 void reshape_and_cache_flash(
    torch::Tensor& key,        // [num_tokens, num_heads, head_size]
@@ -630,9 +617,8 @@ void reshape_and_cache_flash(
    torch::Tensor&
        value_cache,  // [num_blocks, block_size, num_heads, head_size]
    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
-    const std::string& kv_cache_dtype,
+    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
-    torch::Tensor& k_scale,    // [1] or [num_heads]
+    torch::Tensor& v_scale) {
    torch::Tensor& v_scale) {  // [1] or [num_heads]
  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
  // slot_mapping.size(0) because of padding for CUDA graphs.
  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
@@ -655,12 +641,6 @@ void reshape_and_cache_flash(
  int64_t head_stride = key_cache.stride(2);
  TORCH_CHECK(key_cache.stride(0) == value_cache.stride(0));
  TORCH_CHECK(k_scale.sizes() == v_scale.sizes(),
              "k_scale and v_scale must have the same shape");
  TORCH_CHECK(k_scale.numel() == 1 || k_scale.numel() == num_heads,
              "k_scale and v_scale must be of shape [1] or [num_heads]");
  int kv_scale_stride = (k_scale.numel() > 1) ? 1 : 0;
  dim3 grid(num_tokens);
  dim3 block(std::min(num_heads * head_size, 512));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -80,10 +80,8 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
    reg.val[1] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr) + 8);
  }
  // ASIMD does not support non-temporal loads
  explicit FP16Vec16(bool, const void* ptr) : FP16Vec16(ptr) {}
  explicit FP16Vec16(const FP32Vec16& vec);
  void save(void* ptr) const {
    vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
    vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
@@ -192,9 +190,6 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
  explicit BF16Vec16(const void* ptr)
      : reg(*reinterpret_cast<const bfloat16x8x2_t*>(ptr)) {};
  // ASIMD does not support non-temporal loads
  explicit BF16Vec16(bool, const void* ptr) : BF16Vec16(ptr) {}
  explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {};
  explicit BF16Vec16(const FP32Vec16&);
@@ -479,9 +474,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
      : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8),
             vld1q_f32(ptr + 12)}) {}
  // ASIMD does not support non-temporal loads
  explicit FP32Vec16(bool, const float* ptr) : FP32Vec16(ptr) {}
  explicit FP32Vec16(float32x4x4_t data) : reg(data) {}
  explicit FP32Vec16(const FP32Vec8& data) {
@@ -764,96 +756,6 @@ struct INT8Vec16 : public Vec<INT8Vec16> {
  };
 };
 struct INT8Vec64 : public Vec<INT8Vec64> {
  constexpr static int VEC_ELEM_NUM = 64;
  union AliasReg {
    int8x16x4_t reg;
    int8_t values[VEC_ELEM_NUM];
  };
  int8x16x4_t reg;
  explicit INT8Vec64(const int8_t* ptr) { reg = vld1q_s8_x4(ptr); }
  // ASIMD does not support non-temporal loads
  explicit INT8Vec64(bool, const int8_t* ptr) : INT8Vec64(ptr) {}
  void save(int8_t* ptr) const { vst1q_s8_x4(ptr, reg); }
  // masked store
  void save(int8_t* p, int elem_num) const {
    TORCH_CHECK(elem_num <= VEC_ELEM_NUM && elem_num > 0);
    if (elem_num == VEC_ELEM_NUM) {
      vst1q_s8_x4(p, reg);
      return;
    }
    const int full_quadwords = elem_num / 16;
    const int remaining_bytes = elem_num % 16;
    for (int i = 0; i < full_quadwords; ++i) {
      vst1q_s8(p + 16 * i, reg.val[i]);
    }
    if (remaining_bytes) {
      const int8x16_t v = reg.val[full_quadwords];
      int8_t* tail = p + 16 * full_quadwords;
      switch (remaining_bytes) {
        case 15:
          tail[14] = vgetq_lane_s8(v, 14);
          [[fallthrough]];
        case 14:
          tail[13] = vgetq_lane_s8(v, 13);
          [[fallthrough]];
        case 13:
          tail[12] = vgetq_lane_s8(v, 12);
          [[fallthrough]];
        case 12:
          tail[11] = vgetq_lane_s8(v, 11);
          [[fallthrough]];
        case 11:
          tail[10] = vgetq_lane_s8(v, 10);
          [[fallthrough]];
        case 10:
          tail[9] = vgetq_lane_s8(v, 9);
          [[fallthrough]];
        case 9:
          tail[8] = vgetq_lane_s8(v, 8);
          [[fallthrough]];
        case 8:
          tail[7] = vgetq_lane_s8(v, 7);
          [[fallthrough]];
        case 7:
          tail[6] = vgetq_lane_s8(v, 6);
          [[fallthrough]];
        case 6:
          tail[5] = vgetq_lane_s8(v, 5);
          [[fallthrough]];
        case 5:
          tail[4] = vgetq_lane_s8(v, 4);
          [[fallthrough]];
        case 4:
          tail[3] = vgetq_lane_s8(v, 3);
          [[fallthrough]];
        case 3:
          tail[2] = vgetq_lane_s8(v, 2);
          [[fallthrough]];
        case 2:
          tail[1] = vgetq_lane_s8(v, 1);
          [[fallthrough]];
        case 1:
          tail[0] = vgetq_lane_s8(v, 0);
          break;
        default:
          break;
      }
    }
  }
  // ASIMD does not support non-temporal stores
  void nt_save(int8_t* ptr) const { save(ptr); }
 };  // INT8Vec64
 template <typename T>
 struct VecType {
  using vec_type = void;
--- a/csrc/cpu/cpu_wna16.cpp
+++ b/csrc/cpu/cpu_wna16.cpp
@@ -116,7 +116,7 @@ class Dequantizer4b {
      scalar_vec_t output_vec_0(wb_0);
      scalar_vec_t output_vec_1(wb_1);
-      // AMX needs to interleave K elements to pack as 32 bits
+      // AMX needs to interlave K elements to pack as 32 bits
      if constexpr (isa == ISA::AMX) {
        vec_op::interleave_save(output_vec_0, output_vec_1, curr_weight);
      } else {
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@@ -360,14 +360,13 @@ void onednn_scaled_mm(
    const std::optional<torch::Tensor>& azp,      // [M] or [1]
    const std::optional<torch::Tensor>& azp_adj,  // [M] or [1]
    const std::optional<torch::Tensor>& bias,     // [N]
-    const torch::Tensor& handler_tensor) {
+    int64_t handler) {
  CPU_KERNEL_GUARD_IN(onednn_scaled_mm)
  TORCH_CHECK(a.dim() == 2);
  TORCH_CHECK(a.is_contiguous());
  TORCH_CHECK(c.is_contiguous());
  W8A8MatMulPrimitiveHandler* ptr =
-      reinterpret_cast<W8A8MatMulPrimitiveHandler*>(
+      reinterpret_cast<W8A8MatMulPrimitiveHandler*>(handler);
          handler_tensor.item<int64_t>());
  const int32_t* azp_ptr = nullptr;
  if (azp.has_value()) {
    azp_ptr = azp->data_ptr<int32_t>();
@@ -520,14 +519,13 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b,
 void onednn_mm(torch::Tensor& c,        // [M, OC], row-major
               const torch::Tensor& a,  // [M, IC], row-major
-               const std::optional<torch::Tensor>& bias,
+               const std::optional<torch::Tensor>& bias, int64_t handler) {
               const torch::Tensor& handler_tensor) {
  CPU_KERNEL_GUARD_IN(onednn_mm)
  TORCH_CHECK(a.dim() == 2);
  TORCH_CHECK(a.stride(-1) == 1);
  TORCH_CHECK(c.stride(-1) == 1);
  MatMulPrimitiveHandler* ptr =
-      reinterpret_cast<MatMulPrimitiveHandler*>(handler_tensor.item<int64_t>());
+      reinterpret_cast<MatMulPrimitiveHandler*>(handler);
 // ACL matmuls expect contiguous source tensors
 #ifdef VLLM_USE_ACL
--- a/csrc/cpu/shm.cpp
+++ b/csrc/cpu/shm.cpp
@@ -5,10 +5,6 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #ifdef __aarch64__
  #include <atomic>
 #endif
 namespace {
 #define MAX_SHM_RANK_NUM 8
 #define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024)
@@ -38,17 +34,8 @@ struct KernelVecType<c10::Half> {
 };
 struct ThreadSHMContext {
 #ifdef __aarch64__
  // memory model is weaker on AArch64, so we use atomic variables for
  // consumer (load-acquire) and producer (store-release) to make sure
  // that a stamp cannot be ready before the corresponding data is ready.
  std::atomic<char> _curr_thread_stamp[2];
  std::atomic<char> _ready_thread_stamp[2];
  static_assert(std::atomic<char>::is_always_lock_free);
 #else
  volatile char _curr_thread_stamp[2];
  volatile char _ready_thread_stamp[2];
 #endif  // __aarch64__
  int local_stamp_buffer_idx;
  int remote_stamp_buffer_idx;
  int thread_id;
@@ -75,17 +62,10 @@ struct ThreadSHMContext {
    TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
    TORCH_CHECK((size_t)this % 64 == 0);
    TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
 #ifdef __aarch64__
    _curr_thread_stamp[0].store(1, std::memory_order_relaxed);
    _curr_thread_stamp[1].store(1, std::memory_order_relaxed);
    _ready_thread_stamp[0].store(0, std::memory_order_relaxed);
    _ready_thread_stamp[1].store(0, std::memory_order_relaxed);
 #else
    _curr_thread_stamp[0] = 1;
    _curr_thread_stamp[1] = 1;
    _ready_thread_stamp[0] = 0;
    _ready_thread_stamp[1] = 0;
 #endif  // __aarch64__
    _thread_buffer_mask[0] = 0;
    _thread_buffer_mask[1] = 0;
    for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
@@ -123,43 +103,19 @@ struct ThreadSHMContext {
    _thread_buffer_mask[local_stamp_buffer_idx] ^= 0xFFFFFFFFFFFFFFFF;
  }
-  char get_curr_stamp(int idx) const {
+  char get_curr_stamp(int idx) const { return _curr_thread_stamp[idx]; }
 #ifdef __aarch64__
    return _curr_thread_stamp[idx].load(std::memory_order_acquire);
 #else
    return _curr_thread_stamp[idx];
 #endif  // __aarch64__
  }
-  char get_ready_stamp(int idx) const {
+  char get_ready_stamp(int idx) const { return _ready_thread_stamp[idx]; }
 #ifdef __aarch64__
    return _ready_thread_stamp[idx].load(std::memory_order_acquire);
 #else
    return _ready_thread_stamp[idx];
 #endif  // __aarch64__
  }
  void next_stamp() {
 #ifdef __aarch64__
    _curr_thread_stamp[local_stamp_buffer_idx].fetch_add(
        1, std::memory_order_release);
 #else
    _mm_mfence();
    _curr_thread_stamp[local_stamp_buffer_idx] += 1;
 #endif  // __aarch64__
  }
  void commit_ready_stamp() {
 #ifdef __aarch64__
    _ready_thread_stamp[local_stamp_buffer_idx].store(
        _curr_thread_stamp[local_stamp_buffer_idx].load(
            std::memory_order_relaxed),
        std::memory_order_release);
 #else
    _mm_mfence();
    _ready_thread_stamp[local_stamp_buffer_idx] =
        _curr_thread_stamp[local_stamp_buffer_idx];
 #endif  // __aarch64__
  }
  int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
@@ -186,11 +142,7 @@ struct ThreadSHMContext {
        break;
      }
      ++_spinning_count;
 #ifdef __aarch64__
      __asm__ __volatile__("yield");
 #else
      _mm_pause();
 #endif  // __aarch64__
    }
  }
@@ -237,10 +189,10 @@ struct ThreadSHMContext {
 class SHMManager {
 public:
  explicit SHMManager(const std::string& name, const int rank,
-                      const int group_size, const int thread_num)
+                      const int group_size)
      : _rank(rank),
        _group_size(group_size),
-        _thread_num(thread_num),
+        _thread_num(omp_get_max_threads()),
        _shm_names({""}),
        _shared_mem_ptrs({nullptr}),
        _shm_ctx(nullptr) {
@@ -282,11 +234,11 @@ class SHMManager {
  }
  static int64_t create_singleton_instance(const std::string& name,
-                                           const int group_size, const int rank,
+                                           const int group_size,
-                                           const int thread_num) {
+                                           const int rank) {
    std::lock_guard<std::mutex> guard(SingletonInstancesLock);
    SingletonInstances.emplace_back(
-        std::make_unique<SHMManager>(name, rank, group_size, thread_num));
+        std::make_unique<SHMManager>(name, rank, group_size));
    return static_cast<int64_t>(SingletonInstances.size() - 1);
  }
@@ -854,9 +806,8 @@ std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src) {
 }
 int64_t init_shm_manager(const std::string& name, const int64_t group_size,
-                         const int64_t rank, const int64_t thread_num) {
+                         const int64_t rank) {
-  return SHMManager::create_singleton_instance(name, group_size, rank,
+  return SHMManager::create_singleton_instance(name, group_size, rank);
                                               thread_num);
 }
 std::string join_shm_manager(int64_t handle, const std::string& name) {
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -19,14 +19,13 @@ void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
                      const std::optional<torch::Tensor>& azp,
                      const std::optional<torch::Tensor>& azp_adj,
                      const std::optional<torch::Tensor>& bias,
-                      const torch::Tensor& handler_tensor);
+                      int64_t handler);
 int64_t create_onednn_mm_handler(const torch::Tensor& b,
                                 int64_t primitive_cache_size);
 void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
-               const std::optional<torch::Tensor>& bias,
+               const std::optional<torch::Tensor>& bias, int64_t handler);
               const torch::Tensor& handler_tensor);
 bool is_onednn_acl_supported();
@@ -35,7 +34,7 @@ void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
                        torch::Tensor& block_tables, torch::Tensor& seq_lens);
 int64_t init_shm_manager(const std::string& name, const int64_t group_size,
-                         const int64_t rank, const int64_t thread_num);
+                         const int64_t rank);
 std::string join_shm_manager(int64_t handle, const std::string& name);
@@ -197,7 +196,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // oneDNN GEMM
  ops.def(
      "onednn_mm(Tensor! c, Tensor a, Tensor? bias, "
-      "Tensor handler_tensor) -> ()");
+      "int handler) -> ()");
  ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
  // Check if oneDNN was built with ACL backend
@@ -213,7 +212,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // oneDNN scaled_mm for W8A8 with static per-tensor activation quantization
  ops.def(
      "onednn_scaled_mm(Tensor! c, Tensor a, Tensor a_scales, Tensor? azp, "
-      "Tensor? azp_adj, Tensor? bias, Tensor handler_tensor) -> ()");
+      "Tensor? azp_adj, Tensor? bias, int handler) -> ()");
  ops.impl("onednn_scaled_mm", torch::kCPU, &onednn_scaled_mm);
  // Compute int8 quantized tensor for given scaling factor.
@@ -231,11 +230,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 #endif
 // SHM CCL
-#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
+#ifdef __AVX512F__
-  ops.def(
+  ops.def("init_shm_manager(str name, int group_size, int rank) -> int",
-      "init_shm_manager(str name, int group_size, int rank, int thread_num) -> "
+          &init_shm_manager);
      "int",
      &init_shm_manager);
  ops.def("join_shm_manager(int handle, str name) -> str", &join_shm_manager);
  ops.def("shm_allreduce(int handle, Tensor! data) -> ()");
  ops.impl("shm_allreduce", torch::kCPU, &shm_allreduce);
@@ -253,7 +250,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("shm_send_tensor_list", torch::kCPU, &shm_send_tensor_list);
  ops.def("shm_recv_tensor_list(int handle, int src) -> Tensor[](a)",
          &shm_recv_tensor_list);
-#endif  // #if defined(__AVX512F__) || defined(__aarch64__)
+#endif
  // sgl-kernels
 #if defined(__AVX512BF16__) && defined(__AVX512F__) && defined(__AVX512VNNI__)
@@ -294,7 +291,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "value_cache, Tensor(a3!) output, Tensor query_start_loc, Tensor "
      "seq_lens, float scale, bool causal, Tensor? alibi_slopes, SymInt "
      "sliding_window_left, SymInt sliding_window_right, Tensor block_table, "
-      "float softcap, Tensor scheduler_metadata, Tensor? s_aux) -> ()",
+      "float softcap, Tensor sheduler_metadata, Tensor? s_aux) -> ()",
      &cpu_attention_with_kv_cache);
  // placeholders
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -31,6 +31,8 @@ namespace moe {
 constexpr unsigned FULL_WARP_MASK = 0xffffffff;
 constexpr int32_t WARP_SIZE = 32;
 constexpr int32_t BLOCK_SIZE = 512;
 constexpr int32_t NUM_WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE;
 namespace warp_topk {
@@ -63,6 +65,14 @@ __forceinline__ __device__ bool is_better_than(T val, T baseline, idxT index,
  return res;
 }
 template <typename T, typename idxT>
 int calc_smem_size_for_block_wide(int num_of_warp, int64_t k) {
  int64_t cache_topk = (sizeof(T) + sizeof(idxT)) * num_of_warp * k;
  int64_t n = std::max<int>(num_of_warp / 2 * k, num_of_warp * WARP_SIZE);
  return max(cache_topk,
             round_up_to_multiple_of<256>(n * sizeof(T)) + n * sizeof(idxT));
 }
 template <int size, bool ascending, bool reverse, typename T, typename idxT,
          bool is_stable>
 struct BitonicMerge {
@@ -257,15 +267,6 @@ class WarpSort {
    }
  }
  // Accessors for per-lane selected value/index.
  // NOTE: For the common case `capacity == WARP_SIZE`, `max_arr_len_ == 1`
  // and callers should use `i == 0`.
  __device__ __forceinline__ idxT get_idx(int i = 0) const {
    return idx_arr_[i];
  }
  __device__ __forceinline__ T get_val(int i = 0) const { return val_arr_[i]; }
 protected:
  static constexpr int max_arr_len_ = capacity / WARP_SIZE;
@@ -284,7 +285,6 @@ class WarpSelect : public WarpSort<capacity, greater, T, idxT, is_stable> {
  __device__ WarpSelect(idxT k, T dummy)
      : WarpSort<capacity, greater, T, idxT, is_stable>(k, dummy),
        k_th_(dummy),
        k_th_idx_(0),
        k_th_lane_((k - 1) % WARP_SIZE) {
    extern __shared__ char smem_buf[];  // extern __shared__ T smem_buf[];
@@ -346,6 +346,9 @@ class WarpSelect : public WarpSort<capacity, greater, T, idxT, is_stable> {
      idxT idx = (lane_ < smem_buf_len_) ? idx_smem_[lane_] : 0;
      merge_buf_(val, idx);
    }
    // after done(), smem is used for merging results among warps
    __syncthreads();
  }
 private:
@@ -500,186 +503,255 @@ __device__ void topk_with_k2(T* output, T const* input, BiasT const* bias,
  }
 }
-template <typename T, typename BiasT, typename IdxT, ScoringFunc SF>
+template <typename T, typename BiasT, ScoringFunc SF>
-__global__ void grouped_topk_fused_kernel(
+__global__ void topk_with_k2_kernel(T* output, T* input, BiasT const* bias,
-    T* scores, float* topk_values, IdxT* topk_indices, BiasT const* bias,
+                                    int64_t const num_tokens,
-    int64_t const num_tokens, int64_t const num_experts, int64_t const n_group,
+                                    int64_t const num_cases,
-    int64_t const topk_group, int64_t const topk, bool renormalize,
+                                    int64_t const n_group,
                                    int64_t const num_experts_per_group) {
  int32_t warp_id = threadIdx.x / WARP_SIZE;
  int32_t lane_id = threadIdx.x % WARP_SIZE;
  int32_t case_id = blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;
  if (case_id < num_cases) {
    input += case_id * num_experts_per_group;
    // bias is per expert group, offset to current group
    int32_t group_id = case_id % n_group;
    BiasT const* group_bias = bias + group_id * num_experts_per_group;
    output += case_id;
    cg::thread_block block = cg::this_thread_block();
    cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
    asm volatile("griddepcontrol.wait;");
 #endif
    topk_with_k2<T, BiasT, SF>(output, input, group_bias, tile, lane_id,
                               num_experts_per_group);
  }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
  asm volatile("griddepcontrol.launch_dependents;");
 #endif
 }
 template <typename T, typename BiasT, typename IdxT, ScoringFunc SF,
          int NGroup = -1>
 __global__ void group_idx_and_topk_idx_kernel(
    T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices,
    BiasT const* bias, int64_t const num_tokens, int64_t const n_group,
    int64_t const topk_group, int64_t const topk, int64_t const num_experts,
    int64_t const num_experts_per_group, bool renormalize,
    double routed_scaling_factor) {
-  int32_t const token_id = static_cast<int32_t>(blockIdx.x);
+  int32_t warp_id = threadIdx.x / WARP_SIZE;
-  if (token_id >= num_tokens) {
+  int32_t lane_id = threadIdx.x % WARP_SIZE;
-    return;
+  int32_t case_id =
-  }
+      blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;  // one per token
  scores += case_id * num_experts;
  group_scores += case_id * n_group;
  topk_values += case_id * topk;
  topk_indices += case_id * topk;
-  int32_t const warp_id = threadIdx.x / WARP_SIZE;
+  constexpr bool kUseStaticNGroup = (NGroup > 0);
-  int32_t const lane_id = threadIdx.x % WARP_SIZE;
+  // use int32 to avoid implicit conversion
  int32_t const n_group_i32 =
      kUseStaticNGroup ? NGroup : static_cast<int32_t>(n_group);
-  int32_t const n_group_i32 = static_cast<int32_t>(n_group);
+  int32_t align_num_experts_per_group =
-  int32_t const topk_group_i32 = static_cast<int32_t>(topk_group);
+      warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
  int32_t const topk_i32 = static_cast<int32_t>(topk);
  int32_t const num_experts_i32 = static_cast<int32_t>(num_experts);
  int32_t const num_warps = blockDim.x / WARP_SIZE;
  if (warp_id >= n_group_i32 || num_warps < n_group_i32) {
    return;
  }
  int32_t const num_experts_per_group = num_experts_i32 / n_group_i32;
  T* scores_token = scores + static_cast<int64_t>(token_id) * num_experts;
  cg::thread_block block = cg::this_thread_block();
  cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
-  extern __shared__ char smem_buf[];
+  extern __shared__ char smem_buf[];  // NOTE: reuse the shared memory here to
-  // warpSelect internal staging buffer layout
+                                      // store the target topk idx
-  size_t const val_bytes =
+  int32_t* s_topk_idx = reinterpret_cast<int32_t*>(smem_buf);
-      static_cast<size_t>(num_warps) * WARP_SIZE * sizeof(T);
+  T* s_topk_value =
-  size_t const val_bytes_aligned =
+      reinterpret_cast<T*>(s_topk_idx + NUM_WARPS_PER_BLOCK * topk) +
-      warp_topk::round_up_to_multiple_of<256>(val_bytes);
+      warp_id * topk;
-  size_t const idx_bytes =
+  s_topk_idx += warp_id * topk;
      static_cast<size_t>(num_warps) * WARP_SIZE * sizeof(int32_t);
  size_t const internal_bytes = val_bytes_aligned + idx_bytes;
-  // user-managed shared memory starts after warpSelect internal staging.
+  T value = neg_inf<T>();
-  uintptr_t ptr_u = reinterpret_cast<uintptr_t>(smem_buf + internal_bytes);
+  T topk_group_value = neg_inf<T>();
-  ptr_u = (ptr_u + 15) & ~static_cast<uintptr_t>(15);  // align to 16B
+  int32_t num_equalto_topkth_group;
  T* s_group_scores = reinterpret_cast<T*>(ptr_u);
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
  asm volatile("griddepcontrol.wait;");  // I think all prolog can be put before
                                         // acqbulk because it's ptr arithmetic
 #endif
-  // phase 1: per-group scan
+  if (case_id < num_tokens) {
-  int32_t const group_offset = warp_id * num_experts_per_group;
+    // calculate group_idx
-  topk_with_k2<T, BiasT, SF>(s_group_scores + warp_id,
+    int32_t target_num_min =
-                             scores_token + group_offset, bias + group_offset,
+        WARP_SIZE - n_group_i32 + static_cast<int32_t>(topk_group);
-                             tile, lane_id, num_experts_per_group);
+    // The check is necessary to avoid abnormal input
    if (lane_id < n_group_i32 && is_finite(group_scores[lane_id])) {
      value = group_scores[lane_id];
    }
    int count_equal_to_top_value = WARP_SIZE - n_group_i32;
    int pre_count_equal_to_top_value = 0;
    // Use loop to find the largset top_group
    while (count_equal_to_top_value < target_num_min) {
      topk_group_value = cg::reduce(tile, value, cg::greater<T>());
      if (value == topk_group_value) {
        value = neg_inf<T>();
      }
      pre_count_equal_to_top_value = count_equal_to_top_value;
      count_equal_to_top_value =
          __popc(__ballot_sync(FULL_WARP_MASK, (value == neg_inf<T>())));
    }
    num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
  }
  __syncthreads();
  warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
                        /* is_stable */ true>
      queue((int32_t)topk, neg_inf<T>());
  int count_equalto_topkth_group = 0;
  bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
  if (case_id < num_tokens && if_proceed_next_topk) {
    auto process_group = [&](int i_group) {
      if ((group_scores[i_group] > topk_group_value) ||
          ((group_scores[i_group] == topk_group_value) &&
           (count_equalto_topkth_group < num_equalto_topkth_group))) {
        int32_t offset = i_group * num_experts_per_group;
        for (int32_t i = lane_id; i < align_num_experts_per_group;
             i += WARP_SIZE) {
          T candidates = neg_inf<T>();
          if (i < num_experts_per_group) {
            // apply scoring function (if any) and add bias
            T input = scores[offset + i];
            if (is_finite(input)) {
              T score = apply_scoring<SF>(input);
              candidates = score + static_cast<T>(bias[offset + i]);
            }
          }
          queue.add(candidates, offset + i);
        }
        if (group_scores[i_group] == topk_group_value) {
          count_equalto_topkth_group++;
        }
      }
    };
    if constexpr (kUseStaticNGroup) {
 #pragma unroll
      for (int i_group = 0; i_group < NGroup; ++i_group) {
        process_group(i_group);
      }
    } else {
      for (int i_group = 0; i_group < n_group_i32; ++i_group) {
        process_group(i_group);
      }
    }
    queue.done();
    // Get the topk_idx
    queue.dumpIdx(s_topk_idx);
  }
  // Load the valid score value
  // Calculate the summation
  float topk_sum = 1e-20;
  if (case_id < num_tokens && if_proceed_next_topk) {
    for (int i = lane_id;
         i < warp_topk::round_up_to_multiple_of<WARP_SIZE>(topk);
         i += WARP_SIZE) {
      T value = cuda_cast<T, float>(0.0f);
      if (i < topk) {
        // Load the score value (without bias) for normalization
        T input = scores[s_topk_idx[i]];
        value = apply_scoring<SF>(input);
        s_topk_value[i] = value;
      }
      if (renormalize) {
        topk_sum +=
            cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
      }
    }
  }
  __syncthreads();
-  // phase 2: warp0 selects groups + merges candidates to final topk
+  if (case_id < num_tokens) {
-  if (warp_id != 0) {
+    if (if_proceed_next_topk) {
-    return;
+      float scale = routed_scaling_factor;
-  }
+      if (renormalize) {
-
+        scale /= topk_sum;
-  topk_values += static_cast<int64_t>(token_id) * topk;
+      }
-  topk_indices += static_cast<int64_t>(token_id) * topk;
+      for (int i = lane_id; i < topk; i += WARP_SIZE) {
-
+        float base = cuda_cast<float, T>(s_topk_value[i]);
-  // select topk_group groups by group score
+        float value = base * scale;
-  warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
+        topk_indices[i] = s_topk_idx[i];
-                        /* is_stable */ true>
+        topk_values[i] = value;
-      group_sel(static_cast<int32_t>(topk_group_i32), neg_inf<T>());
+      }
-
+    } else {
-  // all lanes must participate in WarpSelect::add().
+      for (int i = lane_id; i < topk; i += WARP_SIZE) {
-  T gscore = (lane_id < n_group_i32) ? s_group_scores[lane_id] : neg_inf<T>();
+        topk_indices[i] = i;
-  group_sel.add(gscore, lane_id);
+        topk_values[i] = 1.0f / topk;
  group_sel.done();
  // proceed only if the k-th selected group score is not -inf
  bool proceed = false;
  if (topk_group_i32 > 0) {
    int const kth_lane = topk_group_i32 - 1;
    // broadcast the k-th selected group score to all lanes
    T kth_val = __shfl_sync(FULL_WARP_MASK, group_sel.get_val(0), kth_lane);
    proceed = (kth_val != neg_inf<T>());
  }
  if (!proceed) {
    for (int i = lane_id; i < topk_i32; i += WARP_SIZE) {
      topk_indices[i] = static_cast<IdxT>(i);
      topk_values[i] = 1.0f / static_cast<float>(topk_i32);
    }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
    asm volatile("griddepcontrol.launch_dependents;");
 #endif
    return;
  }
  // merge per-group topk candidates for selected groups, then select topk
  warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
                        /* is_stable */ true>
      expert_sel(static_cast<int32_t>(topk_i32), neg_inf<T>());
  // selected group ids reside in lanes [0, topk_group)
  int32_t sel_gid_lane = (lane_id < topk_group_i32) ? group_sel.get_idx(0) : 0;
  // add candidates from selected groups to expert_sel
  for (int32_t g = 0; g < topk_group_i32; ++g) {
    int32_t gid = __shfl_sync(FULL_WARP_MASK, sel_gid_lane, g);
    int32_t const offset = gid * num_experts_per_group;
    int32_t const align_num_experts_per_group =
        warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
    for (int32_t i = lane_id; i < align_num_experts_per_group; i += WARP_SIZE) {
      // all lanes must call `add()` the same number of times.
      T cand = neg_inf<T>();
      int32_t idx = 0;
      if (i < num_experts_per_group) {
        idx = offset + i;
        T input = scores_token[idx];
        if (is_finite(input)) {
          T score = apply_scoring<SF>(input);
          cand = score + static_cast<T>(bias[idx]);
        }
      }
      expert_sel.add(cand, idx);
    }
    // Note: when if_proceed_next_topk==false, choose the first 8 experts as the
    // default result.
  }
  expert_sel.done();
  // compute unbiased routing weights + optional renorm.
  float lane_unbiased = 0.0f;
  IdxT lane_idx = 0;
  if (lane_id < topk_i32) {
    lane_idx = static_cast<IdxT>(expert_sel.get_idx(0));
    T in = scores_token[static_cast<int32_t>(lane_idx)];
    lane_unbiased = cuda_cast<float, T>(apply_scoring<SF>(in));
  }
  float topk_sum = 1e-20f;
  if (renormalize) {
    topk_sum += cg::reduce(tile, lane_unbiased, cg::plus<float>());
  }
  float scale = static_cast<float>(routed_scaling_factor);
  if (renormalize) {
    scale /= topk_sum;
  }
  if (lane_id < topk_i32) {
    topk_indices[lane_id] = lane_idx;
    topk_values[lane_id] = lane_unbiased * scale;
  }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
  asm volatile("griddepcontrol.launch_dependents;");
 #endif
 }
 template <typename T, typename BiasT, typename IdxT, ScoringFunc SF>
 inline void launch_group_idx_and_topk_kernel(
    cudaLaunchConfig_t const& config, T* scores, T* group_scores,
    float* topk_values, IdxT* topk_indices, BiasT const* bias,
    int64_t const num_tokens, int64_t const n_group, int64_t const topk_group,
    int64_t const topk, int64_t const num_experts,
    int64_t const num_experts_per_group, bool const renormalize,
    double const routed_scaling_factor) {
  auto launch = [&](auto* kernel_instance2) {
    cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
                       topk_values, topk_indices, bias, num_tokens, n_group,
                       topk_group, topk, num_experts, num_experts_per_group,
                       renormalize, routed_scaling_factor);
  };
  switch (n_group) {
    case 4: {
      launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 4>);
      break;
    }
    case 8: {
      launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 8>);
      break;
    }
    case 16: {
      launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 16>);
      break;
    }
    case 32: {
      launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 32>);
      break;
    }
    default: {
      launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF>);
      break;
    }
  }
 }
 template <typename T, typename BiasT, typename IdxT>
-void invokeNoAuxTc(T* scores, float* topk_values, IdxT* topk_indices,
+void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
-                   BiasT const* bias, int64_t const num_tokens,
+                   IdxT* topk_indices, BiasT const* bias,
-                   int64_t const num_experts, int64_t const n_group,
+                   int64_t const num_tokens, int64_t const num_experts,
-                   int64_t const topk_group, int64_t const topk,
+                   int64_t const n_group, int64_t const topk_group,
-                   bool const renormalize, double const routed_scaling_factor,
+                   int64_t const topk, bool const renormalize,
-                   int const scoring_func, bool enable_pdl = false,
+                   double const routed_scaling_factor, int const scoring_func,
-                   cudaStream_t const stream = 0) {
+                   bool enable_pdl = false, cudaStream_t const stream = 0) {
  int64_t num_cases = num_tokens * n_group;
  int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
  cudaLaunchConfig_t config;
-  // One block per token; one warp per group.
+  config.gridDim = topk_with_k2_num_blocks;
-  config.gridDim = static_cast<uint32_t>(num_tokens);
+  config.blockDim = BLOCK_SIZE;
-  config.blockDim = static_cast<uint32_t>(n_group) * WARP_SIZE;
+  config.dynamicSmemBytes = 0;
  // Dynamic shared memory: WarpSelect staging + per-group topk buffers.
  int32_t const num_warps = static_cast<int32_t>(n_group);
  size_t const val_bytes =
      static_cast<size_t>(num_warps) * WARP_SIZE * sizeof(T);
  size_t const val_bytes_aligned =
      warp_topk::round_up_to_multiple_of<256>(val_bytes);
  size_t const idx_bytes =
      static_cast<size_t>(num_warps) * WARP_SIZE * sizeof(int32_t);
  size_t const internal_bytes = val_bytes_aligned + idx_bytes;
  size_t const extra_bytes = 16 + static_cast<size_t>(n_group) * sizeof(T);
  config.dynamicSmemBytes = internal_bytes + extra_bytes;
  config.stream = stream;
  cudaLaunchAttribute attrs[1];
  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
@@ -687,35 +759,66 @@ void invokeNoAuxTc(T* scores, float* topk_values, IdxT* topk_indices,
  config.numAttrs = 1;
  config.attrs = attrs;
  auto const sf = static_cast<ScoringFunc>(scoring_func);
  int64_t const num_experts_per_group = num_experts / n_group;
  auto launch_topk_with_k2 = [&](auto* kernel_instance1) {
    cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
                       num_tokens, num_cases, n_group, num_experts_per_group);
  };
  switch (sf) {
    case SCORING_NONE: {
-      auto* kernel_instance =
+      auto* kernel_instance1 = &topk_with_k2_kernel<T, BiasT, SCORING_NONE>;
-          &grouped_topk_fused_kernel<T, BiasT, IdxT, SCORING_NONE>;
+      launch_topk_with_k2(kernel_instance1);
-      cudaLaunchKernelEx(&config, kernel_instance, scores, topk_values,
+      break;
                         topk_indices, bias, num_tokens, num_experts, n_group,
                         topk_group, topk, renormalize, routed_scaling_factor);
      return;
    }
    case SCORING_SIGMOID: {
-      auto* kernel_instance =
+      auto* kernel_instance1 = &topk_with_k2_kernel<T, BiasT, SCORING_SIGMOID>;
-          &grouped_topk_fused_kernel<T, BiasT, IdxT, SCORING_SIGMOID>;
+      launch_topk_with_k2(kernel_instance1);
-      cudaLaunchKernelEx(&config, kernel_instance, scores, topk_values,
+      break;
                         topk_indices, bias, num_tokens, num_experts, n_group,
                         topk_group, topk, renormalize, routed_scaling_factor);
      return;
    }
    default:
      // should be guarded by higher level checks.
      TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
  }
  int64_t topk_with_k_group_num_blocks =
      (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
  size_t dynamic_smem_in_bytes =
      warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
                                                           topk);
  config.gridDim = topk_with_k_group_num_blocks;
  config.blockDim = BLOCK_SIZE;
  config.dynamicSmemBytes = dynamic_smem_in_bytes;
  config.stream = stream;
  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
  config.numAttrs = 1;
  config.attrs = attrs;
  switch (sf) {
    case SCORING_NONE: {
      launch_group_idx_and_topk_kernel<T, BiasT, IdxT, SCORING_NONE>(
          config, scores, group_scores, topk_values, topk_indices, bias,
          num_tokens, n_group, topk_group, topk, num_experts,
          num_experts_per_group, renormalize, routed_scaling_factor);
      break;
    }
    case SCORING_SIGMOID: {
      launch_group_idx_and_topk_kernel<T, BiasT, IdxT, SCORING_SIGMOID>(
          config, scores, group_scores, topk_values, topk_indices, bias,
          num_tokens, n_group, topk_group, topk, num_experts,
          num_experts_per_group, renormalize, routed_scaling_factor);
      break;
    }
    default:
      TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
  }
 }
-#define INSTANTIATE_NOAUX_TC(T, BiasT, IdxT)                                 \
+#define INSTANTIATE_NOAUX_TC(T, BiasT, IdxT)                                  \
-  template void invokeNoAuxTc<T, BiasT, IdxT>(                               \
+  template void invokeNoAuxTc<T, BiasT, IdxT>(                                \
-      T * scores, float* topk_values, IdxT* topk_indices, BiasT const* bias, \
+      T * scores, T * group_scores, float* topk_values, IdxT* topk_indices,   \
-      int64_t const num_tokens, int64_t const num_experts,                   \
+      BiasT const* bias, int64_t const num_tokens, int64_t const num_experts, \
-      int64_t const n_group, int64_t const topk_group, int64_t const topk,   \
+      int64_t const n_group, int64_t const topk_group, int64_t const topk,    \
-      bool const renormalize, double const routed_scaling_factor,            \
+      bool const renormalize, double const routed_scaling_factor,             \
      int const scoring_func, bool enable_pdl, cudaStream_t const stream);
 INSTANTIATE_NOAUX_TC(float, float, int32_t);
@@ -740,21 +843,17 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
  int64_t num_tokens = input_size[0];
  int64_t num_experts = input_size[1];
  TORCH_CHECK(input_size.size() == 2, "scores must be a 2D Tensor");
  TORCH_CHECK(n_group > 0, "n_group must be positive");
  TORCH_CHECK(topk > 0, "topk must be positive");
  TORCH_CHECK(topk_group > 0, "topk_group must be positive");
  TORCH_CHECK(topk_group <= n_group, "topk_group must be <= n_group");
  TORCH_CHECK(num_experts % n_group == 0,
              "num_experts should be divisible by n_group");
  TORCH_CHECK(n_group <= 32,
              "n_group should be smaller than or equal to 32 for now");
  TORCH_CHECK(topk <= 32, "topk should be smaller than or equal to 32 for now");
  TORCH_CHECK(topk <= topk_group * (num_experts / n_group),
              "topk must be <= topk_group * (num_experts / n_group)");
  TORCH_CHECK(scoring_func == vllm::moe::SCORING_NONE ||
                  scoring_func == vllm::moe::SCORING_SIGMOID,
              "scoring_func must be SCORING_NONE (0) or SCORING_SIGMOID (1)");
  torch::Tensor group_scores = torch::empty(
      {num_tokens, n_group}, torch::dtype(data_type).device(torch::kCUDA));
  // Always output float32 for topk_values (eliminates Python-side conversion)
  torch::Tensor topk_values = torch::empty(
      {num_tokens, topk}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
@@ -769,6 +868,7 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
      case torch::kFloat16:                                                  \
        vllm::moe::invokeNoAuxTc<T, half, IdxT>(                             \
            reinterpret_cast<T*>(scores.mutable_data_ptr()),                 \
            reinterpret_cast<T*>(group_scores.mutable_data_ptr()),           \
            reinterpret_cast<float*>(topk_values.mutable_data_ptr()),        \
            reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()),        \
            reinterpret_cast<half const*>(bias.data_ptr()), num_tokens,      \
@@ -779,6 +879,7 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
      case torch::kFloat32:                                                  \
        vllm::moe::invokeNoAuxTc<T, float, IdxT>(                            \
            reinterpret_cast<T*>(scores.mutable_data_ptr()),                 \
            reinterpret_cast<T*>(group_scores.mutable_data_ptr()),           \
            reinterpret_cast<float*>(topk_values.mutable_data_ptr()),        \
            reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()),        \
            reinterpret_cast<float const*>(bias.data_ptr()), num_tokens,     \
@@ -789,6 +890,7 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
      case torch::kBFloat16:                                                 \
        vllm::moe::invokeNoAuxTc<T, __nv_bfloat16, IdxT>(                    \
            reinterpret_cast<T*>(scores.mutable_data_ptr()),                 \
            reinterpret_cast<T*>(group_scores.mutable_data_ptr()),           \
            reinterpret_cast<float*>(topk_values.mutable_data_ptr()),        \
            reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()),        \
            reinterpret_cast<__nv_bfloat16 const*>(bias.data_ptr()),         \
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -58,7 +58,7 @@ TEMPLATE = (
    "( MARLIN_KERNEL_PARAMS );"
 )
-THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128), (128, 64, 128)]
+THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
 THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
--- a/csrc/moe/marlin_moe_wna16/kernel.h
+++ b/csrc/moe/marlin_moe_wna16/kernel.h
@@ -3,8 +3,8 @@
  #define MARLIN_NAMESPACE_NAME marlin_moe_wna16
 #endif
-#include "quantization/marlin/marlin.cuh"
+#include "quantization/gptq_marlin/marlin.cuh"
-#include "quantization/marlin/marlin_dtypes.cuh"
+#include "quantization/gptq_marlin/marlin_dtypes.cuh"
 #include "core/scalar_type.hpp"
 #define MARLIN_KERNEL_PARAMS                                          \
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
@@ -23,10 +23,10 @@
  #define MARLIN_NAMESPACE_NAME marlin_moe_wna16
 #endif
-#include "quantization/marlin/marlin.cuh"
+#include "quantization/gptq_marlin/marlin.cuh"
-#include "quantization/marlin/marlin_dtypes.cuh"
+#include "quantization/gptq_marlin/marlin_dtypes.cuh"
-#include "quantization/marlin/dequant.h"
+#include "quantization/gptq_marlin/dequant.h"
-#include "quantization/marlin/marlin_mma.h"
+#include "quantization/gptq_marlin/marlin_mma.h"
 #include "core/scalar_type.hpp"
 #define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@@ -126,16 +126,14 @@ thread_config_t small_batch_thread_configs[] = {
    // thread_k, thread_n, num_threads
    {128, 128, 256},
-    {64, 128, 128},
+    {64, 128, 128}};
    {128, 64, 128}};
 thread_config_t large_batch_thread_configs[] = {
    // Ordered by priority
    // thread_k, thread_n, num_threads
    {64, 256, 256},
-    {64, 128, 128},
+    {64, 128, 128}};
    {128, 64, 128}};
 typedef struct {
  int blocks_per_sm;
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -4,13 +4,7 @@
 void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
                  torch::Tensor& token_expert_indices,
-                  torch::Tensor& gating_output, bool renormalize,
+                  torch::Tensor& gating_output, bool renormalize);
                  std::optional<torch::Tensor> bias);
 void topk_sigmoid(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
                  torch::Tensor& token_expert_indices,
                  torch::Tensor& gating_output, bool renormalize,
                  std::optional<torch::Tensor> bias);
 void moe_sum(torch::Tensor& input, torch::Tensor& output);
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -42,7 +42,7 @@ void moe_permute(
  auto sort_workspace = torch::empty(
      {sorter_size},
      torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
-  torch::Tensor topk_ids_for_sort = topk_ids;
+  auto copy_topk_ids = topk_ids.clone();  // copy topk_ids for preprocess
  auto permuted_experts_id = torch::empty_like(topk_ids);
  auto sorted_row_idx = torch::empty_like(inv_permuted_idx);
@@ -62,51 +62,35 @@ void moe_permute(
    const int* expert_map_ptr = get_ptr<int>(expert_map.value());
    valid_num_ptr =
        get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
-    topk_ids_for_sort = topk_ids.clone();
+    preprocessTopkIdLauncher(get_ptr<int>(copy_topk_ids), n_token * topk,
    preprocessTopkIdLauncher(get_ptr<int>(topk_ids_for_sort), n_token * topk,
                             expert_map_ptr, n_expert, stream);
  }
  // expert sort topk expert id and scan expert id get expert_first_token_offset
  sortAndScanExpert(
-      get_ptr<const int>(topk_ids_for_sort), get_ptr<int>(token_expert_indices),
+      get_ptr<int>(copy_topk_ids), get_ptr<int>(token_expert_indices),
      get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
      get_ptr<int64_t>(expert_first_token_offset), n_token, n_expert,
      n_local_expert, topk, sorter, get_ptr<int>(sort_workspace), stream);
  // DeepGEMM: use getMIndices kernel to compute
  // 1) align_expert_first_token_offset (aligned prefix offsets)
  // 2) m_indices (expert id for each aligned row)
  // eg. expert0: 3, expert1: 5, expert2: 2 tokens respectively
  // expert_first_token_offset = [0, 3, 8, 10], align_block_size = 4
  // expert0: 3->4, expert1: 5->8, expert2: 2->4
  // align_expert_first_token_offset = [0, 4, 12, 16]
  // so m_indices = [0,0,0,0, 1,1,1,1,1,1,1,1, 2,2,2,2]
  torch::Tensor align_expert_first_token_offset;
  const int64_t* aligned_expert_first_token_offset_ptr = nullptr;
  if (align_block_size.has_value()) {
    align_expert_first_token_offset =
        torch::zeros_like(expert_first_token_offset);
    getMIndices(get_ptr<int64_t>(expert_first_token_offset),
                get_ptr<int64_t>(align_expert_first_token_offset),
                get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
                stream);
    aligned_expert_first_token_offset_ptr =
        get_ptr<int64_t>(align_expert_first_token_offset);
  }
  // dispatch expandInputRowsKernelLauncher
  MOE_DISPATCH(input.scalar_type(), [&] {
    expandInputRowsKernelLauncher<scalar_t>(
        get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
        get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
        get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
-        get_ptr<int64_t>(expert_first_token_offset),
+        get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
-        aligned_expert_first_token_offset_ptr, n_token, valid_num_ptr, n_hidden,
+        n_hidden, topk, n_local_expert, align_block_size_value, stream);
        topk, n_local_expert, align_block_size_value, stream);
  });
  // get m_indices and update expert_first_token_offset with align block
  // this is only required for DeepGemm and not required for CUTLASS group gemm
  if (align_block_size.has_value()) {
    auto align_expert_first_token_offset =
        torch::zeros_like(expert_first_token_offset);
    getMIndices(get_ptr<int64_t>(expert_first_token_offset),
                get_ptr<int64_t>(align_expert_first_token_offset),
                get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
                stream);
    expert_first_token_offset.copy_(align_expert_first_token_offset);
  }
 }
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
@@ -109,7 +109,7 @@ void computeExpertFirstTokenOffset(int const* sorted_indices,
      sorted_indices, total_indices, num_experts, expert_first_token_offset);
 }
-void sortAndScanExpert(const int* expert_for_source_row, const int* source_rows,
+void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
                       int* permuted_experts, int* permuted_rows,
                       int64_t* expert_first_token_offset, int num_rows,
                       int num_experts, int num_experts_per_node, int k,
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
@@ -48,7 +48,7 @@ void computeExpertFirstTokenOffset(int const* sorted_indices,
                                   int64_t* expert_first_token_offset,
                                   cudaStream_t stream);
-void sortAndScanExpert(const int* expert_for_source_row, const int* source_rows,
+void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
                       int* permuted_experts, int* permuted_rows,
                       int64_t* expert_first_token_offset, int num_rows,
                       int num_experts, int num_experts_per_node, int k,
@@ -60,8 +60,7 @@ void expandInputRowsKernelLauncher(
    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
    int const* expanded_dest_row_to_expanded_source_row,
    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
-    int64_t const* expert_first_token_offset,
+    int64_t* expert_first_token_offset, int64_t const num_rows,
    int64_t const* aligned_expert_first_token_offset, int64_t const num_rows,
    int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
    int num_local_experts, const int& align_block_size, cudaStream_t stream);
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
@@ -5,8 +5,7 @@ __global__ void expandInputRowsKernel(
    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
    int const* expanded_dest_row_to_expanded_source_row,
    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
-    int64_t const* expert_first_token_offset,
+    int64_t* expert_first_token_offset, int64_t const num_rows,
    int64_t const* aligned_expert_first_token_offset, int64_t const num_rows,
    int64_t const* num_dest_rows, int64_t const cols, int64_t k,
    int num_local_experts, int align_block_size) {
  // Reverse permutation map.
@@ -19,22 +18,35 @@ __global__ void expandInputRowsKernel(
      expanded_dest_row_to_expanded_source_row[expanded_dest_row];
  int expert_id = sorted_experts[expanded_dest_row];
  extern __shared__ int64_t smem_expert_first_token_offset[];
  if constexpr (ALIGN_BLOCK_SIZE) {
-    // convert (unaligned) expanded_dest_row -> aligned expanded_dest_row.
+    // load g2s
-    // aligned_expert_first_token_offset[e] provides the aligned prefix start
+    for (int idx = threadIdx.x; idx < num_local_experts + 1;
-    // for expert e. For non-local experts we map to the end (total aligned M).
+         idx += blockDim.x) {
-    int64_t aligned_base = 0;
+      smem_expert_first_token_offset[idx] =
-    int64_t token_offset_in_expert = 0;
+          __ldg(expert_first_token_offset + idx);
    if (expert_id >= num_local_experts) {
      aligned_base =
          __ldg(aligned_expert_first_token_offset + num_local_experts);
      token_offset_in_expert = 0;
    } else {
      aligned_base = __ldg(aligned_expert_first_token_offset + expert_id);
      token_offset_in_expert =
          expanded_dest_row - __ldg(expert_first_token_offset + expert_id);
    }
-    expanded_dest_row = aligned_base + token_offset_in_expert;
+    __syncthreads();
    int lane_idx = threadIdx.x & 31;
    if (lane_idx == 0) {
      // set token_offset_in_expert = 0 if this expert is not local expert
      int token_offset_in_expert =
          expert_id >= num_local_experts
              ? 0
              : expanded_dest_row - smem_expert_first_token_offset[expert_id];
      int64_t accumulate_align_offset = 0;
 #pragma unroll 1
      for (int eidx = 1; eidx <= min(expert_id, num_local_experts); eidx++) {
        auto n_token_in_expert = smem_expert_first_token_offset[eidx] -
                                 smem_expert_first_token_offset[eidx - 1];
        accumulate_align_offset += (n_token_in_expert + align_block_size - 1) /
                                   align_block_size * align_block_size;
      }
      expanded_dest_row = accumulate_align_offset + token_offset_in_expert;
    }
    // lane0 shuffle broadcast align_expanded_dest_row
    expanded_dest_row = __shfl_sync(0xffffffff, expanded_dest_row, 0);
  }
  if (threadIdx.x == 0) {
@@ -76,8 +88,7 @@ void expandInputRowsKernelLauncher(
    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
    int const* expanded_dest_row_to_expanded_source_row,
    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
-    int64_t const* expert_first_token_offset,
+    int64_t* expert_first_token_offset, int64_t const num_rows,
    int64_t const* aligned_expert_first_token_offset, int64_t const num_rows,
    int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
    int num_local_experts, const int& align_block_size, cudaStream_t stream) {
  int64_t const blocks = num_rows * k;
@@ -93,12 +104,14 @@ void expandInputRowsKernelLauncher(
  bool is_align_block_size = align_block_size != -1;
  auto func = func_map[is_check_skip][is_align_block_size];
-  func<<<blocks, threads, 0, stream>>>(
+  int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1);
  func<<<blocks, threads, smem_size, stream>>>(
      unpermuted_input, permuted_output, sorted_experts,
      expanded_dest_row_to_expanded_source_row,
      expanded_source_row_to_expanded_dest_row, permuted_idx,
-      expert_first_token_offset, aligned_expert_first_token_offset, num_rows,
+      expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
-      num_valid_tokens_ptr, cols, k, num_local_experts, align_block_size);
+      num_local_experts, align_block_size);
 }
 template <class T, class U>
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -62,12 +62,6 @@ __device__ __forceinline__ float toFloat(T value) {
    }
 }
 // Scoring function enums
 enum ScoringFunc {
  SCORING_SOFTMAX = 0, // apply softmax
  SCORING_SIGMOID = 1  // apply sigmoid
 };
 // ====================== Softmax things ===============================
 // We have our own implementation of softmax here so we can support transposing the output
 // in the softmax kernel when we extend this module to support expert-choice routing.
@@ -131,27 +125,6 @@ __launch_bounds__(TPB) __global__
    }
 }
 template <int TPB, typename InputType>
 __launch_bounds__(TPB) __global__
    void moeSigmoid(const InputType* input, const bool* finished, float* output, const int num_cols)
 {
    const int thread_row_offset = blockIdx.x * num_cols;
    // Don't touch finished rows.
    if ((finished != nullptr) && finished[blockIdx.x])
    {
        return;
    }
    for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
    {
        const int idx = thread_row_offset + ii;
        const float val = toFloat(input[idx]);
        const float sigmoid_val = 1.0f / (1.0f + __expf(-val));
        output[idx] = sigmoid_val;
    }
 }
 template <int TPB, typename IndType>
 __launch_bounds__(TPB) __global__ void moeTopK(
    const float* inputs_after_softmax,
@@ -163,8 +136,7 @@ __launch_bounds__(TPB) __global__ void moeTopK(
    const int k,
    const int start_expert,
    const int end_expert,
-    const bool renormalize,
+    const bool renormalize)
    const float* bias)
 {
    using cub_kvp = cub::KeyValuePair<int, float>;
@@ -190,13 +162,7 @@ __launch_bounds__(TPB) __global__ void moeTopK(
        {
            const int idx = thread_read_offset + expert;
            inp_kvp.key = expert;
-
+            inp_kvp.value = inputs_after_softmax[idx];
            // Apply correction bias if provided
            if (bias != nullptr) {
              inp_kvp.value = inputs_after_softmax[idx] + bias[expert];
            } else {
              inp_kvp.value = inputs_after_softmax[idx];
            }
            for (int prior_k = 0; prior_k < k_idx; ++prior_k)
            {
@@ -220,13 +186,12 @@ __launch_bounds__(TPB) __global__ void moeTopK(
            const bool should_process_row = row_is_active && node_uses_expert;
            const int idx = k * block_row + k_idx;
-            // Return the unbiased scores for output weights
+            output[idx] = result_kvp.value;
            output[idx] = inputs_after_softmax[thread_read_offset + expert];
            indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
            assert(indices[idx] >= 0);
            source_rows[idx] = k_idx * num_rows + block_row;
            if (renormalize) {
-                selected_sum += inputs_after_softmax[thread_read_offset + expert];
+                selected_sum += result_kvp.value;
            }
        }
        __syncthreads();
@@ -260,12 +225,10 @@ __launch_bounds__(TPB) __global__ void moeTopK(
  2) This implementation assumes k is small, but will work for any k.
 */
-template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType,
+template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType, typename InputType = float>
          typename InputType = float, ScoringFunc SF>
 __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
-    void topkGating(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices,
+    void topkGatingSoftmax(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices,
-        int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize,
+        int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize)
        const float* bias)
 {
    static_assert(std::is_same_v<InputType, float> || std::is_same_v<InputType, __nv_bfloat16> ||
                      std::is_same_v<InputType, __half>,
@@ -390,89 +353,61 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
        }
    }
-    if constexpr (SF == SCORING_SOFTMAX) {
+    // First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
-      // First, we perform a max reduce within the thread.
+    // convert to float afterwards for the exp + sum reduction.
-      float thread_max = row_chunk[0];
+    float thread_max = row_chunk[0];
 #pragma unroll
-      for (int ii = 1; ii < VPT; ++ii) {
+    for (int ii = 1; ii < VPT; ++ii)
    {
        thread_max = max(thread_max, row_chunk[ii]);
-      }
+    }
 // Now, we find the max within the thread group and distribute among the threads. We use a butterfly reduce.
 #pragma unroll
-      for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
+    for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
-      {
+    {
        thread_max = max(thread_max, VLLM_SHFL_XOR_SYNC_WIDTH(thread_max, mask, THREADS_PER_ROW));
-      }
+    }
-      // From this point, thread max in all the threads have the max within the row.
+    // From this point, thread max in all the threads have the max within the row.
-      // Now, we subtract the max from each element in the thread and take the exp. We also compute the thread local sum.
+    // Now, we subtract the max from each element in the thread and take the exp. We also compute the thread local sum.
-      float row_sum = 0;
+    float row_sum = 0;
 #pragma unroll
-      for (int ii = 0; ii < VPT; ++ii)
+    for (int ii = 0; ii < VPT; ++ii)
-      {
+    {
        row_chunk[ii] = expf(row_chunk[ii] - thread_max);
        row_sum += row_chunk[ii];
-      }
+    }
 // Now, we perform the sum reduce within each thread group. Similar to the max reduce, we use a bufferfly pattern.
 #pragma unroll
-      for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
+    for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
-      {
+    {
        row_sum += VLLM_SHFL_XOR_SYNC_WIDTH(row_sum, mask, THREADS_PER_ROW);
-      }
+    }
-      // From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables
+    // From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables
-      // respectively. Finally, we can scale the rows for the softmax. Technically, for top-k gating we don't need to
+    // respectively. Finally, we can scale the rows for the softmax. Technically, for top-k gating we don't need to
-      // compute the entire softmax row. We can likely look at the maxes and only compute for the top-k values in the row.
+    // compute the entire softmax row. We can likely look at the maxes and only compute for the top-k values in the row.
-      // However, this kernel will likely not be a bottle neck and it seems better to closer match torch and find the
+    // However, this kernel will likely not be a bottle neck and it seems better to closer match torch and find the
-      // argmax after computing the softmax.
+    // argmax after computing the softmax.
-      const float reciprocal_row_sum = 1.f / row_sum;
+    const float reciprocal_row_sum = 1.f / row_sum;
 #pragma unroll
-      for (int ii = 0; ii < VPT; ++ii)
+    for (int ii = 0; ii < VPT; ++ii)
-      {
+    {
        row_chunk[ii] = row_chunk[ii] * reciprocal_row_sum;
      }
    } else if constexpr (SF == SCORING_SIGMOID) {
 #pragma unroll
      for (int ii = 0; ii < VPT; ++ii)
      {
        row_chunk[ii] = 1.0f / (1.0f + __expf(-row_chunk[ii]));
      }
    }
-    static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
+    // Now, softmax_res contains the softmax of the row chunk. Now, I want to find the topk elements in each row, along
    // If bias is not null, use biased value for selection
    float row_chunk_for_choice[VPT];
    // Apply correction bias
    if (bias != nullptr) {
 #pragma unroll
      for (int ldg = 0; ldg < LDG_PER_THREAD; ++ldg) {
 #pragma unroll
        for (int ii = 0; ii < ELTS_PER_LDG; ++ii) {
          const int expert = first_elt_read_by_thread + ldg * COLS_PER_GROUP_LDG + ii;
          float bias_val = expert < NUM_EXPERTS ? bias[expert] : 0.0f;
          row_chunk_for_choice[ldg * ELTS_PER_LDG + ii] = row_chunk[ldg * ELTS_PER_LDG + ii] + bias_val;
        }
      }
    } else {
 #pragma unroll
      for (int ii = 0; ii < VPT; ++ii) {
        row_chunk_for_choice[ii] = row_chunk[ii];
      }
    }
    // Now, row_chunk contains the softmax / sigmoid of the row chunk. Now, I want to find the topk elements in each row, along
    // with the max index.
    int start_col = first_elt_read_by_thread;
    static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
    float selected_sum = 0.f;
    for (int k_idx = 0; k_idx < k; ++k_idx)
    {
        // First, each thread does the local argmax
        float max_val_for_choice = row_chunk_for_choice[0];
        float max_val = row_chunk[0];
        int expert = start_col;
 #pragma unroll
@@ -481,14 +416,12 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
 #pragma unroll
            for (int ii = 0; ii < ELTS_PER_LDG; ++ii)
            {
                float val_for_choice = row_chunk_for_choice[ldg * ELTS_PER_LDG + ii];
                float val = row_chunk[ldg * ELTS_PER_LDG + ii];
                // No check on the experts here since columns with the smallest index are processed first and only
                // updated if > (not >=)
-                if (val_for_choice > max_val_for_choice)
+                if (val > max_val)
                {
                    max_val_for_choice = val_for_choice;
                    max_val = val;
                    expert = col + ii;
                }
@@ -501,14 +434,12 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
 #pragma unroll
        for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
        {
            float other_max_for_choice = VLLM_SHFL_XOR_SYNC_WIDTH(max_val_for_choice, mask, THREADS_PER_ROW);
            float other_max = VLLM_SHFL_XOR_SYNC_WIDTH(max_val, mask, THREADS_PER_ROW);
            int other_expert = VLLM_SHFL_XOR_SYNC_WIDTH(expert, mask, THREADS_PER_ROW);
            // We want lower indices to "win" in every thread so we break ties this way
-            if (other_max_for_choice > max_val_for_choice || (other_max_for_choice == max_val_for_choice && other_expert < expert))
+            if (other_max > max_val || (other_max == max_val && other_expert < expert))
            {
                max_val_for_choice = other_max_for_choice;
                max_val = other_max;
                expert = other_expert;
            }
@@ -543,7 +474,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
            {
                const int offset_for_expert = expert % ELTS_PER_LDG;
                // Safe to set to any negative value since row_chunk values must be between 0 and 1.
-                row_chunk_for_choice[ldg_group_for_expert * ELTS_PER_LDG + offset_for_expert] = -10000.f;
+                row_chunk[ldg_group_for_expert * ELTS_PER_LDG + offset_for_expert] = -10000.f;
            }
        }
    }
@@ -577,10 +508,10 @@ struct TopkConstants
 };
 } // namespace detail
-template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType, typename InputType, ScoringFunc SF>
+template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType, typename InputType>
-void topkGatingLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices,
+void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices,
    int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, const bool renormalize,
-    const float* bias, cudaStream_t stream)
+    cudaStream_t stream)
 {
    static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(InputType) * EXPERTS);
    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM, InputType>;
@@ -590,51 +521,43 @@ void topkGatingLauncherHelper(const InputType* input, const bool* finished, floa
    const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
    dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
-    topkGating<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM, IndType, InputType, SF><<<num_blocks, block_dim, 0, stream>>>(
+    topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM, IndType, InputType><<<num_blocks, block_dim, 0, stream>>>(
-        input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize, bias);
+        input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize);
 }
 #ifndef USE_ROCM
-  #define LAUNCH_TOPK(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                   \
+#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                          \
-    static_assert(WARP_SIZE == 32,                                            \
+    static_assert(WARP_SIZE == 32,                                                    \
-                  "Unsupported warp size. Only 32 is supported for CUDA");    \
+                  "Unsupported warp size. Only 32 is supported for CUDA");            \
-    topkGatingLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES, \
+    topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
-                             IndType, InputType, SF>(                         \
+        gating_output, nullptr, topk_weights, topk_indices, token_expert_indices,     \
-        gating_output, nullptr, topk_weights, topk_indices,                   \
+        num_tokens, topk, 0, num_experts, renormalize, stream);
        token_expert_indices, num_tokens, topk, 0, num_experts, renormalize,  \
        bias, stream);
 #else
-  #define LAUNCH_TOPK(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                    \
+#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                             \
-    if (WARP_SIZE == 64) {                                                     \
+    if (WARP_SIZE == 64) {                                                               \
-      topkGatingLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES,       \
+        topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>(       \
-                               IndType, InputType, SF>(                        \
+            gating_output, nullptr, topk_weights, topk_indices, token_expert_indices,    \
-          gating_output, nullptr, topk_weights, topk_indices,                  \
+            num_tokens, topk, 0, num_experts, renormalize, stream);                      \
-          token_expert_indices, num_tokens, topk, 0, num_experts, renormalize, \
+    } else if (WARP_SIZE == 32) {                                                        \
-          bias, stream);                                                       \
+        topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>(       \
-    } else if (WARP_SIZE == 32) {                                              \
+            gating_output, nullptr, topk_weights, topk_indices, token_expert_indices,    \
-      topkGatingLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES,       \
+            num_tokens, topk, 0, num_experts, renormalize, stream);                      \
-                               IndType, InputType, SF>(                        \
+    } else {                                                                             \
-          gating_output, nullptr, topk_weights, topk_indices,                  \
+        assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
          token_expert_indices, num_tokens, topk, 0, num_experts, renormalize, \
          bias, stream);                                                       \
    } else {                                                                   \
      assert(false &&                                                          \
             "Unsupported warp size. Only 32 and 64 are supported for ROCm");  \
    }
 #endif
-template <typename IndType, typename InputType, ScoringFunc SF>
+template <typename IndType, typename InputType>
-void topkGatingKernelLauncher(
+void topkGatingSoftmaxKernelLauncher(
    const InputType* gating_output,
    float* topk_weights,
    IndType* topk_indices,
    int* token_expert_indices,
-    float* workspace,
+    float* softmax_workspace,
    const int num_tokens,
    const int num_experts,
    const int topk,
    const bool renormalize,
    const float* bias,
    cudaStream_t stream) {
    static constexpr int WARPS_PER_TB = 4;
    static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
@@ -646,71 +569,64 @@ void topkGatingKernelLauncher(
 #endif
    switch (num_experts) {
        case 1:
-            LAUNCH_TOPK(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_SOFTMAX(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 2:
-            LAUNCH_TOPK(2, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_SOFTMAX(2, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 4:
-            LAUNCH_TOPK(4, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_SOFTMAX(4, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 8:
-            LAUNCH_TOPK(8, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_SOFTMAX(8, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 16:
-            LAUNCH_TOPK(16, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_SOFTMAX(16, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 32:
-            LAUNCH_TOPK(32, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_SOFTMAX(32, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 64:
-            LAUNCH_TOPK(64, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_SOFTMAX(64, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 128:
-            LAUNCH_TOPK(128, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_SOFTMAX(128, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 256:
-            LAUNCH_TOPK(256, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_SOFTMAX(256, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 512:
-            LAUNCH_TOPK(512, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_SOFTMAX(512, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        // (CUDA only) support multiples of 64 when num_experts is not power of 2.
        // ROCm uses WARP_SIZE 64 so 8 bytes loading won't fit for some of num_experts,
        // alternatively we can test 4 bytes loading and enable it in future.
 #ifndef USE_ROCM
        case 192:
-            LAUNCH_TOPK(192, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            LAUNCH_SOFTMAX(192, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
            break;
        case 320:
-            LAUNCH_TOPK(320, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            LAUNCH_SOFTMAX(320, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
            break;
        case 384:
-            LAUNCH_TOPK(384, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            LAUNCH_SOFTMAX(384, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
            break;
        case 448:
-            LAUNCH_TOPK(448, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            LAUNCH_SOFTMAX(448, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
            break;
        case 576:
-            LAUNCH_TOPK(576, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            LAUNCH_SOFTMAX(576, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
            break;
 #endif
        default: {
-            TORCH_CHECK(workspace != nullptr,
+            TORCH_CHECK(softmax_workspace != nullptr,
-                "workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
+                "softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
            static constexpr int TPB = 256;
-            if constexpr (SF == SCORING_SOFTMAX) {
+            moeSoftmax<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
-              moeSoftmax<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
+                gating_output, nullptr, softmax_workspace, num_experts);
                gating_output, nullptr, workspace, num_experts);
            } else if constexpr (SF == SCORING_SIGMOID) {
              moeSigmoid<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
                gating_output, nullptr, workspace, num_experts);
            } else {
                TORCH_CHECK(false, "Unsupported scoring func");
            }
            moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
-                workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
+                softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
-                num_experts, topk, 0, num_experts, renormalize, bias);
+                num_experts, topk, 0, num_experts, renormalize);
        }
    }
 }
@@ -719,55 +635,40 @@ void topkGatingKernelLauncher(
 } // namespace vllm
-template<typename ComputeType, vllm::moe::ScoringFunc SF>
+template<typename ComputeType>
-void dispatch_topk_launch(
+void dispatch_topk_softmax_launch(
    torch::Tensor& gating_output,
    torch::Tensor& topk_weights,
    torch::Tensor& topk_indices,
    torch::Tensor& token_expert_indices,
    torch::Tensor& softmax_workspace,
-    int num_tokens, int num_experts, int topk, bool renormalize,
+    int num_tokens, int num_experts, int topk, bool renormalize, cudaStream_t stream)
-    std::optional<torch::Tensor> bias,
+{
    cudaStream_t stream)
 {
    const float* bias_ptr = nullptr;
    if (bias.has_value()) {
      const torch::Tensor& bias_tensor = bias.value();
      TORCH_CHECK(bias_tensor.scalar_type() == at::ScalarType::Float, "bias tensor must be float32");
      TORCH_CHECK(bias_tensor.dim() == 1, "bias tensor must be 1D");
      TORCH_CHECK(bias_tensor.size(0) == num_experts, "bias size mismatch, expected: ", num_experts);
      TORCH_CHECK(bias_tensor.is_contiguous(), "bias tensor must be contiguous");
      bias_ptr = bias_tensor.data_ptr<float>();
    }
    if (topk_indices.scalar_type() == at::ScalarType::Int) {
-        vllm::moe::topkGatingKernelLauncher<int, ComputeType, SF>(
+        vllm::moe::topkGatingSoftmaxKernelLauncher<int, ComputeType>(
            reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
            topk_weights.data_ptr<float>(),
            topk_indices.data_ptr<int>(),
            token_expert_indices.data_ptr<int>(),
            softmax_workspace.data_ptr<float>(),
-            num_tokens, num_experts, topk, renormalize,
+            num_tokens, num_experts, topk, renormalize, stream);
            bias_ptr, stream);
    } else if (topk_indices.scalar_type() == at::ScalarType::UInt32) {
-        vllm::moe::topkGatingKernelLauncher<uint32_t, ComputeType, SF>(
+        vllm::moe::topkGatingSoftmaxKernelLauncher<uint32_t, ComputeType>(
            reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
            topk_weights.data_ptr<float>(),
            topk_indices.data_ptr<uint32_t>(),
            token_expert_indices.data_ptr<int>(),
            softmax_workspace.data_ptr<float>(),
-            num_tokens, num_experts, topk, renormalize,
+            num_tokens, num_experts, topk, renormalize, stream);
            bias_ptr, stream);
    } else {
        TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
-        vllm::moe::topkGatingKernelLauncher<int64_t, ComputeType, SF>(
+        vllm::moe::topkGatingSoftmaxKernelLauncher<int64_t, ComputeType>(
            reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
            topk_weights.data_ptr<float>(),
            topk_indices.data_ptr<int64_t>(),
            token_expert_indices.data_ptr<int>(),
            softmax_workspace.data_ptr<float>(),
-            num_tokens, num_experts, topk, renormalize,
+            num_tokens, num_experts, topk, renormalize, stream);
            bias_ptr, stream);
    }
 }
@@ -776,8 +677,7 @@ void topk_softmax(
    torch::Tensor& topk_indices,                // [num_tokens, topk]
    torch::Tensor& token_expert_indices,        // [num_tokens, topk]
    torch::Tensor& gating_output,               // [num_tokens, num_experts]
-    bool renormalize,
+    bool renormalize)
    std::optional<torch::Tensor> bias)
 {
    const int num_experts = gating_output.size(-1);
    const auto num_tokens = gating_output.numel() / num_experts;
@@ -793,55 +693,14 @@ void topk_softmax(
    torch::Tensor softmax_workspace = torch::empty({workspace_size}, workspace_options);
    if (gating_output.scalar_type() == at::ScalarType::Float) {
-        dispatch_topk_launch<float, vllm::moe::SCORING_SOFTMAX>(gating_output, topk_weights, topk_indices,
+        dispatch_topk_softmax_launch<float>(gating_output, topk_weights, topk_indices, 
-            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize,
+            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
            bias, stream);
    } else if (gating_output.scalar_type() == at::ScalarType::Half) {
-        dispatch_topk_launch<__half, vllm::moe::SCORING_SOFTMAX>(gating_output, topk_weights, topk_indices,
+        dispatch_topk_softmax_launch<__half>(gating_output, topk_weights, topk_indices, 
-            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize,
+            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
            bias, stream);
    } else if (gating_output.scalar_type() == at::ScalarType::BFloat16) {
-        dispatch_topk_launch<__nv_bfloat16, vllm::moe::SCORING_SOFTMAX>(gating_output, topk_weights, topk_indices,
+        dispatch_topk_softmax_launch<__nv_bfloat16>(gating_output, topk_weights, topk_indices, 
-            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize,
+            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
            bias, stream);
    } else {
        TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type());
    }
 }
 void topk_sigmoid(
    torch::Tensor& topk_weights,                // [num_tokens, topk]
    torch::Tensor& topk_indices,                // [num_tokens, topk]
    torch::Tensor& token_expert_indices,        // [num_tokens, topk]
    torch::Tensor& gating_output,               // [num_tokens, num_experts]
    bool renormalize,
    std::optional<torch::Tensor> bias)
 {
    const int num_experts = gating_output.size(-1);
    const auto num_tokens = gating_output.numel() / num_experts;
    const int topk = topk_weights.size(-1);
    const bool is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
    const bool needs_workspace = !is_pow_2 || num_experts > 256;
    const int64_t workspace_size = needs_workspace ? num_tokens * num_experts : 0;
    const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    const auto workspace_options = gating_output.options().dtype(at::ScalarType::Float);
    torch::Tensor workspace = torch::empty({workspace_size}, workspace_options);
    if (gating_output.scalar_type() == at::ScalarType::Float) {
        dispatch_topk_launch<float, vllm::moe::SCORING_SIGMOID>(gating_output, topk_weights, topk_indices,
            token_expert_indices, workspace, num_tokens, num_experts, topk, renormalize,
            bias, stream);
    } else if (gating_output.scalar_type() == at::ScalarType::Half) {
        dispatch_topk_launch<__half, vllm::moe::SCORING_SIGMOID>(gating_output, topk_weights, topk_indices,
            token_expert_indices, workspace, num_tokens, num_experts, topk, renormalize,
            bias, stream);
    } else if (gating_output.scalar_type() == at::ScalarType::BFloat16) {
        dispatch_topk_launch<__nv_bfloat16, vllm::moe::SCORING_SIGMOID>(gating_output, topk_weights, topk_indices,
            token_expert_indices, workspace, num_tokens, num_experts, topk, renormalize,
            bias, stream);
    } else {
        TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type());
    }
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -5,17 +5,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
  // Apply topk softmax to the gating outputs.
  m.def(
      "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
-      "token_expert_indices, Tensor gating_output, bool renormalize, Tensor? "
+      "token_expert_indices, Tensor gating_output, bool renormalize) -> ()");
      "bias) -> ()");
  m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
  // Apply topk sigmoid to the gating outputs.
  m.def(
      "topk_sigmoid(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
      "token_expert_indices, Tensor gating_output, bool renormalize, Tensor? "
      "bias) -> ()");
  m.impl("topk_sigmoid", torch::kCUDA, &topk_sigmoid);
  // Calculate the result of moe by summing up the partial results
  // from all selected experts.
  m.def("moe_sum(Tensor input, Tensor! output) -> ()");
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -260,6 +260,12 @@ void get_cutlass_moe_mm_data(
    const int64_t num_experts, const int64_t n, const int64_t k,
    const std::optional<torch::Tensor>& blockscale_offsets);
 void get_cutlass_moe_mm_problem_sizes(
    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
    std::optional<bool> force_swap_ab = std::nullopt);
 void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
    const torch::Tensor& expert_first_token_offset,
    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@@ -293,8 +299,7 @@ std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a);
 void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
                      torch::Tensor& output_scale,
-                      torch::Tensor const& input_scale,
+                      torch::Tensor const& input_scale);
                      bool is_sf_swizzled_layout);
 void scaled_fp4_experts_quant(
    torch::Tensor& output, torch::Tensor& output_scale,
--- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@@ -27,24 +27,17 @@
 #include "cuda_utils.h"
 #include "launch_bounds_utils.h"
 // Define before including nvfp4_utils.cuh so the header
 // can use this macro during compilation.
 #define NVFP4_ENABLE_ELTS16 1
 #include "nvfp4_utils.cuh"
 namespace vllm {
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
-__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
+__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
-    silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols,
+    silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
-                             int32_t num_padded_cols,
+                             float const* SFScale, uint32_t* out,
-                             Type const* __restrict__ in,
+                             uint32_t* SFout) {
-                             float const* __restrict__ SFScale,
+  using PackedVec = PackedVec<Type>;
                             uint32_t* __restrict__ out,
                             uint32_t* __restrict__ SFout) {
  using PackedVec = vllm::PackedVec<Type>;
  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
@@ -56,60 +49,34 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
  // Get the global scaling factor, which will be applied to the SF.
  // Note SFScale is the same as next GEMM's alpha, which is
  // (448.f / (Alpha_A / 6.f)).
-  float const SFScaleVal = (SFScale == nullptr) ? 1.0f : SFScale[0];
+  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
  int32_t const colIdx = blockDim.x * blockIdx.y + threadIdx.x;
  int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
  // Input tensor row/col loops.
  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
-    if (colIdx < num_padded_cols) {
+    for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
-      PackedVec in_vec;
+         colIdx += blockDim.x) {
      PackedVec in_vec2;
      int64_t inOffset =
          rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) + colIdx;
      int64_t inOffset2 = rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) +
                          numCols / CVT_FP4_ELTS_PER_THREAD + colIdx;
      PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
      PackedVec in_vec2 = reinterpret_cast<PackedVec const*>(in)[inOffset2];
-      bool valid = (rowIdx < numRows) && (elem_idx < numCols);
+      // Get the output tensor offset.
-      if constexpr (CVT_FP4_PACK16) {
+      // Same as inOffset because 8 elements are packed into one uint32_t.
-        ld256_or_zero_cg_u32<Type>(
+      int64_t outOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
-            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 8],
+      auto& out_pos = out[outOffset];
            valid);
        ld256_or_zero_cg_u32<Type>(
            in_vec2, &reinterpret_cast<const uint32_t*>(in)[inOffset2 * 8],
            valid);
      } else {
        ld128_or_zero_cg_u32<Type>(
            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 4],
            valid);
        ld128_or_zero_cg_u32<Type>(
            in_vec2, &reinterpret_cast<const uint32_t*>(in)[inOffset2 * 4],
            valid);
      }
      // Compute silu and mul
-      PackedVec out_silu_mul = compute_silu_mul<Type>(in_vec, in_vec2);
+      PackedVec out_silu_mul = compute_silu_mul(in_vec, in_vec2);
      auto sf_out =
          cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
                                             CVT_FP4_NUM_THREADS_PER_SF>(
              rowIdx, colIdx, numKTiles, SFout);
-      auto out_val =
+      out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(out_silu_mul, SFScaleVal,
-          cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>(
+                                                     sf_out);
              out_silu_mul, SFScaleVal, sf_out);
      if (valid) {
        if constexpr (CVT_FP4_PACK16) {
          int64_t outOffset = rowIdx * (numCols / 8) + colIdx * 2;
          uint64_t packed64 =
              (uint64_t(out_val.hi) << 32) | uint64_t(out_val.lo);
          reinterpret_cast<uint64_t*>(out)[outOffset >> 1] = packed64;
        } else {
          out[inOffset] = out_val;
        }
      }
    }
  }
 }
@@ -136,23 +103,17 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output,  // [..., d]
  auto output_ptr = static_cast<int64_t*>(output.data_ptr());
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
-  dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
+  dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
  int const numBlocksPerSM =
      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
-
+  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
  int sf_n_unpadded = int(n / CVT_FP4_SF_VEC_SIZE);
  int grid_y = vllm::div_round_up(sf_n_unpadded, static_cast<int>(block.x));
  int grid_x = std::min(
      int(m), std::max(1, (multiProcessorCount * numBlocksPerSM) / grid_y));
  dim3 grid(grid_x, grid_y);
  VLLM_DISPATCH_HALF_TYPES(
      input.scalar_type(), "silu_and_mul_nvfp4_quant_kernel", [&] {
        using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
        auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
        vllm::silu_mul_cvt_fp16_to_fp4<cuda_type><<<grid, block, 0, stream>>>(
-            m, n, sf_n_unpadded, input_ptr, input_sf_ptr,
+            m, n, input_ptr, input_sf_ptr,
            reinterpret_cast<uint32_t*>(output_ptr),
            reinterpret_cast<uint32_t*>(sf_out));
      });
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@@ -140,8 +140,8 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
                                           CVT_FP4_NUM_THREADS_PER_SF>(
            rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);
-    out_pos = cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>(
+    out_pos =
-        quant_input, SFScaleVal, sf_out);
+        cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(quant_input, SFScaleVal, sf_out);
  }
 }
@@ -246,8 +246,8 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
                                           CVT_FP4_NUM_THREADS_PER_SF>(
            rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);
-    out_pos = cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>(
+    out_pos =
-        quant_input, SFScaleVal, sf_out);
+        cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(quant_input, SFScaleVal, sf_out);
  }
 }
--- a/csrc/quantization/fp4/nvfp4_quant_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@@ -21,8 +21,7 @@
 void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
                             torch::Tensor const& input,
                             torch::Tensor const& output_sf,
-                             torch::Tensor const& input_sf,
+                             torch::Tensor const& input_sf);
                             bool is_sf_swizzled_layout);
 #endif
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
@@ -52,12 +51,10 @@ void silu_and_mul_scaled_fp4_experts_quant_sm1xxa(
 #endif
 void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
-                      torch::Tensor& output_sf, torch::Tensor const& input_sf,
+                      torch::Tensor& output_sf, torch::Tensor const& input_sf) {
                      bool is_sf_swizzled_layout) {
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
-  return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf,
+  return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf);
                                 is_sf_swizzled_layout);
 #endif
  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization kernel");
 }
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -27,23 +27,29 @@
 #include "cuda_utils.h"
 #include "launch_bounds_utils.h"
 // Define before including nvfp4_utils.cuh so the header
 // can use this macro during compilation.
 #define NVFP4_ENABLE_ELTS16 1
 #include "nvfp4_utils.cuh"
 namespace vllm {
 template <typename Int>
 __host__ __device__ inline Int round_up(Int x, Int y) {
  static_assert(std::is_integral_v<Int>,
                "round_up argument must be integral type");
  return ((x + y - 1) / y) * y;
 }
 // Compute effective rows for grid configuration with swizzled SF layouts.
 inline int computeEffectiveRows(int m) {
  constexpr int ROW_TILE = 128;
  return round_up(m, ROW_TILE);
 }
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
 __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
-    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, int32_t num_padded_cols,
+    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
-                    Type const* __restrict__ in,
+                    float const* SFScale, uint32_t* out, uint32_t* SFout) {
-                    float const* __restrict__ SFScale,
+  using PackedVec = PackedVec<Type>;
                    uint32_t* __restrict__ out, uint32_t* __restrict__ SFout) {
  using PackedVec = vllm::PackedVec<Type>;
  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
@@ -53,31 +59,33 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
  int32_t const numKTiles = (numCols + 63) / 64;
  int sf_m = round_up<int>(numRows, 128);
-  int32_t const colIdx = blockDim.x * blockIdx.y + threadIdx.x;
+  int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
-  int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
+  int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
  int num_padded_cols = sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE;
  // Get the global scaling factor, which will be applied to the SF.
  // Note SFScale is the same as next GEMM's alpha, which is
  // (448.f / (Alpha_A / 6.f)).
-  float const global_scale = (SFScale == nullptr) ? 1.0f : SFScale[0];
+  float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
  // Iterate over all rows and cols including padded ones -
  //  ensures we visit every single scale factor address to initialize it.
  for (int rowIdx = blockIdx.x; rowIdx < sf_m; rowIdx += gridDim.x) {
-    if (colIdx < num_padded_cols) {
+    for (int colIdx = threadIdx.x;
         colIdx < num_padded_cols / CVT_FP4_ELTS_PER_THREAD;
         colIdx += blockDim.x) {
      int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
      PackedVec in_vec;
      int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
      // If we are outside valid rows OR outside valid columns -> Use Zeros
-      bool valid = (rowIdx < numRows) && (elem_idx < numCols);
+      if (rowIdx >= numRows || elem_idx >= numCols) {
-      if constexpr (CVT_FP4_PACK16) {
+        memset(&in_vec, 0, sizeof(PackedVec));
-        ld256_or_zero_cg_u32<Type>(
+
            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 8],
            valid);
      } else {
-        ld128_or_zero_cg_u32<Type>(
+        // Valid Region: Load actual data
-            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 4],
+        in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
            valid);
      }
      auto sf_out =
@@ -86,85 +94,13 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
              rowIdx, colIdx, numKTiles, SFout);
      auto out_val =
-          cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>(
+          cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
              in_vec, global_scale, sf_out);
      // We do NOT write output for padding because the 'out' tensor is not
      // padded.
-      if (valid) {
+      if (rowIdx < numRows && elem_idx < numCols) {
-        if constexpr (CVT_FP4_PACK16) {
+        // Same as inOffset because 8 elements are packed into one uint32_t.
-          int64_t outOffset = rowIdx * (numCols / 8) + colIdx * 2;
+        out[inOffset] = out_val;
          uint64_t packed64 =
              (uint64_t(out_val.hi) << 32) | uint64_t(out_val.lo);
          reinterpret_cast<uint64_t*>(out)[outOffset >> 1] = packed64;
        } else {
          out[inOffset] = out_val;
        }
      }
    }
  }
 }
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
 __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
    cvt_fp16_to_fp4_sf_major(int32_t numRows, int32_t numCols,
                             int32_t sf_n_unpadded, Type const* __restrict__ in,
                             float const* __restrict__ SFScale,
                             uint32_t* __restrict__ out,
                             uint32_t* __restrict__ SFout) {
  using PackedVec = PackedVec<Type>;
  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
                "Vec size is not matched.");
  int32_t const colIdx = blockDim.x * blockIdx.y + threadIdx.x;
  int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
  // Get the global scaling factor, which will be applied to the SF.
  // Note SFScale is the same as next GEMM's alpha, which is
  // (448.f / (Alpha_A / 6.f)).
  float const global_scale = (SFScale == nullptr) ? 1.0f : SFScale[0];
  // Iterate over all rows and cols including padded ones -
  //  ensures we visit every single scale factor address to initialize it.
  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
    if (colIdx < sf_n_unpadded) {
      PackedVec in_vec;
      int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
      // If we are outside valid rows OR outside valid columns -> Use Zeros
      bool valid = (rowIdx < numRows) && (elem_idx < numCols);
      if constexpr (CVT_FP4_PACK16) {
        ld256_or_zero_cg_u32<Type>(
            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 8],
            valid);
      } else {
        ld128_or_zero_cg_u32<Type>(
            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 4],
            valid);
      }
      auto sf_out =
          sf_out_rowmajor_u8<uint32_t>(rowIdx, colIdx, sf_n_unpadded, SFout);
      auto out_val =
          cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>(
              in_vec, global_scale, sf_out);
      // We do NOT write output for padding because the 'out' tensor is not
      // padded.
      if (valid) {
        if constexpr (CVT_FP4_PACK16) {
          int64_t outOffset = rowIdx * (numCols / 8) + colIdx * 2;
          uint64_t packed64 =
              (uint64_t(out_val.hi) << 32) | uint64_t(out_val.lo);
          reinterpret_cast<uint64_t*>(out)[outOffset >> 1] = packed64;
        } else {
          out[inOffset] = out_val;
        }
      }
    }
  }
@@ -175,8 +111,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
 void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
                             torch::Tensor const& input,
                             torch::Tensor const& output_sf,
-                             torch::Tensor const& input_sf,
+                             torch::Tensor const& input_sf) {
                             bool is_sf_swizzled_layout) {
  int32_t m = input.size(0);
  int32_t n = input.size(1);
@@ -194,48 +129,19 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
  int sf_n_unpadded = int(n / CVT_FP4_SF_VEC_SIZE);
  // Grid, Block size. Each thread converts 8 values.
  dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
  int const numBlocksPerSM =
      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
  int effectiveRows = vllm::computeEffectiveRows(m);
  dim3 grid(std::min(effectiveRows, multiProcessorCount * numBlocksPerSM));
-  if (is_sf_swizzled_layout) {
+  VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
-    int sf_n_int = int(vllm::round_up(sf_n_unpadded, 4) / 4);
+    using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
-    int32_t num_padded_cols =
+    auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
-        sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD;
+    // NOTE: We don't support e8m0 scales at this moment.
-
+    vllm::cvt_fp16_to_fp4<cuda_type, false><<<grid, block, 0, stream>>>(
-    int grid_y = vllm::div_round_up(num_padded_cols, static_cast<int>(block.x));
+        m, n, input_ptr, input_sf_ptr, reinterpret_cast<uint32_t*>(output_ptr),
-    int grid_x =
+        reinterpret_cast<uint32_t*>(sf_out));
-        std::min(vllm::computeEffectiveRows(m),
+  });
-                 std::max(1, (multiProcessorCount * numBlocksPerSM) / grid_y));
+}
    dim3 grid(grid_x, grid_y);
    VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
      using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
      auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
      // NOTE: We don't support e8m0 scales at this moment.
      vllm::cvt_fp16_to_fp4<cuda_type, false><<<grid, block, 0, stream>>>(
          m, n, num_padded_cols, input_ptr, input_sf_ptr,
          reinterpret_cast<uint32_t*>(output_ptr),
          reinterpret_cast<uint32_t*>(sf_out));
    });
  } else {
    int grid_y = vllm::div_round_up(sf_n_unpadded, static_cast<int>(block.x));
    int grid_x = std::min(
        m, std::max(1, (multiProcessorCount * numBlocksPerSM) / grid_y));
    dim3 grid(grid_x, grid_y);
    VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
      using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
      auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
      // NOTE: We don't support e8m0 scales at this moment.
      vllm::cvt_fp16_to_fp4_sf_major<cuda_type, false>
          <<<grid, block, 0, stream>>>(m, n, sf_n_unpadded, input_ptr,
                                       input_sf_ptr,
                                       reinterpret_cast<uint32_t*>(output_ptr),
                                       reinterpret_cast<uint32_t*>(sf_out));
    });
  }
 }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Shengqi Chen	b17039bccc	[CI] Implement uploading to PyPI and GitHub in the release pipeline, enable release image building for CUDA 13.0 (#31032 ) (cherry picked from commit `8e61425ee6`)	2026-01-16 21:04:48 -08:00
Cyrus Leung	48b67ba75f	[Frontend] Standardize use of `create_error_response` (#32319 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2026-01-16 11:35:10 +00:00
TJian	09f4264a55	[Bugfix] Fix ROCm dockerfiles (#32447 ) Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>	2026-01-16 10:50:00 +08:00
Matthew Bonanni	7f42dc20bb	[CI] Fix LM Eval Large Models (H100) (#32423 ) Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> (cherry picked from commit `bcf2333cd6`)	2026-01-15 18:00:21 -08:00
TJian	c2a37a3cf8	Cherry pick [ROCm] [CI] [Release] Rocm wheel pipeline with sccache #32264 Signed-off-by: Kevin H. Luu <khluu000@gmail.com>	2026-01-15 17:59:58 -08:00
Michael Goin	0e31fc7996	[UX] Use kv_offloading_backend=native by default (#32421 ) Signed-off-by: mgoin <mgoin64@gmail.com> (cherry picked from commit `1be5a73571`)	2026-01-15 17:55:20 -08:00
Pleaplusone	6ac0fcf416	[ROCm][Bugfix] Disable hip sampler to fix deepseek's accuracy issue on ROCm (#32413 ) Signed-off-by: ganyi <ygan@amd.com> (cherry picked from commit `77c16df31d`)	2026-01-15 17:55:06 -08:00
Douglas Lehr	b62249725c	[ROCM] Add ROCm image build to release pipeline (#31995 ) Signed-off-by: Doug Lehr <douglehr@amd.com> Co-authored-by: Doug Lehr <douglehr@amd.com> (cherry picked from commit `c5891b5430`)	2026-01-15 17:54:47 -08:00
vllmellm	1b57275207	[Bugfix][ROCm][performance] Resolve the performance regression issue of the Qwen3-Next-80B-A3B-Thinking under rocm_atten (#32336 ) Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> (cherry picked from commit `e27078ea80`)	2026-01-15 17:54:01 -08:00
Martin Hickey	2c24bc6996	[BugFix] [KVConnector] Fix KV events for LMCache connector (#32169 ) Signed-off-by: Martin Hickey <martin.hickey@ie.ibm.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>	2026-01-13 10:56:23 -08:00
Cyrus Leung	0aa8c40552	[Bugfix] Replace `PoolingParams.normalize` with `use_activation` (#32243 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2026-01-13 10:56:23 -08:00