[AMD][Kernel][BugFix] Use correct scale in concat_and_cache_ds_mla_kernel when on gfx942 (#32976 )

Signed-off-by: Randall Smith <ransmith@amd.com> Signed-off-by: Randall Smith <Randall.Smith@amd.com> Co-authored-by: Randall Smith <ransmith@amd.com>
[Models] Kimi-K2.5 (#33131 )
2026-01-27 07:16:43 +00:00 · 2026-01-27 14:50:31 +08:00 · 2026-01-27 05:26:48 +00:00 · 2026-01-27 04:57:16 +00:00 · 2026-01-26 20:47:26 -08:00 · 2026-01-27 04:25:02 +00:00
1135 changed files with 58166 additions and 27145 deletions
--- a/.buildkite/ci_config.yaml
+++ b/.buildkite/ci_config.yaml
@@ -1,7 +1,8 @@
 name: vllm_ci
 job_dirs:
-  - ".buildkite/test_areas"
  - ".buildkite/image_build"
+  - ".buildkite/test_areas"
+  - ".buildkite/hardware_tests"
 run_all_patterns:
  - "docker/Dockerfile"
  - "CMakeLists.txt"
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -0,0 +1,28 @@
+group: Hardware
+steps:
+  - label: "AMD: :docker: build image"
+    device: amd_cpu
+    no_plugin: true
+    commands:
+    - >
+      docker build
+      --build-arg max_jobs=16
+      --build-arg REMOTE_VLLM=1
+      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
+      --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
+      --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
+      -f docker/Dockerfile.rocm
+      --target test
+      --no-cache
+      --progress plain .
+    - docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 1
+        - exit_status: -10  # Agent was lost
+          limit: 1
+        - exit_status: 1  # Machine occasionally fail
+          limit: 1
--- a/.buildkite/hardware_tests/arm.yaml
+++ b/.buildkite/hardware_tests/arm.yaml
@@ -0,0 +1,8 @@
+group: Hardware
+steps:
+  - label: "Arm CPU Test"
+    soft_fail: true
+    device: arm_cpu
+    no_plugin: true
+    commands: 
+    - bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
--- a/.buildkite/hardware_tests/ascend_npu.yaml
+++ b/.buildkite/hardware_tests/ascend_npu.yaml
@@ -0,0 +1,10 @@
+group: Hardware
+depends_on: ~
+steps:
+  - label: "Ascend NPU Test"
+    soft_fail: true
+    timeout_in_minutes: 20
+    no_plugin: true
+    device: ascend_npu
+    commands: 
+    - bash .buildkite/scripts/hardware_ci/run-npu-test.sh
--- a/.buildkite/hardware_tests/gh200.yaml
+++ b/.buildkite/hardware_tests/gh200.yaml
@@ -0,0 +1,10 @@
+group: Hardware
+steps:
+  - label: "GH200 Test"
+    soft_fail: true
+    device: gh200
+    no_plugin: true
+    optional: true
+    commands: 
+    - nvidia-smi 
+    - bash .buildkite/scripts/hardware_ci/run-gh200-test.sh
--- a/.buildkite/hardware_tests/intel.yaml
+++ b/.buildkite/hardware_tests/intel.yaml
@@ -0,0 +1,23 @@
+group: Hardware
+depends_on: ~
+steps:
+  - label: "Intel CPU Test"
+    soft_fail: true
+    device: intel_cpu
+    no_plugin: true
+    commands: 
+    - bash .buildkite/scripts/hardware_ci/run-cpu-test.sh
+  
+  - label: "Intel HPU Test"
+    soft_fail: true
+    device: intel_hpu
+    no_plugin: true
+    commands: 
+    - bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
+
+  - label: "Intel GPU Test"
+    soft_fail: true
+    device: intel_gpu
+    no_plugin: true
+    commands: 
+    - bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@@ -1,56 +1,254 @@
 #!/bin/bash
-set -e
+set -euo pipefail

-if [[ $# -lt 8 ]]; then
-  echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
-  exit 1
+# replace invalid characters in Docker image tags and truncate to 128 chars
+clean_docker_tag() {
+    local input="$1"
+    echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128
+}
+
+print_usage_and_exit() {
+    echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
+    exit 1
+}
+
+print_instance_info() {
+    echo ""
+    echo "=== Debug: Instance Information ==="
+    # Get IMDSv2 token
+    if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
+            -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then
+        AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
+            http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown")
+        INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
+            http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown")
+        INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
+            http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")
+        AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
+            http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown")
+        echo "AMI ID:        ${AMI_ID}"
+        echo "Instance Type: ${INSTANCE_TYPE}"
+        echo "Instance ID:   ${INSTANCE_ID}"
+        echo "AZ:            ${AZ}"
+    else
+        echo "Not running on EC2 or IMDS not available"
+    fi
+    # Check for warm cache AMI (marker file baked into custom AMI)
+    if [[ -f /etc/vllm-ami-info ]]; then
+        echo "Cache:         warm (custom vLLM AMI)"
+        cat /etc/vllm-ami-info
+    else
+        echo "Cache:         cold (standard AMI)"
+    fi
+    echo "==================================="
+    echo ""
+}
+
+setup_buildx_builder() {
+    echo "--- :buildkite: Setting up buildx builder"
+    if [[ -S "${BUILDKIT_SOCKET}" ]]; then
+        # Custom AMI with standalone buildkitd - use remote driver for warm cache
+        echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
+        echo "Using remote driver to connect to buildkitd (warm cache available)"
+        if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
+            echo "Using existing baked-vllm-builder"
+            docker buildx use baked-vllm-builder
+        else
+            echo "Creating baked-vllm-builder with remote driver"
+            docker buildx create \
+                --name baked-vllm-builder \
+                --driver remote \
+                --use \
+                "unix://${BUILDKIT_SOCKET}"
+        fi
+        docker buildx inspect --bootstrap
+    elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
+        # Existing builder available
+        echo "Using existing builder: ${BUILDER_NAME}"
+        docker buildx use "${BUILDER_NAME}"
+        docker buildx inspect --bootstrap
+    else
+        # No local buildkitd, no existing builder - create new docker-container builder
+        echo "No local buildkitd found, using docker-container driver"
+        docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
+        docker buildx inspect --bootstrap
+    fi
+
+    # builder info
+    echo "Active builder:"
+    docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
+}
+
+check_and_skip_if_image_exists() {
+    if [[ -n "${IMAGE_TAG:-}" ]]; then
+        echo "--- :mag: Checking if image exists"
+        if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
+            echo "Image already exists: ${IMAGE_TAG}"
+            echo "Skipping build"
+            exit 0
+        fi
+        echo "Image not found, proceeding with build"
+    fi
+}
+
+ecr_login() {
+    aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+    aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+}
+
+prepare_cache_tags() {
+    # resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
+    TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
+    MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
+
+    if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
+        if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
+            cache="${MAIN_CACHE_ECR}:latest"
+        else
+            clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH")
+            cache="${TEST_CACHE_ECR}:${clean_branch}"
+        fi
+        CACHE_TO="$cache"
+        CACHE_FROM="$cache"
+        CACHE_FROM_BASE_BRANCH="$cache"
+    else
+        CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
+        CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
+        if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then
+            CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest"
+        else
+            clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH")
+            CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}"
+        fi
+    fi
+
+    CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest"
+    export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
+}
+
+resolve_parent_commit() {
+    if [[ -z "${PARENT_COMMIT:-}" ]]; then
+        PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
+        if [[ -n "${PARENT_COMMIT}" ]]; then
+            echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
+            export PARENT_COMMIT
+        else
+            echo "Could not determine parent commit (may be first commit in repo)"
+        fi
+    else
+        echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
+    fi
+}
+
+print_bake_config() {
+    echo "--- :page_facing_up: Resolved bake configuration"
+    BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
+    docker buildx bake -f "${VLLM_BAKE_FILE}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
+    echo "Saved bake config to ${BAKE_CONFIG_FILE}"
+    echo "--- :arrow_down: Uploading bake config to Buildkite"
+    buildkite-agent artifact upload "${BAKE_CONFIG_FILE}"
+}
+
+#################################
+#         Main Script           #
+#################################
+print_instance_info
+
+if [[ $# -lt 7 ]]; then
+    print_usage_and_exit
 fi

+# input args
 REGISTRY=$1
 REPO=$2
 BUILDKITE_COMMIT=$3
 BRANCH=$4
 VLLM_USE_PRECOMPILED=$5
 VLLM_MERGE_BASE_COMMIT=$6
-CACHE_FROM=$7
-CACHE_TO=$8
+IMAGE_TAG=$7
+IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional

-# authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
-aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+# build config
+TARGET="test-ci"
+CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
+VLLM_BAKE_FILE="${VLLM_BAKE_FILE:-docker/docker-bake.hcl}"
+BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
+CI_HCL_PATH="/tmp/ci.hcl"
+BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"

-# docker buildx 
-docker buildx create --name vllm-builder --driver docker-container --use
-docker buildx inspect --bootstrap
-docker buildx ls
+prepare_cache_tags
+ecr_login

-# skip build if image already exists
-if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
-  echo "Image not found, proceeding with build..."
-else
-  echo "Image found"
-  exit 0
+# Environment info (for docs and human readers)
+#   CI_HCL_URL          - URL to ci.hcl (default: from ci-infra main branch)
+#   VLLM_CI_BRANCH      - ci-infra branch to use (default: main)
+#   VLLM_BAKE_FILE      - Path to vLLM's bake file (default: docker/docker-bake.hcl)
+#   BUILDER_NAME        - Name for buildx builder (default: vllm-builder)
+#
+# Build configuration (exported as environment variables for bake):
+export BUILDKITE_COMMIT
+export PARENT_COMMIT
+export IMAGE_TAG
+export IMAGE_TAG_LATEST
+export CACHE_FROM
+export CACHE_FROM_BASE_BRANCH
+export CACHE_FROM_MAIN
+export CACHE_TO
+export VLLM_USE_PRECOMPILED
+export VLLM_MERGE_BASE_COMMIT
+
+# print args
+echo "--- :mag: Arguments"
+echo "REGISTRY: ${REGISTRY}"
+echo "REPO: ${REPO}"
+echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
+echo "BRANCH: ${BRANCH}"
+echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
+echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
+echo "IMAGE_TAG: ${IMAGE_TAG}"
+echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
+
+# print build configuration
+echo "--- :mag: Build configuration"
+echo "TARGET: ${TARGET}"
+echo "CI HCL URL: ${CI_HCL_URL}"
+echo "vLLM bake file: ${VLLM_BAKE_FILE}"
+echo "BUILDER_NAME: ${BUILDER_NAME}"
+echo "CI_HCL_PATH: ${CI_HCL_PATH}"
+echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}"
+
+echo "--- :mag: Cache tags"
+echo "CACHE_TO: ${CACHE_TO}"
+echo "CACHE_FROM: ${CACHE_FROM}"
+echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}"
+echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
+
+check_and_skip_if_image_exists
+
+echo "--- :docker: Setting up Docker buildx bake"
+echo "Target: ${TARGET}"
+echo "CI HCL URL: ${CI_HCL_URL}"
+echo "vLLM bake file: ${VLLM_BAKE_FILE}"
+
+if [[ ! -f "${VLLM_BAKE_FILE}" ]]; then
+    echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE}"
+    echo "Make sure you're running from the vLLM repository root"
+    exit 1
 fi

-if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
-  merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
-else
-  merge_base_commit_build_args=""
-fi
+echo "--- :arrow_down: Downloading ci.hcl"
+curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
+echo "Downloaded to ${CI_HCL_PATH}"

-# build
-docker buildx build --file docker/Dockerfile \
-  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
-  --build-arg USE_SCCACHE=1 \
-  --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
-  --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
-  --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
-  ${merge_base_commit_build_args} \
-  --cache-from type=registry,ref=${CACHE_FROM},mode=max \
-  --cache-to type=registry,ref=${CACHE_TO},mode=max \
-  --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
-  $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
-  --push \
-  --target test \
-  --progress plain .
+setup_buildx_builder
+
+# Compute parent commit for cache fallback (if not already set)
+resolve_parent_commit
+export PARENT_COMMIT
+
+print_bake_config
+
+echo "--- :docker: Building ${TARGET}"
+docker --debug buildx bake -f "${VLLM_BAKE_FILE}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"
+
+echo "--- :white_check_mark: Build complete"
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -4,7 +4,8 @@ steps:
    key: image-build
    depends_on: []
    commands:
-    - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
+    - if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
+    - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG_LATEST; fi
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
--- a/.buildkite/lm-eval-harness/configs/models-small-rocm.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small-rocm.txt
@@ -0,0 +1,5 @@
+Qwen2.5-1.5B-Instruct.yaml
+Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+Qwen1.5-MoE-W4A16-compressed-tensors.yaml
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,219 +1,287 @@
 steps:
-  # aarch64 + CUDA builds
-  - label: "Build arm64 wheel - CUDA 12.9"
-    depends_on: ~
-    id: build-wheel-arm64-cuda-12-9
-    agents:
-      queue: arm64_cpu_queue_postmerge
-    commands:
-      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
-      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  - label: "Build arm64 wheel - CUDA 13.0"
-    depends_on: ~
-    id: build-wheel-arm64-cuda-13-0
-    agents:
-      queue: arm64_cpu_queue_postmerge
-    commands:
-      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
-      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04  --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  # aarch64 build
-  - label: "Build arm64 CPU wheel"
-    depends_on: ~
-    id: build-wheel-arm64-cpu
-    agents:
-      queue: arm64_cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  # x86 + CUDA builds
-  - label: "Build wheel - CUDA 12.9"
-    depends_on: ~
-    id: build-wheel-cuda-12-9
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  - label: "Build wheel - CUDA 13.0"
-    depends_on: ~
-    id: build-wheel-cuda-13-0
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  # x86 CPU wheel build
-  - label: "Build x86 CPU wheel"
-    depends_on: ~
-    id: build-wheel-x86-cpu
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  # Build release images (12.9)
-  - label: "Build release image (x86)"
-    depends_on: ~
-    id: build-release-image-x86
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
-      # re-tag to default image tag and push, just in case arm64 build fails
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
-  - label: "Build release image (arm64)"
-    depends_on: ~
-    id: build-release-image-arm64
-    agents:
-      queue: arm64_cpu_queue_postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
-
-  # Add job to create multi-arch manifest
-  - label: "Create multi-arch manifest"
-    depends_on:
-      - build-release-image-x86
-      - build-release-image-arm64
-    id: create-multi-arch-manifest
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
-      - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
-  - label: "Annotate release workflow"
-    depends_on:
-      - create-multi-arch-manifest
-    id: annotate-release-workflow
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "bash .buildkite/scripts/annotate-release.sh"
-
  - input: "Provide Release version here"
    id: input-release-version
    fields:
      - text: "What is the release version?"
        key: release-version

-  - block: "Build CPU release image"
-    key: block-cpu-release-image-build
-    depends_on: ~
+  - group: "Build Python wheels"
+    key: "build-wheels"
+    steps:
+      - label: "Build wheel - aarch64 - CUDA 12.9"
+        depends_on: ~
+        id: build-wheel-arm64-cuda-12-9
+        agents:
+          queue: arm64_cpu_queue_postmerge
+        commands:
+          # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+          # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+          - "mkdir artifacts"
+          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh"
+        env:
+          DOCKER_BUILDKIT: "1"

-  - label: "Build and publish CPU release image"
-    depends_on: block-cpu-release-image-build
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
-    env:
-      DOCKER_BUILDKIT: "1"
+      - label: "Build wheel - aarch64 - CUDA 13.0"
+        depends_on: ~
+        id: build-wheel-arm64-cuda-13-0
+        agents:
+          queue: arm64_cpu_queue_postmerge
+        commands:
+          # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+          # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04  --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+          - "mkdir artifacts"
+          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+        env:
+          DOCKER_BUILDKIT: "1"

-  - block: "Build arm64 CPU release image"
-    key: block-arm64-cpu-release-image-build
-    depends_on: ~
+      - label: "Build wheel - aarch64 - CPU"
+        depends_on: ~
+        id: build-wheel-arm64-cpu
+        agents:
+          queue: arm64_cpu_queue_postmerge
+        commands:
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+          - "mkdir artifacts"
+          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+        env:
+          DOCKER_BUILDKIT: "1"

-  - label: "Build and publish arm64 CPU release image"
-    depends_on: block-arm64-cpu-release-image-build
-    agents:
-      queue: arm64_cpu_queue_postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
-    env:
-      DOCKER_BUILDKIT: "1"
+      - label: "Build wheel - x86_64 - CUDA 12.9"
+        depends_on: ~
+        id: build-wheel-x86-cuda-12-9
+        agents:
+          queue: cpu_queue_postmerge
+        commands:
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+          - "mkdir artifacts"
+          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
+        env:
+          DOCKER_BUILDKIT: "1"

-  - block: "Build ROCm release image"
-    key: block-rocm-release-image-build
-    depends_on: ~
+      - label: "Build wheel - x86_64 - CUDA 13.0"
+        depends_on: ~
+        id: build-wheel-x86-cuda-13-0
+        agents:
+          queue: cpu_queue_postmerge
+        commands:
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+          - "mkdir artifacts"
+          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+        env:
+          DOCKER_BUILDKIT: "1"

-  - label: "Build release image (ROCm)"
-    depends_on: block-rocm-release-image-build
-    id: build-release-image-rocm
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      # Build base image first
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
-      # Build vLLM ROCm image using the base
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
+      - label: "Build wheel - x86_64 - CPU"
+        depends_on: ~
+        id: build-wheel-x86-cpu
+        agents:
+          queue: cpu_queue_postmerge
+        commands:
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+          - "mkdir artifacts"
+          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+        env:
+          DOCKER_BUILDKIT: "1"

-  
-  - label: "Build and publish nightly multi-arch image to DockerHub"
-    depends_on:
-      - create-multi-arch-manifest
-    if: build.env("NIGHTLY") == "1"
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
-      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
-      - "docker push vllm/vllm-openai:nightly-x86_64"
-      - "docker push vllm/vllm-openai:nightly-aarch64"
-      - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
-      - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
-      - "docker manifest push vllm/vllm-openai:nightly"
-      - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
-      # Clean up old nightly builds (keep only last 14)
-      - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
-    plugins:
-      - docker-login#v3.0.0:
-          username: vllmbot
-          password-env: DOCKERHUB_TOKEN
-    env:
-      DOCKER_BUILDKIT: "1"
-      DOCKERHUB_USERNAME: "vllmbot"
+  - group: "Build release Docker images"
+    key: "build-release-images"
+    steps:
+      - label: "Build release image - x86_64 - CUDA 12.9"
+        depends_on: ~
+        id: build-release-image-x86
+        agents:
+          queue: cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+          # re-tag to default image tag and push, just in case arm64 build fails
+          - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
+      - label: "Build release image - aarch64 - CUDA 12.9"
+        depends_on: ~
+        id: build-release-image-arm64
+        agents:
+          queue: arm64_cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+
+      - label: "Build release image - x86_64 - CUDA 13.0"
+        depends_on: ~
+        id: build-release-image-x86-cuda-13-0
+        agents:
+          queue: cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
+          # re-tag to default image tag and push, just in case arm64 build fails
+          - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+
+      - label: "Build release image - aarch64 - CUDA 13.0"
+        depends_on: ~
+        id: build-release-image-arm64-cuda-13-0
+        agents:
+          queue: arm64_cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          # compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
+
+      - block: "Build release image for x86_64 CPU"
+        key: block-cpu-release-image-build
+        depends_on: ~
+
+      - label: "Build release image - x86_64 - CPU"
+        depends_on:
+          - block-cpu-release-image-build
+          - input-release-version
+        agents:
+          queue: cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
+        env:
+          DOCKER_BUILDKIT: "1"
+
+      - block: "Build release image for arm64 CPU"
+        key: block-arm64-cpu-release-image-build
+        depends_on: ~
+
+      - label: "Build release image - arm64 - CPU"
+        depends_on: 
+          - block-arm64-cpu-release-image-build
+          - input-release-version
+        agents:
+          queue: arm64_cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
+        env:
+          DOCKER_BUILDKIT: "1"
+
+      - block: "Build release image for x86_64 ROCm"
+        key: block-rocm-release-image-build
+        depends_on: ~
+
+      - label: "Build release image - x86_64 - ROCm"
+        depends_on: block-rocm-release-image-build
+        id: build-release-image-rocm
+        agents:
+          queue: cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          # Build base image first
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
+          # Build vLLM ROCm image using the base
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
+
+  - group: "Publish release images"
+    key: "publish-release-images"
+    steps:
+      - label: "Create multi-arch manifest - CUDA 12.9"
+        depends_on:
+          - build-release-image-x86
+          - build-release-image-arm64
+        id: create-multi-arch-manifest
+        agents:
+          queue: small_cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
+          - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
+      - label: "Annotate release workflow - CUDA 12.9"
+        depends_on:
+          - create-multi-arch-manifest
+        id: annotate-release-workflow
+        agents:
+          queue: small_cpu_queue_postmerge
+        commands:
+          - "bash .buildkite/scripts/annotate-release.sh"
+
+      - label: "Create multi-arch manifest - CUDA 13.0"
+        depends_on:
+          - build-release-image-x86-cuda-13-0
+          - build-release-image-arm64-cuda-13-0
+        id: create-multi-arch-manifest-cuda-13-0
+        agents:
+          queue: small_cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
+          - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+
+      - label: "Publish nightly multi-arch image to DockerHub"
+        depends_on:
+          - create-multi-arch-manifest
+        if: build.env("NIGHTLY") == "1"
+        agents:
+          queue: small_cpu_queue_postmerge
+        commands:
+          - "bash .buildkite/scripts/push-nightly-builds.sh"
+          # Clean up old nightly builds (keep only last 14)
+          - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
+        plugins:
+          - docker-login#v3.0.0:
+              username: vllmbot
+              password-env: DOCKERHUB_TOKEN
+        env:
+          DOCKER_BUILDKIT: "1"
+          DOCKERHUB_USERNAME: "vllmbot"
+
+      - label: "Publish nightly multi-arch image to DockerHub - CUDA 13.0"
+        depends_on:
+          - create-multi-arch-manifest-cuda-13-0
+        if: build.env("NIGHTLY") == "1"
+        agents:
+          queue: small_cpu_queue_postmerge
+        commands:
+          - "bash .buildkite/scripts/push-nightly-builds.sh cu130"
+          # Clean up old nightly builds (keep only last 14)
+          - "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
+        plugins:
+          - docker-login#v3.0.0:
+              username: vllmbot
+              password-env: DOCKERHUB_TOKEN
+        env:
+          DOCKER_BUILDKIT: "1"
+          DOCKERHUB_USERNAME: "vllmbot"
+
+  - group: "Publish wheels"
+    key: "publish-wheels"
+    steps:
+      - block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
+        key: block-upload-release-wheels
+        depends_on:
+          - input-release-version
+          - build-wheels
+
+      - label: "Upload release wheels to PyPI and GitHub"
+        depends_on:
+          - block-upload-release-wheels
+        id: upload-release-wheels
+        agents:
+          queue: small_cpu_queue_postmerge
+        commands:
+          - "bash .buildkite/scripts/upload-release-wheels.sh"

  # =============================================================================
  # ROCm Release Pipeline (x86_64 only)
--- a/.buildkite/scripts/cherry-pick-from-milestone.sh
+++ b/.buildkite/scripts/cherry-pick-from-milestone.sh
@@ -0,0 +1,242 @@
+#!/bin/bash
+#
+# cherry-pick-from-milestone.sh
+# Find commits from a GitHub milestone that are missing from the current branch
+# and output them in chronological order for cherry-picking.
+#
+# Usage: ./cherry-pick-from-milestone.sh <milestone> [--dry-run] [--execute]
+#
+
+set -euo pipefail
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+usage() {
+    cat <<EOF
+Usage: $(basename "$0") <milestone> [options]
+
+Find commits from a GitHub milestone that need to be cherry-picked into the current branch.
+
+Arguments:
+    milestone       The GitHub milestone name (e.g., v0.14.0)
+
+Options:
+    --dry-run       Show the cherry-pick commands without executing (default)
+    --execute       Actually execute the cherry-picks
+    --main-branch   Specify the main branch name (default: main)
+    --help          Show this help message
+
+Examples:
+    $(basename "$0") v0.14.0
+    $(basename "$0") v0.14.0 --dry-run
+    $(basename "$0") v0.14.0 --execute
+    $(basename "$0") v0.14.0 --main-branch master
+EOF
+    exit 1
+}
+
+log_info() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+log_success() {
+    echo -e "${GREEN}[OK]${NC} $1"
+}
+
+log_warn() {
+    echo -e "${YELLOW}[WARN]${NC} $1"
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $1" >&2
+}
+
+# Default values
+MILESTONE=""
+DRY_RUN=true
+MAIN_BRANCH="main"
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --dry-run)
+            DRY_RUN=true
+            shift
+            ;;
+        --execute)
+            DRY_RUN=false
+            shift
+            ;;
+        --main-branch)
+            MAIN_BRANCH="$2"
+            shift 2
+            ;;
+        --help|-h)
+            usage
+            ;;
+        -*)
+            log_error "Unknown option: $1"
+            usage
+            ;;
+        *)
+            if [[ -z "$MILESTONE" ]]; then
+                MILESTONE="$1"
+            else
+                log_error "Unexpected argument: $1"
+                usage
+            fi
+            shift
+            ;;
+    esac
+done
+
+# Validate milestone argument
+if [[ -z "$MILESTONE" ]]; then
+    log_error "Milestone is required"
+    usage
+fi
+
+# Check if we're in a git repository
+if ! git rev-parse --is-inside-work-tree &>/dev/null; then
+    log_error "Not in a git repository"
+    exit 1
+fi
+
+# Check if gh CLI is available
+if ! command -v gh &>/dev/null; then
+    log_error "GitHub CLI (gh) is not installed"
+    exit 1
+fi
+
+# Check if authenticated with gh
+if ! gh auth status &>/dev/null; then
+    log_error "Not authenticated with GitHub CLI. Run 'gh auth login' first."
+    exit 1
+fi
+
+CURRENT_BRANCH=$(git branch --show-current)
+log_info "Current branch: ${CURRENT_BRANCH}"
+log_info "Main branch: ${MAIN_BRANCH}"
+log_info "Milestone: ${MILESTONE}"
+echo ""
+
+# Fetch latest from remote
+log_info "Fetching latest from remote..."
+git fetch origin "$MAIN_BRANCH" --quiet
+
+# Get merged PRs from the milestone, sorted by merge date
+log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
+
+# Store PR data in a temp file
+PR_DATA=$(mktemp)
+trap "rm -f $PR_DATA" EXIT
+
+if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
+    --limit 1000 \
+    --json number,title,mergeCommit,mergedAt \
+    --jq 'sort_by(.mergedAt) | .[] | "\(.mergeCommit.oid)\t\(.number)\t\(.title)"' > "$PR_DATA" 2>/dev/null; then
+    log_error "Failed to fetch PRs from milestone '${MILESTONE}'"
+    log_error "This could be due to:"
+    log_error "  - Milestone does not exist"
+    log_error "  - Network/authentication issues"
+    log_error "  - Invalid milestone name format"
+    exit 1
+fi
+
+if [[ ! -s "$PR_DATA" ]]; then
+    log_warn "No merged PRs found for milestone '${MILESTONE}'"
+    exit 0
+fi
+
+TOTAL_PRS=$(wc -l < "$PR_DATA")
+log_info "Found ${TOTAL_PRS} merged PR(s) in milestone"
+echo ""
+
+# Find commits that are missing from current branch
+MISSING_COMMITS=()
+MISSING_INFO=()
+
+while IFS=$'\t' read -r sha pr_number title; do
+    # Skip if SHA is empty or null
+    if [[ -z "$sha" || "$sha" == "null" ]]; then
+        log_warn "PR #${pr_number} has no merge commit SHA, skipping"
+        continue
+    fi
+    
+    # Check if this commit is already in the current branch
+    if git merge-base --is-ancestor "$sha" HEAD 2>/dev/null; then
+        log_success "PR #${pr_number} already in branch: ${title:0:60}"
+    else
+        log_warn "PR #${pr_number} MISSING: ${title:0:60}"
+        MISSING_COMMITS+=("$sha")
+        MISSING_INFO+=("$sha PR #${pr_number}: ${title}")
+    fi
+done < "$PR_DATA"
+
+echo ""
+
+if [[ ${#MISSING_COMMITS[@]} -eq 0 ]]; then
+    log_success "All PRs from milestone '${MILESTONE}' are already in the current branch!"
+    exit 0
+fi
+
+log_info "Found ${#MISSING_COMMITS[@]} missing commit(s) to cherry-pick"
+echo ""
+
+# Output the cherry-pick commands
+echo "=========================================="
+echo "Cherry-pick commands (in chronological order):"
+echo "=========================================="
+echo ""
+
+for info in "${MISSING_INFO[@]}"; do
+    echo "# $info"
+done
+echo ""
+
+echo "# Run these commands to cherry-pick all missing commits:"
+echo "git cherry-pick ${MISSING_COMMITS[*]}"
+echo ""
+
+# Or one by one
+echo "# Or cherry-pick one at a time:"
+for sha in "${MISSING_COMMITS[@]}"; do
+    echo "git cherry-pick $sha"
+done
+echo ""
+
+# Execute if requested
+if [[ "$DRY_RUN" == false ]]; then
+    echo "=========================================="
+    log_info "Executing cherry-picks..."
+    echo "=========================================="
+    
+    for i in "${!MISSING_COMMITS[@]}"; do
+        sha="${MISSING_COMMITS[$i]}"
+        info="${MISSING_INFO[$i]}"
+        
+        echo ""
+        log_info "Cherry-picking: $info"
+        
+        if git cherry-pick "$sha"; then
+            log_success "Successfully cherry-picked $sha"
+        else
+            log_error "Failed to cherry-pick $sha"
+            log_error "Resolve conflicts and run 'git cherry-pick --continue', or 'git cherry-pick --abort' to cancel"
+            exit 1
+        fi
+    done
+    
+    echo ""
+    log_success "All cherry-picks completed successfully!"
+else
+    echo "=========================================="
+    echo -e "${YELLOW}Dry run mode - no changes made${NC}"
+    echo "Run with --execute to perform the cherry-picks"
+    echo "=========================================="
+fi
--- a/.buildkite/scripts/cleanup-nightly-builds.sh
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@@ -3,7 +3,14 @@
 set -ex

 # Clean up old nightly builds from DockerHub, keeping only the last 14 builds
-# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
+# This script uses DockerHub API to list and delete old tags with specified prefix
+# Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
+# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
+
+# Get tag prefix from argument, default to "nightly-" if not provided
+TAG_PREFIX="${1:-nightly-}"
+
+echo "Cleaning up tags with prefix: $TAG_PREFIX"

 # DockerHub API endpoint for vllm/vllm-openai repository
 REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
@@ -45,7 +52,7 @@ get_all_tags() {
        set -x
        
        # Get both last_updated timestamp and tag name, separated by |
-        local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
+        local tags=$(echo "$response" | jq -r --arg prefix "$TAG_PREFIX" '.results[] | select(.name | startswith($prefix)) | "\(.last_updated)|\(.name)"')
        
        if [ -z "$tags" ]; then
            break
--- a/.buildkite/scripts/push-nightly-builds.sh
+++ b/.buildkite/scripts/push-nightly-builds.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+set -ex
+
+# Get tag variant from argument, default to empty if not provided, should be something like "cu130".
+# Due to limits in cleanup script, we must move variants to use separate tags like "cu130-nightly",
+# otherwise they will be cleaned up together with the main "nightly" tags.
+
+TAG_VARIANT="$1"
+if [ -n "$TAG_VARIANT" ]; then
+    ORIG_TAG_SUFFIX="-$TAG_VARIANT"
+    TAG_NAME="$TAG_VARIANT-nightly"
+else
+    ORIG_TAG_SUFFIX=""
+    TAG_NAME="nightly"
+fi
+
+ORIG_TAG_NAME="$BUILDKITE_COMMIT"
+
+echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag name: $TAG_NAME"
+
+# pull original arch-dependent images from AWS ECR Public
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
+# tag arch-dependent images
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
+# push arch-dependent images to DockerHub
+docker push vllm/vllm-openai:$TAG_NAME-x86_64
+docker push vllm/vllm-openai:$TAG_NAME-aarch64
+# push arch-independent manifest to DockerHub
+docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
+docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
+docker manifest push vllm/vllm-openai:$TAG_NAME
+docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -18,15 +18,18 @@ wait_for_server() {

 MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"

-# Set BACKENDS based on platform
+# Set BACKENDS and platform-specific args based on platform
 if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
  # ROCm platform
  BACKENDS=("allgather_reducescatter")
  # Disable MOE padding for ROCm since it is causing eplb to fail
  export VLLM_ROCM_MOE_PADDING=0
+  PLATFORM_ARGS=("--no-async-scheduling")
+  echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
 else
  # Non-ROCm platform (CUDA/other)
  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+  PLATFORM_ARGS=()
 fi

 cleanup() {
@@ -54,6 +57,7 @@ for BACK in "${BACKENDS[@]}"; do
    --trust-remote-code \
    --max-model-len 2048 \
    --gpu-memory-utilization 0.9 \
+    "${PLATFORM_ARGS[@]}" \
    --port $PORT &
  SERVER_PID=$!
  wait_for_server $PORT
--- a/.buildkite/scripts/trigger-ci-build.sh
+++ b/.buildkite/scripts/trigger-ci-build.sh
@@ -0,0 +1,227 @@
+#!/bin/bash
+#
+# trigger-ci-build.sh
+# Trigger a Buildkite CI build using the bk CLI for the current commit and branch
+# with RUN_ALL=1 and NIGHTLY=1 environment variables.
+#
+# Usage: ./trigger-ci-build.sh [options]
+#
+# Requires: bk CLI (https://buildkite.com/docs/platform/cli)
+#
+# SAFETY: Dry-run by default. Use --execute to actually trigger a build.
+#
+
+set -euo pipefail
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Default configuration
+PIPELINE="ci"
+DRY_RUN=true
+
+usage() {
+    cat <<EOF
+Usage: $(basename "$0") [options]
+
+Trigger a Buildkite CI build using the bk CLI for the current commit and branch.
+Sets RUN_ALL=1 and NIGHTLY=1 environment variables.
+
+SAFETY: Dry-run by default. Use --execute to actually trigger a build.
+
+Options:
+    --execute       Actually trigger the build (default: dry-run)
+    --pipeline      Buildkite pipeline slug (default: ${PIPELINE})
+    --commit        Override commit SHA (default: current HEAD)
+    --branch        Override branch name (default: current branch)
+    --message       Custom build message (default: auto-generated)
+    --help          Show this help message
+
+Prerequisites:
+    - bk CLI installed: brew tap buildkite/buildkite && brew install buildkite/buildkite/bk
+    - bk configured: bk configure
+
+Examples:
+    $(basename "$0")                        # Dry-run, show what would happen
+    $(basename "$0") --execute              # Actually trigger the build
+    $(basename "$0") --pipeline ci-shadow   # Dry-run with different pipeline
+EOF
+    exit 1
+}
+
+log_info() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+log_success() {
+    echo -e "${GREEN}[OK]${NC} $1"
+}
+
+log_warn() {
+    echo -e "${YELLOW}[WARN]${NC} $1"
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $1" >&2
+}
+
+# Parse arguments
+COMMIT=""
+BRANCH=""
+MESSAGE=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --execute)
+            DRY_RUN=false
+            shift
+            ;;
+        --pipeline)
+            PIPELINE="$2"
+            shift 2
+            ;;
+        --commit)
+            COMMIT="$2"
+            shift 2
+            ;;
+        --branch)
+            BRANCH="$2"
+            shift 2
+            ;;
+        --message)
+            MESSAGE="$2"
+            shift 2
+            ;;
+        --help|-h)
+            usage
+            ;;
+        -*)
+            log_error "Unknown option: $1"
+            usage
+            ;;
+        *)
+            log_error "Unexpected argument: $1"
+            usage
+            ;;
+    esac
+done
+
+# Check if bk CLI is installed
+if ! command -v bk &>/dev/null; then
+    log_error "Buildkite CLI (bk) is not installed"
+    echo ""
+    echo "Install with:"
+    echo "  brew tap buildkite/buildkite && brew install buildkite/buildkite/bk"
+    echo ""
+    echo "Then configure:"
+    echo "  bk configure"
+    exit 1
+fi
+
+# Check if we're in a git repository
+if ! git rev-parse --is-inside-work-tree &>/dev/null; then
+    log_error "Not in a git repository"
+    exit 1
+fi
+
+# Get current commit and branch if not overridden
+if [[ -z "$COMMIT" ]]; then
+    COMMIT=$(git rev-parse HEAD)
+fi
+
+if [[ -z "$BRANCH" ]]; then
+    BRANCH=$(git branch --show-current)
+    if [[ -z "$BRANCH" ]]; then
+        # Detached HEAD state - try to get branch from ref
+        BRANCH=$(git rev-parse --abbrev-ref HEAD)
+    fi
+fi
+
+# Generate default message if not provided
+if [[ -z "$MESSAGE" ]]; then
+    COMMIT_MSG=$(git log -1 --pretty=format:"%s" "$COMMIT" 2>/dev/null || echo "Manual build")
+    MESSAGE="[Manual] ${COMMIT_MSG}"
+fi
+
+# Safety check: Verify the commit exists on the remote
+log_info "Verifying commit exists on remote..."
+git fetch origin --quiet 2>/dev/null || true
+
+# Check if commit is reachable from any remote branch
+REMOTE_BRANCHES=$(git branch -r --contains "$COMMIT" 2>/dev/null || true)
+if [[ -z "$REMOTE_BRANCHES" ]]; then
+    log_error "Commit ${COMMIT} does not exist on any remote branch!"
+    echo ""
+    echo "The CI system will fail to checkout this commit."
+    echo "Please push your changes first:"
+    echo ""
+    echo "  git push origin ${BRANCH}"
+    echo ""
+    exit 1
+fi
+
+log_success "Commit found on remote branches:"
+echo "$REMOTE_BRANCHES" | head -5 | sed 's/^/  /'
+if [[ $(echo "$REMOTE_BRANCHES" | wc -l) -gt 5 ]]; then
+    echo "  ... and more"
+fi
+echo ""
+
+log_info "Pipeline: ${PIPELINE}"
+log_info "Branch: ${BRANCH}"
+log_info "Commit: ${COMMIT}"
+log_info "Message: ${MESSAGE}"
+log_info "Environment: RUN_ALL=1, NIGHTLY=1"
+echo ""
+
+# Build the command
+CMD=(bk build create
+    -y
+    -w
+    -i
+    --pipeline "${PIPELINE}"
+    --commit "${COMMIT}"
+    --branch "${BRANCH}"
+    --message "${MESSAGE}"
+    --env "RUN_ALL=1"
+    --env "NIGHTLY=1"
+)
+
+if [[ "$DRY_RUN" == true ]]; then
+    echo "=========================================="
+    log_warn "DRY-RUN MODE - No build will be triggered"
+    echo "=========================================="
+    echo ""
+    echo "Command that would be executed:"
+    echo ""
+    # Escape single quotes in values for safe shell display
+    escape_for_shell() {
+        printf '%s' "$1" | sed "s/'/'\\\\''/g"
+    }
+    echo "  bk build create \\"
+    echo "    -y \\"
+    echo "    -w \\"
+    echo "    -i \\"
+    echo "    --pipeline '$(escape_for_shell "${PIPELINE}")' \\"
+    echo "    --commit '$(escape_for_shell "${COMMIT}")' \\"
+    echo "    --branch '$(escape_for_shell "${BRANCH}")' \\"
+    echo "    --message '$(escape_for_shell "${MESSAGE}")' \\"
+    echo "    --env 'RUN_ALL=1' \\"
+    echo "    --env 'NIGHTLY=1'"
+    echo ""
+    echo "=========================================="
+    echo -e "${YELLOW}To actually trigger this build, run:${NC}"
+    echo ""
+    echo "  $0 --execute"
+    echo "=========================================="
+    exit 0
+fi
+
+log_info "Triggering build..."
+
+# Execute the command - bk will print the URL and open browser
+"${CMD[@]}"
--- a/.buildkite/scripts/upload-nightly-wheels.sh
+++ b/.buildkite/scripts/upload-nightly-wheels.sh
--- a/.buildkite/scripts/upload-release-wheels.sh
+++ b/.buildkite/scripts/upload-release-wheels.sh
@@ -0,0 +1,104 @@
+#!/usr/bin/env bash
+
+set -e
+
+BUCKET="vllm-wheels"
+SUBPATH=$BUILDKITE_COMMIT
+S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
+
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
+echo "Release version from Buildkite: $RELEASE_VERSION"
+GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
+if [ -z "$GIT_VERSION" ]; then
+    echo "[FATAL] Not on a git tag, cannot create release."
+    exit 1
+else
+    echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
+fi
+# sanity check for version mismatch
+if [ "$RELEASE_VERSION" != "$GIT_VERSION" ]; then
+  if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
+    echo "[WARNING] Force release and ignore version mismatch"
+  else
+    echo "[FATAL] Release version from Buildkite does not match Git version."
+    exit 1
+  fi
+fi
+PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
+
+# check pypi token
+if [ -z "$PYPI_TOKEN" ]; then
+  echo "[FATAL] PYPI_TOKEN is not set."
+  exit 1
+else
+  export TWINE_USERNAME="__token__"
+  export TWINE_PASSWORD="$PYPI_TOKEN"
+fi
+
+# check github token
+if [ -z "$GITHUB_TOKEN" ]; then
+  echo "[FATAL] GITHUB_TOKEN is not set."
+  exit 1
+else
+  export GH_TOKEN="$GITHUB_TOKEN"
+fi
+
+set -x # avoid printing secrets above
+
+# download gh CLI from github
+# Get latest gh CLI version from GitHub API
+GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
+if [ -z "$GH_VERSION" ]; then
+  echo "[FATAL] Failed to get latest gh CLI version from GitHub"
+  exit 1
+fi
+echo "Downloading gh CLI version: $GH_VERSION"
+GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
+GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
+GH_INSTALL_DIR="/tmp/gh-install"
+mkdir -p "$GH_INSTALL_DIR"
+pushd "$GH_INSTALL_DIR"
+curl -L -o "$GH_TARBALL" "$GH_URL"
+tar -xzf "$GH_TARBALL"
+GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
+if [ -z "$GH_BIN" ]; then
+  echo "[FATAL] Failed to find gh CLI executable"
+  exit 1
+fi
+echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
+echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
+command "$GH_BIN" release list --limit 5
+popd
+
+# install twine from pypi
+python3 -m venv /tmp/vllm-release-env
+source /tmp/vllm-release-env/bin/activate
+pip install twine
+python3 -m twine --version
+
+# copy release wheels to local directory
+DIST_DIR=/tmp/vllm-release-dist
+echo "Existing wheels on S3:"
+aws s3 ls "$S3_COMMIT_PREFIX"
+echo "Copying wheels to local directory"
+mkdir -p $DIST_DIR
+# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
+aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
+echo "Wheels copied to local directory"
+# generate source tarball
+git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
+ls -la $DIST_DIR
+
+
+# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
+PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
+if [ -z "$PYPI_WHEEL_FILES" ]; then
+  echo "No default variant wheels found, quitting..."
+  exit 1
+fi
+python3 -m twine check $PYPI_WHEEL_FILES
+python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
+echo "Wheels uploaded to PyPI"
+
+# create release on GitHub with the release version and all wheels
+command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -71,6 +71,7 @@ steps:
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
+  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
  - tests/tool_parsers
@@ -82,6 +83,7 @@ steps:
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s renderers
  - pytest -v -s tokenizers_
  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
@@ -428,6 +430,8 @@ steps:
  timeout_in_minutes: 30
  gpu: h100
  source_file_dependencies:
+    - vllm/config/attention.py
+    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
@@ -452,10 +456,12 @@ steps:
  timeout_in_minutes: 30
  gpu: b200
  source_file_dependencies:
+    - vllm/config/attention.py
+    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
-    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
+    - pytest -v -s v1/attention

 - label: V1 Test others (CPU) # 5 mins
  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
@@ -703,6 +709,17 @@ steps:
    - pytest -v -s kernels/moe/test_batched_deepgemm.py
    - pytest -v -s kernels/attention/test_deepgemm_attention.py

+- label: Kernels Helion Test
+  timeout_in_minutes: 30
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  source_file_dependencies:
+  - vllm/utils/import_utils.py
+  - tests/kernels/helion/
+  commands:
+    - pip install helion
+    - pytest -v -s kernels/helion/
+
 - label: Model Executor Test # 23min
  timeout_in_minutes: 35
  torch_nightly: true
@@ -855,7 +872,7 @@ steps:

 - label: Language Models Tests (Standard)
  timeout_in_minutes: 25
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
@@ -1114,7 +1131,7 @@ steps:
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -1451,7 +1468,7 @@ steps:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt

 - label: NixlConnector PD accuracy tests (Distributed) # 30min
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  timeout_in_minutes: 30
@@ -1462,10 +1479,10 @@ steps:
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
-    - VLLM_ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+    - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

 - label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  timeout_in_minutes: 15
@@ -1476,7 +1493,7 @@ steps:
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
-    - VLLM_ATTENTION_BACKEND=ROCM_ATTN DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+    - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

 ##### multi gpus test #####
 ##### A100 test #####
@@ -1662,17 +1679,6 @@ steps:
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1

- label: DeepSeek V2-Lite Async EPLB Accuracy
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_4
-  # grade: Blocking
-  gpu: h100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace"
-  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030

 - label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
  timeout_in_minutes: 60
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -64,6 +64,7 @@ steps:
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
+  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
  - tests/tool_parsers
@@ -75,6 +76,7 @@ steps:
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s renderers
  - pytest -v -s tokenizers_
  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
@@ -374,6 +376,8 @@ steps:
  timeout_in_minutes: 30
  gpu: h100
  source_file_dependencies:
+    - vllm/config/attention.py
+    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
@@ -396,10 +400,12 @@ steps:
  timeout_in_minutes: 30
  gpu: b200
  source_file_dependencies:
+    - vllm/config/attention.py
+    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
-    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
+    - pytest -v -s v1/attention

 - label: V1 Test others (CPU) # 5 mins
  source_file_dependencies:
@@ -624,6 +630,56 @@ steps:
    - pytest -v -s kernels/moe/test_batched_deepgemm.py
    - pytest -v -s kernels/attention/test_deepgemm_attention.py

+- label: Kernels Helion Test
+  timeout_in_minutes: 30
+  gpu: h100
+  source_file_dependencies:
+  - vllm/utils/import_utils.py
+  - tests/kernels/helion/
+  commands:
+    - pip install helion
+    - pytest -v -s kernels/helion/
+
+  
+- label: Kernels FP8 MoE Test (1 H100)
+  timeout_in_minutes: 90
+  gpu: h100
+  num_gpus: 1
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_cutlass_moe.py
+    - pytest -v -s kernels/moe/test_flashinfer.py
+    - pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
+    - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
+    - pytest -v -s kernels/moe/test_moe.py
+    # - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
+    - pytest -v -s kernels/moe/test_block_int8.py
+    - pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
+    - pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
+
+- label: Kernels FP8 MoE Test (2 H100s)
+  timeout_in_minutes: 90
+  gpu: h100
+  num_gpus: 2
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
+    - pytest -v -s kernels/moe/test_deepep_moe.py
+    - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
+    # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
+  
+- label: Kernels Fp4 MoE Test (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  num_gpus: 1
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_cutedsl_moe.py
+    - pytest -v -s kernels/moe/test_flashinfer_moe.py
+    - pytest -v -s kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s kernels/moe/test_ocp_mx_moe.py
+
+
 - label: Model Executor Test # 23min
  timeout_in_minutes: 35
  torch_nightly: true
@@ -951,7 +1007,7 @@ steps:
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

- label: Blackwell Test # 21 min
+- label: Blackwell Test # 23 min
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  gpu: b200
@@ -961,7 +1017,7 @@ steps:
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -991,6 +1047,8 @@ steps:
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
+    # e2e
+    - pytest -v -s tests/models/quantization/test_nvfp4.py

 - label: Blackwell Fusion and Compile Tests # 30 min
  timeout_in_minutes: 40
@@ -1045,6 +1103,48 @@ steps:
    # Run all e2e fusion tests
    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py

+- label: Hopper Fusion E2E Tests (H100) # 10min
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/"
+  gpu: h100
+  optional: true
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusion_attn.py
+  commands:
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    # skip Llama-4 since it does not fit on this device
+    - pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
+
+- label: Hopper Fusion Distributed E2E Tests (2xH100)  # 70min
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/"
+  gpu: h100
+  optional: true
+  num_gpus: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  commands:
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    # Run all e2e fusion tests
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+
 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
@@ -1216,7 +1316,7 @@ steps:
  - pytest -v -s distributed/test_distributed_oot.py
  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
-  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
+  - pytest -v -s plugins/lora_resolvers # unit tests for lora resolver plugins

 - label: Pipeline + Context Parallelism Test # 45min
  timeout_in_minutes: 60
@@ -1344,22 +1444,31 @@ steps:
    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4

-##### H200 test #####
- label: Distributed Tests (H200) # optional
-  gpu: h200
+- label: Sequence Parallel Tests (H100) # 60 min
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: h100
+  optional: true
+  num_gpus: 2
+  commands:
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    # Run sequence parallel tests
+    - pytest -v -s tests/distributed/test_sequence_parallel.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+
+- label: Distributed Tests (H100) # optional
+  gpu: h100
  optional: true
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
-    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py

+##### H200 test #####
+
 - label: LM Eval Large Models (H200) # optional
  timeout_in_minutes: 60
  gpu: h200
--- a/.buildkite/test_areas/attention.yaml
+++ b/.buildkite/test_areas/attention.yaml
@@ -4,8 +4,10 @@ depends_on:
 steps:
 - label: V1 attention (H100)
  timeout_in_minutes: 30
-  gpu: h100
+  device: h100
  source_file_dependencies:
+    - vllm/config/attention.py
+    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
@@ -13,9 +15,11 @@ steps:

 - label: V1 attention (B200)
  timeout_in_minutes: 30
-  gpu: b200
+  device: b200
  source_file_dependencies:
+    - vllm/config/attention.py
+    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
-    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
+    - pytest -v -s v1/attention
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Fusion and Compile Tests (B200)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  device: b200
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -26,7 +26,7 @@ steps:
    - nvidia-smi
    - pytest -v -s tests/compile/test_fusion_attn.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
-    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    # this runner has 2 GPUs available even though num_devices=2 is not set
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
    # Wrap with quotes to escape yaml
@@ -37,9 +37,9 @@ steps:
 - label: Fusion E2E (2 GPUs)(B200)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  device: b200
  optional: true
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Distributed Comm Ops
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/distributed
  - tests/distributed
@@ -18,7 +18,7 @@ steps:
 - label: Distributed (2 GPUs)
  timeout_in_minutes: 90
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/compilation/
  - vllm/distributed/
@@ -54,7 +54,7 @@ steps:
 - label: Distributed Tests (4 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
  - tests/distributed/test_utils
@@ -103,8 +103,8 @@ steps:

 - label: Distributed Tests (8 GPUs)(H100)
  timeout_in_minutes: 10
-  gpu: h100
-  num_gpus: 8
+  device: h100
+  num_devices: 8
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - examples/offline_inference/torchrun_dp_example.py
@@ -120,9 +120,9 @@ steps:
  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep

 - label: Distributed Tests (4 GPUs)(A100)
-  gpu: a100
+  device: a100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/
  commands:
@@ -133,26 +133,34 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py

- label: Distributed Tests (2 GPUs)(H200)
-  gpu: h200
+- label: Sequence Parallel Tests (H100)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  device: h100
+  optional: true
+  num_devices: 2
+  commands:
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    # Run sequence parallel tests
+    - pytest -v -s tests/distributed/test_sequence_parallel.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+
+- label: Distributed Tests (2 GPUs)(H100)
+  device: h100
  optional: true
  working_dir: "/vllm-workspace/"
-  num_gpus: 2
+  num_devices: 2
  commands:
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
-    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py

 - label: Distributed Tests (2 GPUs)(B200)
-  gpu: b200
+  device: b200
  optional: true
  working_dir: "/vllm-workspace/"
-  num_gpus: 2
+  num_devices: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
@@ -161,8 +169,9 @@ steps:
 - label: 2 Node Test (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  num_nodes: 2
+  no_plugin: true
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
@@ -176,7 +185,7 @@ steps:
 - label: Distributed NixlConnector PD accuracy (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
@@ -184,10 +193,21 @@ steps:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

+- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+
 - label: Pipeline + Context Parallelism (4 GPUs))
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
@@ -196,4 +216,46 @@ steps:
  - tests/distributed/
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
-  - pytest -v -s distributed/test_pipeline_parallel.py
+  - pytest -v -s distributed/test_pipeline_parallel.py
+
+- label: Hopper Fusion E2E Tests (H100)
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/"
+  device: h100
+  optional: true
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusion_attn.py
+  commands:
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    # skip Llama-4 since it does not fit on this device
+    - pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
+
+- label: Hopper Fusion Distributed E2E Tests (2xH100)
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/"
+  device: h100
+  optional: true
+  num_devices: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  commands:
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    # Run all e2e fusion tests
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -4,27 +4,27 @@ depends_on:
 steps:
 - label: DeepSeek V2-Lite Accuracy
  timeout_in_minutes: 60
-  gpu: h100
+  device: h100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010

 - label: Qwen3-30B-A3B-FP8-block Accuracy
  timeout_in_minutes: 60
-  gpu: h100
+  device: h100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020

 - label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
  timeout_in_minutes: 60
-  gpu: b200
+  device: b200
  optional: true
-  num_gpus: 2
+  num_devices: 2
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
@@ -33,10 +33,11 @@ steps:
  timeout_in_minutes: 30
  optional: true
  soft_fail: true
-  num_gpus: 2
+  num_devices: 2
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
+    - nvidia-smi
    - bash .buildkite/scripts/run-prime-rl-test.sh
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -23,4 +23,8 @@ steps:
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - pytest -v -s v1/e2e
-    - pytest -v -s v1/engine
+    # Run this test standalone for now;
+    # need to untangle use (implicit) use of spawn/fork across the tests.
+    - pytest -v -s v1/engine/test_preprocess_error_handling.py
+    # Run the rest of v1/engine tests
+    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -14,7 +14,7 @@ steps:
 - label: EPLB Execution
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/eplb
  - tests/distributed/test_eplb_execute.py
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -57,8 +57,8 @@ steps:

 - label: Kernels DeepGEMM Test (H100)
  timeout_in_minutes: 45
-  gpu: h100
-  num_gpus: 1
+  device: h100
+  num_devices: 1
  source_file_dependencies:
  - tools/install_deepgemm.sh
  - vllm/utils/deep_gemm.py
@@ -77,7 +77,7 @@ steps:
 - label: Kernels (B200)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  device: b200
  # optional: true
  source_file_dependencies:
  - csrc/quantization/fp4/
@@ -85,7 +85,7 @@ steps:
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -114,4 +114,55 @@ steps:
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
+    # e2e
+    - pytest -v -s tests/models/quantization/test_nvfp4.py
+
+- label: Kernels Helion Test
+  timeout_in_minutes: 30
+  device: h100
+  source_file_dependencies:
+  - vllm/utils/import_utils.py
+  - tests/kernels/helion/
+  commands:
+    - pip install helion
+    - pytest -v -s kernels/helion/
+
+ 
+- label: Kernels FP8 MoE Test (1 H100)
+  timeout_in_minutes: 90
+  device: h100
+  num_devices: 1
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_cutlass_moe.py
+    - pytest -v -s kernels/moe/test_flashinfer.py
+    - pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
+    - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
+    - pytest -v -s kernels/moe/test_moe.py
+    # - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
+    - pytest -v -s kernels/moe/test_block_int8.py
+    - pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
+    - pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
+
+- label: Kernels FP8 MoE Test (2 H100s)
+  timeout_in_minutes: 90
+  device: h100
+  num_devices: 2
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
+    - pytest -v -s kernels/moe/test_deepep_moe.py
+    - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
+    # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
+  
+- label: Kernels Fp4 MoE Test (B200)
+  timeout_in_minutes: 60
+  device: b200
+  num_devices: 1
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_cutedsl_moe.py
+    - pytest -v -s kernels/moe/test_flashinfer_moe.py
+    - pytest -v -s kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s kernels/moe/test_ocp_mx_moe.py
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -12,9 +12,9 @@ steps:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt

 - label: LM Eval Large Models (4 GPUs)(A100)
-  gpu: a100
+  device: a100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
@@ -24,9 +24,9 @@ steps:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

 - label: LM Eval Large Models (4 GPUs)(H100)
-  gpu: h100
+  device: h100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
@@ -37,10 +37,39 @@ steps:

 - label: LM Eval Small Models (B200)
  timeout_in_minutes: 120
-  gpu: b200
+  device: b200
  optional: true
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
+
+- label: LM Eval Large Models (H200)
+  timeout_in_minutes: 60
+  device: h200
+  optional: true
+  num_devices: 8
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
+
+- label: MoE Refactor Integration Test (H100 - TEMPORARY)
+  device: h100
+  optional: true
+  num_devices: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
+  
+- label: MoE Refactor Integration Test (B200 - TEMPORARY)
+  gpu: b200
+  optional: true
+  num_devices: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
+
+- label: MoE Refactor Integration Test (B200 DP - TEMPORARY)
+  device: b200
+  optional: true
+  num_devices: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@@ -14,7 +14,7 @@ steps:

 - label: LoRA TP (Distributed)
  timeout_in_minutes: 30
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/lora
  - tests/lora
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -31,7 +31,7 @@ steps:
  source_file_dependencies:
    - vllm/
    - tests/v1
-  no_gpu: true
+  device: cpu
  commands:
    # split the test to avoid interference
    - pytest -v -s -m 'cpu_test' v1/core
@@ -82,7 +82,7 @@ steps:

 - label: Metrics, Tracing (2 GPUs)
  timeout_in_minutes: 20
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/
  - tests/v1/tracing
@@ -121,17 +121,19 @@ steps:
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
+  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
  - tests/tool_parsers
  - tests/transformers_utils
  - tests/config
-  no_gpu: true
+  device: cpu
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s renderers
  - pytest -v -s tokenizers_
  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
@@ -140,7 +142,7 @@ steps:
 - label: GPT-OSS Eval (B200)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  device: b200
  optional: true
  source_file_dependencies:
  - tests/evals/gpt_oss
@@ -153,7 +155,7 @@ steps:

 - label: Batch Invariance (H100)
  timeout_in_minutes: 25
-  gpu: h100
+  device: h100
  source_file_dependencies:
    - vllm/v1/attention
    - vllm/model_executor/layers
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -44,7 +44,7 @@ steps:
  - vllm/
  - tests/models/test_utils.py
  - tests/models/test_vision.py
-  no_gpu: true
+  device: cpu
  commands:
    - pytest -v -s models/test_utils.py models/test_vision.py

--- a/.buildkite/test_areas/models_distributed.yaml
+++ b/.buildkite/test_areas/models_distributed.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Distributed Model Tests (2 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/model_executor/model_loader/sharded_state_loader.py
  - vllm/model_executor/models/
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -18,7 +18,7 @@ steps:
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
-  no_gpu: true
+  device: cpu
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Plugin Tests (2 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
--- a/.buildkite/test_areas/quantization.yaml
+++ b/.buildkite/test_areas/quantization.yaml
@@ -16,14 +16,14 @@ steps:
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
-  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

 - label: Quantized MoE Test (B200)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  device: b200
  source_file_dependencies:
  - tests/quantization/test_blackwell_moe.py
  - vllm/model_executor/models/deepseek_v2.py
--- a/.buildkite/test_areas/weight_loading.yaml
+++ b/.buildkite/test_areas/weight_loading.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Weight Loading Multiple GPU  # 33min
  timeout_in_minutes: 45
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  optional: true
  source_file_dependencies:
  - vllm/
@@ -15,8 +15,8 @@ steps:

 - label: Weight Loading Multiple GPU - Large Models # optional
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  gpu: a100
+  num_devices: 2
+  device: a100
  optional: true
  source_file_dependencies:
  - vllm/
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -414,6 +414,18 @@ pull_request_rules:
      remove:
        - needs-rebase

+- name: label-bug
+  description: Automatically apply bug label
+  conditions:
+    - label != stale
+    - or:
+      - title~=(?i)\bbug\b
+      - title~=(?i)\bbugfix\b
+  actions:
+    label:
+      add:
+        - bug
+
 - name: label-kv-connector
  description: Automatically apply kv-connector label
  conditions:
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -29,8 +29,9 @@ jobs:

      - name: Install dependencies and build vLLM
        run: |
+          uv pip install -r requirements/cpu-build.txt --index-strategy unsafe-best-match
          uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
-          uv pip install -e .
+          uv pip install -e . --no-build-isolation
        env:
          CMAKE_BUILD_PARALLEL_LEVEL: 4

--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,9 @@ vllm/vllm_flash_attn/*
 # OpenAI triton kernels copied from source
 vllm/third_party/triton_kernels/*

+# FlashMLA interface copied from source
+vllm/third_party/flashmla/flash_mla_interface.py
+
 # triton jit
 .triton

@@ -191,6 +194,9 @@ CLAUDE.md
 AGENTS.md
 .codex/

+# Cursor
+.cursor/
+
 # DS Store
 .DS_Store

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -147,6 +147,13 @@ repos:
    entry: python tools/pre_commit/validate_config.py
    language: python
    additional_dependencies: [regex]
+  - id: validate-docker-versions
+    name: Validate docker/versions.json matches Dockerfile
+    entry: python tools/generate_versions_json.py --check
+    language: python
+    files: ^docker/(Dockerfile|versions\.json)$
+    pass_filenames: false
+    additional_dependencies: [dockerfile-parse]
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -377,7 +377,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    # preselected input type pairs and schedules.
    # Generate sources:
    set(MARLIN_GEN_SCRIPT
-      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
+      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/marlin/generate_kernels.py)
    file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
    list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
    set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
@@ -412,7 +412,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()

    if (MARLIN_ARCHS)
-      file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
+      file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/marlin/sm80_kernel_*_float16.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
        CUDA_ARCHS "${MARLIN_ARCHS}")
@@ -422,7 +422,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      endif()
      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})

-      file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
+      file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/marlin/sm80_kernel_*_bfloat16.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
        CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
@@ -434,7 +434,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()

    if (MARLIN_SM75_ARCHS) 
-      file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/gptq_marlin/sm75_kernel_*.cu")
+      file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/marlin/sm75_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
        CUDA_ARCHS "${MARLIN_SM75_ARCHS}")
@@ -446,7 +446,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()

    if (MARLIN_FP8_ARCHS) 
-      file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
+      file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/marlin/sm89_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
        CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
@@ -459,10 +459,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

    set(MARLIN_SRCS
       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
-       "csrc/quantization/gptq_marlin/gptq_marlin.cu"
-       "csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
-       "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
-       "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
+       "csrc/quantization/marlin/marlin.cu"
+       "csrc/quantization/marlin/marlin_int4_fp8_preprocess.cu"
+       "csrc/quantization/marlin/gptq_marlin_repack.cu"
+       "csrc/quantization/marlin/awq_marlin_repack.cu")
    set_gencode_flags_for_srcs(
      SRCS "${MARLIN_SRCS}"
      CUDA_ARCHS "${MARLIN_OTHER_ARCHS}")
--- a/benchmarks/kernels/bench_nvfp4_quant.py
+++ b/benchmarks/kernels/bench_nvfp4_quant.py
@@ -20,8 +20,12 @@ FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max

 PROVIDER_CFGS = {
-    "vllm": dict(backend="vllm", enabled=True),
-    "flashinfer": dict(backend="flashinfer", enabled=True),
+    "vllm": dict(backend="vllm", is_sf_swizzled_layout=False, enabled=True),
+    "vllm-swizzle": dict(backend="vllm", is_sf_swizzled_layout=True, enabled=True),
+    "flashinfer": dict(backend="flashinfer", is_sf_swizzled_layout=False, enabled=True),
+    "flashinfer-swizzle": dict(
+        backend="flashinfer", is_sf_swizzled_layout=True, enabled=True
+    ),
 }

 _enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
@@ -36,7 +40,7 @@ def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor:
@triton.testing.perf_report(
    triton.testing.Benchmark(
        x_names=["batch_size"],
-        x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
+        x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192],
        x_log=False,
        line_arg="provider",
        line_vals=_enabled,
@@ -63,19 +67,36 @@ def benchmark(batch_size, provider, N, K):

    if cfg["backend"] == "vllm":
        # vLLM's FP4 quantization
-        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
-            lambda: ops.scaled_fp4_quant(a, a_global_scale),
-            quantiles=quantiles,
-        )
+        if cfg["is_sf_swizzled_layout"]:
+            ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+                lambda: ops.scaled_fp4_quant(
+                    a, a_global_scale, is_sf_swizzled_layout=True
+                ),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+                lambda: ops.scaled_fp4_quant(
+                    a, a_global_scale, is_sf_swizzled_layout=False
+                ),
+                quantiles=quantiles,
+            )
    elif cfg["backend"] == "flashinfer":
        # FlashInfer's FP4 quantization
-        # Use is_sf_swizzled_layout=True to match vLLM's output format
-        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
-            lambda: flashinfer_fp4_quantize(
-                a, a_global_scale, is_sf_swizzled_layout=True
-            ),
-            quantiles=quantiles,
-        )
+        if cfg["is_sf_swizzled_layout"]:
+            ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+                lambda: flashinfer_fp4_quantize(
+                    a, a_global_scale, is_sf_swizzled_layout=True
+                ),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+                lambda: flashinfer_fp4_quantize(
+                    a, a_global_scale, is_sf_swizzled_layout=False
+                ),
+                quantiles=quantiles,
+            )

    # Convert ms to us for better readability at small batch sizes
    to_us = lambda t_ms: t_ms * 1000
@@ -92,7 +113,9 @@ def prepare_shapes(args):
    return out


-def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
+def _test_accuracy_once(
+    M: int, K: int, dtype: torch.dtype, device: str, is_sf_swizzled_layout: bool
+):
    """Test accuracy between vLLM and FlashInfer FP4 quantization."""
    # Create input tensor
    a = torch.randn((M, K), device=device, dtype=dtype)
@@ -101,11 +124,13 @@ def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
    a_global_scale = compute_global_scale(a)

    # vLLM quantization
-    vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale)
+    vllm_fp4, vllm_scale = ops.scaled_fp4_quant(
+        a, a_global_scale, is_sf_swizzled_layout=is_sf_swizzled_layout
+    )

    # FlashInfer quantization (with swizzled layout to match vLLM's output)
    flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize(
-        a, a_global_scale, is_sf_swizzled_layout=True
+        a, a_global_scale, is_sf_swizzled_layout=is_sf_swizzled_layout
    )
    flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn)

@@ -114,7 +139,14 @@ def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
        vllm_fp4,
        flashinfer_fp4,
    )
-    print(f"M={M}, K={K}, dtype={dtype}: PASSED")
+    # Compare scales
+    torch.testing.assert_close(
+        vllm_scale,
+        flashinfer_scale,
+    )
+    print(
+        f"M={M}, K={K}, dtype={dtype}, is_sf_swizzled_layout={is_sf_swizzled_layout}: PASSED"  # noqa: E501
+    )


 def test_accuracy():
@@ -130,9 +162,10 @@ def test_accuracy():
    Ms = [1, 1024]
    Ks = [4096]

-    for M in Ms:
-        for K in Ks:
-            _test_accuracy_once(M, K, dtype, device)
+    for is_sf_swizzled_layout in [True, False]:
+        for M in Ms:
+            for K in Ks:
+                _test_accuracy_once(M, K, dtype, device, is_sf_swizzled_layout)

    print("\nAll accuracy tests passed!")

@@ -145,7 +178,7 @@ if __name__ == "__main__":
        "--models",
        nargs="+",
        type=str,
-        default=["meta-llama/Llama-3.1-8B-Instruct"],
+        default=["meta-llama/Llama-3.3-70B-Instruct"],
        choices=list(WEIGHT_SHAPES.keys()),
    )
    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@@ -7,7 +7,7 @@ import itertools
 import torch

 import vllm.model_executor.layers.activation  # noqa F401
-from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.custom_op import op_registry
 from vllm.triton_utils import triton
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
@@ -33,14 +33,14 @@ def benchmark_activation(
    torch.set_default_device(device)

    if func_name == "gelu_and_mul":
-        layer = CustomOp.op_registry[func_name](approximate="none")
+        layer = op_registry[func_name](approximate="none")
    elif func_name == "gelu_and_mul_tanh":
-        layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh")
+        layer = op_registry["gelu_and_mul"](approximate="tanh")
    elif func_name == "fatrelu_and_mul":
        threshold = 0.5
-        layer = CustomOp.op_registry[func_name](threshold)
+        layer = op_registry[func_name](threshold)
    else:
-        layer = CustomOp.op_registry[func_name]()
+        layer = op_registry[func_name]()

    x = torch.randn(num_tokens, dim, dtype=dtype, device=device)
    compiled_layer = torch.compile(layer.forward_native)
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -9,6 +9,7 @@ but use different quantization strategies and backends.
 import torch

 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
 from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
@@ -138,12 +139,13 @@ def bench_run(
    fn = mk.FusedMoEModularKernel(
        MoEPrepareAndFinalizeNoEP(),
        CutlassExpertsFp8(
-            out_dtype=a.dtype,
-            e=num_experts,
-            n=n,
-            k=k,
+            moe_config=make_dummy_moe_config(
+                num_experts=num_experts,
+                hidden_dim=k,
+                intermediate_size_per_partition=n,
+                in_dtype=a.dtype,
+            ),
            quant_config=quant_config,
-            device=w1.device,
        ),
    )

--- a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
@@ -12,6 +12,7 @@ import torch
 import torch.utils.benchmark as benchmark

 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import (
@@ -196,10 +197,9 @@ def bench_run(
        )

        kernel = mk.FusedMoEModularKernel(
-            MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
+            MoEPrepareAndFinalizeNoEP(),
            CutlassExpertsFp4(
-                out_dtype=dtype,
-                max_experts_per_worker=e,
+                make_dummy_moe_config(),
                quant_config=quant_config,
            ),
        )
@@ -242,10 +242,9 @@ def bench_run(
        )

        kernel = mk.FusedMoEModularKernel(
-            MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
+            MoEPrepareAndFinalizeNoEP(),
            CutlassExpertsFp4(
-                out_dtype=dtype,
-                max_experts_per_worker=e,
+                make_dummy_moe_config(),
                quant_config=quant_config,
            ),
        )
--- a/benchmarks/kernels/benchmark_fused_topk.py
+++ b/benchmarks/kernels/benchmark_fused_topk.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+
+import torch
+
+from vllm.model_executor.layers.fused_moe.router.fused_topk_router import fused_topk
+from vllm.triton_utils import triton
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+num_tokens_range = [2**i for i in range(0, 8, 2)]
+num_experts_range = [16, 32, 64, 128, 256, 512]
+topk_range = [3, 4]
+configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
+
+
+def torch_topk(
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    scoring_func: str = "softmax",
+):
+    if scoring_func == "softmax":
+        scores = torch.softmax(gating_output.float(), dim=-1)
+    else:
+        scores = torch.sigmoid(gating_output.float())
+    topk_weights, topk_ids = torch.topk(scores, k=topk, dim=-1)
+
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+    return topk_weights, topk_ids
+
+
+def get_benchmark(scoring_func):
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["num_tokens", "num_experts", "topk"],
+            x_vals=[list(_) for _ in configs],
+            line_arg="provider",
+            line_vals=["torch", "vllm"],
+            line_names=["Torch", "vLLM"],
+            styles=[("blue", "-"), ("red", "-")],
+            ylabel="us",
+            plot_name=f"fused-topk-perf-{scoring_func}",
+            args={},
+        )
+    )
+    def benchmark(num_tokens, num_experts, topk, provider):
+        dtype = torch.bfloat16
+        hidden_size = 1024
+        renormalize = True
+        hidden_states = torch.randn(
+            (num_tokens, hidden_size), dtype=dtype, device="cuda"
+        )
+        gating_output = torch.randn(
+            (num_tokens, num_experts), dtype=dtype, device="cuda"
+        )
+
+        quantiles = [0.5, 0.2, 0.8]
+
+        if provider == "torch":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: torch_topk(
+                    gating_output=gating_output,
+                    topk=topk,
+                    renormalize=renormalize,
+                    scoring_func=scoring_func,
+                ),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: fused_topk(
+                    hidden_states=hidden_states,
+                    gating_output=gating_output,
+                    topk=topk,
+                    renormalize=renormalize,
+                    scoring_func=scoring_func,
+                ),
+                quantiles=quantiles,
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark the MoE topk kernel.")
+    parser.add_argument("--scoring-func", type=str, default="softmax")
+    parser.add_argument("--save-path", type=str, default="./configs/fused_topk/")
+    args = parser.parse_args()
+
+    # Get the benchmark function
+    benchmark = get_benchmark(args.scoring_func)
+    # Run performance benchmark
+    benchmark.run(print_data=True, save_path=args.save_path)
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -6,6 +6,7 @@ import torch.utils.benchmark as benchmark
 from benchmark_shapes import WEIGHT_SHAPES_MOE

 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
@@ -134,13 +135,13 @@ def bench_run(
        fn = mk.FusedMoEModularKernel(
            MoEPrepareAndFinalizeNoEP(),
            CutlassExpertsFp8(
-                out_dtype=a.dtype,
-                # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
-                e=w2.shape[0],
-                n=w2.shape[2],
-                k=w2.shape[1],
+                moe_config=make_dummy_moe_config(
+                    num_experts=w2.shape[0],
+                    hidden_dim=w2.shape[1],
+                    intermediate_size_per_partition=w2.shape[2],
+                    in_dtype=a.dtype,
+                ),
                quant_config=quant_config,
-                device=w1.device,
            ),
        )

@@ -166,13 +167,13 @@ def bench_run(
        fn = mk.FusedMoEModularKernel(
            MoEPrepareAndFinalizeNoEP(),
            CutlassExpertsFp8(
-                out_dtype=a.dtype,
-                # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
-                e=w2.shape[0],
-                n=w2.shape[2],
-                k=w2.shape[1],
+                moe_config=make_dummy_moe_config(
+                    num_experts=w2.shape[0],
+                    hidden_dim=w2.shape[1],
+                    intermediate_size_per_partition=w2.shape[2],
+                    in_dtype=a.dtype,
+                ),
                quant_config=quant_config,
-                device=w1.device,
            ),
        )

--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -231,7 +231,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
        assert bt.w_tok_s is None
        assert bt.group_size is not None

-        fn = lambda: ops.gptq_marlin_gemm(
+        fn = lambda: ops.marlin_gemm(
            a=bt.a,
            c=None,
            b_q_weight=w_q,
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -239,7 +239,7 @@ def bench_run(
        "sm_version": sm_version,
        "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD,
        # Kernels
-        "gptq_marlin_gemm": ops.gptq_marlin_gemm,
+        "marlin_gemm": ops.marlin_gemm,
        "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
        "gptq_marlin_repack": ops.gptq_marlin_repack,
        "allspark_w8a16_gemm": ops.allspark_w8a16_gemm,
@@ -263,21 +263,21 @@ def bench_run(

    results.append(
        benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
+            stmt="output = marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
-            description="gptq_marlin_gemm",
+            description="marlin_gemm",
        ).blocked_autorange(min_run_time=min_run_time)
    )

    results.append(
        benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
+            stmt="output = marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
-            description="gptq_marlin_gemm_fp32",
+            description="marlin_gemm_fp32",
        ).blocked_autorange(min_run_time=min_run_time)
    )

--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -15,11 +15,18 @@ import ray
 import torch
 from ray.experimental.tqdm_ray import tqdm

+from vllm.model_executor.layers.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEParallelConfig,
    FusedMoEQuantConfig,
+    RoutingMethodType,
    _get_config_dtype_str,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import *
+from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
+    TritonOrDeepGemmExperts,
+)
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
 from vllm.triton_utils import triton
@@ -194,10 +201,36 @@ def benchmark_config(
            block_shape=block_quant_shape,
        )

+        deep_gemm_experts = None
+        if use_deep_gemm:
+            deep_gemm_experts = mk.FusedMoEModularKernel(
+                prepare_finalize=MoEPrepareAndFinalizeNoEP(),
+                fused_experts=TritonOrDeepGemmExperts(
+                    moe_config=FusedMoEConfig(
+                        num_experts=num_experts,
+                        experts_per_token=topk,
+                        hidden_dim=hidden_size,
+                        intermediate_size_per_partition=shard_intermediate_size,
+                        num_local_experts=num_experts,
+                        activation="silu",
+                        moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
+                        in_dtype=init_dtype,
+                        routing_method=RoutingMethodType.TopK,
+                        device="cuda",
+                    ),
+                    quant_config=quant_config,
+                ),
+            )
+
        with override_config(config):
            topk_weights, topk_ids, token_expert_indices = fused_topk(
                x, input_gating, topk, renormalize=not use_deep_gemm
            )
+
+            if use_deep_gemm:
+                return deep_gemm_experts(
+                    x, w1, w2, topk_weights, topk_ids, inplace=True
+                )
            return fused_experts(
                x,
                w1,
@@ -206,7 +239,6 @@ def benchmark_config(
                topk_ids,
                inplace=True,
                quant_config=quant_config,
-                allow_deep_gemm=use_deep_gemm,
            )

    # JIT compilation & warmup
@@ -643,6 +675,7 @@ def main(args: argparse.Namespace):
        "DeepseekV3ForCausalLM",
        "DeepseekV32ForCausalLM",
        "Glm4MoeForCausalLM",
+        "Glm4MoeLiteForCausalLM",
        "NemotronHForCausalLM",
    ):
        E = config.n_routed_experts
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -8,10 +8,8 @@ import ray
 import torch
 from transformers import AutoConfig

-from vllm.model_executor.layers.fused_moe.fused_moe import *
+from vllm.model_executor.layers.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
-    _moe_permute,
-    _moe_unpermute_and_reduce,
    moe_permute,
    moe_unpermute,
 )
@@ -41,7 +39,6 @@ def benchmark_permute(
    use_fp8_w8a8: bool,
    use_int8_w8a16: bool,
    num_iters: int = 100,
-    use_customized_permute: bool = False,
 ) -> float:
    # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
@@ -64,31 +61,14 @@ def benchmark_permute(
        input_gating.copy_(gating_output[i])

    def run():
-        if use_customized_permute:
-            (
-                permuted_hidden_states,
-                a1q_scale,
-                first_token_off,
-                inv_perm_idx,
-                m_indices,
-            ) = moe_permute(
-                qhidden_states,
-                a1q_scale=None,
-                topk_ids=topk_ids,
-                n_expert=num_experts,
-                expert_map=None,
-                align_block_size=align_block_size,
-            )
-        else:
-            (
-                permuted_hidden_states,
-                a1q_scale,
-                sorted_token_ids,
-                expert_ids,
-                inv_perm,
-            ) = _moe_permute(
-                qhidden_states, None, topk_ids, num_experts, None, align_block_size
-            )
+        moe_permute(
+            qhidden_states,
+            a1q_scale=None,
+            topk_ids=topk_ids,
+            n_expert=num_experts,
+            expert_map=None,
+            align_block_size=align_block_size,
+        )

    # JIT compilation & warmup
    run()
@@ -133,11 +113,9 @@ def benchmark_unpermute(
    use_fp8_w8a8: bool,
    use_int8_w8a16: bool,
    num_iters: int = 100,
-    use_customized_permute: bool = False,
 ) -> float:
    # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
-    output_hidden_states = torch.empty_like(hidden_states)
    if use_fp8_w8a8:
        align_block_size = 128  # deepgemm needs 128 m aligned block
        qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
@@ -152,78 +130,37 @@ def benchmark_unpermute(
    )

    def prepare():
-        if use_customized_permute:
-            (
-                permuted_hidden_states,
-                a1q_scale,
-                first_token_off,
-                inv_perm_idx,
-                m_indices,
-            ) = moe_permute(
-                qhidden_states,
-                a1q_scale=None,
-                topk_ids=topk_ids,
-                n_expert=num_experts,
-                expert_map=None,
-                align_block_size=align_block_size,
-            )
-            # convert to fp16/bf16 as gemm output
-            return (
-                permuted_hidden_states.to(dtype),
-                first_token_off,
-                inv_perm_idx,
-                m_indices,
-            )
-        else:
-            (
-                permuted_qhidden_states,
-                a1q_scale,
-                sorted_token_ids,
-                expert_ids,
-                inv_perm,
-            ) = _moe_permute(
-                qhidden_states, None, topk_ids, num_experts, None, align_block_size
-            )
-            # convert to fp16/bf16 as gemm output
-            return (
-                permuted_qhidden_states.to(dtype),
-                a1q_scale,
-                sorted_token_ids,
-                expert_ids,
-                inv_perm,
-            )
+        (
+            permuted_hidden_states,
+            _,
+            first_token_off,
+            inv_perm_idx,
+            _,
+        ) = moe_permute(
+            qhidden_states,
+            a1q_scale=None,
+            topk_ids=topk_ids,
+            n_expert=num_experts,
+            expert_map=None,
+            align_block_size=align_block_size,
+        )
+        # convert to fp16/bf16 as gemm output
+        return (
+            permuted_hidden_states.to(dtype),
+            first_token_off,
+            inv_perm_idx,
+        )

    def run(input: tuple):
-        if use_customized_permute:
-            (
-                permuted_hidden_states,
-                first_token_off,
-                inv_perm_idx,
-                m_indices,
-            ) = input
-            output = torch.empty_like(hidden_states)
-            moe_unpermute(
-                output,
-                permuted_hidden_states,
-                topk_weights,
-                inv_perm_idx,
-                first_token_off,
-            )
-        else:
-            (
-                permuted_hidden_states,
-                a1q_scale,
-                sorted_token_ids,
-                expert_ids,
-                inv_perm,
-            ) = input
-            _moe_unpermute_and_reduce(
-                output_hidden_states,
-                permuted_hidden_states,
-                inv_perm,
-                topk_weights,
-                True,
-            )
+        (permuted_hidden_states, first_token_off, inv_perm_idx) = input
+        output = torch.empty_like(hidden_states)
+        moe_unpermute(
+            output,
+            permuted_hidden_states,
+            topk_weights,
+            inv_perm_idx,
+            first_token_off,
+        )

    # JIT compilation & warmup
    input = prepare()
@@ -278,8 +215,7 @@ class BenchmarkWorker:
        dtype: torch.dtype,
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
-        use_customized_permute: bool = False,
-    ) -> tuple[dict[str, int], float]:
+    ) -> tuple[float, float]:
        set_random_seed(self.seed)

        permute_time = benchmark_permute(
@@ -291,7 +227,6 @@ class BenchmarkWorker:
            use_fp8_w8a8,
            use_int8_w8a16,
            num_iters=100,
-            use_customized_permute=use_customized_permute,
        )
        unpermute_time = benchmark_unpermute(
            num_tokens,
@@ -302,7 +237,6 @@ class BenchmarkWorker:
            use_fp8_w8a8,
            use_int8_w8a16,
            num_iters=100,
-            use_customized_permute=use_customized_permute,
        )
        return permute_time, unpermute_time

@@ -330,6 +264,7 @@ def main(args: argparse.Namespace):
        config.architectures[0] == "DeepseekV3ForCausalLM"
        or config.architectures[0] == "DeepseekV2ForCausalLM"
        or config.architectures[0] == "Glm4MoeForCausalLM"
+        or config.architectures[0] == "Glm4MoeLiteForCausalLM"
    ):
        E = config.n_routed_experts
        topk = config.num_experts_per_tok
@@ -348,7 +283,6 @@ def main(args: argparse.Namespace):
    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
-    use_customized_permute = args.use_customized_permute

    if args.batch_size is None:
        batch_sizes = [
@@ -400,7 +334,6 @@ def main(args: argparse.Namespace):
                dtype,
                use_fp8_w8a8,
                use_int8_w8a16,
-                use_customized_permute,
            )
            for batch_size in batch_sizes
        ],
@@ -420,7 +353,6 @@ if __name__ == "__main__":
    parser.add_argument(
        "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
    )
-    parser.add_argument("--use-customized-permute", action="store_true")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--batch-size", type=int, required=False)
    parser.add_argument("--trust-remote-code", action="store_true")
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -14,7 +14,6 @@ from vllm.triton_utils import triton
 from vllm.utils.deep_gemm import (
    calc_diff,
    fp8_gemm_nt,
-    get_col_major_tma_aligned_tensor,
    per_block_cast_to_fp8,
 )

@@ -48,8 +47,9 @@ def benchmark_shape(
    block_size = [128, 128]

    # Pre-quantize A for all implementations
-    A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
-    A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
+    A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(
+        A, block_size[1], column_major_scales=True, tma_aligned_scales=True
+    )
    C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
    A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
    A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -13,6 +13,8 @@ endif()
 #
 # Define environment variables for special configurations
 #
+set(ENABLE_AVX2 $ENV{VLLM_CPU_AVX2})
+set(ENABLE_AVX512 $ENV{VLLM_CPU_AVX512})
 set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16})
 set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI})
 set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16})
@@ -103,6 +105,16 @@ else()
    find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
    find_isa(${CPUINFO} "S390" S390_FOUND)
    find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
+
+    # Support cross-compilation by allowing override via environment variables
+    if (ENABLE_AVX2)
+        set(AVX2_FOUND ON)
+        message(STATUS "AVX2 support enabled via VLLM_CPU_AVX2 environment variable")
+    endif()
+    if (ENABLE_AVX512)
+        set(AVX512_FOUND ON)
+        message(STATUS "AVX512 support enabled via VLLM_CPU_AVX512 environment variable")
+    endif()
 endif()

 if (AVX512_FOUND AND NOT AVX512_DISABLED)
@@ -379,6 +391,12 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
    endif()
 endif()

+if (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
+    set(VLLM_EXT_SRC
+        "csrc/cpu/shm.cpp"
+        ${VLLM_EXT_SRC})
+endif()
+
 if(USE_ONEDNN)
    set(VLLM_EXT_SRC
        "csrc/cpu/dnnl_kernels.cpp"
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -19,7 +19,7 @@ else()
  FetchContent_Declare(
        flashmla
        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
-        GIT_TAG 46d64a8ebef03fa50b4ae74937276a5c940e3f95
+        GIT_TAG c2afa9cb93e674d5a9120a170a6da57b89267208
        GIT_PROGRESS TRUE
        CONFIGURE_COMMAND ""
        BUILD_COMMAND ""
@@ -30,6 +30,24 @@ endif()
 FetchContent_MakeAvailable(flashmla)
 message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")

+# Vendor FlashMLA interface into vLLM with torch-ops shim.
+set(FLASHMLA_VENDOR_DIR "${CMAKE_SOURCE_DIR}/vllm/third_party/flashmla")
+file(MAKE_DIRECTORY "${FLASHMLA_VENDOR_DIR}")
+file(READ "${flashmla_SOURCE_DIR}/flash_mla/flash_mla_interface.py"
+     FLASHMLA_INTERFACE_CONTENT)
+string(REPLACE "import flash_mla.cuda as flash_mla_cuda"
+               "import vllm._flashmla_C\nflash_mla_cuda = torch.ops._flashmla_C"
+               FLASHMLA_INTERFACE_CONTENT
+               "${FLASHMLA_INTERFACE_CONTENT}")
+file(WRITE "${FLASHMLA_VENDOR_DIR}/flash_mla_interface.py"
+     "${FLASHMLA_INTERFACE_CONTENT}")
+
+# Install the generated flash_mla_interface.py to the wheel
+# Use COMPONENT _flashmla_C to ensure it's installed with the C extension
+install(FILES "${FLASHMLA_VENDOR_DIR}/flash_mla_interface.py"
+        DESTINATION vllm/third_party/flashmla/
+        COMPONENT _flashmla_C)
+
 # The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
 # Only build FlashMLA kernels if we are building for something compatible with 
 # sm90a
@@ -55,16 +73,42 @@ if(FLASH_MLA_ARCHS)

    set(FlashMLA_SOURCES
        ${flashmla_SOURCE_DIR}/csrc/torch_api.cpp
-        ${flashmla_SOURCE_DIR}/csrc/pybind.cpp
-        ${flashmla_SOURCE_DIR}/csrc/smxx/get_mla_metadata.cu
-        ${flashmla_SOURCE_DIR}/csrc/smxx/mla_combine.cu
-        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/splitkv_mla.cu
-        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/splitkv_mla.cu
+
+        # Misc kernels for decoding
+        ${flashmla_SOURCE_DIR}/csrc/smxx/decode/get_decoding_sched_meta/get_decoding_sched_meta.cu
+        ${flashmla_SOURCE_DIR}/csrc/smxx/decode/combine/combine.cu
+
+        # sm90 dense decode
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/instantiations/fp16.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/instantiations/bf16.cu
+
+        # sm90 sparse decode
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/model1_persistent_h64.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/model1_persistent_h128.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/v32_persistent_h64.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/v32_persistent_h128.cu
+
+        # sm90 sparse prefill
        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu
-        ${flashmla_SOURCE_DIR}/csrc/sm100/decode/sparse_fp8/splitkv_mla.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k512.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k512_topklen.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k576.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k576_topklen.cu
+
+        # sm100 dense prefill & backward
        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu
-        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_bwd_sm100.cu
-        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu
+
+        # sm100 sparse prefill
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head64/instantiations/phase1_k512.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head64/instantiations/phase1_k576.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head128/instantiations/phase1_k512.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head128/instantiations/phase1_k576.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/instantiations/phase1_prefill_k512.cu
+
+        # sm100 sparse decode
+        ${flashmla_SOURCE_DIR}/csrc/sm100/decode/head64/instantiations/v32.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/decode/head64/instantiations/model1.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/instantiations/phase1_decode_k512.cu
    )

    set(FlashMLA_Extension_SOURCES
@@ -76,6 +120,7 @@ if(FLASH_MLA_ARCHS)

    set(FlashMLA_INCLUDES
        ${flashmla_SOURCE_DIR}/csrc
+        ${flashmla_SOURCE_DIR}/csrc/kerutils/include
        ${flashmla_SOURCE_DIR}/csrc/sm90
        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
        ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
@@ -83,7 +128,6 @@ if(FLASH_MLA_ARCHS)

    set(FlashMLA_Extension_INCLUDES
        ${flashmla_SOURCE_DIR}/csrc
-        ${flashmla_SOURCE_DIR}/csrc/sm90
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/
        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
        ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
@@ -110,9 +154,12 @@ if(FLASH_MLA_ARCHS)

    # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
    # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
+    # Also enable C++20 for the FlashMLA sources (required for std::span, requires, etc.)
    target_compile_options(_flashmla_C PRIVATE
        $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
-        $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
+        $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>
+        $<$<COMPILE_LANGUAGE:CXX>:-std=c++20>
+        $<$<COMPILE_LANGUAGE:CUDA>:-std=c++20>)

    define_extension_target(
        _flashmla_extension_C
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -7,6 +7,7 @@
 #include <vector>

 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
+                 int64_t block_size_in_bytes,
                 const torch::Tensor& block_mapping);

 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -24,7 +24,14 @@
 typedef __hip_bfloat16 __nv_bfloat16;
 #endif

+#if defined(__gfx942__)
+constexpr float kFp8ScaleDivisor = 224.f;
+#else
+constexpr float kFp8ScaleDivisor = 448.f;
+#endif
+
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
+                 int64_t block_size_in_bytes,
                 const torch::Tensor& block_mapping) {
  torch::Device src_device = src.device();
  torch::Device dst_device = dst.device();
@@ -49,10 +56,6 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
  char* src_ptr = static_cast<char*>(src.data_ptr());
  char* dst_ptr = static_cast<char*>(dst.data_ptr());

-  // We use the stride instead of numel in case the cache is padded for memory
-  // alignment reasons, we assume the blocks data (inclusive of any padding)
-  // is contiguous in memory
-  const int64_t block_size_in_bytes = src.element_size() * src.stride(0);
  const at::cuda::OptionalCUDAGuard device_guard(
      src_device.is_cuda() ? src_device : dst_device);
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@@ -205,7 +208,8 @@ __global__ void reshape_and_cache_flash_kernel(
    const int64_t block_stride, const int64_t page_stride,
    const int64_t head_stride, const int64_t key_stride,
    const int64_t value_stride, const int num_heads, const int head_size,
-    const int block_size, const float* k_scale, const float* v_scale) {
+    const int block_size, const float* k_scale, const float* v_scale,
+    const int kv_scale_stride) {
  const int64_t token_idx = blockIdx.x;
  const int64_t slot_idx = slot_mapping[token_idx];
  // NOTE: slot_idx can be -1 if the token is padded
@@ -229,21 +233,23 @@ __global__ void reshape_and_cache_flash_kernel(
  // this is true for the NHD layout where `head_stride == head_size`
  const bool is_contiguous_heads = (head_stride == head_size);

-  float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
-  float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
  constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
-  CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
-  CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
-  if (is_contiguous_heads) {
-    // NHD layout
+
+  if (is_contiguous_heads && kv_scale_stride == 0) {
+    // NHD layout and k/v_scales are [1] (i.e. single scale for all heads)
    // kv cache: [num_blocks, block_size, num_heads, head_size]
+    float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
+    float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
+
+    CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
+    CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
+
    vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, n_elems, threadIdx.x,
                                       blockDim.x, k_op);
-
    vectorize_with_alignment<VEC_SIZE>(value_src, value_dst, n_elems,
                                       threadIdx.x, blockDim.x, v_op);
-
  } else {
+    // HND layout OR k/v_scales are [num_heads] (i.e. per-attn-head)
    // HND layout: heads are strided, but each head_size segment is contiguous
    // kv cache: [num_blocks, num_heads, block_size, head_size]
    const int lane = threadIdx.x & 31;     // 0..31 within warp
@@ -259,6 +265,16 @@ __global__ void reshape_and_cache_flash_kernel(
      cache_t* __restrict__ v_dst_h =
          value_dst + static_cast<int64_t>(head) * head_stride;

+      float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto)
+                              ? 0.f
+                              : k_scale[head * kv_scale_stride];
+      float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto)
+                              ? 0.f
+                              : v_scale[head * kv_scale_stride];
+
+      CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
+      CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
+
      // within each head, let the 32 threads of the warp perform the vector
      // copy
      vectorize_with_alignment<VEC_SIZE>(k_src_h, k_dst_h, head_size, lane, 32,
@@ -391,8 +407,7 @@ __global__ void concat_and_cache_ds_mla_kernel(
  }

  // Compute the scale for the tile
-  float tile_scale = max_abs / 448.f;
-  tile_scale = fmaxf(tile_scale, FLT_MIN);
+  float tile_scale = fmaxf(max_abs / kFp8ScaleDivisor, FLT_MIN);

  // The first lane of each half-warp writes the scale to kv_cache
  if ((lane_idx == 0) || (lane_idx == 16)) {
@@ -461,11 +476,8 @@ __global__ void indexer_k_quant_and_cache_kernel(
 #endif
  }

-#if defined(__gfx942__)
-  float scale = fmaxf(amax, 1e-4) / 224.0f;
-#else
-  float scale = fmaxf(amax, 1e-4) / 448.0f;
-#endif
+  float scale = fmaxf(amax, 1e-4) / kFp8ScaleDivisor;
+
  if (use_ue8m0) {
    scale = exp2f(ceilf(log2f(scale)));
  }
@@ -608,7 +620,8 @@ void reshape_and_cache(
          slot_mapping.data_ptr<int64_t>(), block_stride, page_stride,    \
          head_stride, key_stride, value_stride, num_heads, head_size,    \
          block_size, reinterpret_cast<const float*>(k_scale.data_ptr()), \
-          reinterpret_cast<const float*>(v_scale.data_ptr()));
+          reinterpret_cast<const float*>(v_scale.data_ptr()),             \
+          kv_scale_stride);

 void reshape_and_cache_flash(
    torch::Tensor& key,        // [num_tokens, num_heads, head_size]
@@ -617,8 +630,9 @@ void reshape_and_cache_flash(
    torch::Tensor&
        value_cache,  // [num_blocks, block_size, num_heads, head_size]
    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
-    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
-    torch::Tensor& v_scale) {
+    const std::string& kv_cache_dtype,
+    torch::Tensor& k_scale,    // [1] or [num_heads]
+    torch::Tensor& v_scale) {  // [1] or [num_heads]
  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
  // slot_mapping.size(0) because of padding for CUDA graphs.
  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
@@ -641,6 +655,12 @@ void reshape_and_cache_flash(
  int64_t head_stride = key_cache.stride(2);
  TORCH_CHECK(key_cache.stride(0) == value_cache.stride(0));

+  TORCH_CHECK(k_scale.sizes() == v_scale.sizes(),
+              "k_scale and v_scale must have the same shape");
+  TORCH_CHECK(k_scale.numel() == 1 || k_scale.numel() == num_heads,
+              "k_scale and v_scale must be of shape [1] or [num_heads]");
+  int kv_scale_stride = (k_scale.numel() > 1) ? 1 : 0;
+
  dim3 grid(num_tokens);
  dim3 block(std::min(num_heads * head_size, 512));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -80,8 +80,10 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
    reg.val[1] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr) + 8);
  }

-  explicit FP16Vec16(const FP32Vec16& vec);
+  // ASIMD does not support non-temporal loads
+  explicit FP16Vec16(bool, const void* ptr) : FP16Vec16(ptr) {}

+  explicit FP16Vec16(const FP32Vec16& vec);
  void save(void* ptr) const {
    vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
    vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
@@ -190,6 +192,9 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
  explicit BF16Vec16(const void* ptr)
      : reg(*reinterpret_cast<const bfloat16x8x2_t*>(ptr)) {};

+  // ASIMD does not support non-temporal loads
+  explicit BF16Vec16(bool, const void* ptr) : BF16Vec16(ptr) {}
+
  explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {};

  explicit BF16Vec16(const FP32Vec16&);
@@ -474,6 +479,9 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
      : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8),
             vld1q_f32(ptr + 12)}) {}

+  // ASIMD does not support non-temporal loads
+  explicit FP32Vec16(bool, const float* ptr) : FP32Vec16(ptr) {}
+
  explicit FP32Vec16(float32x4x4_t data) : reg(data) {}

  explicit FP32Vec16(const FP32Vec8& data) {
@@ -756,6 +764,96 @@ struct INT8Vec16 : public Vec<INT8Vec16> {
  };
 };

+struct INT8Vec64 : public Vec<INT8Vec64> {
+  constexpr static int VEC_ELEM_NUM = 64;
+  union AliasReg {
+    int8x16x4_t reg;
+    int8_t values[VEC_ELEM_NUM];
+  };
+  int8x16x4_t reg;
+
+  explicit INT8Vec64(const int8_t* ptr) { reg = vld1q_s8_x4(ptr); }
+
+  // ASIMD does not support non-temporal loads
+  explicit INT8Vec64(bool, const int8_t* ptr) : INT8Vec64(ptr) {}
+
+  void save(int8_t* ptr) const { vst1q_s8_x4(ptr, reg); }
+
+  // masked store
+  void save(int8_t* p, int elem_num) const {
+    TORCH_CHECK(elem_num <= VEC_ELEM_NUM && elem_num > 0);
+
+    if (elem_num == VEC_ELEM_NUM) {
+      vst1q_s8_x4(p, reg);
+      return;
+    }
+
+    const int full_quadwords = elem_num / 16;
+    const int remaining_bytes = elem_num % 16;
+
+    for (int i = 0; i < full_quadwords; ++i) {
+      vst1q_s8(p + 16 * i, reg.val[i]);
+    }
+
+    if (remaining_bytes) {
+      const int8x16_t v = reg.val[full_quadwords];
+      int8_t* tail = p + 16 * full_quadwords;
+      switch (remaining_bytes) {
+        case 15:
+          tail[14] = vgetq_lane_s8(v, 14);
+          [[fallthrough]];
+        case 14:
+          tail[13] = vgetq_lane_s8(v, 13);
+          [[fallthrough]];
+        case 13:
+          tail[12] = vgetq_lane_s8(v, 12);
+          [[fallthrough]];
+        case 12:
+          tail[11] = vgetq_lane_s8(v, 11);
+          [[fallthrough]];
+        case 11:
+          tail[10] = vgetq_lane_s8(v, 10);
+          [[fallthrough]];
+        case 10:
+          tail[9] = vgetq_lane_s8(v, 9);
+          [[fallthrough]];
+        case 9:
+          tail[8] = vgetq_lane_s8(v, 8);
+          [[fallthrough]];
+        case 8:
+          tail[7] = vgetq_lane_s8(v, 7);
+          [[fallthrough]];
+        case 7:
+          tail[6] = vgetq_lane_s8(v, 6);
+          [[fallthrough]];
+        case 6:
+          tail[5] = vgetq_lane_s8(v, 5);
+          [[fallthrough]];
+        case 5:
+          tail[4] = vgetq_lane_s8(v, 4);
+          [[fallthrough]];
+        case 4:
+          tail[3] = vgetq_lane_s8(v, 3);
+          [[fallthrough]];
+        case 3:
+          tail[2] = vgetq_lane_s8(v, 2);
+          [[fallthrough]];
+        case 2:
+          tail[1] = vgetq_lane_s8(v, 1);
+          [[fallthrough]];
+        case 1:
+          tail[0] = vgetq_lane_s8(v, 0);
+          break;
+        default:
+          break;
+      }
+    }
+  }
+
+  // ASIMD does not support non-temporal stores
+  void nt_save(int8_t* ptr) const { save(ptr); }
+};  // INT8Vec64
+
 template <typename T>
 struct VecType {
  using vec_type = void;
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@@ -360,13 +360,14 @@ void onednn_scaled_mm(
    const std::optional<torch::Tensor>& azp,      // [M] or [1]
    const std::optional<torch::Tensor>& azp_adj,  // [M] or [1]
    const std::optional<torch::Tensor>& bias,     // [N]
-    int64_t handler) {
+    const torch::Tensor& handler_tensor) {
  CPU_KERNEL_GUARD_IN(onednn_scaled_mm)
  TORCH_CHECK(a.dim() == 2);
  TORCH_CHECK(a.is_contiguous());
  TORCH_CHECK(c.is_contiguous());
  W8A8MatMulPrimitiveHandler* ptr =
-      reinterpret_cast<W8A8MatMulPrimitiveHandler*>(handler);
+      reinterpret_cast<W8A8MatMulPrimitiveHandler*>(
+          handler_tensor.item<int64_t>());
  const int32_t* azp_ptr = nullptr;
  if (azp.has_value()) {
    azp_ptr = azp->data_ptr<int32_t>();
@@ -519,13 +520,14 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b,

 void onednn_mm(torch::Tensor& c,        // [M, OC], row-major
               const torch::Tensor& a,  // [M, IC], row-major
-               const std::optional<torch::Tensor>& bias, int64_t handler) {
+               const std::optional<torch::Tensor>& bias,
+               const torch::Tensor& handler_tensor) {
  CPU_KERNEL_GUARD_IN(onednn_mm)
  TORCH_CHECK(a.dim() == 2);
  TORCH_CHECK(a.stride(-1) == 1);
  TORCH_CHECK(c.stride(-1) == 1);
  MatMulPrimitiveHandler* ptr =
-      reinterpret_cast<MatMulPrimitiveHandler*>(handler);
+      reinterpret_cast<MatMulPrimitiveHandler*>(handler_tensor.item<int64_t>());

 // ACL matmuls expect contiguous source tensors
 #ifdef VLLM_USE_ACL
--- a/csrc/cpu/shm.cpp
+++ b/csrc/cpu/shm.cpp
@@ -5,6 +5,10 @@
 #include <sys/stat.h>
 #include <unistd.h>

+#ifdef __aarch64__
+  #include <atomic>
+#endif
+
 namespace {
 #define MAX_SHM_RANK_NUM 8
 #define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024)
@@ -34,8 +38,17 @@ struct KernelVecType<c10::Half> {
 };

 struct ThreadSHMContext {
+#ifdef __aarch64__
+  // memory model is weaker on AArch64, so we use atomic variables for
+  // consumer (load-acquire) and producer (store-release) to make sure
+  // that a stamp cannot be ready before the corresponding data is ready.
+  std::atomic<char> _curr_thread_stamp[2];
+  std::atomic<char> _ready_thread_stamp[2];
+  static_assert(std::atomic<char>::is_always_lock_free);
+#else
  volatile char _curr_thread_stamp[2];
  volatile char _ready_thread_stamp[2];
+#endif  // __aarch64__
  int local_stamp_buffer_idx;
  int remote_stamp_buffer_idx;
  int thread_id;
@@ -62,10 +75,17 @@ struct ThreadSHMContext {
    TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
    TORCH_CHECK((size_t)this % 64 == 0);
    TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
+#ifdef __aarch64__
+    _curr_thread_stamp[0].store(1, std::memory_order_relaxed);
+    _curr_thread_stamp[1].store(1, std::memory_order_relaxed);
+    _ready_thread_stamp[0].store(0, std::memory_order_relaxed);
+    _ready_thread_stamp[1].store(0, std::memory_order_relaxed);
+#else
    _curr_thread_stamp[0] = 1;
    _curr_thread_stamp[1] = 1;
    _ready_thread_stamp[0] = 0;
    _ready_thread_stamp[1] = 0;
+#endif  // __aarch64__
    _thread_buffer_mask[0] = 0;
    _thread_buffer_mask[1] = 0;
    for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
@@ -103,19 +123,43 @@ struct ThreadSHMContext {
    _thread_buffer_mask[local_stamp_buffer_idx] ^= 0xFFFFFFFFFFFFFFFF;
  }

-  char get_curr_stamp(int idx) const { return _curr_thread_stamp[idx]; }
+  char get_curr_stamp(int idx) const {
+#ifdef __aarch64__
+    return _curr_thread_stamp[idx].load(std::memory_order_acquire);
+#else
+    return _curr_thread_stamp[idx];
+#endif  // __aarch64__
+  }

-  char get_ready_stamp(int idx) const { return _ready_thread_stamp[idx]; }
+  char get_ready_stamp(int idx) const {
+#ifdef __aarch64__
+    return _ready_thread_stamp[idx].load(std::memory_order_acquire);
+#else
+    return _ready_thread_stamp[idx];
+#endif  // __aarch64__
+  }

  void next_stamp() {
+#ifdef __aarch64__
+    _curr_thread_stamp[local_stamp_buffer_idx].fetch_add(
+        1, std::memory_order_release);
+#else
    _mm_mfence();
    _curr_thread_stamp[local_stamp_buffer_idx] += 1;
+#endif  // __aarch64__
  }

  void commit_ready_stamp() {
+#ifdef __aarch64__
+    _ready_thread_stamp[local_stamp_buffer_idx].store(
+        _curr_thread_stamp[local_stamp_buffer_idx].load(
+            std::memory_order_relaxed),
+        std::memory_order_release);
+#else
    _mm_mfence();
    _ready_thread_stamp[local_stamp_buffer_idx] =
        _curr_thread_stamp[local_stamp_buffer_idx];
+#endif  // __aarch64__
  }

  int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
@@ -142,7 +186,11 @@ struct ThreadSHMContext {
        break;
      }
      ++_spinning_count;
+#ifdef __aarch64__
+      __asm__ __volatile__("yield");
+#else
      _mm_pause();
+#endif  // __aarch64__
    }
  }

--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -19,13 +19,14 @@ void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
                      const std::optional<torch::Tensor>& azp,
                      const std::optional<torch::Tensor>& azp_adj,
                      const std::optional<torch::Tensor>& bias,
-                      int64_t handler);
+                      const torch::Tensor& handler_tensor);

 int64_t create_onednn_mm_handler(const torch::Tensor& b,
                                 int64_t primitive_cache_size);

 void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
-               const std::optional<torch::Tensor>& bias, int64_t handler);
+               const std::optional<torch::Tensor>& bias,
+               const torch::Tensor& handler_tensor);

 bool is_onednn_acl_supported();

@@ -196,7 +197,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // oneDNN GEMM
  ops.def(
      "onednn_mm(Tensor! c, Tensor a, Tensor? bias, "
-      "int handler) -> ()");
+      "Tensor handler_tensor) -> ()");
  ops.impl("onednn_mm", torch::kCPU, &onednn_mm);

  // Check if oneDNN was built with ACL backend
@@ -212,7 +213,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // oneDNN scaled_mm for W8A8 with static per-tensor activation quantization
  ops.def(
      "onednn_scaled_mm(Tensor! c, Tensor a, Tensor a_scales, Tensor? azp, "
-      "Tensor? azp_adj, Tensor? bias, int handler) -> ()");
+      "Tensor? azp_adj, Tensor? bias, Tensor handler_tensor) -> ()");
  ops.impl("onednn_scaled_mm", torch::kCPU, &onednn_scaled_mm);

  // Compute int8 quantized tensor for given scaling factor.
@@ -230,7 +231,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 #endif

 // SHM CCL
-#ifdef __AVX512F__
+#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
  ops.def("init_shm_manager(str name, int group_size, int rank) -> int",
          &init_shm_manager);
  ops.def("join_shm_manager(int handle, str name) -> str", &join_shm_manager);
@@ -250,7 +251,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("shm_send_tensor_list", torch::kCPU, &shm_send_tensor_list);
  ops.def("shm_recv_tensor_list(int handle, int src) -> Tensor[](a)",
          &shm_recv_tensor_list);
-#endif
+#endif  // #if defined(__AVX512F__) || defined(__aarch64__)

  // sgl-kernels
 #if defined(__AVX512BF16__) && defined(__AVX512F__) && defined(__AVX512VNNI__)
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -31,8 +31,6 @@ namespace moe {

 constexpr unsigned FULL_WARP_MASK = 0xffffffff;
 constexpr int32_t WARP_SIZE = 32;
-constexpr int32_t BLOCK_SIZE = 512;
-constexpr int32_t NUM_WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE;

 namespace warp_topk {

@@ -65,14 +63,6 @@ __forceinline__ __device__ bool is_better_than(T val, T baseline, idxT index,
  return res;
 }

-template <typename T, typename idxT>
-int calc_smem_size_for_block_wide(int num_of_warp, int64_t k) {
-  int64_t cache_topk = (sizeof(T) + sizeof(idxT)) * num_of_warp * k;
-  int64_t n = std::max<int>(num_of_warp / 2 * k, num_of_warp * WARP_SIZE);
-  return max(cache_topk,
-             round_up_to_multiple_of<256>(n * sizeof(T)) + n * sizeof(idxT));
-}
-
 template <int size, bool ascending, bool reverse, typename T, typename idxT,
          bool is_stable>
 struct BitonicMerge {
@@ -267,6 +257,15 @@ class WarpSort {
    }
  }

+  // Accessors for per-lane selected value/index.
+  // NOTE: For the common case `capacity == WARP_SIZE`, `max_arr_len_ == 1`
+  // and callers should use `i == 0`.
+  __device__ __forceinline__ idxT get_idx(int i = 0) const {
+    return idx_arr_[i];
+  }
+
+  __device__ __forceinline__ T get_val(int i = 0) const { return val_arr_[i]; }
+
 protected:
  static constexpr int max_arr_len_ = capacity / WARP_SIZE;

@@ -285,6 +284,7 @@ class WarpSelect : public WarpSort<capacity, greater, T, idxT, is_stable> {
  __device__ WarpSelect(idxT k, T dummy)
      : WarpSort<capacity, greater, T, idxT, is_stable>(k, dummy),
        k_th_(dummy),
+        k_th_idx_(0),
        k_th_lane_((k - 1) % WARP_SIZE) {
    extern __shared__ char smem_buf[];  // extern __shared__ T smem_buf[];

@@ -346,9 +346,6 @@ class WarpSelect : public WarpSort<capacity, greater, T, idxT, is_stable> {
      idxT idx = (lane_ < smem_buf_len_) ? idx_smem_[lane_] : 0;
      merge_buf_(val, idx);
    }
-
-    // after done(), smem is used for merging results among warps
-    __syncthreads();
  }

 private:
@@ -503,255 +500,186 @@ __device__ void topk_with_k2(T* output, T const* input, BiasT const* bias,
  }
 }

-template <typename T, typename BiasT, ScoringFunc SF>
-__global__ void topk_with_k2_kernel(T* output, T* input, BiasT const* bias,
-                                    int64_t const num_tokens,
-                                    int64_t const num_cases,
-                                    int64_t const n_group,
-                                    int64_t const num_experts_per_group) {
-  int32_t warp_id = threadIdx.x / WARP_SIZE;
-  int32_t lane_id = threadIdx.x % WARP_SIZE;
-
-  int32_t case_id = blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;
-  if (case_id < num_cases) {
-    input += case_id * num_experts_per_group;
-    // bias is per expert group, offset to current group
-    int32_t group_id = case_id % n_group;
-    BiasT const* group_bias = bias + group_id * num_experts_per_group;
-    output += case_id;
-
-    cg::thread_block block = cg::this_thread_block();
-    cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
-#endif
-    topk_with_k2<T, BiasT, SF>(output, input, group_bias, tile, lane_id,
-                               num_experts_per_group);
-  }
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
-#endif
-}
-
-template <typename T, typename BiasT, typename IdxT, ScoringFunc SF,
-          int NGroup = -1>
-__global__ void group_idx_and_topk_idx_kernel(
-    T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices,
-    BiasT const* bias, int64_t const num_tokens, int64_t const n_group,
-    int64_t const topk_group, int64_t const topk, int64_t const num_experts,
-    int64_t const num_experts_per_group, bool renormalize,
+template <typename T, typename BiasT, typename IdxT, ScoringFunc SF>
+__global__ void grouped_topk_fused_kernel(
+    T* scores, float* topk_values, IdxT* topk_indices, BiasT const* bias,
+    int64_t const num_tokens, int64_t const num_experts, int64_t const n_group,
+    int64_t const topk_group, int64_t const topk, bool renormalize,
    double routed_scaling_factor) {
-  int32_t warp_id = threadIdx.x / WARP_SIZE;
-  int32_t lane_id = threadIdx.x % WARP_SIZE;
-  int32_t case_id =
-      blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;  // one per token
-  scores += case_id * num_experts;
-  group_scores += case_id * n_group;
-  topk_values += case_id * topk;
-  topk_indices += case_id * topk;
+  int32_t const token_id = static_cast<int32_t>(blockIdx.x);
+  if (token_id >= num_tokens) {
+    return;
+  }

-  constexpr bool kUseStaticNGroup = (NGroup > 0);
-  // use int32 to avoid implicit conversion
-  int32_t const n_group_i32 =
-      kUseStaticNGroup ? NGroup : static_cast<int32_t>(n_group);
+  int32_t const warp_id = threadIdx.x / WARP_SIZE;
+  int32_t const lane_id = threadIdx.x % WARP_SIZE;

-  int32_t align_num_experts_per_group =
-      warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
+  int32_t const n_group_i32 = static_cast<int32_t>(n_group);
+  int32_t const topk_group_i32 = static_cast<int32_t>(topk_group);
+  int32_t const topk_i32 = static_cast<int32_t>(topk);
+  int32_t const num_experts_i32 = static_cast<int32_t>(num_experts);
+
+  int32_t const num_warps = blockDim.x / WARP_SIZE;
+  if (warp_id >= n_group_i32 || num_warps < n_group_i32) {
+    return;
+  }
+
+  int32_t const num_experts_per_group = num_experts_i32 / n_group_i32;
+
+  T* scores_token = scores + static_cast<int64_t>(token_id) * num_experts;

  cg::thread_block block = cg::this_thread_block();
  cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);

-  extern __shared__ char smem_buf[];  // NOTE: reuse the shared memory here to
-                                      // store the target topk idx
-  int32_t* s_topk_idx = reinterpret_cast<int32_t*>(smem_buf);
-  T* s_topk_value =
-      reinterpret_cast<T*>(s_topk_idx + NUM_WARPS_PER_BLOCK * topk) +
-      warp_id * topk;
-  s_topk_idx += warp_id * topk;
+  extern __shared__ char smem_buf[];
+  // warpSelect internal staging buffer layout
+  size_t const val_bytes =
+      static_cast<size_t>(num_warps) * WARP_SIZE * sizeof(T);
+  size_t const val_bytes_aligned =
+      warp_topk::round_up_to_multiple_of<256>(val_bytes);
+  size_t const idx_bytes =
+      static_cast<size_t>(num_warps) * WARP_SIZE * sizeof(int32_t);
+  size_t const internal_bytes = val_bytes_aligned + idx_bytes;

-  T value = neg_inf<T>();
-  T topk_group_value = neg_inf<T>();
-  int32_t num_equalto_topkth_group;
+  // user-managed shared memory starts after warpSelect internal staging.
+  uintptr_t ptr_u = reinterpret_cast<uintptr_t>(smem_buf + internal_bytes);
+  ptr_u = (ptr_u + 15) & ~static_cast<uintptr_t>(15);  // align to 16B
+  T* s_group_scores = reinterpret_cast<T*>(ptr_u);

 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
  asm volatile("griddepcontrol.wait;");  // I think all prolog can be put before
                                         // acqbulk because it's ptr arithmetic
 #endif

-  if (case_id < num_tokens) {
-    // calculate group_idx
-    int32_t target_num_min =
-        WARP_SIZE - n_group_i32 + static_cast<int32_t>(topk_group);
-    // The check is necessary to avoid abnormal input
-    if (lane_id < n_group_i32 && is_finite(group_scores[lane_id])) {
-      value = group_scores[lane_id];
-    }
+  // phase 1: per-group scan
+  int32_t const group_offset = warp_id * num_experts_per_group;
+  topk_with_k2<T, BiasT, SF>(s_group_scores + warp_id,
+                             scores_token + group_offset, bias + group_offset,
+                             tile, lane_id, num_experts_per_group);

-    int count_equal_to_top_value = WARP_SIZE - n_group_i32;
-    int pre_count_equal_to_top_value = 0;
-    // Use loop to find the largset top_group
-    while (count_equal_to_top_value < target_num_min) {
-      topk_group_value = cg::reduce(tile, value, cg::greater<T>());
-      if (value == topk_group_value) {
-        value = neg_inf<T>();
-      }
-      pre_count_equal_to_top_value = count_equal_to_top_value;
-      count_equal_to_top_value =
-          __popc(__ballot_sync(FULL_WARP_MASK, (value == neg_inf<T>())));
-    }
-    num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
-  }
  __syncthreads();

+  // phase 2: warp0 selects groups + merges candidates to final topk
+  if (warp_id != 0) {
+    return;
+  }
+
+  topk_values += static_cast<int64_t>(token_id) * topk;
+  topk_indices += static_cast<int64_t>(token_id) * topk;
+
+  // select topk_group groups by group score
  warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
                        /* is_stable */ true>
-      queue((int32_t)topk, neg_inf<T>());
+      group_sel(static_cast<int32_t>(topk_group_i32), neg_inf<T>());

-  int count_equalto_topkth_group = 0;
-  bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
-  if (case_id < num_tokens && if_proceed_next_topk) {
-    auto process_group = [&](int i_group) {
-      if ((group_scores[i_group] > topk_group_value) ||
-          ((group_scores[i_group] == topk_group_value) &&
-           (count_equalto_topkth_group < num_equalto_topkth_group))) {
-        int32_t offset = i_group * num_experts_per_group;
-        for (int32_t i = lane_id; i < align_num_experts_per_group;
-             i += WARP_SIZE) {
-          T candidates = neg_inf<T>();
-          if (i < num_experts_per_group) {
-            // apply scoring function (if any) and add bias
-            T input = scores[offset + i];
-            if (is_finite(input)) {
-              T score = apply_scoring<SF>(input);
-              candidates = score + static_cast<T>(bias[offset + i]);
-            }
-          }
-          queue.add(candidates, offset + i);
-        }
-        if (group_scores[i_group] == topk_group_value) {
-          count_equalto_topkth_group++;
+  // all lanes must participate in WarpSelect::add().
+  T gscore = (lane_id < n_group_i32) ? s_group_scores[lane_id] : neg_inf<T>();
+  group_sel.add(gscore, lane_id);
+  group_sel.done();
+
+  // proceed only if the k-th selected group score is not -inf
+  bool proceed = false;
+  if (topk_group_i32 > 0) {
+    int const kth_lane = topk_group_i32 - 1;
+    // broadcast the k-th selected group score to all lanes
+    T kth_val = __shfl_sync(FULL_WARP_MASK, group_sel.get_val(0), kth_lane);
+    proceed = (kth_val != neg_inf<T>());
+  }
+
+  if (!proceed) {
+    for (int i = lane_id; i < topk_i32; i += WARP_SIZE) {
+      topk_indices[i] = static_cast<IdxT>(i);
+      topk_values[i] = 1.0f / static_cast<float>(topk_i32);
+    }
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    asm volatile("griddepcontrol.launch_dependents;");
+#endif
+    return;
+  }
+
+  // merge per-group topk candidates for selected groups, then select topk
+  warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
+                        /* is_stable */ true>
+      expert_sel(static_cast<int32_t>(topk_i32), neg_inf<T>());
+
+  // selected group ids reside in lanes [0, topk_group)
+  int32_t sel_gid_lane = (lane_id < topk_group_i32) ? group_sel.get_idx(0) : 0;
+
+  // add candidates from selected groups to expert_sel
+  for (int32_t g = 0; g < topk_group_i32; ++g) {
+    int32_t gid = __shfl_sync(FULL_WARP_MASK, sel_gid_lane, g);
+    int32_t const offset = gid * num_experts_per_group;
+    int32_t const align_num_experts_per_group =
+        warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
+    for (int32_t i = lane_id; i < align_num_experts_per_group; i += WARP_SIZE) {
+      // all lanes must call `add()` the same number of times.
+      T cand = neg_inf<T>();
+      int32_t idx = 0;
+      if (i < num_experts_per_group) {
+        idx = offset + i;
+        T input = scores_token[idx];
+        if (is_finite(input)) {
+          T score = apply_scoring<SF>(input);
+          cand = score + static_cast<T>(bias[idx]);
        }
      }
-    };
-
-    if constexpr (kUseStaticNGroup) {
-#pragma unroll
-      for (int i_group = 0; i_group < NGroup; ++i_group) {
-        process_group(i_group);
-      }
-    } else {
-      for (int i_group = 0; i_group < n_group_i32; ++i_group) {
-        process_group(i_group);
-      }
-    }
-    queue.done();
-    // Get the topk_idx
-    queue.dumpIdx(s_topk_idx);
-  }
-
-  // Load the valid score value
-  // Calculate the summation
-  float topk_sum = 1e-20;
-  if (case_id < num_tokens && if_proceed_next_topk) {
-    for (int i = lane_id;
-         i < warp_topk::round_up_to_multiple_of<WARP_SIZE>(topk);
-         i += WARP_SIZE) {
-      T value = cuda_cast<T, float>(0.0f);
-      if (i < topk) {
-        // Load the score value (without bias) for normalization
-        T input = scores[s_topk_idx[i]];
-        value = apply_scoring<SF>(input);
-        s_topk_value[i] = value;
-      }
-      if (renormalize) {
-        topk_sum +=
-            cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
-      }
+      expert_sel.add(cand, idx);
    }
  }
+  expert_sel.done();

-  __syncthreads();
-
-  if (case_id < num_tokens) {
-    if (if_proceed_next_topk) {
-      float scale = routed_scaling_factor;
-      if (renormalize) {
-        scale /= topk_sum;
-      }
-      for (int i = lane_id; i < topk; i += WARP_SIZE) {
-        float base = cuda_cast<float, T>(s_topk_value[i]);
-        float value = base * scale;
-        topk_indices[i] = s_topk_idx[i];
-        topk_values[i] = value;
-      }
-    } else {
-      for (int i = lane_id; i < topk; i += WARP_SIZE) {
-        topk_indices[i] = i;
-        topk_values[i] = 1.0f / topk;
-      }
-    }
-    // Note: when if_proceed_next_topk==false, choose the first 8 experts as the
-    // default result.
+  // compute unbiased routing weights + optional renorm.
+  float lane_unbiased = 0.0f;
+  IdxT lane_idx = 0;
+  if (lane_id < topk_i32) {
+    lane_idx = static_cast<IdxT>(expert_sel.get_idx(0));
+    T in = scores_token[static_cast<int32_t>(lane_idx)];
+    lane_unbiased = cuda_cast<float, T>(apply_scoring<SF>(in));
  }
+
+  float topk_sum = 1e-20f;
+  if (renormalize) {
+    topk_sum += cg::reduce(tile, lane_unbiased, cg::plus<float>());
+  }
+
+  float scale = static_cast<float>(routed_scaling_factor);
+  if (renormalize) {
+    scale /= topk_sum;
+  }
+
+  if (lane_id < topk_i32) {
+    topk_indices[lane_id] = lane_idx;
+    topk_values[lane_id] = lane_unbiased * scale;
+  }
+
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
  asm volatile("griddepcontrol.launch_dependents;");
 #endif
 }

-template <typename T, typename BiasT, typename IdxT, ScoringFunc SF>
-inline void launch_group_idx_and_topk_kernel(
-    cudaLaunchConfig_t const& config, T* scores, T* group_scores,
-    float* topk_values, IdxT* topk_indices, BiasT const* bias,
-    int64_t const num_tokens, int64_t const n_group, int64_t const topk_group,
-    int64_t const topk, int64_t const num_experts,
-    int64_t const num_experts_per_group, bool const renormalize,
-    double const routed_scaling_factor) {
-  auto launch = [&](auto* kernel_instance2) {
-    cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
-                       topk_values, topk_indices, bias, num_tokens, n_group,
-                       topk_group, topk, num_experts, num_experts_per_group,
-                       renormalize, routed_scaling_factor);
-  };
-
-  switch (n_group) {
-    case 4: {
-      launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 4>);
-      break;
-    }
-    case 8: {
-      launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 8>);
-      break;
-    }
-    case 16: {
-      launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 16>);
-      break;
-    }
-    case 32: {
-      launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 32>);
-      break;
-    }
-    default: {
-      launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF>);
-      break;
-    }
-  }
-}
-
 template <typename T, typename BiasT, typename IdxT>
-void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
-                   IdxT* topk_indices, BiasT const* bias,
-                   int64_t const num_tokens, int64_t const num_experts,
-                   int64_t const n_group, int64_t const topk_group,
-                   int64_t const topk, bool const renormalize,
-                   double const routed_scaling_factor, int const scoring_func,
-                   bool enable_pdl = false, cudaStream_t const stream = 0) {
-  int64_t num_cases = num_tokens * n_group;
-  int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
+void invokeNoAuxTc(T* scores, float* topk_values, IdxT* topk_indices,
+                   BiasT const* bias, int64_t const num_tokens,
+                   int64_t const num_experts, int64_t const n_group,
+                   int64_t const topk_group, int64_t const topk,
+                   bool const renormalize, double const routed_scaling_factor,
+                   int const scoring_func, bool enable_pdl = false,
+                   cudaStream_t const stream = 0) {
  cudaLaunchConfig_t config;
-  config.gridDim = topk_with_k2_num_blocks;
-  config.blockDim = BLOCK_SIZE;
-  config.dynamicSmemBytes = 0;
+  // One block per token; one warp per group.
+  config.gridDim = static_cast<uint32_t>(num_tokens);
+  config.blockDim = static_cast<uint32_t>(n_group) * WARP_SIZE;
+  // Dynamic shared memory: WarpSelect staging + per-group topk buffers.
+  int32_t const num_warps = static_cast<int32_t>(n_group);
+  size_t const val_bytes =
+      static_cast<size_t>(num_warps) * WARP_SIZE * sizeof(T);
+  size_t const val_bytes_aligned =
+      warp_topk::round_up_to_multiple_of<256>(val_bytes);
+  size_t const idx_bytes =
+      static_cast<size_t>(num_warps) * WARP_SIZE * sizeof(int32_t);
+  size_t const internal_bytes = val_bytes_aligned + idx_bytes;
+  size_t const extra_bytes = 16 + static_cast<size_t>(n_group) * sizeof(T);
+  config.dynamicSmemBytes = internal_bytes + extra_bytes;
  config.stream = stream;
  cudaLaunchAttribute attrs[1];
  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
@@ -759,66 +687,35 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
  config.numAttrs = 1;
  config.attrs = attrs;
  auto const sf = static_cast<ScoringFunc>(scoring_func);
-  int64_t const num_experts_per_group = num_experts / n_group;
-  auto launch_topk_with_k2 = [&](auto* kernel_instance1) {
-    cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
-                       num_tokens, num_cases, n_group, num_experts_per_group);
-  };
  switch (sf) {
    case SCORING_NONE: {
-      auto* kernel_instance1 = &topk_with_k2_kernel<T, BiasT, SCORING_NONE>;
-      launch_topk_with_k2(kernel_instance1);
-      break;
+      auto* kernel_instance =
+          &grouped_topk_fused_kernel<T, BiasT, IdxT, SCORING_NONE>;
+      cudaLaunchKernelEx(&config, kernel_instance, scores, topk_values,
+                         topk_indices, bias, num_tokens, num_experts, n_group,
+                         topk_group, topk, renormalize, routed_scaling_factor);
+      return;
    }
    case SCORING_SIGMOID: {
-      auto* kernel_instance1 = &topk_with_k2_kernel<T, BiasT, SCORING_SIGMOID>;
-      launch_topk_with_k2(kernel_instance1);
-      break;
+      auto* kernel_instance =
+          &grouped_topk_fused_kernel<T, BiasT, IdxT, SCORING_SIGMOID>;
+      cudaLaunchKernelEx(&config, kernel_instance, scores, topk_values,
+                         topk_indices, bias, num_tokens, num_experts, n_group,
+                         topk_group, topk, renormalize, routed_scaling_factor);
+      return;
    }
    default:
      // should be guarded by higher level checks.
      TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
  }
-
-  int64_t topk_with_k_group_num_blocks =
-      (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
-  size_t dynamic_smem_in_bytes =
-      warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
-                                                           topk);
-  config.gridDim = topk_with_k_group_num_blocks;
-  config.blockDim = BLOCK_SIZE;
-  config.dynamicSmemBytes = dynamic_smem_in_bytes;
-  config.stream = stream;
-  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
-  config.numAttrs = 1;
-  config.attrs = attrs;
-  switch (sf) {
-    case SCORING_NONE: {
-      launch_group_idx_and_topk_kernel<T, BiasT, IdxT, SCORING_NONE>(
-          config, scores, group_scores, topk_values, topk_indices, bias,
-          num_tokens, n_group, topk_group, topk, num_experts,
-          num_experts_per_group, renormalize, routed_scaling_factor);
-      break;
-    }
-    case SCORING_SIGMOID: {
-      launch_group_idx_and_topk_kernel<T, BiasT, IdxT, SCORING_SIGMOID>(
-          config, scores, group_scores, topk_values, topk_indices, bias,
-          num_tokens, n_group, topk_group, topk, num_experts,
-          num_experts_per_group, renormalize, routed_scaling_factor);
-      break;
-    }
-    default:
-      TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
-  }
 }

-#define INSTANTIATE_NOAUX_TC(T, BiasT, IdxT)                                  \
-  template void invokeNoAuxTc<T, BiasT, IdxT>(                                \
-      T * scores, T * group_scores, float* topk_values, IdxT* topk_indices,   \
-      BiasT const* bias, int64_t const num_tokens, int64_t const num_experts, \
-      int64_t const n_group, int64_t const topk_group, int64_t const topk,    \
-      bool const renormalize, double const routed_scaling_factor,             \
+#define INSTANTIATE_NOAUX_TC(T, BiasT, IdxT)                                 \
+  template void invokeNoAuxTc<T, BiasT, IdxT>(                               \
+      T * scores, float* topk_values, IdxT* topk_indices, BiasT const* bias, \
+      int64_t const num_tokens, int64_t const num_experts,                   \
+      int64_t const n_group, int64_t const topk_group, int64_t const topk,   \
+      bool const renormalize, double const routed_scaling_factor,            \
      int const scoring_func, bool enable_pdl, cudaStream_t const stream);

 INSTANTIATE_NOAUX_TC(float, float, int32_t);
@@ -843,17 +740,21 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
  int64_t num_tokens = input_size[0];
  int64_t num_experts = input_size[1];
  TORCH_CHECK(input_size.size() == 2, "scores must be a 2D Tensor");
+  TORCH_CHECK(n_group > 0, "n_group must be positive");
+  TORCH_CHECK(topk > 0, "topk must be positive");
+  TORCH_CHECK(topk_group > 0, "topk_group must be positive");
+  TORCH_CHECK(topk_group <= n_group, "topk_group must be <= n_group");
  TORCH_CHECK(num_experts % n_group == 0,
              "num_experts should be divisible by n_group");
  TORCH_CHECK(n_group <= 32,
              "n_group should be smaller than or equal to 32 for now");
  TORCH_CHECK(topk <= 32, "topk should be smaller than or equal to 32 for now");
+  TORCH_CHECK(topk <= topk_group * (num_experts / n_group),
+              "topk must be <= topk_group * (num_experts / n_group)");
  TORCH_CHECK(scoring_func == vllm::moe::SCORING_NONE ||
                  scoring_func == vllm::moe::SCORING_SIGMOID,
              "scoring_func must be SCORING_NONE (0) or SCORING_SIGMOID (1)");

-  torch::Tensor group_scores = torch::empty(
-      {num_tokens, n_group}, torch::dtype(data_type).device(torch::kCUDA));
  // Always output float32 for topk_values (eliminates Python-side conversion)
  torch::Tensor topk_values = torch::empty(
      {num_tokens, topk}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
@@ -868,7 +769,6 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
      case torch::kFloat16:                                                  \
        vllm::moe::invokeNoAuxTc<T, half, IdxT>(                             \
            reinterpret_cast<T*>(scores.mutable_data_ptr()),                 \
-            reinterpret_cast<T*>(group_scores.mutable_data_ptr()),           \
            reinterpret_cast<float*>(topk_values.mutable_data_ptr()),        \
            reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()),        \
            reinterpret_cast<half const*>(bias.data_ptr()), num_tokens,      \
@@ -879,7 +779,6 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
      case torch::kFloat32:                                                  \
        vllm::moe::invokeNoAuxTc<T, float, IdxT>(                            \
            reinterpret_cast<T*>(scores.mutable_data_ptr()),                 \
-            reinterpret_cast<T*>(group_scores.mutable_data_ptr()),           \
            reinterpret_cast<float*>(topk_values.mutable_data_ptr()),        \
            reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()),        \
            reinterpret_cast<float const*>(bias.data_ptr()), num_tokens,     \
@@ -890,7 +789,6 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
      case torch::kBFloat16:                                                 \
        vllm::moe::invokeNoAuxTc<T, __nv_bfloat16, IdxT>(                    \
            reinterpret_cast<T*>(scores.mutable_data_ptr()),                 \
-            reinterpret_cast<T*>(group_scores.mutable_data_ptr()),           \
            reinterpret_cast<float*>(topk_values.mutable_data_ptr()),        \
            reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()),        \
            reinterpret_cast<__nv_bfloat16 const*>(bias.data_ptr()),         \
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -58,7 +58,7 @@ TEMPLATE = (
    "( MARLIN_KERNEL_PARAMS );"
 )

-THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
+THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128), (128, 64, 128)]

 THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]

--- a/csrc/moe/marlin_moe_wna16/kernel.h
+++ b/csrc/moe/marlin_moe_wna16/kernel.h
@@ -3,8 +3,8 @@
  #define MARLIN_NAMESPACE_NAME marlin_moe_wna16
 #endif

-#include "quantization/gptq_marlin/marlin.cuh"
-#include "quantization/gptq_marlin/marlin_dtypes.cuh"
+#include "quantization/marlin/marlin.cuh"
+#include "quantization/marlin/marlin_dtypes.cuh"
 #include "core/scalar_type.hpp"

 #define MARLIN_KERNEL_PARAMS                                          \
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
@@ -23,10 +23,10 @@
  #define MARLIN_NAMESPACE_NAME marlin_moe_wna16
 #endif

-#include "quantization/gptq_marlin/marlin.cuh"
-#include "quantization/gptq_marlin/marlin_dtypes.cuh"
-#include "quantization/gptq_marlin/dequant.h"
-#include "quantization/gptq_marlin/marlin_mma.h"
+#include "quantization/marlin/marlin.cuh"
+#include "quantization/marlin/marlin_dtypes.cuh"
+#include "quantization/marlin/dequant.h"
+#include "quantization/marlin/marlin_mma.h"
 #include "core/scalar_type.hpp"

 #define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@@ -126,14 +126,16 @@ thread_config_t small_batch_thread_configs[] = {

    // thread_k, thread_n, num_threads
    {128, 128, 256},
-    {64, 128, 128}};
+    {64, 128, 128},
+    {128, 64, 128}};

 thread_config_t large_batch_thread_configs[] = {
    // Ordered by priority

    // thread_k, thread_n, num_threads
    {64, 256, 256},
-    {64, 128, 128}};
+    {64, 128, 128},
+    {128, 64, 128}};

 typedef struct {
  int blocks_per_sm;
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -4,7 +4,13 @@

 void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
                  torch::Tensor& token_expert_indices,
-                  torch::Tensor& gating_output, bool renormalize);
+                  torch::Tensor& gating_output, bool renormalize,
+                  std::optional<torch::Tensor> bias);
+
+void topk_sigmoid(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
+                  torch::Tensor& token_expert_indices,
+                  torch::Tensor& gating_output, bool renormalize,
+                  std::optional<torch::Tensor> bias);

 void moe_sum(torch::Tensor& input, torch::Tensor& output);

--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -42,7 +42,7 @@ void moe_permute(
  auto sort_workspace = torch::empty(
      {sorter_size},
      torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
-  auto copy_topk_ids = topk_ids.clone();  // copy topk_ids for preprocess
+  torch::Tensor topk_ids_for_sort = topk_ids;
  auto permuted_experts_id = torch::empty_like(topk_ids);
  auto sorted_row_idx = torch::empty_like(inv_permuted_idx);

@@ -62,12 +62,13 @@ void moe_permute(
    const int* expert_map_ptr = get_ptr<int>(expert_map.value());
    valid_num_ptr =
        get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
-    preprocessTopkIdLauncher(get_ptr<int>(copy_topk_ids), n_token * topk,
+    topk_ids_for_sort = topk_ids.clone();
+    preprocessTopkIdLauncher(get_ptr<int>(topk_ids_for_sort), n_token * topk,
                             expert_map_ptr, n_expert, stream);
  }
  // expert sort topk expert id and scan expert id get expert_first_token_offset
  sortAndScanExpert(
-      get_ptr<int>(copy_topk_ids), get_ptr<int>(token_expert_indices),
+      get_ptr<const int>(topk_ids_for_sort), get_ptr<int>(token_expert_indices),
      get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
      get_ptr<int64_t>(expert_first_token_offset), n_token, n_expert,
      n_local_expert, topk, sorter, get_ptr<int>(sort_workspace), stream);
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
@@ -109,7 +109,7 @@ void computeExpertFirstTokenOffset(int const* sorted_indices,
      sorted_indices, total_indices, num_experts, expert_first_token_offset);
 }

-void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
+void sortAndScanExpert(const int* expert_for_source_row, const int* source_rows,
                       int* permuted_experts, int* permuted_rows,
                       int64_t* expert_first_token_offset, int num_rows,
                       int num_experts, int num_experts_per_node, int k,
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
@@ -48,7 +48,7 @@ void computeExpertFirstTokenOffset(int const* sorted_indices,
                                   int64_t* expert_first_token_offset,
                                   cudaStream_t stream);

-void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
+void sortAndScanExpert(const int* expert_for_source_row, const int* source_rows,
                       int* permuted_experts, int* permuted_rows,
                       int64_t* expert_first_token_offset, int num_rows,
                       int num_experts, int num_experts_per_node, int k,
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -62,6 +62,12 @@ __device__ __forceinline__ float toFloat(T value) {
    }
 }

+// Scoring function enums
+enum ScoringFunc {
+  SCORING_SOFTMAX = 0, // apply softmax
+  SCORING_SIGMOID = 1  // apply sigmoid
+};
+
 // ====================== Softmax things ===============================
 // We have our own implementation of softmax here so we can support transposing the output
 // in the softmax kernel when we extend this module to support expert-choice routing.
@@ -125,6 +131,27 @@ __launch_bounds__(TPB) __global__
    }
 }

+template <int TPB, typename InputType>
+__launch_bounds__(TPB) __global__
+    void moeSigmoid(const InputType* input, const bool* finished, float* output, const int num_cols)
+{
+    const int thread_row_offset = blockIdx.x * num_cols;
+
+    // Don't touch finished rows.
+    if ((finished != nullptr) && finished[blockIdx.x])
+    {
+        return;
+    }
+
+    for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
+    {
+        const int idx = thread_row_offset + ii;
+        const float val = toFloat(input[idx]);
+        const float sigmoid_val = 1.0f / (1.0f + __expf(-val));
+        output[idx] = sigmoid_val;
+    }
+}
+
 template <int TPB, typename IndType>
 __launch_bounds__(TPB) __global__ void moeTopK(
    const float* inputs_after_softmax,
@@ -136,7 +163,8 @@ __launch_bounds__(TPB) __global__ void moeTopK(
    const int k,
    const int start_expert,
    const int end_expert,
-    const bool renormalize)
+    const bool renormalize,
+    const float* bias)
 {

    using cub_kvp = cub::KeyValuePair<int, float>;
@@ -162,7 +190,13 @@ __launch_bounds__(TPB) __global__ void moeTopK(
        {
            const int idx = thread_read_offset + expert;
            inp_kvp.key = expert;
-            inp_kvp.value = inputs_after_softmax[idx];
+
+            // Apply correction bias if provided
+            if (bias != nullptr) {
+              inp_kvp.value = inputs_after_softmax[idx] + bias[expert];
+            } else {
+              inp_kvp.value = inputs_after_softmax[idx];
+            }

            for (int prior_k = 0; prior_k < k_idx; ++prior_k)
            {
@@ -186,12 +220,13 @@ __launch_bounds__(TPB) __global__ void moeTopK(
            const bool should_process_row = row_is_active && node_uses_expert;

            const int idx = k * block_row + k_idx;
-            output[idx] = result_kvp.value;
+            // Return the unbiased scores for output weights
+            output[idx] = inputs_after_softmax[thread_read_offset + expert];
            indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
            assert(indices[idx] >= 0);
            source_rows[idx] = k_idx * num_rows + block_row;
            if (renormalize) {
-                selected_sum += result_kvp.value;
+                selected_sum += inputs_after_softmax[thread_read_offset + expert];
            }
        }
        __syncthreads();
@@ -225,10 +260,12 @@ __launch_bounds__(TPB) __global__ void moeTopK(
  2) This implementation assumes k is small, but will work for any k.
 */

-template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType, typename InputType = float>
+template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType,
+          typename InputType = float, ScoringFunc SF>
 __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
-    void topkGatingSoftmax(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices,
-        int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize)
+    void topkGating(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices,
+        int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize,
+        const float* bias)
 {
    static_assert(std::is_same_v<InputType, float> || std::is_same_v<InputType, __nv_bfloat16> ||
                      std::is_same_v<InputType, __half>,
@@ -353,61 +390,89 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
        }
    }

-    // First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
-    // convert to float afterwards for the exp + sum reduction.
-    float thread_max = row_chunk[0];
+    if constexpr (SF == SCORING_SOFTMAX) {
+      // First, we perform a max reduce within the thread.
+      float thread_max = row_chunk[0];
 #pragma unroll
-    for (int ii = 1; ii < VPT; ++ii)
-    {
+      for (int ii = 1; ii < VPT; ++ii) {
        thread_max = max(thread_max, row_chunk[ii]);
-    }
+      }

 // Now, we find the max within the thread group and distribute among the threads. We use a butterfly reduce.
 #pragma unroll
-    for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
-    {
+      for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
+      {
        thread_max = max(thread_max, VLLM_SHFL_XOR_SYNC_WIDTH(thread_max, mask, THREADS_PER_ROW));
-    }
+      }

-    // From this point, thread max in all the threads have the max within the row.
-    // Now, we subtract the max from each element in the thread and take the exp. We also compute the thread local sum.
-    float row_sum = 0;
+      // From this point, thread max in all the threads have the max within the row.
+      // Now, we subtract the max from each element in the thread and take the exp. We also compute the thread local sum.
+      float row_sum = 0;
 #pragma unroll
-    for (int ii = 0; ii < VPT; ++ii)
-    {
+      for (int ii = 0; ii < VPT; ++ii)
+      {
        row_chunk[ii] = expf(row_chunk[ii] - thread_max);
        row_sum += row_chunk[ii];
-    }
+      }

 // Now, we perform the sum reduce within each thread group. Similar to the max reduce, we use a bufferfly pattern.
 #pragma unroll
-    for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
-    {
+      for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
+      {
        row_sum += VLLM_SHFL_XOR_SYNC_WIDTH(row_sum, mask, THREADS_PER_ROW);
-    }
+      }

-    // From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables
-    // respectively. Finally, we can scale the rows for the softmax. Technically, for top-k gating we don't need to
-    // compute the entire softmax row. We can likely look at the maxes and only compute for the top-k values in the row.
-    // However, this kernel will likely not be a bottle neck and it seems better to closer match torch and find the
-    // argmax after computing the softmax.
-    const float reciprocal_row_sum = 1.f / row_sum;
+      // From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables
+      // respectively. Finally, we can scale the rows for the softmax. Technically, for top-k gating we don't need to
+      // compute the entire softmax row. We can likely look at the maxes and only compute for the top-k values in the row.
+      // However, this kernel will likely not be a bottle neck and it seems better to closer match torch and find the
+      // argmax after computing the softmax.
+      const float reciprocal_row_sum = 1.f / row_sum;

 #pragma unroll
-    for (int ii = 0; ii < VPT; ++ii)
-    {
+      for (int ii = 0; ii < VPT; ++ii)
+      {
        row_chunk[ii] = row_chunk[ii] * reciprocal_row_sum;
+      }
+    } else if constexpr (SF == SCORING_SIGMOID) {
+#pragma unroll
+      for (int ii = 0; ii < VPT; ++ii)
+      {
+        row_chunk[ii] = 1.0f / (1.0f + __expf(-row_chunk[ii]));
+      }
    }

-    // Now, softmax_res contains the softmax of the row chunk. Now, I want to find the topk elements in each row, along
+    static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
+
+    // If bias is not null, use biased value for selection
+    float row_chunk_for_choice[VPT];
+    // Apply correction bias
+    if (bias != nullptr) {
+#pragma unroll
+      for (int ldg = 0; ldg < LDG_PER_THREAD; ++ldg) {
+#pragma unroll
+        for (int ii = 0; ii < ELTS_PER_LDG; ++ii) {
+          const int expert = first_elt_read_by_thread + ldg * COLS_PER_GROUP_LDG + ii;
+          float bias_val = expert < NUM_EXPERTS ? bias[expert] : 0.0f;
+          row_chunk_for_choice[ldg * ELTS_PER_LDG + ii] = row_chunk[ldg * ELTS_PER_LDG + ii] + bias_val;
+        }
+      }
+    } else {
+#pragma unroll
+      for (int ii = 0; ii < VPT; ++ii) {
+        row_chunk_for_choice[ii] = row_chunk[ii];
+      }
+    }
+
+    // Now, row_chunk contains the softmax / sigmoid of the row chunk. Now, I want to find the topk elements in each row, along
    // with the max index.
    int start_col = first_elt_read_by_thread;
-    static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;

    float selected_sum = 0.f;
    for (int k_idx = 0; k_idx < k; ++k_idx)
    {
        // First, each thread does the local argmax
+        float max_val_for_choice = row_chunk_for_choice[0];
        float max_val = row_chunk[0];
        int expert = start_col;
 #pragma unroll
@@ -416,12 +481,14 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
 #pragma unroll
            for (int ii = 0; ii < ELTS_PER_LDG; ++ii)
            {
+                float val_for_choice = row_chunk_for_choice[ldg * ELTS_PER_LDG + ii];
                float val = row_chunk[ldg * ELTS_PER_LDG + ii];

                // No check on the experts here since columns with the smallest index are processed first and only
                // updated if > (not >=)
-                if (val > max_val)
+                if (val_for_choice > max_val_for_choice)
                {
+                    max_val_for_choice = val_for_choice;
                    max_val = val;
                    expert = col + ii;
                }
@@ -434,12 +501,14 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
 #pragma unroll
        for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
        {
+            float other_max_for_choice = VLLM_SHFL_XOR_SYNC_WIDTH(max_val_for_choice, mask, THREADS_PER_ROW);
            float other_max = VLLM_SHFL_XOR_SYNC_WIDTH(max_val, mask, THREADS_PER_ROW);
            int other_expert = VLLM_SHFL_XOR_SYNC_WIDTH(expert, mask, THREADS_PER_ROW);

            // We want lower indices to "win" in every thread so we break ties this way
-            if (other_max > max_val || (other_max == max_val && other_expert < expert))
+            if (other_max_for_choice > max_val_for_choice || (other_max_for_choice == max_val_for_choice && other_expert < expert))
            {
+                max_val_for_choice = other_max_for_choice;
                max_val = other_max;
                expert = other_expert;
            }
@@ -474,7 +543,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
            {
                const int offset_for_expert = expert % ELTS_PER_LDG;
                // Safe to set to any negative value since row_chunk values must be between 0 and 1.
-                row_chunk[ldg_group_for_expert * ELTS_PER_LDG + offset_for_expert] = -10000.f;
+                row_chunk_for_choice[ldg_group_for_expert * ELTS_PER_LDG + offset_for_expert] = -10000.f;
            }
        }
    }
@@ -508,10 +577,10 @@ struct TopkConstants
 };
 } // namespace detail

-template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType, typename InputType>
-void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices,
+template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType, typename InputType, ScoringFunc SF>
+void topkGatingLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices,
    int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, const bool renormalize,
-    cudaStream_t stream)
+    const float* bias, cudaStream_t stream)
 {
    static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(InputType) * EXPERTS);
    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM, InputType>;
@@ -521,43 +590,51 @@ void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finishe
    const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;

    dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
-    topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM, IndType, InputType><<<num_blocks, block_dim, 0, stream>>>(
-        input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize);
+    topkGating<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM, IndType, InputType, SF><<<num_blocks, block_dim, 0, stream>>>(
+        input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize, bias);
 }

 #ifndef USE_ROCM
-#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                          \
-    static_assert(WARP_SIZE == 32,                                                    \
-                  "Unsupported warp size. Only 32 is supported for CUDA");            \
-    topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
-        gating_output, nullptr, topk_weights, topk_indices, token_expert_indices,     \
-        num_tokens, topk, 0, num_experts, renormalize, stream);
+  #define LAUNCH_TOPK(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                   \
+    static_assert(WARP_SIZE == 32,                                            \
+                  "Unsupported warp size. Only 32 is supported for CUDA");    \
+    topkGatingLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES, \
+                             IndType, InputType, SF>(                         \
+        gating_output, nullptr, topk_weights, topk_indices,                   \
+        token_expert_indices, num_tokens, topk, 0, num_experts, renormalize,  \
+        bias, stream);
 #else
-#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                             \
-    if (WARP_SIZE == 64) {                                                               \
-        topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>(       \
-            gating_output, nullptr, topk_weights, topk_indices, token_expert_indices,    \
-            num_tokens, topk, 0, num_experts, renormalize, stream);                      \
-    } else if (WARP_SIZE == 32) {                                                        \
-        topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>(       \
-            gating_output, nullptr, topk_weights, topk_indices, token_expert_indices,    \
-            num_tokens, topk, 0, num_experts, renormalize, stream);                      \
-    } else {                                                                             \
-        assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
+  #define LAUNCH_TOPK(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                    \
+    if (WARP_SIZE == 64) {                                                     \
+      topkGatingLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES,       \
+                               IndType, InputType, SF>(                        \
+          gating_output, nullptr, topk_weights, topk_indices,                  \
+          token_expert_indices, num_tokens, topk, 0, num_experts, renormalize, \
+          bias, stream);                                                       \
+    } else if (WARP_SIZE == 32) {                                              \
+      topkGatingLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES,       \
+                               IndType, InputType, SF>(                        \
+          gating_output, nullptr, topk_weights, topk_indices,                  \
+          token_expert_indices, num_tokens, topk, 0, num_experts, renormalize, \
+          bias, stream);                                                       \
+    } else {                                                                   \
+      assert(false &&                                                          \
+             "Unsupported warp size. Only 32 and 64 are supported for ROCm");  \
    }
 #endif

-template <typename IndType, typename InputType>
-void topkGatingSoftmaxKernelLauncher(
+template <typename IndType, typename InputType, ScoringFunc SF>
+void topkGatingKernelLauncher(
    const InputType* gating_output,
    float* topk_weights,
    IndType* topk_indices,
    int* token_expert_indices,
-    float* softmax_workspace,
+    float* workspace,
    const int num_tokens,
    const int num_experts,
    const int topk,
    const bool renormalize,
+    const float* bias,
    cudaStream_t stream) {
    static constexpr int WARPS_PER_TB = 4;
    static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
@@ -569,64 +646,71 @@ void topkGatingSoftmaxKernelLauncher(
 #endif
    switch (num_experts) {
        case 1:
-            LAUNCH_SOFTMAX(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_TOPK(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 2:
-            LAUNCH_SOFTMAX(2, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_TOPK(2, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 4:
-            LAUNCH_SOFTMAX(4, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_TOPK(4, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 8:
-            LAUNCH_SOFTMAX(8, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_TOPK(8, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 16:
-            LAUNCH_SOFTMAX(16, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_TOPK(16, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 32:
-            LAUNCH_SOFTMAX(32, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_TOPK(32, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 64:
-            LAUNCH_SOFTMAX(64, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_TOPK(64, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 128:
-            LAUNCH_SOFTMAX(128, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_TOPK(128, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 256:
-            LAUNCH_SOFTMAX(256, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_TOPK(256, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 512:
-            LAUNCH_SOFTMAX(512, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_TOPK(512, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        // (CUDA only) support multiples of 64 when num_experts is not power of 2.
        // ROCm uses WARP_SIZE 64 so 8 bytes loading won't fit for some of num_experts,
        // alternatively we can test 4 bytes loading and enable it in future.
 #ifndef USE_ROCM
        case 192:
-            LAUNCH_SOFTMAX(192, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            LAUNCH_TOPK(192, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
            break;
        case 320:
-            LAUNCH_SOFTMAX(320, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            LAUNCH_TOPK(320, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
            break;
        case 384:
-            LAUNCH_SOFTMAX(384, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            LAUNCH_TOPK(384, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
            break;
        case 448:
-            LAUNCH_SOFTMAX(448, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            LAUNCH_TOPK(448, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
            break;
        case 576:
-            LAUNCH_SOFTMAX(576, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            LAUNCH_TOPK(576, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
            break;
 #endif
        default: {
-            TORCH_CHECK(softmax_workspace != nullptr,
-                "softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
+            TORCH_CHECK(workspace != nullptr,
+                "workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
            static constexpr int TPB = 256;
-            moeSoftmax<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
-                gating_output, nullptr, softmax_workspace, num_experts);
+            if constexpr (SF == SCORING_SOFTMAX) {
+              moeSoftmax<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
+                gating_output, nullptr, workspace, num_experts);
+            } else if constexpr (SF == SCORING_SIGMOID) {
+              moeSigmoid<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
+                gating_output, nullptr, workspace, num_experts);
+            } else {
+                TORCH_CHECK(false, "Unsupported scoring func");
+            }
            moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
-                softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
-                num_experts, topk, 0, num_experts, renormalize);
+                workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
+                num_experts, topk, 0, num_experts, renormalize, bias);
        }
    }
 }
@@ -635,40 +719,55 @@ void topkGatingSoftmaxKernelLauncher(
 } // namespace vllm


-template<typename ComputeType>
-void dispatch_topk_softmax_launch(
+template<typename ComputeType, vllm::moe::ScoringFunc SF>
+void dispatch_topk_launch(
    torch::Tensor& gating_output,
    torch::Tensor& topk_weights,
    torch::Tensor& topk_indices,
    torch::Tensor& token_expert_indices,
    torch::Tensor& softmax_workspace,
-    int num_tokens, int num_experts, int topk, bool renormalize, cudaStream_t stream)
-{
+    int num_tokens, int num_experts, int topk, bool renormalize,
+    std::optional<torch::Tensor> bias,
+    cudaStream_t stream)
+ {
+    const float* bias_ptr = nullptr;
+    if (bias.has_value()) {
+      const torch::Tensor& bias_tensor = bias.value();
+      TORCH_CHECK(bias_tensor.scalar_type() == at::ScalarType::Float, "bias tensor must be float32");
+      TORCH_CHECK(bias_tensor.dim() == 1, "bias tensor must be 1D");
+      TORCH_CHECK(bias_tensor.size(0) == num_experts, "bias size mismatch, expected: ", num_experts);
+      TORCH_CHECK(bias_tensor.is_contiguous(), "bias tensor must be contiguous");
+      bias_ptr = bias_tensor.data_ptr<float>();
+    }
+
    if (topk_indices.scalar_type() == at::ScalarType::Int) {
-        vllm::moe::topkGatingSoftmaxKernelLauncher<int, ComputeType>(
+        vllm::moe::topkGatingKernelLauncher<int, ComputeType, SF>(
            reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
            topk_weights.data_ptr<float>(),
            topk_indices.data_ptr<int>(),
            token_expert_indices.data_ptr<int>(),
            softmax_workspace.data_ptr<float>(),
-            num_tokens, num_experts, topk, renormalize, stream);
+            num_tokens, num_experts, topk, renormalize,
+            bias_ptr, stream);
    } else if (topk_indices.scalar_type() == at::ScalarType::UInt32) {
-        vllm::moe::topkGatingSoftmaxKernelLauncher<uint32_t, ComputeType>(
+        vllm::moe::topkGatingKernelLauncher<uint32_t, ComputeType, SF>(
            reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
            topk_weights.data_ptr<float>(),
            topk_indices.data_ptr<uint32_t>(),
            token_expert_indices.data_ptr<int>(),
            softmax_workspace.data_ptr<float>(),
-            num_tokens, num_experts, topk, renormalize, stream);
+            num_tokens, num_experts, topk, renormalize,
+            bias_ptr, stream);
    } else {
        TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
-        vllm::moe::topkGatingSoftmaxKernelLauncher<int64_t, ComputeType>(
+        vllm::moe::topkGatingKernelLauncher<int64_t, ComputeType, SF>(
            reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
            topk_weights.data_ptr<float>(),
            topk_indices.data_ptr<int64_t>(),
            token_expert_indices.data_ptr<int>(),
            softmax_workspace.data_ptr<float>(),
-            num_tokens, num_experts, topk, renormalize, stream);
+            num_tokens, num_experts, topk, renormalize,
+            bias_ptr, stream);
    }
 }

@@ -677,7 +776,8 @@ void topk_softmax(
    torch::Tensor& topk_indices,                // [num_tokens, topk]
    torch::Tensor& token_expert_indices,        // [num_tokens, topk]
    torch::Tensor& gating_output,               // [num_tokens, num_experts]
-    bool renormalize)
+    bool renormalize,
+    std::optional<torch::Tensor> bias)
 {
    const int num_experts = gating_output.size(-1);
    const auto num_tokens = gating_output.numel() / num_experts;
@@ -693,14 +793,55 @@ void topk_softmax(
    torch::Tensor softmax_workspace = torch::empty({workspace_size}, workspace_options);

    if (gating_output.scalar_type() == at::ScalarType::Float) {
-        dispatch_topk_softmax_launch<float>(gating_output, topk_weights, topk_indices, 
-            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
+        dispatch_topk_launch<float, vllm::moe::SCORING_SOFTMAX>(gating_output, topk_weights, topk_indices,
+            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize,
+            bias, stream);
    } else if (gating_output.scalar_type() == at::ScalarType::Half) {
-        dispatch_topk_softmax_launch<__half>(gating_output, topk_weights, topk_indices, 
-            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
+        dispatch_topk_launch<__half, vllm::moe::SCORING_SOFTMAX>(gating_output, topk_weights, topk_indices,
+            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize,
+            bias, stream);
    } else if (gating_output.scalar_type() == at::ScalarType::BFloat16) {
-        dispatch_topk_softmax_launch<__nv_bfloat16>(gating_output, topk_weights, topk_indices, 
-            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
+        dispatch_topk_launch<__nv_bfloat16, vllm::moe::SCORING_SOFTMAX>(gating_output, topk_weights, topk_indices,
+            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize,
+            bias, stream);
+    } else {
+        TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type());
+    }
+}
+
+void topk_sigmoid(
+    torch::Tensor& topk_weights,                // [num_tokens, topk]
+    torch::Tensor& topk_indices,                // [num_tokens, topk]
+    torch::Tensor& token_expert_indices,        // [num_tokens, topk]
+    torch::Tensor& gating_output,               // [num_tokens, num_experts]
+    bool renormalize,
+    std::optional<torch::Tensor> bias)
+{
+    const int num_experts = gating_output.size(-1);
+    const auto num_tokens = gating_output.numel() / num_experts;
+    const int topk = topk_weights.size(-1);
+
+    const bool is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
+    const bool needs_workspace = !is_pow_2 || num_experts > 256;
+    const int64_t workspace_size = needs_workspace ? num_tokens * num_experts : 0;
+
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    const auto workspace_options = gating_output.options().dtype(at::ScalarType::Float);
+    torch::Tensor workspace = torch::empty({workspace_size}, workspace_options);
+
+    if (gating_output.scalar_type() == at::ScalarType::Float) {
+        dispatch_topk_launch<float, vllm::moe::SCORING_SIGMOID>(gating_output, topk_weights, topk_indices,
+            token_expert_indices, workspace, num_tokens, num_experts, topk, renormalize,
+            bias, stream);
+    } else if (gating_output.scalar_type() == at::ScalarType::Half) {
+        dispatch_topk_launch<__half, vllm::moe::SCORING_SIGMOID>(gating_output, topk_weights, topk_indices,
+            token_expert_indices, workspace, num_tokens, num_experts, topk, renormalize,
+            bias, stream);
+    } else if (gating_output.scalar_type() == at::ScalarType::BFloat16) {
+        dispatch_topk_launch<__nv_bfloat16, vllm::moe::SCORING_SIGMOID>(gating_output, topk_weights, topk_indices,
+            token_expert_indices, workspace, num_tokens, num_experts, topk, renormalize,
+            bias, stream);
    } else {
        TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type());
    }
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -5,9 +5,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
  // Apply topk softmax to the gating outputs.
  m.def(
      "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
-      "token_expert_indices, Tensor gating_output, bool renormalize) -> ()");
+      "token_expert_indices, Tensor gating_output, bool renormalize, Tensor? "
+      "bias) -> ()");
  m.impl("topk_softmax", torch::kCUDA, &topk_softmax);

+  // Apply topk sigmoid to the gating outputs.
+  m.def(
+      "topk_sigmoid(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
+      "token_expert_indices, Tensor gating_output, bool renormalize, Tensor? "
+      "bias) -> ()");
+  m.impl("topk_sigmoid", torch::kCUDA, &topk_sigmoid);
+
  // Calculate the result of moe by summing up the partial results
  // from all selected experts.
  m.def("moe_sum(Tensor input, Tensor! output) -> ()");
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -260,12 +260,6 @@ void get_cutlass_moe_mm_data(
    const int64_t num_experts, const int64_t n, const int64_t k,
    const std::optional<torch::Tensor>& blockscale_offsets);

-void get_cutlass_moe_mm_problem_sizes(
-    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
-    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
-    std::optional<bool> force_swap_ab = std::nullopt);
-
 void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
    const torch::Tensor& expert_first_token_offset,
    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@@ -299,7 +293,8 @@ std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a);

 void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
                      torch::Tensor& output_scale,
-                      torch::Tensor const& input_scale);
+                      torch::Tensor const& input_scale,
+                      bool is_sf_swizzled_layout);

 void scaled_fp4_experts_quant(
    torch::Tensor& output, torch::Tensor& output_scale,
--- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@@ -27,17 +27,24 @@

 #include "cuda_utils.h"
 #include "launch_bounds_utils.h"
+
+// Define before including nvfp4_utils.cuh so the header
+// can use this macro during compilation.
+#define NVFP4_ENABLE_ELTS16 1
 #include "nvfp4_utils.cuh"

 namespace vllm {

 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
-__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
-    silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
-                             float const* SFScale, uint32_t* out,
-                             uint32_t* SFout) {
-  using PackedVec = PackedVec<Type>;
+__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
+    silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols,
+                             int32_t num_padded_cols,
+                             Type const* __restrict__ in,
+                             float const* __restrict__ SFScale,
+                             uint32_t* __restrict__ out,
+                             uint32_t* __restrict__ SFout) {
+  using PackedVec = vllm::PackedVec<Type>;
  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
@@ -49,34 +56,60 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
  // Get the global scaling factor, which will be applied to the SF.
  // Note SFScale is the same as next GEMM's alpha, which is
  // (448.f / (Alpha_A / 6.f)).
-  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
+  float const SFScaleVal = (SFScale == nullptr) ? 1.0f : SFScale[0];
+
+  int32_t const colIdx = blockDim.x * blockIdx.y + threadIdx.x;
+  int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;

  // Input tensor row/col loops.
  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
-    for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
-         colIdx += blockDim.x) {
+    if (colIdx < num_padded_cols) {
+      PackedVec in_vec;
+      PackedVec in_vec2;
      int64_t inOffset =
          rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) + colIdx;
      int64_t inOffset2 = rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) +
                          numCols / CVT_FP4_ELTS_PER_THREAD + colIdx;
-      PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-      PackedVec in_vec2 = reinterpret_cast<PackedVec const*>(in)[inOffset2];

-      // Get the output tensor offset.
-      // Same as inOffset because 8 elements are packed into one uint32_t.
-      int64_t outOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
-      auto& out_pos = out[outOffset];
+      bool valid = (rowIdx < numRows) && (elem_idx < numCols);
+      if constexpr (CVT_FP4_PACK16) {
+        ld256_or_zero_cg_u32<Type>(
+            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 8],
+            valid);
+        ld256_or_zero_cg_u32<Type>(
+            in_vec2, &reinterpret_cast<const uint32_t*>(in)[inOffset2 * 8],
+            valid);
+      } else {
+        ld128_or_zero_cg_u32<Type>(
+            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 4],
+            valid);
+        ld128_or_zero_cg_u32<Type>(
+            in_vec2, &reinterpret_cast<const uint32_t*>(in)[inOffset2 * 4],
+            valid);
+      }

      // Compute silu and mul
-      PackedVec out_silu_mul = compute_silu_mul(in_vec, in_vec2);
+      PackedVec out_silu_mul = compute_silu_mul<Type>(in_vec, in_vec2);

      auto sf_out =
          cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
                                             CVT_FP4_NUM_THREADS_PER_SF>(
              rowIdx, colIdx, numKTiles, SFout);

-      out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(out_silu_mul, SFScaleVal,
-                                                     sf_out);
+      auto out_val =
+          cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>(
+              out_silu_mul, SFScaleVal, sf_out);
+
+      if (valid) {
+        if constexpr (CVT_FP4_PACK16) {
+          int64_t outOffset = rowIdx * (numCols / 8) + colIdx * 2;
+          uint64_t packed64 =
+              (uint64_t(out_val.hi) << 32) | uint64_t(out_val.lo);
+          reinterpret_cast<uint64_t*>(out)[outOffset >> 1] = packed64;
+        } else {
+          out[inOffset] = out_val;
+        }
+      }
    }
  }
 }
@@ -103,17 +136,23 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output,  // [..., d]
  auto output_ptr = static_cast<int64_t*>(output.data_ptr());
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
-  dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
+  dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
  int const numBlocksPerSM =
      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
-  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+
+  int sf_n_unpadded = int(n / CVT_FP4_SF_VEC_SIZE);
+
+  int grid_y = vllm::div_round_up(sf_n_unpadded, static_cast<int>(block.x));
+  int grid_x = std::min(
+      int(m), std::max(1, (multiProcessorCount * numBlocksPerSM) / grid_y));
+  dim3 grid(grid_x, grid_y);

  VLLM_DISPATCH_HALF_TYPES(
      input.scalar_type(), "silu_and_mul_nvfp4_quant_kernel", [&] {
        using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
        auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
        vllm::silu_mul_cvt_fp16_to_fp4<cuda_type><<<grid, block, 0, stream>>>(
-            m, n, input_ptr, input_sf_ptr,
+            m, n, sf_n_unpadded, input_ptr, input_sf_ptr,
            reinterpret_cast<uint32_t*>(output_ptr),
            reinterpret_cast<uint32_t*>(sf_out));
      });
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@@ -140,8 +140,8 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
                                           CVT_FP4_NUM_THREADS_PER_SF>(
            rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);

-    out_pos =
-        cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(quant_input, SFScaleVal, sf_out);
+    out_pos = cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>(
+        quant_input, SFScaleVal, sf_out);
  }
 }

@@ -246,8 +246,8 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
                                           CVT_FP4_NUM_THREADS_PER_SF>(
            rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);

-    out_pos =
-        cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(quant_input, SFScaleVal, sf_out);
+    out_pos = cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>(
+        quant_input, SFScaleVal, sf_out);
  }
 }

--- a/csrc/quantization/fp4/nvfp4_quant_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@@ -21,7 +21,8 @@
 void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
                             torch::Tensor const& input,
                             torch::Tensor const& output_sf,
-                             torch::Tensor const& input_sf);
+                             torch::Tensor const& input_sf,
+                             bool is_sf_swizzled_layout);
 #endif

 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
@@ -51,10 +52,12 @@ void silu_and_mul_scaled_fp4_experts_quant_sm1xxa(
 #endif

 void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
-                      torch::Tensor& output_sf, torch::Tensor const& input_sf) {
+                      torch::Tensor& output_sf, torch::Tensor const& input_sf,
+                      bool is_sf_swizzled_layout) {
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
-  return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf);
+  return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf,
+                                 is_sf_swizzled_layout);
 #endif
  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization kernel");
 }
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -27,29 +27,23 @@

 #include "cuda_utils.h"
 #include "launch_bounds_utils.h"
+
+// Define before including nvfp4_utils.cuh so the header
+// can use this macro during compilation.
+#define NVFP4_ENABLE_ELTS16 1
 #include "nvfp4_utils.cuh"

 namespace vllm {

-template <typename Int>
-__host__ __device__ inline Int round_up(Int x, Int y) {
-  static_assert(std::is_integral_v<Int>,
-                "round_up argument must be integral type");
-  return ((x + y - 1) / y) * y;
-}
-
-// Compute effective rows for grid configuration with swizzled SF layouts.
-inline int computeEffectiveRows(int m) {
-  constexpr int ROW_TILE = 128;
-  return round_up(m, ROW_TILE);
-}
-
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
 __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
-    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
-                    float const* SFScale, uint32_t* out, uint32_t* SFout) {
-  using PackedVec = PackedVec<Type>;
+    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, int32_t num_padded_cols,
+                    Type const* __restrict__ in,
+                    float const* __restrict__ SFScale,
+                    uint32_t* __restrict__ out, uint32_t* __restrict__ SFout) {
+  using PackedVec = vllm::PackedVec<Type>;
+
  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
@@ -59,33 +53,31 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
  int32_t const numKTiles = (numCols + 63) / 64;

  int sf_m = round_up<int>(numRows, 128);
-  int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
-  int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
-  int num_padded_cols = sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE;
+  int32_t const colIdx = blockDim.x * blockIdx.y + threadIdx.x;
+  int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;

  // Get the global scaling factor, which will be applied to the SF.
  // Note SFScale is the same as next GEMM's alpha, which is
  // (448.f / (Alpha_A / 6.f)).
-  float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
+  float const global_scale = (SFScale == nullptr) ? 1.0f : SFScale[0];

  // Iterate over all rows and cols including padded ones -
  //  ensures we visit every single scale factor address to initialize it.
  for (int rowIdx = blockIdx.x; rowIdx < sf_m; rowIdx += gridDim.x) {
-    for (int colIdx = threadIdx.x;
-         colIdx < num_padded_cols / CVT_FP4_ELTS_PER_THREAD;
-         colIdx += blockDim.x) {
-      int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
-
+    if (colIdx < num_padded_cols) {
      PackedVec in_vec;
      int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;

      // If we are outside valid rows OR outside valid columns -> Use Zeros
-      if (rowIdx >= numRows || elem_idx >= numCols) {
-        memset(&in_vec, 0, sizeof(PackedVec));
-
+      bool valid = (rowIdx < numRows) && (elem_idx < numCols);
+      if constexpr (CVT_FP4_PACK16) {
+        ld256_or_zero_cg_u32<Type>(
+            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 8],
+            valid);
      } else {
-        // Valid Region: Load actual data
-        in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+        ld128_or_zero_cg_u32<Type>(
+            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 4],
+            valid);
      }

      auto sf_out =
@@ -94,13 +86,85 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
              rowIdx, colIdx, numKTiles, SFout);

      auto out_val =
-          cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
+          cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>(
+              in_vec, global_scale, sf_out);

      // We do NOT write output for padding because the 'out' tensor is not
      // padded.
-      if (rowIdx < numRows && elem_idx < numCols) {
-        // Same as inOffset because 8 elements are packed into one uint32_t.
-        out[inOffset] = out_val;
+      if (valid) {
+        if constexpr (CVT_FP4_PACK16) {
+          int64_t outOffset = rowIdx * (numCols / 8) + colIdx * 2;
+          uint64_t packed64 =
+              (uint64_t(out_val.hi) << 32) | uint64_t(out_val.lo);
+          reinterpret_cast<uint64_t*>(out)[outOffset >> 1] = packed64;
+        } else {
+          out[inOffset] = out_val;
+        }
+      }
+    }
+  }
+}
+
+// Use UE4M3 by default.
+template <class Type, bool UE8M0_SF = false>
+__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
+    cvt_fp16_to_fp4_sf_major(int32_t numRows, int32_t numCols,
+                             int32_t sf_n_unpadded, Type const* __restrict__ in,
+                             float const* __restrict__ SFScale,
+                             uint32_t* __restrict__ out,
+                             uint32_t* __restrict__ SFout) {
+  using PackedVec = PackedVec<Type>;
+
+  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
+      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
+  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
+                "Vec size is not matched.");
+
+  int32_t const colIdx = blockDim.x * blockIdx.y + threadIdx.x;
+  int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
+
+  // Get the global scaling factor, which will be applied to the SF.
+  // Note SFScale is the same as next GEMM's alpha, which is
+  // (448.f / (Alpha_A / 6.f)).
+  float const global_scale = (SFScale == nullptr) ? 1.0f : SFScale[0];
+
+  // Iterate over all rows and cols including padded ones -
+  //  ensures we visit every single scale factor address to initialize it.
+  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
+    if (colIdx < sf_n_unpadded) {
+      PackedVec in_vec;
+      int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
+
+      // If we are outside valid rows OR outside valid columns -> Use Zeros
+      bool valid = (rowIdx < numRows) && (elem_idx < numCols);
+      if constexpr (CVT_FP4_PACK16) {
+        ld256_or_zero_cg_u32<Type>(
+            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 8],
+            valid);
+      } else {
+        ld128_or_zero_cg_u32<Type>(
+            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 4],
+            valid);
+      }
+
+      auto sf_out =
+          sf_out_rowmajor_u8<uint32_t>(rowIdx, colIdx, sf_n_unpadded, SFout);
+
+      auto out_val =
+          cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>(
+              in_vec, global_scale, sf_out);
+
+      // We do NOT write output for padding because the 'out' tensor is not
+      // padded.
+      if (valid) {
+        if constexpr (CVT_FP4_PACK16) {
+          int64_t outOffset = rowIdx * (numCols / 8) + colIdx * 2;
+          uint64_t packed64 =
+              (uint64_t(out_val.hi) << 32) | uint64_t(out_val.lo);
+          reinterpret_cast<uint64_t*>(out)[outOffset >> 1] = packed64;
+        } else {
+          out[inOffset] = out_val;
+        }
      }
    }
  }
@@ -111,7 +175,8 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
 void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
                             torch::Tensor const& input,
                             torch::Tensor const& output_sf,
-                             torch::Tensor const& input_sf) {
+                             torch::Tensor const& input_sf,
+                             bool is_sf_swizzled_layout) {
  int32_t m = input.size(0);
  int32_t n = input.size(1);

@@ -129,19 +194,48 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  auto stream = at::cuda::getCurrentCUDAStream(input.get_device());

+  int sf_n_unpadded = int(n / CVT_FP4_SF_VEC_SIZE);
+
  // Grid, Block size. Each thread converts 8 values.
  dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
  int const numBlocksPerSM =
      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
-  int effectiveRows = vllm::computeEffectiveRows(m);
-  dim3 grid(std::min(effectiveRows, multiProcessorCount * numBlocksPerSM));

-  VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
-    using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
-    auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
-    // NOTE: We don't support e8m0 scales at this moment.
-    vllm::cvt_fp16_to_fp4<cuda_type, false><<<grid, block, 0, stream>>>(
-        m, n, input_ptr, input_sf_ptr, reinterpret_cast<uint32_t*>(output_ptr),
-        reinterpret_cast<uint32_t*>(sf_out));
-  });
-}
+  if (is_sf_swizzled_layout) {
+    int sf_n_int = int(vllm::round_up(sf_n_unpadded, 4) / 4);
+    int32_t num_padded_cols =
+        sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD;
+
+    int grid_y = vllm::div_round_up(num_padded_cols, static_cast<int>(block.x));
+    int grid_x =
+        std::min(vllm::computeEffectiveRows(m),
+                 std::max(1, (multiProcessorCount * numBlocksPerSM) / grid_y));
+    dim3 grid(grid_x, grid_y);
+
+    VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
+      using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
+      auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
+      // NOTE: We don't support e8m0 scales at this moment.
+      vllm::cvt_fp16_to_fp4<cuda_type, false><<<grid, block, 0, stream>>>(
+          m, n, num_padded_cols, input_ptr, input_sf_ptr,
+          reinterpret_cast<uint32_t*>(output_ptr),
+          reinterpret_cast<uint32_t*>(sf_out));
+    });
+  } else {
+    int grid_y = vllm::div_round_up(sf_n_unpadded, static_cast<int>(block.x));
+    int grid_x = std::min(
+        m, std::max(1, (multiProcessorCount * numBlocksPerSM) / grid_y));
+    dim3 grid(grid_x, grid_y);
+
+    VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
+      using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
+      auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
+      // NOTE: We don't support e8m0 scales at this moment.
+      vllm::cvt_fp16_to_fp4_sf_major<cuda_type, false>
+          <<<grid, block, 0, stream>>>(m, n, sf_n_unpadded, input_ptr,
+                                       input_sf_ptr,
+                                       reinterpret_cast<uint32_t*>(output_ptr),
+                                       reinterpret_cast<uint32_t*>(sf_out));
+    });
+  }
+}
--- a/csrc/quantization/fp4/nvfp4_utils.cuh
+++ b/csrc/quantization/fp4/nvfp4_utils.cuh
@@ -19,9 +19,17 @@
 #include <cuda_runtime.h>
 #include <cuda_fp8.h>

-#define ELTS_PER_THREAD 8
-
+#if (defined(NVFP4_ENABLE_ELTS16) && (CUDART_VERSION >= 12090) && \
+     defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100)
+  #define ELTS_PER_THREAD 16
+constexpr int CVT_FP4_ELTS_PER_THREAD = 16;
+constexpr bool CVT_FP4_PACK16 = true;
+#else
+  #define ELTS_PER_THREAD 8
 constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
+constexpr bool CVT_FP4_PACK16 = false;
+#endif
+
 constexpr int CVT_FP4_SF_VEC_SIZE = 16;

 namespace vllm {
@@ -68,19 +76,46 @@ struct TypeConverter<__nv_bfloat16> {
  using Type = __nv_bfloat162;
 };

+#if (defined(NVFP4_ENABLE_ELTS16) && (CUDART_VERSION >= 12090) && \
+     defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100)
+// Define a 32 bytes packed data type.
+template <class Type>
+struct alignas(32) PackedVec {
+  typename TypeConverter<Type>::Type elts[8];
+};
+#else
 // Define a 16 bytes packed data type.
 template <class Type>
-struct PackedVec {
+struct alignas(16) PackedVec {
  typename TypeConverter<Type>::Type elts[4];
 };
+#endif

 template <>
 struct PackedVec<__nv_fp8_e4m3> {
  __nv_fp8x2_e4m3 elts[8];
 };

+template <typename Int>
+__host__ __device__ inline Int round_up(Int x, Int y) {
+  static_assert(std::is_integral_v<Int>,
+                "round_up argument must be integral type");
+  return ((x + y - 1) / y) * y;
+}
+
+template <typename Int>
+__host__ __device__ __forceinline__ Int div_round_up(Int x, Int y) {
+  return (x + y - 1) / y;
+}
+
+// Compute effective rows for grid configuration with swizzled SF layouts.
+inline int computeEffectiveRows(int m) {
+  constexpr int ROW_TILE = 128;
+  return round_up(m, ROW_TILE);
+}
+
 // Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
+inline __device__ uint32_t fp32_vec8_to_e2m1(float (&array)[8]) {
  uint32_t val;
  asm volatile(
      "{\n"
@@ -101,7 +136,7 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
 }

 // Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
+__device__ __forceinline__ uint32_t fp32_vec8_to_e2m1(float2 (&array)[4]) {
  uint32_t val;
  asm volatile(
      "{\n"
@@ -114,20 +149,115 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
-      "}"
+      "}\n"
      : "=r"(val)
      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
  return val;
 }

+struct u32x2 {
+  uint32_t lo, hi;
+};
+
+using fp4_packed_t = std::conditional_t<CVT_FP4_PACK16, u32x2, uint32_t>;
+
+__device__ __forceinline__ u32x2 fp32_vec16_to_e2m1(float2 (&array)[8]) {
+  u32x2 out;
+  asm volatile(
+      "{\n"
+      ".reg .b8 b0;\n"
+      ".reg .b8 b1;\n"
+      ".reg .b8 b2;\n"
+      ".reg .b8 b3;\n"
+      ".reg .b8 b4;\n"
+      ".reg .b8 b5;\n"
+      ".reg .b8 b6;\n"
+      ".reg .b8 b7;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   b0,  %3,  %2;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   b1,  %5,  %4;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   b2,  %7,  %6;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   b3,  %9,  %8;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   b4, %11, %10;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   b5, %13, %12;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   b6, %15, %14;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   b7, %17, %16;\n"
+      "mov.b32 %0, {b0, b1, b2, b3};\n"
+      "mov.b32 %1, {b4, b5, b6, b7};\n"
+      "}\n"
+      : "=r"(out.lo), "=r"(out.hi)
+      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
+        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y),
+        "f"(array[4].x), "f"(array[4].y), "f"(array[5].x), "f"(array[5].y),
+        "f"(array[6].x), "f"(array[6].y), "f"(array[7].x), "f"(array[7].y));
+  return out;
+}
+
+__device__ __forceinline__ uint32_t pack_fp4(float2 (&v)[4]) {
+  return fp32_vec8_to_e2m1(v);
+}
+
+__device__ __forceinline__ u32x2 pack_fp4(float2 (&v)[8]) {
+  return fp32_vec16_to_e2m1(v);
+}
+
 // Fast reciprocal.
-inline __device__ float reciprocal_approximate_ftz(float a) {
+__device__ __forceinline__ float reciprocal_approximate_ftz(float a) {
  float b;
-  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
+  asm volatile("rcp.approx.ftz.f32 %0, %1;" : "=f"(b) : "f"(a));
  return b;
 }

+template <class Type>
+__device__ __forceinline__ void ld128_or_zero_cg_u32(PackedVec<Type>& out,
+                                                     const void* ptr,
+                                                     bool pred) {
+  uint32_t r0, r1, r2, r3;
+
+  asm volatile(
+      "{\n"
+      "  .reg .pred pr;\n"
+      "  setp.ne.u32 pr, %4, 0;\n"
+      "  mov.u32 %0, 0;\n"
+      "  mov.u32 %1, 0;\n"
+      "  mov.u32 %2, 0;\n"
+      "  mov.u32 %3, 0;\n"
+      "  @pr ld.global.cg.v4.u32 {%0,%1,%2,%3}, [%5];\n"
+      "}\n"
+      : "=r"(r0), "=r"(r1), "=r"(r2), "=r"(r3)
+      : "r"((int)pred), "l"(ptr));
+
+  *reinterpret_cast<uint4*>(&out) = uint4{r0, r1, r2, r3};
+}
+
+template <class Type>
+__device__ __forceinline__ void ld256_or_zero_cg_u32(PackedVec<Type>& out,
+                                                     const void* ptr,
+                                                     bool pred) {
+  uint32_t r0, r1, r2, r3, r4, r5, r6, r7;
+
+  asm volatile(
+      "{\n"
+      "  .reg .pred pr;\n"
+      "  setp.ne.u32 pr, %8, 0;\n"
+      "  mov.u32 %0, 0;\n"
+      "  mov.u32 %1, 0;\n"
+      "  mov.u32 %2, 0;\n"
+      "  mov.u32 %3, 0;\n"
+      "  mov.u32 %4, 0;\n"
+      "  mov.u32 %5, 0;\n"
+      "  mov.u32 %6, 0;\n"
+      "  mov.u32 %7, 0;\n"
+      "  @pr ld.global.cg.v8.u32 {%0,%1,%2,%3,%4,%5,%6,%7}, [%9];\n"
+      "}\n"
+      : "=r"(r0), "=r"(r1), "=r"(r2), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6),
+        "=r"(r7)
+      : "r"((int)pred), "l"(ptr));
+
+  reinterpret_cast<uint4*>(&out)[0] = uint4{r0, r1, r2, r3};
+  reinterpret_cast<uint4*>(&out)[1] = uint4{r4, r5, r6, r7};
+}
+
 // Compute SF output offset for swizzled tensor core layout.
 // SF layout: [numMTiles, numKTiles, 32, 4, 4]
 // Caller must precompute: numKTiles = (numCols + 63) / 64
@@ -166,21 +296,41 @@ __device__ __forceinline__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(
  return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
 }

+template <class SFType>
+__device__ __forceinline__ uint8_t* sf_out_rowmajor_u8(int row, int pack,
+                                                       int packs_per_row_sf,
+                                                       SFType* SFout) {
+  constexpr int PACK = CVT_FP4_ELTS_PER_THREAD;
+  constexpr int THREADS_PER_SF =
+      CVT_FP4_SF_VEC_SIZE / PACK;  // 1 if PACK=16, 2 else PACK=8
+
+  if (threadIdx.x % THREADS_PER_SF != 0) return nullptr;
+
+  int sf_col =
+      pack / THREADS_PER_SF;  // PACK=16 => sf_col=pack; PACK=8 => sf_col=pack/2
+  int64_t off = (int64_t)row * packs_per_row_sf + sf_col;
+
+  return (uint8_t*)SFout + off;
+}
+
 // Quantizes the provided PackedVec into the uint32_t output
-template <class Type, bool UE8M0_SF = false>
-__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
-                                         uint8_t* SFout) {
+template <class Type, int CVT_FP4_NUM_THREADS_PER_SF, bool UE8M0_SF = false>
+__device__ __forceinline__ fp4_packed_t
+cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal, uint8_t* SFout) {
  // Get absolute maximum values among the local 8 values.
  auto localMax = __habs2(vec.elts[0]);

-// Local maximum value.
+  // Local maximum value.
 #pragma unroll
  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
    localMax = __hmax2(localMax, __habs2(vec.elts[i]));
  }

  // Get the absolute maximum among all 16 values (two threads).
-  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+
+  if constexpr (CVT_FP4_NUM_THREADS_PER_SF == 2) {
+    localMax = __hmax2(__shfl_xor_sync(0xffffffffu, localMax, 1), localMax);
+  }
  // Get the final absolute maximum values.
  float vecMax = float(__hmax(localMax.x, localMax.y));

@@ -205,18 +355,17 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
    // Convert back to fp32.
    SFValue = float(tmp);
  }
+
+  // Write the SF to global memory (STG.8).
+  if (SFout) *SFout = fp8SFVal;
+
  // Get the output scale.
  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
  //                       reciprocal(SFScaleVal))
  float outputScale =
-      SFValue != 0 ? reciprocal_approximate_ftz(
-                         SFValue * reciprocal_approximate_ftz(SFScaleVal))
-                   : 0.0f;
-
-  if (SFout) {
-    // Write the SF to global memory (STG.8).
-    *SFout = fp8SFVal;
-  }
+      SFValue != 0.0f ? reciprocal_approximate_ftz(
+                            SFValue * reciprocal_approximate_ftz(SFScaleVal))
+                      : 0.0f;

  // Convert the input to float.
  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
@@ -233,10 +382,7 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
  }

  // Convert to e2m1 values.
-  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
-
-  // Write the e2m1 values to global memory.
-  return e2m1Vec;
+  return pack_fp4(fp2Vals);
 }

 // silu in float32
--- a/csrc/quantization/gptq_allspark/allspark_utils.cuh
+++ b/csrc/quantization/gptq_allspark/allspark_utils.cuh
@@ -7,7 +7,7 @@
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
 #include <iostream>
-#include "../gptq_marlin/marlin_dtypes.cuh"
+#include "../marlin/marlin_dtypes.cuh"
 using marlin::MarlinScalarType2;

 namespace allspark {
--- a/csrc/quantization/gptq_marlin/.gitignore
+++ b/csrc/quantization/gptq_marlin/.gitignore
--- a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
--- a/csrc/quantization/gptq_marlin/dequant.h
+++ b/csrc/quantization/gptq_marlin/dequant.h
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
@@ -70,15 +70,6 @@ QUANT_CONFIGS = [
        "thread_m_blocks": THREAD_M_BLOCKS,
        "group_blocks": [-1, 2, 4, 8],
    },
-    # HQQ
-    {
-        "a_type": ["kFloat16"],
-        "b_type": "kU4",
-        "thread_configs": THREAD_CONFIGS,
-        "thread_m_blocks": THREAD_M_BLOCKS,
-        "group_blocks": [4],
-        "is_zp_float": True,
-    },
    # GPTQ-INT4
    {
        "b_type": "kU4B8",
--- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
--- a/csrc/quantization/gptq_marlin/kernel.h
+++ b/csrc/quantization/gptq_marlin/kernel.h
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -46,7 +46,7 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,

 }  // namespace marlin

-torch::Tensor gptq_marlin_gemm(
+torch::Tensor marlin_gemm(
    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
    torch::Tensor& b_q_weight,
    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
@@ -528,7 +528,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,

 }  // namespace marlin

-torch::Tensor gptq_marlin_gemm(
+torch::Tensor marlin_gemm(
    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
    torch::Tensor& b_q_weight,
    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
@@ -856,5 +856,5 @@ torch::Tensor gptq_marlin_gemm(
 #endif

 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
-  m.impl("gptq_marlin_gemm", &gptq_marlin_gemm);
+  m.impl("marlin_gemm", &marlin_gemm);
 }
--- a/csrc/quantization/gptq_marlin/marlin.cuh
+++ b/csrc/quantization/gptq_marlin/marlin.cuh
--- a/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
+++ b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
--- a/csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu
+++ b/csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu
--- a/csrc/quantization/gptq_marlin/marlin_mma.h
+++ b/csrc/quantization/gptq_marlin/marlin_mma.h
--- a/csrc/quantization/gptq_marlin/marlin_template.h
+++ b/csrc/quantization/gptq_marlin/marlin_template.h
--- a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
+++ b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
@@ -130,26 +130,6 @@ inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
 }
 }  // namespace

-void get_cutlass_moe_mm_problem_sizes_caller(
-    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
-    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
-    std::optional<bool> force_swap_ab = std::nullopt) {
-  auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
-  auto options_int32 =
-      torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
-  torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
-
-  // Swap-AB should be disabled for FP4 path
-  bool may_swap_ab =
-      force_swap_ab.value_or((!blockscale_offsets.has_value()) &&
-                             (topk_ids.numel() <= SWAP_AB_THRESHOLD));
-
-  launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
-                               atomic_buffer, num_experts, n, k, stream,
-                               may_swap_ab);
-}
-
 template <bool SWAP_AB>
 __global__ void compute_problem_sizes_from_expert_offsets(
    const int64_t* __restrict__ expert_first_token_offset,
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
@@ -77,12 +77,6 @@ void get_cutlass_moe_mm_data_caller(
    const int64_t num_experts, const int64_t n, const int64_t k,
    const std::optional<torch::Tensor>& blockscale_offsets);

-void get_cutlass_moe_mm_problem_sizes_caller(
-    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
-    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
-    std::optional<bool> force_swap_ab = std::nullopt);
-
 void get_cutlass_moe_mm_problem_sizes_from_expert_offsets_caller(
    const torch::Tensor& expert_first_token_offset,
    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@@ -306,27 +300,6 @@ void get_cutlass_moe_mm_data(
      version_num, ". Required capability: 90, 100, or 120");
 }

-void get_cutlass_moe_mm_problem_sizes(
-    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
-    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
-    std::optional<bool> force_swap_ab = std::nullopt) {
-  int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
-    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
-  get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
-                                          problem_sizes2, num_experts, n, k,
-                                          blockscale_offsets, force_swap_ab);
-  return;
-#endif
-  TORCH_CHECK_NOT_IMPLEMENTED(
-      false,
-      "No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm "
-      "kernel for CUDA device capability: ",
-      version_num, ". Required capability: 90, 100, or 120");
-}
-
 void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
    const torch::Tensor& expert_first_token_offset,
    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
@@ -9,6 +9,10 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
                       const std::optional<at::Tensor>& in_bias,
                       const int64_t CuCount);

+torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
+                         const std::optional<at::Tensor>& in_bias,
+                         const int64_t CuCount);
+
 void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
               const std::optional<at::Tensor>& in_bias, at::Tensor& out_c,
               const at::Tensor& scale_a, const at::Tensor& scale_b,
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@@ -13,6 +13,13 @@
 #include "dispatch_utils.h"
 #include "quantization/w8a8/fp8/common.cuh"

+// TODO(rasmith): The kernels in this file are susceptible to integer overflow
+// issues, do not take strides, and are unable to handle PyTorch tensors that
+// return is_contiguous() as False (the tensors may actually be contiguous
+// in memory).
+//
+// However, it may be possible to fix these kernels to handle both issues.
+
 #if defined(__HIPCC__) && \
    (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))
  #define __HIP__GFX9__
@@ -287,6 +294,11 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
    V0 += (s.x + s.y);                                                        \
  }

+// To avoid LLVM silently upcasting to double
+__device__ inline unsigned int min__(uint32_t a, uint32_t b) {
+  return min(a, b);
+}
+
 #if defined(__HIP__GFX9__)  // TODO: Add NAVI support
 // This version targets cases where A[] fits LDS capacity
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
@@ -334,11 +346,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  // - Then the WG will move to another 8 K elements
  // TODO: Logic below will only work when K is multiple of 8
  //----------------------------------------------------
-  for (uint32_t k = 0; k < min(K * N, max_lds_len);
+  for (uint32_t k = 0; k < min__(K * N, max_lds_len);
       k += THRDS * WvPrGrp * A_CHUNK) {
    uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);

-    if (k_in >= min(K * N, max_lds_len)) break;
+    if (k_in >= min__(K * N, max_lds_len)) break;

    *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
  }
@@ -633,11 +645,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  // - Then the WG will move to another 8 K elements
  // TODO: Logic below will only work when K is multiple of 8
  //----------------------------------------------------
-  for (uint32_t k = 0; k < min(K * N, max_lds_len);
+  for (uint32_t k = 0; k < min__(K * N, max_lds_len);
       k += THRDS * WvPrGrp * A_CHUNK) {
    uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);

-    if (k_in >= min(K * N, max_lds_len)) break;
+    if (k_in >= min__(K * N, max_lds_len)) break;

    *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
  }
@@ -954,11 +966,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  //----------------------------------------------------
  #define PCML
  #ifndef PCML
-  for (uint32_t k = 0; k < min(K * N, max_lds_len);
+  for (uint32_t k = 0; k < min__(K * N, max_lds_len);
       k += THRDS * WvPrGrp * A_CHUNK) {
    uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);

-    if (k_in >= min(K * N, max_lds_len)) break;
+    if (k_in >= min__(K * N, max_lds_len)) break;

    *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
  }
@@ -975,7 +987,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
             ? kFit
             : (kFit - kFit % TUC);  // round up to multiple of TUC
  // if (kFit == 0) kFit = TUC;
-  kFit = min(kFit, K);
+  kFit = min__(kFit, K);

  float sum[N][YTILE];
  scalar8 sum4[N][YTILE];
@@ -1251,6 +1263,7 @@ int mindiv(int N, int div1, int div2) {
  }
  for (int i = 12; i >= 0; i--)
    if (rnds[0] == rnds[i]) return (div2 - i);
+  return 0;
 }

 torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
@@ -1352,6 +1365,536 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
  return out_c;
 }

+#if defined(__gfx950__)  // TODO: Add NAVI support
+  // This version targets big A[] cases, where it is much larger than LDS
+  // capacity
+  #define WVSPLITKRC_1KPASS
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N, int GrpsShrB>
+
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    __attribute__((amdgpu_waves_per_eu(1, 1)))
+    wvSplitKrc_(const int actlN, const int K, const int M, const int Bx,
+                const int By, const scalar_t* __restrict__ B,
+                const scalar_t* __restrict__ A,
+                const scalar_t* __restrict__ BIAS, float* glbl, scalar_t* C,
+                const int CuCount) {
+  // Use upper half of glbl buffer for atomic reduce counting
+  int* cntr = (int*)(&glbl[M * N]);
+
+  constexpr int NTILE = 16;
+  constexpr int WVLDS_ = (NTILE * THRDS * A_CHUNK);
+  constexpr int APAD = 1;
+  constexpr int ASTRD = 64;
+  constexpr int BPAD = 1;
+  constexpr int BSTRD = 64;
+  constexpr int WVLDS = ((WVLDS_ + (WVLDS_ / BSTRD) * 4 * BPAD));
+
+  constexpr int max_lds_len = LDS_SIZE / 2;
+
+  using scalar16 =
+      __attribute__((__vector_size__((A_CHUNK * 2) * sizeof(float)))) float;
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(float)))) float;
+  using half4 =
+      __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(__bf16)))) __bf16;
+  union bigType {
+    scalar_t h[A_CHUNK];
+    float f[A_CHUNK / 2];
+    unsigned int i[A_CHUNK / 2];
+    float2 f2[A_CHUNK / 4];
+    unsigned long l[A_CHUNK / 4];
+    double d[A_CHUNK / 4];
+    half4 h4[A_CHUNK / 4];
+    scalar8 h8;
+  };
+  using big4 = __attribute__((__vector_size__(4 * sizeof(bigType)))) __bf16;
+
+  __shared__ scalar_t stg[WvPrGrp * WVLDS / GrpsShrB];
+  unsigned int* myStg = (unsigned int*)(&stg[WVLDS * (threadIdx.y / GrpsShrB)]);
+  __shared__ scalar_t s[max_lds_len - WvPrGrp * WVLDS / GrpsShrB];
+
+  #ifndef WVSPLITKRC_1KPASS
+  constexpr int TUC_ = (THRDS * UNRL * A_CHUNK);
+  // find biggest k size that fits padded into LDS
+  constexpr uint32_t kFit__ = (max_lds_len - WvPrGrp * WVLDS / GrpsShrB) / N;
+  constexpr uint32_t kFit_ = (kFit__ * ASTRD) / (APAD + ASTRD);
+  uint32_t kFit = kFit_ - (kFit_ % TUC_);
+  uint32_t kfitsPerRdc = (K + kFit - 1) / kFit;
+
+  // find best k split to fill the CUs
+  if (((K + kfitsPerRdc * kFit - 1) / (kfitsPerRdc * kFit)) * numCuWithFullK <=
+      CuCount)
+    while (true) {
+      while (kFit > TUC_) {
+        uint32_t kFit_ = kFit - TUC_;
+        if (((K + (kfitsPerRdc * kFit_ - 1)) / (kfitsPerRdc * kFit_)) *
+                numCuWithFullK >
+            CuCount)
+          break;
+        kFit = kFit_;
+      }
+      if (((K + ((kfitsPerRdc - 1) * kFit - 1)) / ((kfitsPerRdc - 1) * kFit)) *
+              numCuWithFullK <=
+          CuCount)
+        kfitsPerRdc--;
+      else
+        break;
+    }
+  #else
+  int constexpr kFit = 512;
+  int constexpr kfitsPerRdc = 1;
+  #endif
+
+  bool doRdc = (kfitsPerRdc * kFit < K);
+  uint32_t numCuWithFullK =
+      ((M + (WvPrGrp * YTILE / GrpsShrB) - 1) / (WvPrGrp * YTILE / GrpsShrB));
+  uint32_t Mmod = numCuWithFullK * (WvPrGrp * YTILE / GrpsShrB);
+
+  // given above k-split, find this wave's position
+  uint32_t kFitPdd = kFit + (kFit / ASTRD) * APAD;
+  uint32_t m0 = (blockIdx.x * WvPrGrp / GrpsShrB) * YTILE;
+  uint32_t m1 = ((threadIdx.y % WvPrGrp) / GrpsShrB) * YTILE;
+  uint32_t m = (m0 + m1) % Mmod;
+  const uint32_t k_str = (m0 / Mmod) * kFit * kfitsPerRdc;
+  uint32_t k_end = (m0 / Mmod + 1) * kFit * kfitsPerRdc;
+  const uint32_t k_rnd = (K + kFit * kfitsPerRdc - 1) / (kFit * kfitsPerRdc);
+
+  scalar8 sum4[N / NTILE / GrpsShrB][1];
+  bigType bigB_[YTILE / GrpsShrB][UNRL];
+  const uint32_t bLoader = (threadIdx.y % GrpsShrB);
+  uint32_t kBase = 0;
+  if (k_str >= K) return;
+  if (m >= Mmod) return;
+
+  bool noreloada = false;
+  constexpr bool FAST_UNSAFE_RDC_INIT = false;
+
+  #ifdef WVSPLITKRC_1KPASS
+  // Early glbl init, B[] loading, if 1KPASS
+  if constexpr (FAST_UNSAFE_RDC_INIT) {
+    if (m + (threadIdx.x % 16) < M)
+      if (doRdc)
+        if (k_str == 0) {
+          int mindx = m + (threadIdx.x % 16);
+          int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
+                       (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+          int adr_ = mindx + M * nindx_ / 4;
+          __hip_atomic_store(&cntr[adr_], 0, __ATOMIC_RELAXED,
+                             __HIP_MEMORY_SCOPE_AGENT);
+          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+            for (uint32_t j = 0; j < 4; j++) {
+              int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                          (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+              int adr = mindx + M * nindx;
+              __hip_atomic_store(&glbl[adr], 0, __ATOMIC_RELAXED,
+                                 __HIP_MEMORY_SCOPE_AGENT);
+            }
+          }
+        }
+  }
+
+    // Load first B[] chunk
+    #pragma unroll
+  for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+    uint32_t k = k_str + k2 * THRDS * A_CHUNK;
+    uint32_t k_ = k + threadIdx.x * A_CHUNK;
+    const scalar_t* B_ = &B[min__(k_, K - A_CHUNK)];
+    #pragma unroll
+    for (uint32_t y = 0; y < YTILE / GrpsShrB; y++)
+      bigB_[y][k2].h8 = (loadnt(
+          (scalar8*)(&B_[min__(y * GrpsShrB + bLoader + m, M - 1) * K])));
+  }
+  {
+  #else
+  while (m < Mmod) {
+  #endif
+
+  #ifndef WVSPLITKRC_1KPASS
+    if constexpr (FAST_UNSAFE_RDC_INIT) {
+      if (m + (threadIdx.x % 16) < M)
+        if (doRdc)
+          if (k_str == 0) {
+            int mindx = m + (threadIdx.x % 16);
+            int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
+                         (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+            int adr_ = mindx + M * nindx_ / 4;
+            __hip_atomic_store(&cntr[adr_], 0, __ATOMIC_RELAXED,
+                               __HIP_MEMORY_SCOPE_AGENT);
+            for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+              for (uint32_t j = 0; j < 4; j++) {
+                int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                            (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+                int adr = mindx + M * nindx;
+                __hip_atomic_store(&glbl[adr], 0, __ATOMIC_RELAXED,
+                                   __HIP_MEMORY_SCOPE_AGENT);
+              }
+            }
+          }
+    }
+
+  #endif
+
+  #ifndef WVSPLITKRC_1KPASS
+    for (uint32_t k1 = k_str; k1 < k_end; k1 += THRDS * A_CHUNK * UNRL) {
+  #else
+    const uint32_t k1 = k_str;
+    {
+  #endif
+  #ifndef WVSPLITKRC_1KPASS
+      const bool reloada = (!noreloada) &&
+                           ((k1 == k_str) || (k1 == k_str + kBase + kFit)) &&
+                           (k1 < k_end);
+      // load next chunk of A[] to LDS
+      if (reloada) {
+        if (k1 != k_str) kBase += kFit;
+        __syncthreads();
+  #else
+      const bool reloada = (!noreloada) &&
+                           ((k1 == k_str) || (k1 == k_str + kBase + kFit)) &&
+                           (k1 < k_end);
+      if (reloada) {
+  #endif
+        constexpr int sprdN = 4;
+        const uint32_t thrd = ((threadIdx.y / sprdN) * THRDS + threadIdx.x);
+
+  #ifndef WVSPLITKRC_1KPASS
+    #pragma unroll
+        for (int k = 0; k < kFit; k += THRDS * (WvPrGrp / sprdN) * A_CHUNK) {
+  #else
+        const unsigned int k = 0;
+        {
+  #endif
+          unsigned int kOff = k + (thrd * A_CHUNK);
+          unsigned int kOffcp = min__(K - A_CHUNK, k_str + kOff);
+          const unsigned int k_in = kOffcp + ((threadIdx.y % sprdN)) * K;
+          const unsigned int k_ot = kOff + ((threadIdx.y % sprdN)) * kFitPdd;
+          for (unsigned int n = 0; n < N / 2; n += sprdN) {
+            __builtin_amdgcn_global_load_lds((int*)(&A[k_in + n * K]),
+                                             (int*)(&s[(k_ot + n * kFitPdd)]),
+                                             16, 0, 0);
+            if (((threadIdx.y % sprdN)) + n + N / 2 >= actlN) continue;
+            __builtin_amdgcn_global_load_lds(
+                (int*)(&A[k_in + (n + N / 2) * K]),
+                (int*)(&s[(k_ot + (n + N / 2) * kFitPdd)]), 16, 0, 0);
+          }
+
+          // Stage loaded B[] to LDS for MFMA swizzling...
+          for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+            uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+            uint32_t k_ = k + threadIdx.x * A_CHUNK;
+            const bool oob_k = (k_ >= K);
+            for (uint32_t y = 0; y < YTILE / GrpsShrB; y++) {
+              uint32_t idx = threadIdx.x * 4 +
+                             (y * GrpsShrB + bLoader) * ((THRDS + BPAD) * 4);
+              // zero out if oob
+              *((scalar8*)&myStg[idx]) =
+                  (oob_k || (y * GrpsShrB + bLoader + m >= M))
+                      ? 0
+                      : bigB_[y][k2].h8;
+            }
+          }
+        }
+      }
+    }
+
+  #ifndef WVSPLITKRC_1KPASS
+    // Fire load of next B[] chunk...
+    if ((k1 + THRDS * A_CHUNK * UNRL < k_end) &&
+        (k1 + THRDS * A_CHUNK * UNRL < K))
+    #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + THRDS * A_CHUNK * UNRL + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        const scalar_t* B_ = &B[min__(k_, K - A_CHUNK)];
+    #pragma unroll
+        for (uint32_t y = 0; y < YTILE / GrpsShrB; y++)
+          bigB_[y][k2].h8 = (loadnt(
+              (scalar8*)(&B_[min__(y * GrpsShrB + bLoader + m, M - 1) * K])));
+      }
+  #endif
+
+    // B[] staging is cooperative across GrpsShrB, so sync here before reading
+    // back
+    __syncthreads();
+
+    // read back B[] swizzled for MFMA...
+    bigType bigB[YTILE][UNRL];
+    for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+      for (uint32_t y = 0; y < YTILE; y++) {
+        unsigned int idx = (threadIdx.x % YTILE) * ((THRDS + BPAD) * 4) +
+                           (threadIdx.x / YTILE) * 4 + y * 16;
+        bigB[y][k2].h8 = *((scalar8*)&myStg[idx]);
+      }
+    }
+
+    // rReadback A[] swizzled for MFMA...
+    bigType bigA[N / GrpsShrB][UNRL];
+  #pragma unroll
+    for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+      uint32_t k = k1 + k2 * THRDS * A_CHUNK - kBase - k_str;
+  #pragma unroll
+      for (uint32_t nt = 0; nt < N / GrpsShrB; nt += NTILE)
+  #pragma unroll
+        for (uint32_t n = 0; n < NTILE; n++) {
+          uint32_t idxa = (nt + (threadIdx.x % NTILE) +
+                           (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) *
+                              kFitPdd +
+                          A_CHUNK * ((threadIdx.x / NTILE) + n * 4) + k;
+          bigA[nt + n][k2] = *((const bigType*)(&(s[idxa])));
+        }
+    }
+
+    // Do the MFMAs
+  #pragma unroll
+    for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+  #pragma unroll
+      for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+        if constexpr (std::is_same_v<scalar_t, half>) {
+          sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16f16(
+              bigA[nt * NTILE + 0][k2].h4[0], bigB[0][k2].h4[0],
+              (k1 == k_str) ? ((scalar8){0}) : sum4[nt][0], 0, 0, 0);
+          sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16f16(
+              bigA[nt * NTILE + 0][k2].h4[1], bigB[0][k2].h4[1], sum4[nt][0], 0,
+              0, 0);
+        } else {  // bf16
+          sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
+              bigA[nt * NTILE + 0][k2].h4[0], bigB[0][k2].h4[0],
+              (k1 == k_str) ? ((scalar8){0}) : sum4[nt][0], 0, 0, 0);
+          sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
+              bigA[nt * NTILE + 0][k2].h4[1], bigB[0][k2].h4[1], sum4[nt][0], 0,
+              0, 0);
+        }
+  #pragma unroll
+        for (uint32_t j = 1; j < YTILE; j++) {
+          if constexpr (std::is_same_v<scalar_t, half>) {
+            sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16f16(
+                bigA[nt * NTILE + j][k2].h4[0], bigB[j][k2].h4[0], sum4[nt][0],
+                0, 0, 0);
+            sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16f16(
+                bigA[nt * NTILE + j][k2].h4[1], bigB[j][k2].h4[1], sum4[nt][0],
+                0, 0, 0);
+          } else {  // bf16
+            sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
+                bigA[nt * NTILE + j][k2].h4[0], bigB[j][k2].h4[0], sum4[nt][0],
+                0, 0, 0);
+            sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
+                bigA[nt * NTILE + j][k2].h4[1], bigB[j][k2].h4[1], sum4[nt][0],
+                0, 0, 0);
+          }
+        }
+      }
+    }
+  }
+
+  if (!doRdc) {
+    if (m + (threadIdx.x % 16) < M) {
+      scalar_t biases[N / NTILE / GrpsShrB][4] = {0};
+      if (BIAS)
+        for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+          for (uint32_t j = 0; j < 4; j++) {
+            int mindx = m + (threadIdx.x % 16);
+            int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                        (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+            biases[nt][j] = BIAS[(mindx % Bx) + (nindx % By) * M];
+          }
+        }
+      for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+        for (uint32_t j = 0; j < 4; j++) {
+          int mindx = m + (threadIdx.x % 16);
+          int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                      (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+          int adr = mindx + M * nindx;
+          if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
+            if (BIAS) sum4[nt][0][j] += __bfloat162float(biases[nt][j]);
+            C[adr] = __float2bfloat16(sum4[nt][0][j]);
+          } else {
+            if (BIAS) sum4[nt][0][j] += __half2float(biases[nt][j]);
+            C[adr] = __float2half(sum4[nt][0][j]);
+          }
+        }
+      }
+    }
+  } else {
+    if (m + (threadIdx.x % 16) < M) {
+      int my_cntr;
+      if (!BIAS) {
+        int mindx = m + (threadIdx.x % 16);
+        for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++)
+          for (uint32_t j = 0; j < 4; j++) {
+            int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                        (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+            int adr = mindx + M * nindx;
+            atomicAdd(&glbl[adr], sum4[nt][0][j]);
+          }
+        int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
+                     (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+        int adr_ = mindx + M * nindx_ / 4;
+        my_cntr = atomicAdd(&cntr[adr_], 1);
+        float vals[N / NTILE / GrpsShrB][4] = {};
+        if (my_cntr + 1 == k_rnd) {
+          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+            for (uint32_t j = 0; j < 4; j++) {
+              int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                          (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+              int adr = mindx + M * nindx;
+              vals[nt][j] = glbl[adr];
+            }
+          }
+          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+            for (uint32_t j = 0; j < 4; j++) {
+              int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                          (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+              if (nindx >= actlN) break;
+              int adr = mindx + M * nindx;
+              if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
+                C[adr] = __float2bfloat16(vals[nt][j]);
+              } else {
+                C[adr] = __float2half(vals[nt][j]);
+              }
+            }
+          }
+        }
+      } else {
+        int mindx = m + (threadIdx.x % 16);
+        scalar_t biases[N / NTILE / GrpsShrB][4] = {};
+        // Atomic add the output, read biases
+        for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++)
+          for (uint32_t j = 0; j < 4; j++) {
+            int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                        (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+            int adr = mindx + M * nindx;
+            atomicAdd(&glbl[adr], sum4[nt][0][j]);
+            biases[nt][j] = BIAS[(mindx % Bx) + (nindx % By) * M];
+          }
+        int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
+                     (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+        int adr_ = mindx + M * nindx_ / 4;
+        // Update the complete counter
+        my_cntr = atomicAdd(&cntr[adr_], 1);
+        float vals[N / NTILE / GrpsShrB][4] = {};
+        // If we're the last k-shard, read back the value and convert...
+        if (my_cntr + 1 == k_rnd) {
+          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+            for (uint32_t j = 0; j < 4; j++) {
+              int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                          (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+              int adr = mindx + M * nindx;
+              vals[nt][j] = glbl[adr];
+            }
+          }
+          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+            for (uint32_t j = 0; j < 4; j++) {
+              int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                          (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+              if (nindx >= actlN) break;
+              int adr = mindx + M * nindx;
+              if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
+                vals[nt][j] += __bfloat162float(biases[nt][j]);
+                C[adr] = __float2bfloat16(vals[nt][j]);
+              } else {
+                vals[nt][j] += __half2float(biases[nt][j]);
+                C[adr] = __float2half(vals[nt][j]);
+              }
+            }
+          }
+        }
+      }
+    }
+
+  #ifndef WVSPLITKRC_1KPASS
+    m0 += CuCount * WvPrGrp * YTILE / GrpsShrB;
+    m = (m0 + m1) % Mmod;
+    k_str = (m0 / Mmod) * kFit * kfitsPerRdc;
+    k_end = (m0 / Mmod + 1) * kFit * kfitsPerRdc;
+    if (k_str >= K) break;
+    kBase = 0;
+  #endif
+  }
+}
+#else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N, int GrpsShrB>
+__global__ void wvSplitKrc_(const int actlN, const int K, const int M,
+                            const int Bx, const int By, const scalar_t* B,
+                            const scalar_t* __restrict__ A,
+                            const scalar_t* __restrict__ BIAS, float* glbl,
+                            // int* cntr,
+                            scalar_t* C, const int CuCount){UNREACHABLE_CODE}
+#endif  // defined(__HIP__GFX9__) TODO: Add NAVI support
+
+torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
+                         const std::optional<at::Tensor>& in_bias,
+                         const int64_t CuCount) {
+  auto M_in = in_a.size(0);
+  auto N_in = in_b.size(0);
+  auto K_in = in_a.size(1);
+  auto Bx_in =
+      (in_bias.has_value() && in_bias->numel() > 0)
+          ? (in_bias->sizes().size() == 2) ? in_bias->size(1) : in_bias->size(0)
+          : 1;
+  auto By_in = (in_bias.has_value() && in_bias->numel() > 0 &&
+                in_bias->sizes().size() == 2)
+                   ? in_bias->size(0)
+                   : 1;
+
+  TORCH_CHECK(in_a.dtype() == in_b.dtype());
+  TORCH_CHECK(K_in % 8 == 0, "k % 8 == 0");
+  TORCH_CHECK(in_a.dtype() == torch::kFloat16 ||
+              in_a.dtype() == torch::kBFloat16);
+
+  auto out_c = torch::empty(
+      {N_in, M_in},
+      torch::TensorOptions().dtype(in_b.dtype()).device(in_b.device()));
+
+  auto N_p2 = 1U << (32 - __builtin_clz(N_in - 1));
+  auto axl_glbl = torch::empty(
+      {N_p2 + N_p2 / 4, M_in + M_in / 4},
+      torch::TensorOptions().dtype(torch::kFloat32).device(in_b.device()));
+  axl_glbl.zero_();  // disable for FAST_UNSAFE_RDC_INIT
+
+  dim3 grid(CuCount);
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  // const int max_lds_len = get_lds_size() / 2;
+
+#define WVSPLITKrc(_WvPrGrp, _YTILE, _UNRL, _N, _GrpsShrB)                     \
+  {                                                                            \
+    dim3 block(64, _WvPrGrp);                                                  \
+    wvSplitKrc_<fptype, 64, _YTILE, _WvPrGrp, 8, _UNRL, _N, _GrpsShrB>         \
+        <<<grid, block, 0, stream>>>(N_in, K_in, M_in, Bx_in, By_in, af4, bf4, \
+                                     biasf4, glbl, c, CuCount);                \
+  }
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitKrc", [&] {
+    using fptype = typename scalar<scalar_t>::type;
+    fptype* af4 = reinterpret_cast<fptype*>(in_a.data_ptr());
+    const fptype* bf4 = reinterpret_cast<const fptype*>(in_b.data_ptr());
+    const fptype* biasf4 =
+        (in_bias.has_value() && in_bias->numel() > 0)
+            ? reinterpret_cast<const fptype*>(in_bias->data_ptr())
+            : nullptr;
+    fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
+    auto glbl = axl_glbl.data_ptr<float>();
+    switch (N_p2) {
+      case 16:
+        WVSPLITKrc(4, 16, 1, 16, 1) break;
+      case 32:
+        WVSPLITKrc(4, 16, 1, 32, 2) break;
+      case 64:
+        WVSPLITKrc(4, 16, 1, 64, 2) break;
+      case 128:
+        WVSPLITKrc(4, 16, 1, 128, 4) break;
+      default:
+        throw std::runtime_error(
+            "Unsupported N value: " + std::to_string(M_in) + "," +
+            std::to_string(K_in) + "," + std::to_string(N_in));
+    }
+  });
+  return out_c;
+}
+
 #if defined(__HIP__MI3XX__)  // TODO: Add NAVI support
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
          int A_CHUNK, int UNRL, int N>
@@ -1381,7 +1924,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  __shared__ fp8_t s[max_lds_len];

  for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
-       k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
+       k < min__(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
    *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
  }
  __syncthreads();
@@ -1570,7 +2113,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  __shared__ fp8_t s[max_lds_len];

  for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
-       k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
+       k < min__(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
    *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
  }
  __syncthreads();
--- a/csrc/rocm/torch_bindings.cpp
+++ b/csrc/rocm/torch_bindings.cpp
@@ -26,6 +26,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
      "Tensor");
  rocm_ops.impl("wvSplitK", torch::kCUDA, &wvSplitK);

+  // Custom gemm op for skinny matrix-matrix multiplication
+  rocm_ops.def(
+      "wvSplitKrc(Tensor in_a, Tensor in_b, Tensor? in_bias, int CuCount) -> "
+      "Tensor");
+  rocm_ops.impl("wvSplitKrc", torch::kCUDA, &wvSplitKrc);
+
  // wvSplitK for fp8
  rocm_ops.def(
      "wvSplitKQ(Tensor in_a, Tensor in_b, Tensor? in_bias, Tensor! out_c, "
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -303,9 +303,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def("permute_cols(Tensor A, Tensor perm) -> Tensor");
  ops.impl("permute_cols", torch::kCUDA, &permute_cols);

-  // gptq_marlin Optimized Quantized GEMM for GPTQ.
+  // Marlin Optimized Quantized GEMM (supports GPTQ, AWQ, FP8, NVFP4, MXFP4).
  ops.def(
-      "gptq_marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, "
+      "marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, "
      "Tensor? b_bias_or_none,Tensor b_scales, "
      "Tensor? a_scales, Tensor? global_scale, Tensor? b_zeros_or_none, "
      "Tensor? "
@@ -474,19 +474,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "()");
  ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);

-  // A function that computes problem sizes for each expert's multiplication
-  // used by the two mms called from fused MoE operation. It takes topk_ids as
-  // an input, and computes problem_sizes1 and problem_sizes2 only.
-  ops.def(
-      "get_cutlass_moe_mm_problem_sizes(Tensor topk_ids, "
-      "                                 Tensor! problem_sizes1, "
-      "                                 Tensor! problem_sizes2, "
-      "                                 int num_experts, int n, int k, "
-      "                                 Tensor? blockscale_offsets, "
-      "                                 bool? force_swap_ab) -> ()");
-  ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA,
-           &get_cutlass_moe_mm_problem_sizes);
-
  // compute per-expert problem sizes from expert_first_token_offset
  // produced by vLLM's moe_permute kernel
  ops.def(
@@ -559,7 +546,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // Compute NVFP4 block quantized tensor.
  ops.def(
      "scaled_fp4_quant(Tensor! output, Tensor input,"
-      "                 Tensor! output_scale, Tensor input_scale) -> ()");
+      "                 Tensor! output_scale, Tensor input_scale, bool "
+      "is_sf_swizzled_layout) -> ()");
  ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant);

  // Compute NVFP4 experts quantization.
@@ -705,7 +693,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
  // Cache ops
  // Swap in (out) the cache blocks from src to dst.
  cache_ops.def(
-      "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
+      "swap_blocks(Tensor src, Tensor! dst,"
+      "            int block_size_in_bytes, Tensor block_mapping) -> ()");
  cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks);

  // Reshape the key and value tensors and cache them.
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -5,6 +5,23 @@
 # docs/contributing/dockerfile/dockerfile.md and
 # docs/assets/contributing/dockerfile-stages-dependency.png

+# =============================================================================
+# VERSION MANAGEMENT
+# =============================================================================
+# ARG defaults in this Dockerfile are the source of truth for pinned versions.
+# docker/versions.json is auto-generated for use with docker buildx bake.
+#
+# When updating versions:
+# 1. Edit the ARG defaults below
+# 2. Run: python tools/generate_versions_json.py
+#
+# To query versions programmatically:
+#   jq -r '.variable.CUDA_VERSION.default' docker/versions.json
+#
+# To build with bake:
+#   docker buildx bake -f docker/docker-bake.hcl -f docker/versions.json
+# =============================================================================
+
 ARG CUDA_VERSION=12.9.1
 ARG PYTHON_VERSION=3.12

@@ -117,8 +134,8 @@ ENV UV_LINK_MODE=copy
 # Verify GCC version
 RUN gcc --version

-# Workaround for triton/pytorch issues
-RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+# Ensure CUDA compatibility library is loaded
+RUN echo "/usr/local/cuda-$(echo "$CUDA_VERSION" | cut -d. -f1,2)/compat/" > /etc/ld.so.conf.d/00-cuda-compat.conf && ldconfig

 # ============================================================
 # SLOW-CHANGING DEPENDENCIES BELOW
@@ -131,16 +148,41 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL

 WORKDIR /workspace

-# install build and runtime dependencies
+# We can specify the standard or nightly build of PyTorch
+ARG PYTORCH_NIGHTLY
+
+# Install build and runtime dependencies, including PyTorch
+# Check whether to install torch nightly instead of release for this build
 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY pyproject.toml pyproject.toml
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
-    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
+        echo "Installing torch nightly..." \
+        && uv pip install --python /opt/venv/bin/python3 torch torchaudio torchvision --pre \
+        --index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+        && echo "Installing other requirements..." \
+        && /opt/venv/bin/python3 use_existing_torch.py --prefix \
+        && uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    else \
+        uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    fi
+
+# Track PyTorch lib versions used during build and match in downstream instances.
+# We do this for both nightly and release so we can strip dependencies/*.txt as needed.
+# Otherwise library dependencies can upgrade/downgrade torch incorrectly.
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip freeze | grep -i "^torch=\|^torchvision=\|^torchaudio=" > torch_lib_versions.txt \
+    && TORCH_LIB_VERSIONS=$(cat torch_lib_versions.txt | xargs) \
+    && echo "Installed torch libs: ${TORCH_LIB_VERSIONS}"

 # CUDA arch list used by torch
 # Explicitly set the list to avoid issues with torch 2.2
 # See https://github.com/pytorch/pytorch/pull/123243
+# From versions.json: .torch.cuda_arch_list
 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 #################### BUILD BASE IMAGE ####################
@@ -153,8 +195,13 @@ ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL

-# install build dependencies
+# We can specify the standard or nightly build of PyTorch
+ARG PYTORCH_NIGHTLY
+
+# Install build dependencies
 COPY requirements/build.txt requirements/build.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY --from=base /workspace/torch_lib_versions.txt torch_lib_versions.txt

 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@@ -164,8 +211,18 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE=copy

 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
-    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
+        echo "Installing build requirements without torch..." \
+        && python3 use_existing_torch.py --prefix \
+        && uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
+        && echo "Installing torch nightly..." \
+        && uv pip install --python /opt/venv/bin/python3 $(cat torch_lib_versions.txt | grep -i "^torch=" | xargs) --pre \
+        --index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    else \
+        echo "Installing build requirements..." \
+        && uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    fi

 WORKDIR /workspace

@@ -197,6 +254,13 @@ ARG VLLM_MAIN_CUDA_VERSION=""
 # Use dummy version for csrc-build wheel (only .so files are extracted, version doesn't matter)
 ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"

+# Use existing torch for nightly builds
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
+        python3 use_existing_torch.py --prefix; \
+    fi
+
+# Build the vLLM wheel
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$USE_SCCACHE" = "1" ]; then \
@@ -240,6 +304,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
        export VLLM_DOCKER_BUILD_CONTEXT=1 && \
        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
    fi
+
 #################### CSRC BUILD IMAGE ####################

 #################### EXTENSIONS BUILD IMAGE ####################
@@ -256,7 +321,8 @@ ENV UV_LINK_MODE=copy
 WORKDIR /workspace

 # Build DeepGEMM wheel
-ARG DEEPGEMM_GIT_REF
+# Default moved here from tools/install_deepgemm.sh for centralized version management
+ARG DEEPGEMM_GIT_REF=594953acce41793ae00a1233eb516044d604bcb6
 COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
 RUN --mount=type=cache,target=/root/.cache/uv \
    mkdir -p /tmp/deepgemm/dist && \
@@ -271,8 +337,9 @@ RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped

 # Build pplx-kernels and DeepEP wheels
 COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
-ARG PPLX_COMMIT_HASH
-ARG DEEPEP_COMMIT_HASH
+# Defaults moved here from tools/ep_kernels/install_python_libraries.sh for centralized version management
+ARG PPLX_COMMIT_HASH=12cecfd
+ARG DEEPEP_COMMIT_HASH=73b6ea4
 ARG NVSHMEM_VER
 RUN --mount=type=cache,target=/root/.cache/uv \
    mkdir -p /tmp/ep_kernels_workspace/dist && \
@@ -294,8 +361,13 @@ ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL

-# install build dependencies
+# We can specify the standard or nightly build of PyTorch
+ARG PYTORCH_NIGHTLY
+
+# Install build dependencies
 COPY requirements/build.txt requirements/build.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY --from=base /workspace/torch_lib_versions.txt torch_lib_versions.txt

 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@@ -305,14 +377,23 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE=copy

 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
-    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
+        echo "Installing build requirements without torch..." \
+        && python3 use_existing_torch.py --prefix \
+        && uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
+        && echo "Installing torch nightly..." \
+        && uv pip install --python /opt/venv/bin/python3 $(cat torch_lib_versions.txt | grep -i "^torch=" | xargs) --pre \
+        --index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    else \
+        echo "Installing build requirements..." \
+        && uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    fi

 WORKDIR /workspace

 # Copy pre-built csrc wheel directly
 COPY --from=csrc-build /workspace/dist /precompiled-wheels
-
 COPY . .

 ARG GIT_REPO_CHECK=0
@@ -325,6 +406,13 @@ ENV VLLM_TARGET_DEVICE=${vllm_target_device}
 # Skip adding +precompiled suffix to version (preserves git-derived version)
 ENV VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX=1

+# Use existing torch for nightly builds
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
+        python3 use_existing_torch.py --prefix; \
+    fi
+
+# Build the vLLM wheel
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
    if [ "${vllm_target_device}" = "cuda" ]; then \
@@ -347,7 +435,8 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
    else \
        echo "Skipping wheel size check."; \
    fi
-#################### EXTENSION Build IMAGE ####################
+
+#################### WHEEL BUILD IMAGE ####################

 #################### DEV IMAGE ####################
 FROM base AS dev
@@ -365,12 +454,34 @@ ENV UV_LINK_MODE=copy

 # Install libnuma-dev, required by fastsafetensors (fixes #20384)
 RUN apt-get update && apt-get install -y --no-install-recommends libnuma-dev && rm -rf /var/lib/apt/lists/*
+
+
+# We can specify the standard or nightly build of PyTorch
+ARG PYTORCH_NIGHTLY
+
+# Install development dependencies
 COPY requirements/lint.txt requirements/lint.txt
+COPY requirements/test.in requirements/test.in
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY --from=base /workspace/torch_lib_versions.txt torch_lib_versions.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
-    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
+        echo "Installing dev requirements plus torch nightly..." \
+        && python3 use_existing_torch.py --prefix \
+        && cat torch_lib_versions.txt >> requirements/test.in \
+        && uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+        && uv pip install --python /opt/venv/bin/python3 $(cat torch_lib_versions.txt | xargs) --pre \
+        -r requirements/dev.txt \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    else \
+        echo "Installing dev requirements..." \
+        && uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    fi
+
 #################### DEV IMAGE ####################
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
@@ -453,8 +564,8 @@ ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE=copy

-# Workaround for triton/pytorch issues
-RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+# Ensure CUDA compatibility library is loaded
+RUN echo "/usr/local/cuda-$(echo "$CUDA_VERSION" | cut -d. -f1,2)/compat/" > /etc/ld.so.conf.d/00-cuda-compat.conf && ldconfig

 # ============================================================
 # SLOW-CHANGING DEPENDENCIES BELOW
@@ -474,7 +585,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Install FlashInfer pre-compiled kernel cache and binaries
 # This is ~1.1GB and only changes when FlashInfer version bumps
 # https://docs.flashinfer.ai/installation.html
-ARG FLASHINFER_VERSION=0.5.3
+# From versions.json: .flashinfer.version
+ARG FLASHINFER_VERSION=0.6.1
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \
    && uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
@@ -503,14 +615,20 @@ RUN set -eux; \

 # Install vllm-openai dependencies (saves ~2.6s per build)
 # These are stable packages that don't depend on vLLM itself
+# From versions.json: .bitsandbytes.x86_64, .bitsandbytes.arm64
+# From versions.json: .openai_server_extras.timm, .openai_server_extras.runai_model_streamer
+ARG BITSANDBYTES_VERSION_X86=0.46.1
+ARG BITSANDBYTES_VERSION_ARM64=0.42.0
+ARG TIMM_VERSION=">=1.0.17"
+ARG RUNAI_MODEL_STREAMER_VERSION=">=0.15.3"
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        BITSANDBYTES_VERSION="0.42.0"; \
+        BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_ARM64}"; \
    else \
-        BITSANDBYTES_VERSION="0.46.1"; \
+        BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_X86}"; \
    fi; \
    uv pip install --system accelerate hf_transfer modelscope \
-        "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.3'
+        "bitsandbytes>=${BITSANDBYTES_VERSION}" "timm${TIMM_VERSION}" "runai-model-streamer[s3,gcs]${RUNAI_MODEL_STREAMER_VERSION}"

 # ============================================================
 # VLLM INSTALLATION (depends on build stage)
@@ -521,11 +639,26 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER

-# Install vllm wheel first, so that torch etc will be installed.
+# We can specify the standard or nightly build of PyTorch
+ARG PYTORCH_NIGHTLY
+
+# Install vLLM wheel first, so that torch etc will be installed.
+# Check whether to install torch nightly instead of release for this build.
+COPY --from=base /workspace/torch_lib_versions.txt torch_lib_versions.txt
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system dist/*.whl --verbose \
-        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
+        echo "Installing torch nightly..." \
+        && uv pip install --system $(cat torch_lib_versions.txt | xargs) --pre \
+        --index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+        && echo "Installing vLLM..." \
+        && uv pip install --system dist/*.whl --verbose \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    else \
+        echo "Installing vLLM..." \
+        && uv pip install --system dist/*.whl --verbose \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    fi

 RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
@@ -585,12 +718,33 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && apt-get update -y \
    && apt-get install -y git

-# install development dependencies (for testing)
+# We can specify the standard or nightly build of PyTorch
+ARG PYTORCH_NIGHTLY
+
+# Install development dependencies (for testing)
+COPY requirements/lint.txt requirements/lint.txt
+COPY requirements/test.in requirements/test.in
+COPY requirements/test.txt requirements/test.txt
+COPY requirements/dev.txt requirements/dev.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY --from=base /workspace/torch_lib_versions.txt torch_lib_versions.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
    CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
    if [ "$CUDA_MAJOR" -ge 12 ]; then \
-        uv pip install --system -r requirements/dev.txt \
-        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+        if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
+            echo "Installing dev requirements plus torch nightly..." \
+            && python3 use_existing_torch.py --prefix \
+            && cat torch_lib_versions.txt >> requirements/test.in \
+            && uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match \
+            --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+            && uv pip install --system $(cat torch_lib_versions.txt | xargs) --pre \
+            -r requirements/dev.txt \
+            --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+        else \
+            echo "Installing dev requirements..." \
+            && uv pip install --system -r requirements/dev.txt \
+            --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+        fi \
    fi

 # install development dependencies (for testing)
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -15,9 +15,11 @@
 # Build arguments:
 #   PYTHON_VERSION=3.13|3.12 (default)|3.11|3.10
 #   VLLM_CPU_DISABLE_AVX512=false (default)|true
-#   VLLM_CPU_AVX512BF16=false (default)|true
-#   VLLM_CPU_AVX512VNNI=false (default)|true
-#   VLLM_CPU_AMXBF16=false |true (default)
+#   VLLM_CPU_AVX2=false (default)|true (for cross-compilation)
+#   VLLM_CPU_AVX512=false (default)|true (for cross-compilation)
+#   VLLM_CPU_AVX512BF16=false (default)|true (for cross-compilation)
+#   VLLM_CPU_AVX512VNNI=false (default)|true (for cross-compilation)
+#   VLLM_CPU_AMXBF16=false (default)|true (for cross-compilation)
 #

 ######################### COMMON BASE IMAGE #########################
@@ -54,9 +56,12 @@ ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE="copy"
+
+# Copy requirements files for installation
+COPY requirements/common.txt requirements/common.txt
+COPY requirements/cpu.txt requirements/cpu.txt
+
 RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
-    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
    uv pip install --upgrade pip && \
    uv pip install -r requirements/cpu.txt

@@ -88,6 +93,12 @@ ARG GIT_REPO_CHECK=0
 # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 ARG VLLM_CPU_DISABLE_AVX512=0
 ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+# Support for cross-compilation with AVX2 ISA: docker build --build-arg VLLM_CPU_AVX2="1" ...
+ARG VLLM_CPU_AVX2=0
+ENV VLLM_CPU_AVX2=${VLLM_CPU_AVX2}
+# Support for cross-compilation with AVX512 ISA: docker build --build-arg VLLM_CPU_AVX512="1" ...
+ARG VLLM_CPU_AVX512=0
+ENV VLLM_CPU_AVX512=${VLLM_CPU_AVX512}
 # Support for building with AVX512BF16 ISA: docker build --build-arg VLLM_CPU_AVX512BF16="true" ...
 ARG VLLM_CPU_AVX512BF16=0
 ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
@@ -100,18 +111,19 @@ ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16}

 WORKDIR /workspace/vllm

+# Copy build requirements
+COPY requirements/cpu-build.txt requirements/build.txt
+
 RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=requirements/cpu-build.txt,target=requirements/build.txt \
    uv pip install -r requirements/build.txt

 COPY . .
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+RUN if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi

 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
-    --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38

 ######################### TEST DEPS #########################
@@ -119,9 +131,11 @@ FROM base AS vllm-test-deps

 WORKDIR /workspace/vllm

+# Copy test requirements
+COPY requirements/test.in requirements/cpu-test.in
+
 # TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
-RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
-    cp requirements/test.in requirements/cpu-test.in && \
+RUN \
    sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
    remove_packages_not_supported_on_aarch64() { \
      case "$(uname -m)" in \
@@ -132,7 +146,7 @@ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
      esac; \
    }; \
    remove_packages_not_supported_on_aarch64 && \
-    sed -i 's/^torch==.*/torch==2.9.1/g' requirements/cpu-test.in && \
+    sed -i 's/^torch==.*/torch==2.10.0/g' requirements/cpu-test.in && \
    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
    uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
@@ -200,4 +214,29 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
    uv pip install dist/*.whl

+# Add labels to document build configuration
+LABEL org.opencontainers.image.title="vLLM CPU"
+LABEL org.opencontainers.image.description="vLLM inference engine for CPU platforms"
+LABEL org.opencontainers.image.vendor="vLLM Project"
+LABEL org.opencontainers.image.source="https://github.com/vllm-project/vllm"
+
+# Build configuration labels
+ARG TARGETARCH
+ARG VLLM_CPU_DISABLE_AVX512
+ARG VLLM_CPU_AVX2
+ARG VLLM_CPU_AVX512
+ARG VLLM_CPU_AVX512BF16
+ARG VLLM_CPU_AVX512VNNI
+ARG VLLM_CPU_AMXBF16
+ARG PYTHON_VERSION
+
+LABEL ai.vllm.build.target-arch="${TARGETARCH}"
+LABEL ai.vllm.build.cpu-disable-avx512="${VLLM_CPU_DISABLE_AVX512:-false}"
+LABEL ai.vllm.build.cpu-avx2="${VLLM_CPU_AVX2:-false}"
+LABEL ai.vllm.build.cpu-avx512="${VLLM_CPU_AVX512:-false}"
+LABEL ai.vllm.build.cpu-avx512bf16="${VLLM_CPU_AVX512BF16:-false}"
+LABEL ai.vllm.build.cpu-avx512vnni="${VLLM_CPU_AVX512VNNI:-false}"
+LABEL ai.vllm.build.cpu-amxbf16="${VLLM_CPU_AMXBF16:-false}"
+LABEL ai.vllm.build.python-version="${PYTHON_VERSION:-3.12}"
+
 ENTRYPOINT ["vllm", "serve"]
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -1,3 +1,11 @@
+#######
+#
+# THIS FILE IS DEPRECATED AND WILL BE REMOVED SHORTLY
+#
+# Please use the standard Dockerfile with PYTORCH_NIGHTLY=1 instead
+#
+#######
+
 # The vLLM Dockerfile is used to construct vLLM image against torch nightly that can be directly used for testing

 # for torch nightly, cuda >=12.6 is required,
@@ -213,15 +221,14 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.


 # build flashinfer for torch nightly from source around 10 mins
-# release version: v0.5.2
+# release version: v0.6.1
 # todo(elainewy): cache flashinfer build result for faster build
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/uv \
    echo "git clone flashinfer..." \
-    && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
+    && git clone --depth 1 --branch v0.6.1 --recursive https://github.com/flashinfer-ai/flashinfer.git \
    && cd flashinfer \
-    && git checkout v0.5.2 \
    && git submodule update --init --recursive \
    && echo "finish git clone flashinfer..." \
    && rm -rf build \
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -85,6 +85,8 @@ ONBUILD COPY ./ vllm/
 FROM base AS fetch_vllm_1
 ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
 ARG VLLM_BRANCH="main"
+ENV VLLM_REPO=${VLLM_REPO}
+ENV VLLM_BRANCH=${VLLM_BRANCH}
 ONBUILD RUN git clone ${VLLM_REPO} \
 	    && cd vllm \
 	    && git fetch -v --prune -- origin ${VLLM_BRANCH} \
@@ -301,6 +303,10 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
    && pip uninstall -y vllm \
    && uv pip install --system *.whl

+# Install RIXL wheel
+RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
+    uv pip install --system /rixl_install/*.whl
+
 WORKDIR /vllm-workspace
 ARG COMMON_WORKDIR
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
@@ -379,5 +385,5 @@ RUN echo "VLLM_BASE_IMAGE=${BASE_IMAGE}" >> ${COMMON_WORKDIR}/versions.txt
 CMD ["/bin/bash"]

 #Set entrypoint for vllm-openai official images
-FROM final As vllm-openai
+FROM final AS vllm-openai
 ENTRYPOINT ["vllm", "serve"]
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -198,92 +198,6 @@ RUN cd mori \
 RUN mkdir -p /app/install && cp /app/mori/dist/*.whl /app/install


-###
-### RIXL Build
-###
-FROM build_pytorch AS build_rixl
-ARG RIXL_BRANCH
-ARG RIXL_REPO
-ARG ETCD_BRANCH
-ARG ETCD_REPO
-ARG UCX_BRANCH
-ARG UCX_REPO
-
-ENV ROCM_PATH=/opt/rocm
-ENV UCX_HOME=/usr/local/ucx
-ENV RIXL_HOME=/usr/local/rixl
-ENV RIXL_BENCH_HOME=/usr/local/rixl_bench
-
-# RIXL build system dependences and RDMA support
-RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
-    libgrpc-dev \
-    libgrpc++-dev \
-    libprotobuf-dev \
-    protobuf-compiler-grpc \
-    libcpprest-dev \
-    libaio-dev \
-    librdmacm1 \
-    librdmacm-dev \
-    libibverbs1 \
-    libibverbs-dev \
-    ibverbs-utils \
-    rdmacm-utils \
-    ibverbs-providers
-
-RUN pip install meson auditwheel patchelf tomlkit
-
-WORKDIR /workspace
-
-RUN git clone ${ETCD_REPO} && \
-    cd etcd-cpp-apiv3 && \
-    git checkout ${ETCD_BRANCH} && \
-    mkdir build && cd build && \
-    cmake .. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 && \
-    make -j$(nproc) && \
-    make install
-
-RUN cd /usr/local/src && \
-    git clone ${UCX_REPO} &&  \
-    cd ucx  && \
-    git checkout ${UCX_BRANCH} && \
-    ./autogen.sh && \
-    mkdir build && cd build && \
-    ../configure \
-        --prefix=/usr/local/ucx \
-        --enable-shared \
-        --disable-static \
-        --disable-doxygen-doc \
-        --enable-optimizations \
-        --enable-devel-headers \
-        --with-rocm=/opt/rocm \
-        --with-verbs \
-        --with-dm \
-        --enable-mt && \
-    make -j && \
-    make -j install
-
-ENV PATH=/usr/local/ucx/bin:$PATH
-ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}
-
-RUN git clone ${RIXL_REPO} /opt/rixl && \
-    cd /opt/rixl && \
-    git checkout ${RIXL_BRANCH} && \
-    meson setup build --prefix=${RIXL_HOME} \
-                     -Ducx_path=${UCX_HOME} \
-                     -Drocm_path=${ROCM_PATH} && \
-    cd build && \
-    ninja && \
-    ninja install
-
-# Generate RIXL wheel
-RUN cd /opt/rixl && mkdir -p /app/install && \
-    ./contrib/build-wheel.sh \
-        --output-dir /app/install \
-        --rocm-dir ${ROCM_PATH} \
-        --ucx-plugins-dir ${UCX_HOME}/lib/ucx \
-        --nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins
-
-
 ###
 ### FlashAttention Build
 ###
@@ -365,8 +279,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
 RUN --mount=type=bind,from=build_mori,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
-RUN --mount=type=bind,from=build_rixl,src=/app/install/,target=/install \
-    cp /install/*.whl /app/debs

 FROM base AS final
 RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \
@@ -385,12 +297,6 @@ ARG FA_BRANCH
 ARG FA_REPO
 ARG AITER_BRANCH
 ARG AITER_REPO
-ARG RIXL_BRANCH
-ARG RIXL_REPO
-ARG ETCD_BRANCH
-ARG ETCD_REPO
-ARG UCX_BRANCH
-ARG UCX_REPO
 ARG MORI_BRANCH
 ARG MORI_REPO
 RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
@@ -406,11 +312,5 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \
-    && echo "RIXL_BRANCH: ${RIXL_BRANCH}" >> /app/versions.txt \
-    && echo "RIXL_REPO: ${RIXL_REPO}" >> /app/versions.txt \
-    && echo "ETCD_BRANCH: ${ETCD_BRANCH}" >> /app/versions.txt \
-    && echo "ETCD_REPO: ${ETCD_REPO}" >> /app/versions.txt \
-    && echo "UCX_BRANCH: ${UCX_BRANCH}" >> /app/versions.txt \
-    && echo "UCX_REPO: ${UCX_REPO}" >> /app/versions.txt \
    && echo "MORI_BRANCH: ${MORI_BRANCH}" >> /app/versions.txt \
    && echo "MORI_REPO: ${MORI_REPO}" >> /app/versions.txt
--- a/Show More
+++ b/Show More