[BugFix][Spec Decoding] Fix negative accepted tokens metric crash (#33729 )

Signed-off-by: Nick Hill <nickhill123@gmail.com>
cherry pick
2026-02-03 20:28:32 -05:00 · 2026-02-03 20:28:02 -05:00 · 2026-02-03 20:17:37 -05:00 · 2026-02-03 01:16:42 -08:00 · 2026-02-03 00:24:28 -08:00 · 2026-02-03 00:03:56 -08:00
1051 changed files with 59177 additions and 25487 deletions
--- a/.buildkite/lm-eval-harness/configs/models-small-rocm.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small-rocm.txt
@@ -0,0 +1,5 @@
+Qwen2.5-1.5B-Instruct.yaml
+Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+Qwen1.5-MoE-W4A16-compressed-tensors.yaml
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,198 +1,730 @@
 steps:
-  # aarch64 + CUDA builds
-  - label: "Build arm64 wheel - CUDA 12.9"
-    depends_on: ~
-    id: build-wheel-arm64-cuda-12-9
-    agents:
-      queue: arm64_cpu_queue_postmerge
-    commands:
-      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
-      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  - label: "Build arm64 wheel - CUDA 13.0"
-    depends_on: ~
-    id: build-wheel-arm64-cuda-13-0
-    agents:
-      queue: arm64_cpu_queue_postmerge
-    commands:
-      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
-      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04  --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  # aarch64 build
-  - label: "Build arm64 CPU wheel"
-    depends_on: ~
-    id: build-wheel-arm64-cpu
-    agents:
-      queue: arm64_cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  # x86 + CUDA builds
-  - label: "Build wheel - CUDA 12.9"
-    depends_on: ~
-    id: build-wheel-cuda-12-9
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  - label: "Build wheel - CUDA 13.0"
-    depends_on: ~
-    id: build-wheel-cuda-13-0
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  # x86 CPU wheel build
-  - label: "Build x86 CPU wheel"
-    depends_on: ~
-    id: build-wheel-x86-cpu
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  # Build release images (12.9)
-  - label: "Build release image (x86)"
-    depends_on: ~
-    id: build-release-image-x86
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
-      # re-tag to default image tag and push, just in case arm64 build fails
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
-  - label: "Build release image (arm64)"
-    depends_on: ~
-    id: build-release-image-arm64
-    agents:
-      queue: arm64_cpu_queue_postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
-
-  # Add job to create multi-arch manifest
-  - label: "Create multi-arch manifest"
-    depends_on:
-      - build-release-image-x86
-      - build-release-image-arm64
-    id: create-multi-arch-manifest
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
-      - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
-  - label: "Annotate release workflow"
-    depends_on:
-      - create-multi-arch-manifest
-    id: annotate-release-workflow
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "bash .buildkite/scripts/annotate-release.sh"
-
  - input: "Provide Release version here"
    id: input-release-version
    fields:
      - text: "What is the release version?"
        key: release-version

-  - block: "Build CPU release image"
-    key: block-cpu-release-image-build
+  - group: "Build Python wheels"
+    key: "build-wheels"
+    steps:
+      - label: "Build wheel - aarch64 - CUDA 12.9"
+        depends_on: ~
+        id: build-wheel-arm64-cuda-12-9
+        agents:
+          queue: arm64_cpu_queue_postmerge
+        commands:
+          # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+          # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+          - "mkdir artifacts"
+          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh"
+        env:
+          DOCKER_BUILDKIT: "1"
+
+      - label: "Build wheel - aarch64 - CUDA 13.0"
+        depends_on: ~
+        id: build-wheel-arm64-cuda-13-0
+        agents:
+          queue: arm64_cpu_queue_postmerge
+        commands:
+          # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+          # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04  --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+          - "mkdir artifacts"
+          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+        env:
+          DOCKER_BUILDKIT: "1"
+
+      - label: "Build wheel - aarch64 - CPU"
+        depends_on: ~
+        id: build-wheel-arm64-cpu
+        agents:
+          queue: arm64_cpu_queue_postmerge
+        commands:
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+          - "mkdir artifacts"
+          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+        env:
+          DOCKER_BUILDKIT: "1"
+
+      - label: "Build wheel - x86_64 - CUDA 12.9"
+        depends_on: ~
+        id: build-wheel-x86-cuda-12-9
+        agents:
+          queue: cpu_queue_postmerge
+        commands:
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+          - "mkdir artifacts"
+          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31"
+        env:
+          DOCKER_BUILDKIT: "1"
+
+      - label: "Build wheel - x86_64 - CUDA 13.0"
+        depends_on: ~
+        id: build-wheel-x86-cuda-13-0
+        agents:
+          queue: cpu_queue_postmerge
+        commands:
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+          - "mkdir artifacts"
+          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+        env:
+          DOCKER_BUILDKIT: "1"
+
+      - label: "Build wheel - x86_64 - CPU"
+        depends_on: ~
+        id: build-wheel-x86-cpu
+        agents:
+          queue: cpu_queue_postmerge
+        commands:
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+          - "mkdir artifacts"
+          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
+        env:
+          DOCKER_BUILDKIT: "1"
+
+  - group: "Build release Docker images"
+    key: "build-release-images"
+    steps:
+      - label: "Build release image - x86_64 - CUDA 12.9"
+        depends_on: ~
+        id: build-release-image-x86
+        agents:
+          queue: cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+          # re-tag to default image tag and push, just in case arm64 build fails
+          - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
+      - label: "Build release image - aarch64 - CUDA 12.9"
+        depends_on: ~
+        id: build-release-image-arm64
+        agents:
+          queue: arm64_cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+
+      - label: "Build release image - x86_64 - CUDA 13.0"
+        depends_on: ~
+        id: build-release-image-x86-cuda-13-0
+        agents:
+          queue: cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
+          # re-tag to default image tag and push, just in case arm64 build fails
+          - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+
+      - label: "Build release image - aarch64 - CUDA 13.0"
+        depends_on: ~
+        id: build-release-image-arm64-cuda-13-0
+        agents:
+          queue: arm64_cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          # compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
+
+      - block: "Build release image for x86_64 CPU"
+        key: block-cpu-release-image-build
+        depends_on: ~
+
+      - label: "Build release image - x86_64 - CPU"
+        depends_on:
+          - block-cpu-release-image-build
+          - input-release-version
+        agents:
+          queue: cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
+        env:
+          DOCKER_BUILDKIT: "1"
+
+      - block: "Build release image for arm64 CPU"
+        key: block-arm64-cpu-release-image-build
+        depends_on: ~
+
+      - label: "Build release image - arm64 - CPU"
+        depends_on: 
+          - block-arm64-cpu-release-image-build
+          - input-release-version
+        agents:
+          queue: arm64_cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
+        env:
+          DOCKER_BUILDKIT: "1"
+
+      - block: "Build release image for x86_64 ROCm"
+        key: block-rocm-release-image-build
+        depends_on: ~
+
+      - label: "Build release image - x86_64 - ROCm"
+        depends_on: block-rocm-release-image-build
+        id: build-release-image-rocm
+        agents:
+          queue: cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          # Build base image first
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
+          # Build vLLM ROCm image using the base
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
+          - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
+
+  - group: "Publish release images"
+    key: "publish-release-images"
+    steps:
+      - label: "Create multi-arch manifest - CUDA 12.9"
+        depends_on:
+          - build-release-image-x86
+          - build-release-image-arm64
+        id: create-multi-arch-manifest
+        agents:
+          queue: small_cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
+          - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
+      - label: "Annotate release workflow - CUDA 12.9"
+        depends_on:
+          - create-multi-arch-manifest
+        id: annotate-release-workflow
+        agents:
+          queue: small_cpu_queue_postmerge
+        commands:
+          - "bash .buildkite/scripts/annotate-release.sh"
+
+      - label: "Create multi-arch manifest - CUDA 13.0"
+        depends_on:
+          - build-release-image-x86-cuda-13-0
+          - build-release-image-arm64-cuda-13-0
+        id: create-multi-arch-manifest-cuda-13-0
+        agents:
+          queue: small_cpu_queue_postmerge
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
+          - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
+
+      - label: "Publish nightly multi-arch image to DockerHub"
+        depends_on:
+          - create-multi-arch-manifest
+        if: build.env("NIGHTLY") == "1"
+        agents:
+          queue: small_cpu_queue_postmerge
+        commands:
+          - "bash .buildkite/scripts/push-nightly-builds.sh"
+          # Clean up old nightly builds (keep only last 14)
+          - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
+        plugins:
+          - docker-login#v3.0.0:
+              username: vllmbot
+              password-env: DOCKERHUB_TOKEN
+        env:
+          DOCKER_BUILDKIT: "1"
+          DOCKERHUB_USERNAME: "vllmbot"
+
+      - label: "Publish nightly multi-arch image to DockerHub - CUDA 13.0"
+        depends_on:
+          - create-multi-arch-manifest-cuda-13-0
+        if: build.env("NIGHTLY") == "1"
+        agents:
+          queue: small_cpu_queue_postmerge
+        commands:
+          - "bash .buildkite/scripts/push-nightly-builds.sh cu130"
+          # Clean up old nightly builds (keep only last 14)
+          - "bash .buildkite/scripts/cleanup-nightly-builds.sh cu130-nightly-"
+        plugins:
+          - docker-login#v3.0.0:
+              username: vllmbot
+              password-env: DOCKERHUB_TOKEN
+        env:
+          DOCKER_BUILDKIT: "1"
+          DOCKERHUB_USERNAME: "vllmbot"
+
+  - group: "Publish wheels"
+    key: "publish-wheels"
+    steps:
+      - block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
+        key: block-upload-release-wheels
+        depends_on:
+          - input-release-version
+          - build-wheels
+
+      - label: "Upload release wheels to PyPI"
+        depends_on:
+          - block-upload-release-wheels
+        id: upload-release-wheels
+        agents:
+          queue: small_cpu_queue_postmerge
+        commands:
+          - "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
+
+  # =============================================================================
+  # ROCm Release Pipeline (x86_64 only)
+  # =============================================================================
+  #
+  # vLLM version is determined by the Buildkite checkout (like CUDA pipeline).
+  # To build a specific version, trigger the build from that branch/tag.
+  #
+  # Environment variables for ROCm builds (set via Buildkite UI or schedule):
+  #   ROCM_PYTHON_VERSION: Python version (default: 3.12)
+  #   PYTORCH_ROCM_ARCH: GPU architectures (default: gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151)
+  #   ROCM_UPLOAD_WHEELS: Upload to S3 (default: false for nightly, true for releases)
+  #   ROCM_FORCE_REBUILD: Force rebuild base wheels, ignore S3 cache (default: false)
+  #
+  # Note: ROCm version is determined by BASE_IMAGE in docker/Dockerfile.rocm_base
+  #       (currently rocm/dev-ubuntu-22.04:7.1-complete)
+  #
+  # =============================================================================
+
+  # ROCm Input Step - Collect build configuration (manual trigger only)
+  - input: "ROCm Wheel Release Build Configuration"
+    key: input-rocm-config
    depends_on: ~
+    if: build.source == "ui"
+    fields:
+      - text: "Python Version"
+        key: "rocm-python-version"
+        default: "3.12"
+        hint: "Python version (e.g., 3.12)"
+      - text: "GPU Architectures"
+        key: "rocm-pytorch-rocm-arch"
+        default: "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151"
+        hint: "Semicolon-separated GPU architectures"
+      - select: "Upload Wheels to S3"
+        key: "rocm-upload-wheels"
+        default: "true"
+        options:
+          - label: "No - Build only (nightly/dev)"
+            value: "false"
+          - label: "Yes - Upload to S3 (release)"
+            value: "true"
+      - select: "Force Rebuild Base Wheels"
+        key: "rocm-force-rebuild"
+        default: "false"
+        hint: "Ignore S3 cache and rebuild base wheels from scratch"
+        options:
+          - label: "No - Use cached wheels if available"
+            value: "false"
+          - label: "Yes - Rebuild even if cache exists"
+            value: "true"

-  - label: "Build and publish CPU release image"
-    depends_on: block-cpu-release-image-build
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  - block: "Build arm64 CPU release image"
-    key: block-arm64-cpu-release-image-build
-    depends_on: ~
-
-  - label: "Build and publish arm64 CPU release image"
-    depends_on: block-arm64-cpu-release-image-build
-    agents:
-      queue: arm64_cpu_queue_postmerge
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  - label: "Build and publish nightly multi-arch image to DockerHub"
+  # ROCm Job 1: Build ROCm Base Wheels (with S3 caching)
+  - label: ":rocm: Build ROCm Base Wheels"
+    id: build-rocm-base-wheels
    depends_on:
-      - create-multi-arch-manifest
-    if: build.env("NIGHTLY") == "1"
+      - step: input-rocm-config
+        allow_failure: true  # Allow failure so non-UI builds can proceed (input step is skipped)
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
-      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
-      - "docker push vllm/vllm-openai:nightly-x86_64"
-      - "docker push vllm/vllm-openai:nightly-aarch64"
-      - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
-      - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
-      - "docker manifest push vllm/vllm-openai:nightly"
-      - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
-      # Clean up old nightly builds (keep only last 14)
-      - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
-    plugins:
-      - docker-login#v3.0.0:
-          username: vllmbot
-          password-env: DOCKERHUB_TOKEN
+      # Set configuration and check cache
+      - |
+        set -euo pipefail
+
+        # Get values from meta-data (set by input step) or use defaults
+        PYTHON_VERSION="$$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo '')"
+        export PYTHON_VERSION="$${PYTHON_VERSION:-3.12}"
+
+        PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
+        export PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
+
+        # Check for force rebuild flag
+        ROCM_FORCE_REBUILD="$${ROCM_FORCE_REBUILD:-}"
+        if [ -z "$${ROCM_FORCE_REBUILD}" ]; then
+          ROCM_FORCE_REBUILD="$$(buildkite-agent meta-data get rocm-force-rebuild 2>/dev/null || echo '')"
+        fi
+
+        echo "========================================"
+        echo "ROCm Base Wheels Build Configuration"
+        echo "========================================"
+        echo "  PYTHON_VERSION: $${PYTHON_VERSION}"
+        echo "  PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
+        echo "  ROCM_FORCE_REBUILD: $${ROCM_FORCE_REBUILD:-false}"
+        echo "========================================"
+
+        # Save resolved config for later jobs
+        buildkite-agent meta-data set "rocm-python-version" "$${PYTHON_VERSION}"
+        buildkite-agent meta-data set "rocm-pytorch-rocm-arch" "$${PYTORCH_ROCM_ARCH}"
+
+        # Check S3 cache for pre-built wheels
+        CACHE_KEY=$$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
+        CACHE_PATH=$$(.buildkite/scripts/cache-rocm-base-wheels.sh path)
+        echo ""
+        echo "Cache key: $${CACHE_KEY}"
+        echo "Cache path: $${CACHE_PATH}"
+
+        # Save cache key for downstream jobs
+        buildkite-agent meta-data set "rocm-cache-key" "$${CACHE_KEY}"
+
+        CACHE_STATUS="miss"
+        if [ "$${ROCM_FORCE_REBUILD}" != "true" ]; then
+          CACHE_STATUS=$$(.buildkite/scripts/cache-rocm-base-wheels.sh check)
+        else
+          echo "Force rebuild requested, skipping cache check"
+        fi
+
+        if [ "$${CACHE_STATUS}" = "hit" ]; then
+          echo ""
+          echo "CACHE HIT! Downloading pre-built wheels..."
+          echo ""
+          .buildkite/scripts/cache-rocm-base-wheels.sh download
+
+          # Set the S3 path for the cached Docker image (for Job 2 to download)
+          S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
+          buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
+
+          # Mark that we used cache (for Docker image handling)
+          buildkite-agent meta-data set "rocm-used-cache" "true"
+
+          echo ""
+          echo "Cache download complete. Skipping Docker build."
+          echo "Docker image will be downloaded from: $${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
+        else
+          echo ""
+          echo "CACHE MISS. Building from scratch..."
+          echo ""
+
+          # Build full base image (for later vLLM build)
+          DOCKER_BUILDKIT=1 docker buildx build \
+            --file docker/Dockerfile.rocm_base \
+            --tag rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} \
+            --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
+            --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
+            --build-arg USE_SCCACHE=1 \
+            --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
+            --build-arg SCCACHE_REGION_NAME=us-west-2 \
+            --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
+            --load \
+            .
+
+          # Build debs_wheel_release stage for wheel extraction
+          DOCKER_BUILDKIT=1 docker buildx build \
+            --file docker/Dockerfile.rocm_base \
+            --tag rocm-base-debs:$${BUILDKITE_BUILD_NUMBER} \
+            --target debs_wheel_release \
+            --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
+            --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
+            --build-arg USE_SCCACHE=1 \
+            --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
+            --build-arg SCCACHE_REGION_NAME=us-west-2 \
+            --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
+            --load \
+            .
+
+          # Extract wheels from Docker image
+          mkdir -p artifacts/rocm-base-wheels
+          container_id=$$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER})
+          docker cp $${container_id}:/app/debs/. artifacts/rocm-base-wheels/
+          docker rm $${container_id}
+          echo "Extracted base wheels:"
+          ls -lh artifacts/rocm-base-wheels/
+
+          # Upload wheels to S3 cache for future builds
+          echo ""
+          echo "Uploading wheels to S3 cache..."
+          .buildkite/scripts/cache-rocm-base-wheels.sh upload
+
+          # Export base Docker image for reuse in vLLM build
+          mkdir -p artifacts/rocm-docker-image
+          docker save rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} | gzip > artifacts/rocm-docker-image/rocm-base-image.tar.gz
+          echo "Docker image size:"
+          ls -lh artifacts/rocm-docker-image/
+
+          # Upload large Docker image to S3 (also cached by cache key)
+          S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
+          echo "Uploading Docker image to $${S3_ARTIFACT_PATH}/"
+          aws s3 cp artifacts/rocm-docker-image/rocm-base-image.tar.gz "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
+
+          # Save the S3 path for downstream jobs
+          buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
+
+          # Mark that we did NOT use cache
+          buildkite-agent meta-data set "rocm-used-cache" "false"
+
+          echo ""
+          echo "Build complete. Wheels cached for future builds."
+        fi
+    artifact_paths:
+      - "artifacts/rocm-base-wheels/*.whl"
    env:
      DOCKER_BUILDKIT: "1"
-      DOCKERHUB_USERNAME: "vllmbot"
+      S3_BUCKET: "vllm-wheels"
+
+  # ROCm Job 2: Build vLLM ROCm Wheel
+  - label: ":python: Build vLLM ROCm Wheel"
+    id: build-rocm-vllm-wheel
+    depends_on:
+      - step: build-rocm-base-wheels
+        allow_failure: false
+    agents:
+      queue: cpu_queue_postmerge
+    timeout_in_minutes: 180
+    commands:
+      # Download artifacts and prepare Docker image
+      - |
+        set -euo pipefail
+
+        # Ensure git tags are up-to-date (Buildkite's default fetch doesn't update tags)
+        # This fixes version detection when tags are moved/force-pushed
+        echo "Fetching latest tags from origin..."
+        git fetch --tags --force origin
+        
+        # Log tag information for debugging version detection
+        echo "========================================"
+        echo "Git Tag Verification"
+        echo "========================================"
+        echo "Current HEAD: $(git rev-parse HEAD)"
+        echo "git describe --tags: $(git describe --tags 2>/dev/null || echo 'No tags found')"
+        echo ""
+        echo "Recent tags (pointing to commits near HEAD):"
+        git tag -l --sort=-creatordate | head -5
+        echo "setuptools_scm version detection:"
+        pip install -q setuptools_scm 2>/dev/null || true
+        python3 -c "import setuptools_scm; print('  Detected version:', setuptools_scm.get_version())" 2>/dev/null || echo "  (setuptools_scm not available in this environment)"
+        echo "========================================"
+
+        # Download wheel artifacts from current build
+        echo "Downloading wheel artifacts from current build"
+        buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
+
+        # Download Docker image from S3 (too large for Buildkite artifacts)
+        DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
+        if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
+          echo "ERROR: rocm-docker-image-s3-path metadata not found"
+          echo "This should have been set by the build-rocm-base-wheels job"
+          exit 1
+        fi
+        echo "Downloading Docker image from $${DOCKER_IMAGE_S3_PATH}"
+        mkdir -p artifacts/rocm-docker-image
+        aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
+
+        # Load base Docker image and capture the tag
+        echo "Loading base Docker image..."
+        LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
+        echo "$${LOAD_OUTPUT}"
+        # Extract the actual loaded image tag from "Loaded image: <tag>" output
+        # This avoids picking up stale images (like rocm/vllm-dev:nightly) already on the agent
+        BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
+        if [ -z "$${BASE_IMAGE_TAG}" ]; then
+          echo "ERROR: Failed to extract image tag from docker load output"
+          echo "Load output was: $${LOAD_OUTPUT}"
+          exit 1
+        fi
+        echo "Loaded base image: $${BASE_IMAGE_TAG}"
+
+        # Prepare base wheels for Docker build context
+        mkdir -p docker/context/base-wheels
+        touch docker/context/base-wheels/.keep
+        cp artifacts/rocm-base-wheels/*.whl docker/context/base-wheels/
+        echo "Base wheels for vLLM build:"
+        ls -lh docker/context/base-wheels/
+
+        # Get GPU architectures from meta-data
+        PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
+        PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
+
+        echo "========================================"
+        echo "Building vLLM wheel with:"
+        echo "  BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}"
+        echo "  BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
+        echo "  PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
+        echo "  BASE_IMAGE: $${BASE_IMAGE_TAG}"
+        echo "========================================"
+
+        # Build vLLM wheel using local checkout (REMOTE_VLLM=0)
+        DOCKER_BUILDKIT=1 docker build \
+          --file docker/Dockerfile.rocm \
+          --target export_vllm_wheel_release \
+          --output type=local,dest=rocm-dist \
+          --build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
+          --build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
+          --build-arg REMOTE_VLLM=0 \
+          --build-arg GIT_REPO_CHECK=1 \
+          --build-arg USE_SCCACHE=1 \
+          --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
+          --build-arg SCCACHE_REGION_NAME=us-west-2 \
+          --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
+          .
+
+        echo "Built vLLM wheel:"
+        ls -lh rocm-dist/*.whl
+
+        # Copy wheel to artifacts directory
+        mkdir -p artifacts/rocm-vllm-wheel
+        cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/
+        echo "Final vLLM wheel:"
+        ls -lh artifacts/rocm-vllm-wheel/
+    artifact_paths:
+      - "artifacts/rocm-vllm-wheel/*.whl"
+    env:
+      DOCKER_BUILDKIT: "1"
+      S3_BUCKET: "vllm-wheels"
+
+  # ROCm Job 3: Upload Wheels to S3
+  - label: ":s3: Upload ROCm Wheels to S3"
+    id: upload-rocm-wheels
+    depends_on:
+      - step: build-rocm-vllm-wheel
+        allow_failure: false
+    agents:
+      queue: cpu_queue_postmerge
+    timeout_in_minutes: 60
+    commands:
+      # Download all wheel artifacts and run upload
+      - |
+        set -euo pipefail
+
+        # Check if upload is enabled (from env var, meta-data, or release branch)
+        ROCM_UPLOAD_WHEELS="$${ROCM_UPLOAD_WHEELS:-}"
+        if [ -z "$${ROCM_UPLOAD_WHEELS}" ]; then
+          # Try to get from meta-data (input form)
+          ROCM_UPLOAD_WHEELS="$$(buildkite-agent meta-data get rocm-upload-wheels 2>/dev/null || echo '')"
+        fi
+
+        echo "========================================"
+        echo "Upload check:"
+        echo "  ROCM_UPLOAD_WHEELS: $${ROCM_UPLOAD_WHEELS}"
+        echo "  BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
+        echo "========================================"
+
+        # Skip upload if not enabled
+        if [ "$${ROCM_UPLOAD_WHEELS}" != "true" ]; then
+          echo "Skipping S3 upload (ROCM_UPLOAD_WHEELS != true, NIGHTLY != 1, not a release branch)"
+          echo "To enable upload, set 'Upload Wheels to S3' to 'Yes' in the build configuration"
+          exit 0
+        fi
+
+        echo "Upload enabled, proceeding..."
+
+        # Download artifacts from current build
+        echo "Downloading artifacts from current build"
+        buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
+        buildkite-agent artifact download "artifacts/rocm-vllm-wheel/*.whl" .
+
+        # Run upload script
+        bash .buildkite/scripts/upload-rocm-wheels.sh
+    env:
+      DOCKER_BUILDKIT: "1"
+      S3_BUCKET: "vllm-wheels"
+
+  # ROCm Job 4: Annotate ROCm Wheel Release
+  - label: ":memo: Annotate ROCm wheel release"
+    id: annotate-rocm-release
+    depends_on:
+      - step: upload-rocm-wheels
+        allow_failure: true
+      - step: input-release-version
+        allow_failure: true
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "bash .buildkite/scripts/annotate-rocm-release.sh"
+    env:
+      S3_BUCKET: "vllm-wheels"
+
+  # ROCm Job 5: Generate Root Index for ROCm Wheels (for release only)
+  # This is the job to create https://wheels.vllm.ai/rocm/ index allowing
+  # users to install with `uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/`
+  - block: "Generate Root Index for ROCm Wheels for Release"
+    key: block-generate-root-index-rocm-wheels
+    depends_on: upload-rocm-wheels
+
+  - label: ":package: Generate Root Index for ROCm Wheels for Release"
+    depends_on: block-generate-root-index-rocm-wheels
+    id: generate-root-index-rocm-wheels
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
+    env:
+      S3_BUCKET: "vllm-wheels"
+      VARIANT: "rocm700"
+
+  # ROCm Job 5: Build ROCm Release Docker Image
+  - label: ":rocm: :docker: Build ROCm Release Docker Image"
+    id: build-rocm-release-image
+    depends_on:
+      - step: build-rocm-base-wheels
+        allow_failure: false
+    agents:
+      queue: cpu_queue_postmerge
+    timeout_in_minutes: 60
+    commands:
+      - |
+        set -euo pipefail
+
+        # Login to ECR
+        aws ecr-public get-login-password --region us-east-1 | \
+          docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
+
+        # Download Docker image from S3 (set by build-rocm-base-wheels)
+        DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
+        if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
+          echo "ERROR: rocm-docker-image-s3-path metadata not found"
+          exit 1
+        fi
+
+        echo "Downloading base image from $${DOCKER_IMAGE_S3_PATH}"
+        mkdir -p artifacts/rocm-docker-image
+        aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
+
+        # Load base Docker image
+        echo "Loading base Docker image..."
+        LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
+        BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
+        echo "Loaded base image: $${BASE_IMAGE_TAG}"
+
+        # Tag and push the base image to ECR
+        docker tag "$${BASE_IMAGE_TAG}" public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
+        docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
+        echo "Pushed base image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base"
+
+        # Get GPU architectures from meta-data
+        PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
+        PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
+
+        # Build vLLM ROCm release image using cached base
+        DOCKER_BUILDKIT=1 docker build \
+          --build-arg max_jobs=16 \
+          --build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
+          --build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
+          --build-arg USE_SCCACHE=1 \
+          --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
+          --build-arg SCCACHE_REGION_NAME=us-west-2 \
+          --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
+          --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm \
+          --target vllm-openai \
+          --progress plain \
+          -f docker/Dockerfile.rocm .
+
+        # Push to ECR
+        docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
+        echo "Pushed: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
+    env:
+      DOCKER_BUILDKIT: "1"
+      S3_BUCKET: "vllm-wheels"
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -11,27 +11,32 @@ fi
 buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
 To download the wheel (by commit):
 \`\`\`
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_aarch64.whl .

-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+(Optional) For CUDA 13.0:
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .
+
+(Optional) For CPU:
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
 \`\`\`

-To download the wheel (by version):
-\`\`\`
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
-
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
-\`\`\`

 To download and upload the image:

 \`\`\`
+Download images:
+
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
+
+Tag and push images:

 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
@@ -39,16 +44,47 @@ docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
 docker push vllm/vllm-openai:latest-x86_64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64

+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
+docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
+docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
+docker push vllm/vllm-openai:latest-x86_64-cu130
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
+
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker push vllm/vllm-openai:latest-aarch64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64

+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
+docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
+docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+docker push vllm/vllm-openai:latest-aarch64-cu130
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:latest
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
+docker push vllm/vllm-openai-rocm:latest
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
+
+Create multi-arch manifest:
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+docker push vllm/vllm-openai-rocm:latest-base
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+
 docker manifest rm vllm/vllm-openai:latest
 docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker manifest push vllm/vllm-openai:latest
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
+
+docker manifest rm vllm/vllm-openai:latest-cu130
+docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+docker manifest push vllm/vllm-openai:latest-cu130
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
 \`\`\`
 EOF 
--- a/.buildkite/scripts/annotate-rocm-release.sh
+++ b/.buildkite/scripts/annotate-rocm-release.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Generate Buildkite annotation for ROCm wheel release
+set -ex
+
+# Get build configuration from meta-data
+# Extract ROCm version dynamically from Dockerfile.rocm_base
+# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0"
+ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
+PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
+PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
+
+# TODO: Enable the nightly build for ROCm
+# Get release version, default to 1.0.0.dev for nightly/per-commit builds
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "")
+if [ -z "${RELEASE_VERSION}" ]; then
+  RELEASE_VERSION="1.0.0.dev"
+fi
+
+# S3 URLs
+S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
+S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
+S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"
+
+# Format ROCm version for path (e.g., "7.1" -> "rocm710")
+ROCM_VERSION_PATH="rocm$(echo ${ROCM_VERSION} | tr -d '.')"
+ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
+buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
+## ROCm Wheel and Docker Image Releases
+### Build Configuration
+| Setting | Value |
+|---------|-------|
+| **ROCm Version** | ${ROCM_VERSION} |
+| **Python Version** | ${PYTHON_VERSION} |
+| **GPU Architectures** | ${PYTORCH_ROCM_ARCH} |
+| **Branch** | \`${BUILDKITE_BRANCH}\` |
+| **Commit** | \`${BUILDKITE_COMMIT}\` |
+
+### :package: Installation
+
+**Install from this build (by commit):**
+
+\`\`\`bash
+pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
+
+# Example for ROCm ${ROCM_VERSION}:
+pip install vllm --extra-index-url ${S3_URL}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
+\`\`\`
+
+**Install from nightly (if published):**
+
+\`\`\`bash
+pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/ --trusted-host ${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com
+\`\`\`
+
+### :floppy_disk: Download Wheels Directly
+
+\`\`\`bash
+# List all ROCm wheels
+aws s3 ls s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/
+# Download specific wheels
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/vllm-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torch-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton-kernels-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/aiter-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
+\`\`\`
+
+### :gear: Included Packages
+- **vllm**: vLLM with ROCm support
+- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
+- **triton**: Triton
+- **triton-kernels**: Triton kernels
+- **torchvision**: TorchVision for ROCm PyTorch
+- **torchaudio**: Torchaudio for ROCm PyTorch
+- **amdsmi**: AMD SMI Python bindings
+- **aiter**: Aiter for ROCm
+- **flash-attn**: Flash Attention for ROCm
+
+### :warning: Notes
+- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
+- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
+- Platform: Linux x86_64 only
+
+### :package: Docker Image Release
+
+To download and upload the image:
+
+\`\`\`
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+docker push vllm/vllm-openai-rocm:latest-base
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+docker push vllm/vllm-openai-rocm:latest
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+\`\`\`
+
+EOF
--- a/.buildkite/scripts/cache-rocm-base-wheels.sh
+++ b/.buildkite/scripts/cache-rocm-base-wheels.sh
@@ -0,0 +1,140 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Cache helper for ROCm base wheels
+#
+# This script manages caching of pre-built ROCm base wheels (torch, triton, etc.)
+# to avoid rebuilding them when Dockerfile.rocm_base hasn't changed.
+#
+# Usage:
+#   cache-rocm-base-wheels.sh check    - Check if cache exists, outputs "hit" or "miss"
+#   cache-rocm-base-wheels.sh upload   - Upload wheels to cache
+#   cache-rocm-base-wheels.sh download - Download wheels from cache
+#   cache-rocm-base-wheels.sh key      - Output the cache key
+#
+# Environment variables:
+#   S3_BUCKET          - S3 bucket name (default: vllm-wheels)
+#   PYTHON_VERSION     - Python version (affects cache key)
+#   PYTORCH_ROCM_ARCH  - GPU architectures (affects cache key)
+#
+# Note: ROCm version is determined by BASE_IMAGE in Dockerfile.rocm_base,
+#       so changes to ROCm version are captured by the Dockerfile hash.
+
+set -euo pipefail
+
+BUCKET="${S3_BUCKET:-vllm-wheels}"
+DOCKERFILE="docker/Dockerfile.rocm_base"
+CACHE_PREFIX="rocm/cache"
+
+# Generate hash from Dockerfile content + build args
+generate_cache_key() {
+    # Include Dockerfile content
+    if [[ ! -f "$DOCKERFILE" ]]; then
+        echo "ERROR: Dockerfile not found: $DOCKERFILE" >&2
+        exit 1
+    fi
+    local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16)
+
+    # Include key build args that affect the output
+    # These should match the ARGs in Dockerfile.rocm_base that change the build output
+    # Note: ROCm version is determined by BASE_IMAGE in the Dockerfile, so it's captured by dockerfile_hash
+    local args_string="${PYTHON_VERSION:-}|${PYTORCH_ROCM_ARCH:-}"
+    local args_hash=$(echo "$args_string" | sha256sum | cut -c1-8)
+
+    echo "${dockerfile_hash}-${args_hash}"
+}
+
+CACHE_KEY=$(generate_cache_key)
+CACHE_PATH="s3://${BUCKET}/${CACHE_PREFIX}/${CACHE_KEY}/"
+
+case "${1:-}" in
+    check)
+        echo "Checking cache for key: ${CACHE_KEY}" >&2
+        echo "Cache path: ${CACHE_PATH}" >&2
+        echo "Variables used in cache key:" >&2
+        echo "  PYTHON_VERSION: ${PYTHON_VERSION:-<not set>}" >&2
+        echo "  PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH:-<not set>}" >&2
+
+        # Check if cache exists by listing objects
+        # We look for at least one .whl file
+        echo "Running: aws s3 ls ${CACHE_PATH}" >&2
+        S3_OUTPUT=$(aws s3 ls "${CACHE_PATH}" 2>&1) || true
+        echo "S3 ls output:" >&2
+        echo "$S3_OUTPUT" | head -5 >&2
+
+        if echo "$S3_OUTPUT" | grep -q "\.whl"; then
+            echo "hit"
+        else
+            echo "miss"
+        fi
+        ;;
+
+    upload)
+        echo "========================================"
+        echo "Uploading wheels to cache"
+        echo "========================================"
+        echo "Cache key: ${CACHE_KEY}"
+        echo "Cache path: ${CACHE_PATH}"
+        echo ""
+
+        if [[ ! -d "artifacts/rocm-base-wheels" ]]; then
+            echo "ERROR: artifacts/rocm-base-wheels directory not found" >&2
+            exit 1
+        fi
+
+        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
+        if [[ "$WHEEL_COUNT" -eq 0 ]]; then
+            echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
+            exit 1
+        fi
+
+        echo "Uploading $WHEEL_COUNT wheels..."
+        aws s3 cp --recursive artifacts/rocm-base-wheels/ "${CACHE_PATH}"
+
+        echo ""
+        echo "Cache upload complete!"
+        echo "========================================"
+        ;;
+
+    download)
+        echo "========================================"
+        echo "Downloading wheels from cache"
+        echo "========================================"
+        echo "Cache key: ${CACHE_KEY}"
+        echo "Cache path: ${CACHE_PATH}"
+        echo ""
+
+        mkdir -p artifacts/rocm-base-wheels
+        aws s3 cp --recursive "${CACHE_PATH}" artifacts/rocm-base-wheels/
+
+        echo ""
+        echo "Downloaded wheels:"
+        ls -lh artifacts/rocm-base-wheels/
+
+        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
+        echo ""
+        echo "Total: $WHEEL_COUNT wheels"
+        echo "========================================"
+        ;;
+
+    key)
+        echo "${CACHE_KEY}"
+        ;;
+
+    path)
+        echo "${CACHE_PATH}"
+        ;;
+
+    *)
+        echo "Usage: $0 {check|upload|download|key|path}" >&2
+        echo "" >&2
+        echo "Commands:" >&2
+        echo "  check    - Check if cache exists, outputs 'hit' or 'miss'" >&2
+        echo "  upload   - Upload wheels from artifacts/rocm-base-wheels/ to cache" >&2
+        echo "  download - Download wheels from cache to artifacts/rocm-base-wheels/" >&2
+        echo "  key      - Output the cache key" >&2
+        echo "  path     - Output the full S3 cache path" >&2
+        exit 1
+        ;;
+esac
--- a/.buildkite/scripts/cherry-pick-from-milestone.sh
+++ b/.buildkite/scripts/cherry-pick-from-milestone.sh
@@ -0,0 +1,242 @@
+#!/bin/bash
+#
+# cherry-pick-from-milestone.sh
+# Find commits from a GitHub milestone that are missing from the current branch
+# and output them in chronological order for cherry-picking.
+#
+# Usage: ./cherry-pick-from-milestone.sh <milestone> [--dry-run] [--execute]
+#
+
+set -euo pipefail
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+usage() {
+    cat <<EOF
+Usage: $(basename "$0") <milestone> [options]
+
+Find commits from a GitHub milestone that need to be cherry-picked into the current branch.
+
+Arguments:
+    milestone       The GitHub milestone name (e.g., v0.14.0)
+
+Options:
+    --dry-run       Show the cherry-pick commands without executing (default)
+    --execute       Actually execute the cherry-picks
+    --main-branch   Specify the main branch name (default: main)
+    --help          Show this help message
+
+Examples:
+    $(basename "$0") v0.14.0
+    $(basename "$0") v0.14.0 --dry-run
+    $(basename "$0") v0.14.0 --execute
+    $(basename "$0") v0.14.0 --main-branch master
+EOF
+    exit 1
+}
+
+log_info() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+log_success() {
+    echo -e "${GREEN}[OK]${NC} $1"
+}
+
+log_warn() {
+    echo -e "${YELLOW}[WARN]${NC} $1"
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $1" >&2
+}
+
+# Default values
+MILESTONE=""
+DRY_RUN=true
+MAIN_BRANCH="main"
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --dry-run)
+            DRY_RUN=true
+            shift
+            ;;
+        --execute)
+            DRY_RUN=false
+            shift
+            ;;
+        --main-branch)
+            MAIN_BRANCH="$2"
+            shift 2
+            ;;
+        --help|-h)
+            usage
+            ;;
+        -*)
+            log_error "Unknown option: $1"
+            usage
+            ;;
+        *)
+            if [[ -z "$MILESTONE" ]]; then
+                MILESTONE="$1"
+            else
+                log_error "Unexpected argument: $1"
+                usage
+            fi
+            shift
+            ;;
+    esac
+done
+
+# Validate milestone argument
+if [[ -z "$MILESTONE" ]]; then
+    log_error "Milestone is required"
+    usage
+fi
+
+# Check if we're in a git repository
+if ! git rev-parse --is-inside-work-tree &>/dev/null; then
+    log_error "Not in a git repository"
+    exit 1
+fi
+
+# Check if gh CLI is available
+if ! command -v gh &>/dev/null; then
+    log_error "GitHub CLI (gh) is not installed"
+    exit 1
+fi
+
+# Check if authenticated with gh
+if ! gh auth status &>/dev/null; then
+    log_error "Not authenticated with GitHub CLI. Run 'gh auth login' first."
+    exit 1
+fi
+
+CURRENT_BRANCH=$(git branch --show-current)
+log_info "Current branch: ${CURRENT_BRANCH}"
+log_info "Main branch: ${MAIN_BRANCH}"
+log_info "Milestone: ${MILESTONE}"
+echo ""
+
+# Fetch latest from remote
+log_info "Fetching latest from remote..."
+git fetch origin "$MAIN_BRANCH" --quiet
+
+# Get merged PRs from the milestone, sorted by merge date
+log_info "Fetching merged PRs from milestone '${MILESTONE}'..."
+
+# Store PR data in a temp file
+PR_DATA=$(mktemp)
+trap "rm -f $PR_DATA" EXIT
+
+if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
+    --limit 1000 \
+    --json number,title,mergeCommit,mergedAt \
+    --jq 'sort_by(.mergedAt) | .[] | "\(.mergeCommit.oid)\t\(.number)\t\(.title)"' > "$PR_DATA" 2>/dev/null; then
+    log_error "Failed to fetch PRs from milestone '${MILESTONE}'"
+    log_error "This could be due to:"
+    log_error "  - Milestone does not exist"
+    log_error "  - Network/authentication issues"
+    log_error "  - Invalid milestone name format"
+    exit 1
+fi
+
+if [[ ! -s "$PR_DATA" ]]; then
+    log_warn "No merged PRs found for milestone '${MILESTONE}'"
+    exit 0
+fi
+
+TOTAL_PRS=$(wc -l < "$PR_DATA")
+log_info "Found ${TOTAL_PRS} merged PR(s) in milestone"
+echo ""
+
+# Find commits that are missing from current branch
+MISSING_COMMITS=()
+MISSING_INFO=()
+
+while IFS=$'\t' read -r sha pr_number title; do
+    # Skip if SHA is empty or null
+    if [[ -z "$sha" || "$sha" == "null" ]]; then
+        log_warn "PR #${pr_number} has no merge commit SHA, skipping"
+        continue
+    fi
+    
+    # Check if this commit is already in the current branch
+    if git merge-base --is-ancestor "$sha" HEAD 2>/dev/null; then
+        log_success "PR #${pr_number} already in branch: ${title:0:60}"
+    else
+        log_warn "PR #${pr_number} MISSING: ${title:0:60}"
+        MISSING_COMMITS+=("$sha")
+        MISSING_INFO+=("$sha PR #${pr_number}: ${title}")
+    fi
+done < "$PR_DATA"
+
+echo ""
+
+if [[ ${#MISSING_COMMITS[@]} -eq 0 ]]; then
+    log_success "All PRs from milestone '${MILESTONE}' are already in the current branch!"
+    exit 0
+fi
+
+log_info "Found ${#MISSING_COMMITS[@]} missing commit(s) to cherry-pick"
+echo ""
+
+# Output the cherry-pick commands
+echo "=========================================="
+echo "Cherry-pick commands (in chronological order):"
+echo "=========================================="
+echo ""
+
+for info in "${MISSING_INFO[@]}"; do
+    echo "# $info"
+done
+echo ""
+
+echo "# Run these commands to cherry-pick all missing commits:"
+echo "git cherry-pick ${MISSING_COMMITS[*]}"
+echo ""
+
+# Or one by one
+echo "# Or cherry-pick one at a time:"
+for sha in "${MISSING_COMMITS[@]}"; do
+    echo "git cherry-pick $sha"
+done
+echo ""
+
+# Execute if requested
+if [[ "$DRY_RUN" == false ]]; then
+    echo "=========================================="
+    log_info "Executing cherry-picks..."
+    echo "=========================================="
+    
+    for i in "${!MISSING_COMMITS[@]}"; do
+        sha="${MISSING_COMMITS[$i]}"
+        info="${MISSING_INFO[$i]}"
+        
+        echo ""
+        log_info "Cherry-picking: $info"
+        
+        if git cherry-pick "$sha"; then
+            log_success "Successfully cherry-picked $sha"
+        else
+            log_error "Failed to cherry-pick $sha"
+            log_error "Resolve conflicts and run 'git cherry-pick --continue', or 'git cherry-pick --abort' to cancel"
+            exit 1
+        fi
+    done
+    
+    echo ""
+    log_success "All cherry-picks completed successfully!"
+else
+    echo "=========================================="
+    echo -e "${YELLOW}Dry run mode - no changes made${NC}"
+    echo "Run with --execute to perform the cherry-picks"
+    echo "=========================================="
+fi
--- a/.buildkite/scripts/cleanup-nightly-builds.sh
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@@ -3,7 +3,14 @@
 set -ex

 # Clean up old nightly builds from DockerHub, keeping only the last 14 builds
-# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
+# This script uses DockerHub API to list and delete old tags with specified prefix
+# Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
+# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
+
+# Get tag prefix from argument, default to "nightly-" if not provided
+TAG_PREFIX="${1:-nightly-}"
+
+echo "Cleaning up tags with prefix: $TAG_PREFIX"

 # DockerHub API endpoint for vllm/vllm-openai repository
 REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
@@ -45,7 +52,7 @@ get_all_tags() {
        set -x
        
        # Get both last_updated timestamp and tag name, separated by |
-        local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
+        local tags=$(echo "$response" | jq -r --arg prefix "$TAG_PREFIX" '.results[] | select(.name | startswith($prefix)) | "\(.last_updated)|\(.name)"')
        
        if [ -z "$tags" ]; then
            break
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -16,6 +16,18 @@ from urllib.parse import quote

 import regex as re

+
+def normalize_package_name(name: str) -> str:
+    """
+    Normalize package name according to PEP 503.
+    https://peps.python.org/pep-0503/#normalized-names
+
+    Replace runs of underscores, hyphens, and periods with a single hyphen,
+    and lowercase the result.
+    """
+    return re.sub(r"[-_.]+", "-", name).lower()
+
+
 if not sys.version_info >= (3, 12):
    raise RuntimeError("This script requires Python 3.12 or higher.")

@@ -78,7 +90,13 @@ def parse_from_filename(file: str) -> WheelFileInfo:
            version = version.removesuffix("." + variant)
    else:
        if "+" in version:
-            version, variant = version.split("+")
+            version_part, suffix = version.split("+", 1)
+            # Only treat known patterns as variants (rocmXXX, cuXXX, cpu)
+            # Git hashes and other suffixes are NOT variants
+            if suffix.startswith(("rocm", "cu", "cpu")):
+                variant = suffix
+                version = version_part
+            # Otherwise keep the full version string (variant stays None)

    return WheelFileInfo(
        package_name=package_name,
@@ -206,6 +224,26 @@ def generate_index_and_metadata(
        print("No wheel files found, skipping index generation.")
        return

+    # For ROCm builds: inherit variant from vllm wheel
+    # All ROCm wheels should share the same variant as vllm
+    rocm_variant = None
+    for file in parsed_files:
+        if (
+            file.package_name == "vllm"
+            and file.variant
+            and file.variant.startswith("rocm")
+        ):
+            rocm_variant = file.variant
+            print(f"Detected ROCm variant from vllm: {rocm_variant}")
+            break
+
+    # Apply ROCm variant to all wheels without a variant
+    if rocm_variant:
+        for file in parsed_files:
+            if file.variant is None:
+                file.variant = rocm_variant
+                print(f"Inherited variant '{rocm_variant}' for {file.filename}")
+
    # Group by variant
    variant_to_files: dict[str, list[WheelFileInfo]] = {}
    for file in parsed_files:
@@ -256,8 +294,8 @@ def generate_index_and_metadata(

        variant_dir.mkdir(parents=True, exist_ok=True)

-        # gather all package names in this variant
-        packages = set(f.package_name for f in files)
+        # gather all package names in this variant (normalized per PEP 503)
+        packages = set(normalize_package_name(f.package_name) for f in files)
        if variant == "default":
            # these packages should also appear in the "project list"
            # generate after all variants are processed
@@ -269,8 +307,10 @@ def generate_index_and_metadata(
                f.write(project_list_str)

        for package in packages:
-            # filter files belonging to this package only
-            package_files = [f for f in files if f.package_name == package]
+            # filter files belonging to this package only (compare normalized names)
+            package_files = [
+                f for f in files if normalize_package_name(f.package_name) == package
+            ]
            package_dir = variant_dir / package
            package_dir.mkdir(parents=True, exist_ok=True)
            index_str, metadata_str = generate_package_index_and_metadata(
@@ -341,8 +381,13 @@ if __name__ == "__main__":
    args = parser.parse_args()

    version = args.version
-    if "/" in version or "\\" in version:
-        raise ValueError("Version string must not contain slashes.")
+    # Allow rocm/ prefix, reject other slashes and all backslashes
+    if "\\" in version:
+        raise ValueError("Version string must not contain backslashes.")
+    if "/" in version and not version.startswith("rocm/"):
+        raise ValueError(
+            "Version string must not contain slashes (except for 'rocm/' prefix)."
+        )
    current_objects_path = Path(args.current_objects)
    output_dir = Path(args.output_dir)
    if not output_dir.exists():
@@ -393,8 +438,23 @@ if __name__ == "__main__":
    # Generate index and metadata, assuming wheels and indices are stored as:
    # s3://vllm-wheels/{wheel_dir}/<wheel files>
    # s3://vllm-wheels/<anything>/<index files>
-    wheel_dir = args.wheel_dir or version
-    wheel_base_dir = Path(output_dir).parent / wheel_dir.strip().rstrip("/")
+    #
+    # For ROCm builds, version is "rocm/{commit}" and indices are uploaded to:
+    #   - rocm/{commit}/  (same as wheels)
+    #   - rocm/nightly/
+    #   - rocm/{version}/
+    # All these are under the "rocm/" prefix, so relative paths should be
+    # relative to "rocm/", not the bucket root.
+    if args.wheel_dir:
+        # Explicit wheel-dir provided (e.g., for version-specific indices pointing to commit dir)
+        wheel_dir = args.wheel_dir.strip().rstrip("/")
+    elif version.startswith("rocm/"):
+        # For rocm/commit, wheel_base_dir should be just the commit part
+        # so relative path from rocm/0.12.0/rocm710/vllm/ -> ../../../{commit}/
+        wheel_dir = version.split("/", 1)[1]
+    else:
+        wheel_dir = version
+    wheel_base_dir = Path(output_dir).parent / wheel_dir
    index_base_dir = Path(output_dir)

    generate_index_and_metadata(
--- a/.buildkite/scripts/push-nightly-builds.sh
+++ b/.buildkite/scripts/push-nightly-builds.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+set -ex
+
+# Get tag variant from argument, default to empty if not provided, should be something like "cu130".
+# Due to limits in cleanup script, we must move variants to use separate tags like "cu130-nightly",
+# otherwise they will be cleaned up together with the main "nightly" tags.
+
+TAG_VARIANT="$1"
+if [ -n "$TAG_VARIANT" ]; then
+    ORIG_TAG_SUFFIX="-$TAG_VARIANT"
+    TAG_NAME="$TAG_VARIANT-nightly"
+else
+    ORIG_TAG_SUFFIX=""
+    TAG_NAME="nightly"
+fi
+
+ORIG_TAG_NAME="$BUILDKITE_COMMIT"
+
+echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag name: $TAG_NAME"
+
+# pull original arch-dependent images from AWS ECR Public
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX
+# tag arch-dependent images
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64
+# push arch-dependent images to DockerHub
+docker push vllm/vllm-openai:$TAG_NAME-x86_64
+docker push vllm/vllm-openai:$TAG_NAME-aarch64
+# push arch-independent manifest to DockerHub
+docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
+docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend
+docker manifest push vllm/vllm-openai:$TAG_NAME
+docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -18,15 +18,18 @@ wait_for_server() {

 MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"

-# Set BACKENDS based on platform
+# Set BACKENDS and platform-specific args based on platform
 if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
  # ROCm platform
  BACKENDS=("allgather_reducescatter")
  # Disable MOE padding for ROCm since it is causing eplb to fail
  export VLLM_ROCM_MOE_PADDING=0
+  PLATFORM_ARGS=("--no-async-scheduling")
+  echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
 else
  # Non-ROCm platform (CUDA/other)
  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+  PLATFORM_ARGS=()
 fi

 cleanup() {
@@ -54,6 +57,7 @@ for BACK in "${BACKENDS[@]}"; do
    --trust-remote-code \
    --max-model-len 2048 \
    --gpu-memory-utilization 0.9 \
+    "${PLATFORM_ARGS[@]}" \
    --port $PORT &
  SERVER_PID=$!
  wait_for_server $PORT
--- a/.buildkite/scripts/trigger-ci-build.sh
+++ b/.buildkite/scripts/trigger-ci-build.sh
@@ -0,0 +1,227 @@
+#!/bin/bash
+#
+# trigger-ci-build.sh
+# Trigger a Buildkite CI build using the bk CLI for the current commit and branch
+# with RUN_ALL=1 and NIGHTLY=1 environment variables.
+#
+# Usage: ./trigger-ci-build.sh [options]
+#
+# Requires: bk CLI (https://buildkite.com/docs/platform/cli)
+#
+# SAFETY: Dry-run by default. Use --execute to actually trigger a build.
+#
+
+set -euo pipefail
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Default configuration
+PIPELINE="ci"
+DRY_RUN=true
+
+usage() {
+    cat <<EOF
+Usage: $(basename "$0") [options]
+
+Trigger a Buildkite CI build using the bk CLI for the current commit and branch.
+Sets RUN_ALL=1 and NIGHTLY=1 environment variables.
+
+SAFETY: Dry-run by default. Use --execute to actually trigger a build.
+
+Options:
+    --execute       Actually trigger the build (default: dry-run)
+    --pipeline      Buildkite pipeline slug (default: ${PIPELINE})
+    --commit        Override commit SHA (default: current HEAD)
+    --branch        Override branch name (default: current branch)
+    --message       Custom build message (default: auto-generated)
+    --help          Show this help message
+
+Prerequisites:
+    - bk CLI installed: brew tap buildkite/buildkite && brew install buildkite/buildkite/bk
+    - bk configured: bk configure
+
+Examples:
+    $(basename "$0")                        # Dry-run, show what would happen
+    $(basename "$0") --execute              # Actually trigger the build
+    $(basename "$0") --pipeline ci-shadow   # Dry-run with different pipeline
+EOF
+    exit 1
+}
+
+log_info() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+log_success() {
+    echo -e "${GREEN}[OK]${NC} $1"
+}
+
+log_warn() {
+    echo -e "${YELLOW}[WARN]${NC} $1"
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $1" >&2
+}
+
+# Parse arguments
+COMMIT=""
+BRANCH=""
+MESSAGE=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --execute)
+            DRY_RUN=false
+            shift
+            ;;
+        --pipeline)
+            PIPELINE="$2"
+            shift 2
+            ;;
+        --commit)
+            COMMIT="$2"
+            shift 2
+            ;;
+        --branch)
+            BRANCH="$2"
+            shift 2
+            ;;
+        --message)
+            MESSAGE="$2"
+            shift 2
+            ;;
+        --help|-h)
+            usage
+            ;;
+        -*)
+            log_error "Unknown option: $1"
+            usage
+            ;;
+        *)
+            log_error "Unexpected argument: $1"
+            usage
+            ;;
+    esac
+done
+
+# Check if bk CLI is installed
+if ! command -v bk &>/dev/null; then
+    log_error "Buildkite CLI (bk) is not installed"
+    echo ""
+    echo "Install with:"
+    echo "  brew tap buildkite/buildkite && brew install buildkite/buildkite/bk"
+    echo ""
+    echo "Then configure:"
+    echo "  bk configure"
+    exit 1
+fi
+
+# Check if we're in a git repository
+if ! git rev-parse --is-inside-work-tree &>/dev/null; then
+    log_error "Not in a git repository"
+    exit 1
+fi
+
+# Get current commit and branch if not overridden
+if [[ -z "$COMMIT" ]]; then
+    COMMIT=$(git rev-parse HEAD)
+fi
+
+if [[ -z "$BRANCH" ]]; then
+    BRANCH=$(git branch --show-current)
+    if [[ -z "$BRANCH" ]]; then
+        # Detached HEAD state - try to get branch from ref
+        BRANCH=$(git rev-parse --abbrev-ref HEAD)
+    fi
+fi
+
+# Generate default message if not provided
+if [[ -z "$MESSAGE" ]]; then
+    COMMIT_MSG=$(git log -1 --pretty=format:"%s" "$COMMIT" 2>/dev/null || echo "Manual build")
+    MESSAGE="[Manual] ${COMMIT_MSG}"
+fi
+
+# Safety check: Verify the commit exists on the remote
+log_info "Verifying commit exists on remote..."
+git fetch origin --quiet 2>/dev/null || true
+
+# Check if commit is reachable from any remote branch
+REMOTE_BRANCHES=$(git branch -r --contains "$COMMIT" 2>/dev/null || true)
+if [[ -z "$REMOTE_BRANCHES" ]]; then
+    log_error "Commit ${COMMIT} does not exist on any remote branch!"
+    echo ""
+    echo "The CI system will fail to checkout this commit."
+    echo "Please push your changes first:"
+    echo ""
+    echo "  git push origin ${BRANCH}"
+    echo ""
+    exit 1
+fi
+
+log_success "Commit found on remote branches:"
+echo "$REMOTE_BRANCHES" | head -5 | sed 's/^/  /'
+if [[ $(echo "$REMOTE_BRANCHES" | wc -l) -gt 5 ]]; then
+    echo "  ... and more"
+fi
+echo ""
+
+log_info "Pipeline: ${PIPELINE}"
+log_info "Branch: ${BRANCH}"
+log_info "Commit: ${COMMIT}"
+log_info "Message: ${MESSAGE}"
+log_info "Environment: RUN_ALL=1, NIGHTLY=1"
+echo ""
+
+# Build the command
+CMD=(bk build create
+    -y
+    -w
+    -i
+    --pipeline "${PIPELINE}"
+    --commit "${COMMIT}"
+    --branch "${BRANCH}"
+    --message "${MESSAGE}"
+    --env "RUN_ALL=1"
+    --env "NIGHTLY=1"
+)
+
+if [[ "$DRY_RUN" == true ]]; then
+    echo "=========================================="
+    log_warn "DRY-RUN MODE - No build will be triggered"
+    echo "=========================================="
+    echo ""
+    echo "Command that would be executed:"
+    echo ""
+    # Escape single quotes in values for safe shell display
+    escape_for_shell() {
+        printf '%s' "$1" | sed "s/'/'\\\\''/g"
+    }
+    echo "  bk build create \\"
+    echo "    -y \\"
+    echo "    -w \\"
+    echo "    -i \\"
+    echo "    --pipeline '$(escape_for_shell "${PIPELINE}")' \\"
+    echo "    --commit '$(escape_for_shell "${COMMIT}")' \\"
+    echo "    --branch '$(escape_for_shell "${BRANCH}")' \\"
+    echo "    --message '$(escape_for_shell "${MESSAGE}")' \\"
+    echo "    --env 'RUN_ALL=1' \\"
+    echo "    --env 'NIGHTLY=1'"
+    echo ""
+    echo "=========================================="
+    echo -e "${YELLOW}To actually trigger this build, run:${NC}"
+    echo ""
+    echo "  $0 --execute"
+    echo "=========================================="
+    exit 0
+fi
+
+log_info "Triggering build..."
+
+# Execute the command - bk will print the URL and open browser
+"${CMD[@]}"
--- a/.buildkite/scripts/upload-nightly-wheels.sh
+++ b/.buildkite/scripts/upload-nightly-wheels.sh
--- a/.buildkite/scripts/upload-release-wheels-pypi.sh
+++ b/.buildkite/scripts/upload-release-wheels-pypi.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+
+set -e
+
+BUCKET="vllm-wheels"
+SUBPATH=$BUILDKITE_COMMIT
+S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
+
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
+GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
+
+echo "Release version from Buildkite: $RELEASE_VERSION"
+
+if [[ -z "$GIT_VERSION" ]]; then
+    echo "[FATAL] Not on a git tag, cannot create release."
+    exit 1
+else
+    echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
+fi
+# sanity check for version mismatch
+if [[ "$RELEASE_VERSION" != "$GIT_VERSION" ]]; then
+  if [[ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]]; then
+    echo "[WARNING] Force release and ignore version mismatch"
+  else
+    echo "[FATAL] Release version from Buildkite does not match Git version."
+    exit 1
+  fi
+fi
+PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
+
+# check pypi token
+if [[ -z "$PYPI_TOKEN" ]]; then
+  echo "[FATAL] PYPI_TOKEN is not set."
+  exit 1
+else
+  export TWINE_USERNAME="__token__"
+  export TWINE_PASSWORD="$PYPI_TOKEN"
+fi
+
+set -x # avoid printing secrets above
+
+# install twine from pypi
+python3 -m venv /tmp/vllm-release-env
+source /tmp/vllm-release-env/bin/activate
+pip install twine
+python3 -m twine --version
+
+# copy release wheels to local directory
+DIST_DIR=/tmp/vllm-release-dist
+echo "Existing wheels on S3:"
+aws s3 ls "$S3_COMMIT_PREFIX"
+echo "Copying wheels to local directory"
+mkdir -p $DIST_DIR
+# include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
+aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
+echo "Wheels copied to local directory"
+# generate source tarball
+git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
+ls -la $DIST_DIR
+
+# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
+PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
+if [[ -z "$PYPI_WHEEL_FILES" ]]; then
+  echo "No default variant wheels found, quitting..."
+  exit 1
+fi
+
+python3 -m twine check $PYPI_WHEEL_FILES
+python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES
+echo "Wheels uploaded to PyPI"
--- a/.buildkite/scripts/upload-rocm-wheels.sh
+++ b/.buildkite/scripts/upload-rocm-wheels.sh
@@ -0,0 +1,151 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Upload ROCm wheels to S3 with proper index generation
+#
+# Required environment variables:
+#   AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY (or IAM role)
+#   S3_BUCKET (default: vllm-wheels)
+#
+# S3 path structure:
+#   s3://vllm-wheels/rocm/{commit}/     - All wheels for this commit
+#   s3://vllm-wheels/rocm/nightly/      - Index pointing to latest nightly
+#   s3://vllm-wheels/rocm/{version}/    - Index for release versions
+
+set -ex
+
+# ======== Configuration ========
+BUCKET="${S3_BUCKET:-vllm-wheels}"
+ROCM_SUBPATH="rocm/${BUILDKITE_COMMIT}"
+S3_COMMIT_PREFIX="s3://$BUCKET/$ROCM_SUBPATH/"
+INDICES_OUTPUT_DIR="rocm-indices"
+PYTHON="${PYTHON_PROG:-python3}"
+
+# ROCm uses manylinux_2_35 (Ubuntu 22.04 based)
+MANYLINUX_VERSION="manylinux_2_35"
+
+echo "========================================"
+echo "ROCm Wheel Upload Configuration"
+echo "========================================"
+echo "S3 Bucket: $BUCKET"
+echo "S3 Path: $ROCM_SUBPATH"
+echo "Commit: $BUILDKITE_COMMIT"
+echo "Branch: $BUILDKITE_BRANCH"
+echo "========================================"
+
+# ======== Part 0: Setup Python ========
+
+# Detect if python3.12+ is available
+has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)" 2>/dev/null || echo 0)
+if [[ "$has_new_python" -eq 0 ]]; then
+    # Use new python from docker
+    # Use --user to ensure files are created with correct ownership (not root)
+    docker pull python:3-slim
+    PYTHON="docker run --rm --user $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3"
+fi
+
+echo "Using python interpreter: $PYTHON"
+echo "Python version: $($PYTHON --version)"
+
+# ======== Part 1: Collect and prepare wheels ========
+
+# Collect all wheels
+mkdir -p all-rocm-wheels
+cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
+cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
+
+WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
+echo "Total wheels to upload: $WHEEL_COUNT"
+
+if [ "$WHEEL_COUNT" -eq 0 ]; then
+    echo "ERROR: No wheels found to upload!"
+    exit 1
+fi
+
+# Rename linux to manylinux in wheel filenames
+for wheel in all-rocm-wheels/*.whl; do
+    if [[ "$wheel" == *"linux"* ]] && [[ "$wheel" != *"manylinux"* ]]; then
+        new_wheel="${wheel/linux/$MANYLINUX_VERSION}"
+        mv -- "$wheel" "$new_wheel"
+        echo "Renamed: $(basename "$wheel") -> $(basename "$new_wheel")"
+    fi
+done
+
+echo ""
+echo "Wheels to upload:"
+ls -lh all-rocm-wheels/
+
+# ======== Part 2: Upload wheels to S3 ========
+
+echo ""
+echo "Uploading wheels to $S3_COMMIT_PREFIX"
+for wheel in all-rocm-wheels/*.whl; do
+    aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
+done
+
+# ======== Part 3: Generate and upload indices ========
+
+# List existing wheels in commit directory
+echo ""
+echo "Generating indices..."
+obj_json="rocm-objects.json"
+aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$ROCM_SUBPATH/" --delimiter / --output json > "$obj_json"
+
+mkdir -p "$INDICES_OUTPUT_DIR"
+
+# Use the existing generate-nightly-index.py
+# HACK: Replace regex module with stdlib re (same as CUDA script)
+sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
+
+$PYTHON .buildkite/scripts/generate-nightly-index.py \
+    --version "$ROCM_SUBPATH" \
+    --current-objects "$obj_json" \
+    --output-dir "$INDICES_OUTPUT_DIR" \
+    --comment "ROCm commit $BUILDKITE_COMMIT"
+
+# Upload indices to commit directory
+echo "Uploading indices to $S3_COMMIT_PREFIX"
+aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
+
+# Update rocm/nightly/ if on main branch and not a PR
+if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] || [[ "$NIGHTLY" == "1" ]]; then
+    echo "Updating rocm/nightly/ index..."
+    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/nightly/"
+fi
+
+# Extract version from vLLM wheel and update version-specific index
+VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
+if [ -n "$VLLM_WHEEL" ]; then
+    VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
+    echo "Version in wheel: $VERSION"
+    PURE_VERSION="${VERSION%%+*}"
+    PURE_VERSION="${PURE_VERSION%%.rocm}"
+    echo "Pure version: $PURE_VERSION"
+
+    if [[ "$VERSION" != *"dev"* ]]; then
+        echo "Updating rocm/$PURE_VERSION/ index..."
+        aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/$PURE_VERSION/"
+    fi
+fi
+
+# ======== Part 4: Summary ========
+
+echo ""
+echo "========================================"
+echo "ROCm Wheel Upload Complete!"
+echo "========================================"
+echo ""
+echo "Wheels available at:"
+echo "  s3://$BUCKET/$ROCM_SUBPATH/"
+echo ""
+echo "Install command (by commit):"
+echo "  pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/$ROCM_SUBPATH/"
+echo ""
+if [[ "$BUILDKITE_BRANCH" == "main" ]] || [[ "$NIGHTLY" == "1" ]]; then
+    echo "Install command (nightly):"
+    echo "  pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/rocm/nightly/"
+fi
+echo ""
+echo "Wheel count: $WHEEL_COUNT"
+echo "========================================"
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -71,6 +71,7 @@ steps:
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
+  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
  - tests/tool_parsers
@@ -82,6 +83,7 @@ steps:
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s renderers
  - pytest -v -s tokenizers_
  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
@@ -428,6 +430,8 @@ steps:
  timeout_in_minutes: 30
  gpu: h100
  source_file_dependencies:
+    - vllm/config/attention.py
+    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
@@ -452,10 +456,12 @@ steps:
  timeout_in_minutes: 30
  gpu: b200
  source_file_dependencies:
+    - vllm/config/attention.py
+    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
-    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
+    - pytest -v -s v1/attention

 - label: V1 Test others (CPU) # 5 mins
  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
@@ -703,6 +709,17 @@ steps:
    - pytest -v -s kernels/moe/test_batched_deepgemm.py
    - pytest -v -s kernels/attention/test_deepgemm_attention.py

+- label: Kernels Helion Test
+  timeout_in_minutes: 30
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  source_file_dependencies:
+  - vllm/utils/import_utils.py
+  - tests/kernels/helion/
+  commands:
+    - pip install helion
+    - pytest -v -s kernels/helion/
+
 - label: Model Executor Test # 23min
  timeout_in_minutes: 35
  torch_nightly: true
@@ -855,7 +872,7 @@ steps:

 - label: Language Models Tests (Standard)
  timeout_in_minutes: 25
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
@@ -1451,7 +1468,7 @@ steps:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt

 - label: NixlConnector PD accuracy tests (Distributed) # 30min
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  timeout_in_minutes: 30
@@ -1462,10 +1479,10 @@ steps:
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
-    - VLLM_ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+    - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

 - label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  timeout_in_minutes: 15
@@ -1476,7 +1493,7 @@ steps:
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
-    - VLLM_ATTENTION_BACKEND=ROCM_ATTN DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+    - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

 ##### multi gpus test #####
 ##### A100 test #####
@@ -1662,17 +1679,6 @@ steps:
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1

- label: DeepSeek V2-Lite Async EPLB Accuracy
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_4
-  # grade: Blocking
-  gpu: h100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace"
-  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030

 - label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
  timeout_in_minutes: 60
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -64,6 +64,7 @@ steps:
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
+  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
  - tests/tool_parsers
@@ -75,6 +76,7 @@ steps:
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s renderers
  - pytest -v -s tokenizers_
  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
@@ -374,6 +376,8 @@ steps:
  timeout_in_minutes: 30
  gpu: h100
  source_file_dependencies:
+    - vllm/config/attention.py
+    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
@@ -396,10 +400,12 @@ steps:
  timeout_in_minutes: 30
  gpu: b200
  source_file_dependencies:
+    - vllm/config/attention.py
+    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
-    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
+    - pytest -v -s v1/attention

 - label: V1 Test others (CPU) # 5 mins
  source_file_dependencies:
@@ -624,6 +630,56 @@ steps:
    - pytest -v -s kernels/moe/test_batched_deepgemm.py
    - pytest -v -s kernels/attention/test_deepgemm_attention.py

+- label: Kernels Helion Test
+  timeout_in_minutes: 30
+  gpu: h100
+  source_file_dependencies:
+  - vllm/utils/import_utils.py
+  - tests/kernels/helion/
+  commands:
+    - pip install helion
+    - pytest -v -s kernels/helion/
+
+  
+- label: Kernels FP8 MoE Test (1 H100)
+  timeout_in_minutes: 90
+  gpu: h100
+  num_gpus: 1
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_cutlass_moe.py
+    - pytest -v -s kernels/moe/test_flashinfer.py
+    - pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
+    - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
+    - pytest -v -s kernels/moe/test_moe.py
+    # - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
+    - pytest -v -s kernels/moe/test_block_int8.py
+    - pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
+    - pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
+
+- label: Kernels FP8 MoE Test (2 H100s)
+  timeout_in_minutes: 90
+  gpu: h100
+  num_gpus: 2
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
+    - pytest -v -s kernels/moe/test_deepep_moe.py
+    - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
+    # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
+  
+- label: Kernels Fp4 MoE Test (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  num_gpus: 1
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_cutedsl_moe.py
+    - pytest -v -s kernels/moe/test_flashinfer_moe.py
+    - pytest -v -s kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s kernels/moe/test_ocp_mx_moe.py
+
+
 - label: Model Executor Test # 23min
  timeout_in_minutes: 35
  torch_nightly: true
@@ -951,7 +1007,7 @@ steps:
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

- label: Blackwell Test # 21 min
+- label: Blackwell Test # 23 min
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  gpu: b200
@@ -991,6 +1047,8 @@ steps:
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
+    # e2e
+    - pytest -v -s tests/models/quantization/test_nvfp4.py

 - label: Blackwell Fusion and Compile Tests # 30 min
  timeout_in_minutes: 40
@@ -1045,6 +1103,48 @@ steps:
    # Run all e2e fusion tests
    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py

+- label: Hopper Fusion E2E Tests (H100) # 10min
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/"
+  gpu: h100
+  optional: true
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusion_attn.py
+  commands:
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    # skip Llama-4 since it does not fit on this device
+    - pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
+
+- label: Hopper Fusion Distributed E2E Tests (2xH100)  # 70min
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/"
+  gpu: h100
+  optional: true
+  num_gpus: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  commands:
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    # Run all e2e fusion tests
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+
 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
@@ -1344,22 +1444,31 @@ steps:
    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4

-##### H200 test #####
- label: Distributed Tests (H200) # optional
-  gpu: h200
+- label: Sequence Parallel Tests (H100) # 60 min
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: h100
+  optional: true
+  num_gpus: 2
+  commands:
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    # Run sequence parallel tests
+    - pytest -v -s tests/distributed/test_sequence_parallel.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+
+- label: Distributed Tests (H100) # optional
+  gpu: h100
  optional: true
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
-    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py

+##### H200 test #####
+
 - label: LM Eval Large Models (H200) # optional
  timeout_in_minutes: 60
  gpu: h200
--- a/.buildkite/test_areas/attention.yaml
+++ b/.buildkite/test_areas/attention.yaml
@@ -6,6 +6,8 @@ steps:
  timeout_in_minutes: 30
  gpu: h100
  source_file_dependencies:
+    - vllm/config/attention.py
+    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
@@ -15,7 +17,9 @@ steps:
  timeout_in_minutes: 30
  gpu: b200
  source_file_dependencies:
+    - vllm/config/attention.py
+    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
-    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
+    - pytest -v -s v1/attention
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -121,6 +121,7 @@ steps:
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
+  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
  - tests/tool_parsers
@@ -132,6 +133,7 @@ steps:
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s renderers
  - pytest -v -s tokenizers_
  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -414,6 +414,18 @@ pull_request_rules:
      remove:
        - needs-rebase

+- name: label-bug
+  description: Automatically apply bug label
+  conditions:
+    - label != stale
+    - or:
+      - title~=(?i)\bbug\b
+      - title~=(?i)\bbugfix\b
+  actions:
+    label:
+      add:
+        - bug
+
 - name: label-kv-connector
  description: Automatically apply kv-connector label
  conditions:
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -29,8 +29,9 @@ jobs:

      - name: Install dependencies and build vLLM
        run: |
+          uv pip install -r requirements/cpu-build.txt --index-strategy unsafe-best-match
          uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
-          uv pip install -e .
+          uv pip install -e . --no-build-isolation
        env:
          CMAKE_BUILD_PARALLEL_LEVEL: 4

--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,9 @@ vllm/vllm_flash_attn/*
 # OpenAI triton kernels copied from source
 vllm/third_party/triton_kernels/*

+# FlashMLA interface copied from source
+vllm/third_party/flashmla/flash_mla_interface.py
+
 # triton jit
 .triton

@@ -191,6 +194,9 @@ CLAUDE.md
 AGENTS.md
 .codex/

+# Cursor
+.cursor/
+
 # DS Store
 .DS_Store

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -147,6 +147,13 @@ repos:
    entry: python tools/pre_commit/validate_config.py
    language: python
    additional_dependencies: [regex]
+  - id: validate-docker-versions
+    name: Validate docker/versions.json matches Dockerfile
+    entry: python tools/generate_versions_json.py --check
+    language: python
+    files: ^docker/(Dockerfile|versions\.json)$
+    pass_filenames: false
+    additional_dependencies: [dockerfile-parse]
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -377,7 +377,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    # preselected input type pairs and schedules.
    # Generate sources:
    set(MARLIN_GEN_SCRIPT
-      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
+      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/marlin/generate_kernels.py)
    file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
    list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
    set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
@@ -412,7 +412,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()

    if (MARLIN_ARCHS)
-      file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
+      file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/marlin/sm80_kernel_*_float16.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
        CUDA_ARCHS "${MARLIN_ARCHS}")
@@ -422,7 +422,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      endif()
      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})

-      file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
+      file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/marlin/sm80_kernel_*_bfloat16.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
        CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
@@ -434,7 +434,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()

    if (MARLIN_SM75_ARCHS) 
-      file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/gptq_marlin/sm75_kernel_*.cu")
+      file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/marlin/sm75_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
        CUDA_ARCHS "${MARLIN_SM75_ARCHS}")
@@ -446,7 +446,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()

    if (MARLIN_FP8_ARCHS) 
-      file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
+      file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/marlin/sm89_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
        CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
@@ -459,10 +459,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

    set(MARLIN_SRCS
       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
-       "csrc/quantization/gptq_marlin/gptq_marlin.cu"
-       "csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
-       "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
-       "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
+       "csrc/quantization/marlin/marlin.cu"
+       "csrc/quantization/marlin/marlin_int4_fp8_preprocess.cu"
+       "csrc/quantization/marlin/gptq_marlin_repack.cu"
+       "csrc/quantization/marlin/awq_marlin_repack.cu")
    set_gencode_flags_for_srcs(
      SRCS "${MARLIN_SRCS}"
      CUDA_ARCHS "${MARLIN_OTHER_ARCHS}")
--- a/benchmarks/kernels/bench_nvfp4_quant.py
+++ b/benchmarks/kernels/bench_nvfp4_quant.py
@@ -20,8 +20,12 @@ FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max

 PROVIDER_CFGS = {
-    "vllm": dict(backend="vllm", enabled=True),
-    "flashinfer": dict(backend="flashinfer", enabled=True),
+    "vllm": dict(backend="vllm", is_sf_swizzled_layout=False, enabled=True),
+    "vllm-swizzle": dict(backend="vllm", is_sf_swizzled_layout=True, enabled=True),
+    "flashinfer": dict(backend="flashinfer", is_sf_swizzled_layout=False, enabled=True),
+    "flashinfer-swizzle": dict(
+        backend="flashinfer", is_sf_swizzled_layout=True, enabled=True
+    ),
 }

 _enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
@@ -36,7 +40,7 @@ def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor:
@triton.testing.perf_report(
    triton.testing.Benchmark(
        x_names=["batch_size"],
-        x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
+        x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192],
        x_log=False,
        line_arg="provider",
        line_vals=_enabled,
@@ -63,19 +67,36 @@ def benchmark(batch_size, provider, N, K):

    if cfg["backend"] == "vllm":
        # vLLM's FP4 quantization
-        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
-            lambda: ops.scaled_fp4_quant(a, a_global_scale),
-            quantiles=quantiles,
-        )
+        if cfg["is_sf_swizzled_layout"]:
+            ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+                lambda: ops.scaled_fp4_quant(
+                    a, a_global_scale, is_sf_swizzled_layout=True
+                ),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+                lambda: ops.scaled_fp4_quant(
+                    a, a_global_scale, is_sf_swizzled_layout=False
+                ),
+                quantiles=quantiles,
+            )
    elif cfg["backend"] == "flashinfer":
        # FlashInfer's FP4 quantization
-        # Use is_sf_swizzled_layout=True to match vLLM's output format
-        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
-            lambda: flashinfer_fp4_quantize(
-                a, a_global_scale, is_sf_swizzled_layout=True
-            ),
-            quantiles=quantiles,
-        )
+        if cfg["is_sf_swizzled_layout"]:
+            ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+                lambda: flashinfer_fp4_quantize(
+                    a, a_global_scale, is_sf_swizzled_layout=True
+                ),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+                lambda: flashinfer_fp4_quantize(
+                    a, a_global_scale, is_sf_swizzled_layout=False
+                ),
+                quantiles=quantiles,
+            )

    # Convert ms to us for better readability at small batch sizes
    to_us = lambda t_ms: t_ms * 1000
@@ -92,7 +113,9 @@ def prepare_shapes(args):
    return out


-def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
+def _test_accuracy_once(
+    M: int, K: int, dtype: torch.dtype, device: str, is_sf_swizzled_layout: bool
+):
    """Test accuracy between vLLM and FlashInfer FP4 quantization."""
    # Create input tensor
    a = torch.randn((M, K), device=device, dtype=dtype)
@@ -101,11 +124,13 @@ def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
    a_global_scale = compute_global_scale(a)

    # vLLM quantization
-    vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale)
+    vllm_fp4, vllm_scale = ops.scaled_fp4_quant(
+        a, a_global_scale, is_sf_swizzled_layout=is_sf_swizzled_layout
+    )

    # FlashInfer quantization (with swizzled layout to match vLLM's output)
    flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize(
-        a, a_global_scale, is_sf_swizzled_layout=True
+        a, a_global_scale, is_sf_swizzled_layout=is_sf_swizzled_layout
    )
    flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn)

@@ -114,7 +139,14 @@ def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
        vllm_fp4,
        flashinfer_fp4,
    )
-    print(f"M={M}, K={K}, dtype={dtype}: PASSED")
+    # Compare scales
+    torch.testing.assert_close(
+        vllm_scale,
+        flashinfer_scale,
+    )
+    print(
+        f"M={M}, K={K}, dtype={dtype}, is_sf_swizzled_layout={is_sf_swizzled_layout}: PASSED"  # noqa: E501
+    )


 def test_accuracy():
@@ -130,9 +162,10 @@ def test_accuracy():
    Ms = [1, 1024]
    Ks = [4096]

-    for M in Ms:
-        for K in Ks:
-            _test_accuracy_once(M, K, dtype, device)
+    for is_sf_swizzled_layout in [True, False]:
+        for M in Ms:
+            for K in Ks:
+                _test_accuracy_once(M, K, dtype, device, is_sf_swizzled_layout)

    print("\nAll accuracy tests passed!")

@@ -145,7 +178,7 @@ if __name__ == "__main__":
        "--models",
        nargs="+",
        type=str,
-        default=["meta-llama/Llama-3.1-8B-Instruct"],
+        default=["meta-llama/Llama-3.3-70B-Instruct"],
        choices=list(WEIGHT_SHAPES.keys()),
    )
    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@@ -7,7 +7,7 @@ import itertools
 import torch

 import vllm.model_executor.layers.activation  # noqa F401
-from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.custom_op import op_registry
 from vllm.triton_utils import triton
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
@@ -33,14 +33,14 @@ def benchmark_activation(
    torch.set_default_device(device)

    if func_name == "gelu_and_mul":
-        layer = CustomOp.op_registry[func_name](approximate="none")
+        layer = op_registry[func_name](approximate="none")
    elif func_name == "gelu_and_mul_tanh":
-        layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh")
+        layer = op_registry["gelu_and_mul"](approximate="tanh")
    elif func_name == "fatrelu_and_mul":
        threshold = 0.5
-        layer = CustomOp.op_registry[func_name](threshold)
+        layer = op_registry[func_name](threshold)
    else:
-        layer = CustomOp.op_registry[func_name]()
+        layer = op_registry[func_name]()

    x = torch.randn(num_tokens, dim, dtype=dtype, device=device)
    compiled_layer = torch.compile(layer.forward_native)
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -9,6 +9,7 @@ but use different quantization strategies and backends.
 import torch

 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
 from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
@@ -138,12 +139,13 @@ def bench_run(
    fn = mk.FusedMoEModularKernel(
        MoEPrepareAndFinalizeNoEP(),
        CutlassExpertsFp8(
-            out_dtype=a.dtype,
-            e=num_experts,
-            n=n,
-            k=k,
+            moe_config=make_dummy_moe_config(
+                num_experts=num_experts,
+                hidden_dim=k,
+                intermediate_size_per_partition=n,
+                in_dtype=a.dtype,
+            ),
            quant_config=quant_config,
-            device=w1.device,
        ),
    )

--- a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
@@ -12,6 +12,7 @@ import torch
 import torch.utils.benchmark as benchmark

 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import (
@@ -198,8 +199,7 @@ def bench_run(
        kernel = mk.FusedMoEModularKernel(
            MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
            CutlassExpertsFp4(
-                out_dtype=dtype,
-                max_experts_per_worker=e,
+                make_dummy_moe_config(),
                quant_config=quant_config,
            ),
        )
@@ -244,8 +244,7 @@ def bench_run(
        kernel = mk.FusedMoEModularKernel(
            MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
            CutlassExpertsFp4(
-                out_dtype=dtype,
-                max_experts_per_worker=e,
+                make_dummy_moe_config(),
                quant_config=quant_config,
            ),
        )
--- a/benchmarks/kernels/benchmark_fused_topk.py
+++ b/benchmarks/kernels/benchmark_fused_topk.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+
+import torch
+
+from vllm.model_executor.layers.fused_moe.router.fused_topk_router import fused_topk
+from vllm.triton_utils import triton
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+num_tokens_range = [2**i for i in range(0, 8, 2)]
+num_experts_range = [16, 32, 64, 128, 256, 512]
+topk_range = [3, 4]
+configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
+
+
+def torch_topk(
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    scoring_func: str = "softmax",
+):
+    if scoring_func == "softmax":
+        scores = torch.softmax(gating_output.float(), dim=-1)
+    else:
+        scores = torch.sigmoid(gating_output.float())
+    topk_weights, topk_ids = torch.topk(scores, k=topk, dim=-1)
+
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+    return topk_weights, topk_ids
+
+
+def get_benchmark(scoring_func):
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["num_tokens", "num_experts", "topk"],
+            x_vals=[list(_) for _ in configs],
+            line_arg="provider",
+            line_vals=["torch", "vllm"],
+            line_names=["Torch", "vLLM"],
+            styles=[("blue", "-"), ("red", "-")],
+            ylabel="us",
+            plot_name=f"fused-topk-perf-{scoring_func}",
+            args={},
+        )
+    )
+    def benchmark(num_tokens, num_experts, topk, provider):
+        dtype = torch.bfloat16
+        hidden_size = 1024
+        renormalize = True
+        hidden_states = torch.randn(
+            (num_tokens, hidden_size), dtype=dtype, device="cuda"
+        )
+        gating_output = torch.randn(
+            (num_tokens, num_experts), dtype=dtype, device="cuda"
+        )
+
+        quantiles = [0.5, 0.2, 0.8]
+
+        if provider == "torch":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: torch_topk(
+                    gating_output=gating_output,
+                    topk=topk,
+                    renormalize=renormalize,
+                    scoring_func=scoring_func,
+                ),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: fused_topk(
+                    hidden_states=hidden_states,
+                    gating_output=gating_output,
+                    topk=topk,
+                    renormalize=renormalize,
+                    scoring_func=scoring_func,
+                ),
+                quantiles=quantiles,
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark the MoE topk kernel.")
+    parser.add_argument("--scoring-func", type=str, default="softmax")
+    parser.add_argument("--save-path", type=str, default="./configs/fused_topk/")
+    args = parser.parse_args()
+
+    # Get the benchmark function
+    benchmark = get_benchmark(args.scoring_func)
+    # Run performance benchmark
+    benchmark.run(print_data=True, save_path=args.save_path)
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -6,6 +6,7 @@ import torch.utils.benchmark as benchmark
 from benchmark_shapes import WEIGHT_SHAPES_MOE

 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
@@ -134,13 +135,13 @@ def bench_run(
        fn = mk.FusedMoEModularKernel(
            MoEPrepareAndFinalizeNoEP(),
            CutlassExpertsFp8(
-                out_dtype=a.dtype,
-                # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
-                e=w2.shape[0],
-                n=w2.shape[2],
-                k=w2.shape[1],
+                moe_config=make_dummy_moe_config(
+                    num_experts=w2.shape[0],
+                    hidden_dim=w2.shape[1],
+                    intermediate_size_per_partition=w2.shape[2],
+                    in_dtype=a.dtype,
+                ),
                quant_config=quant_config,
-                device=w1.device,
            ),
        )

@@ -166,13 +167,13 @@ def bench_run(
        fn = mk.FusedMoEModularKernel(
            MoEPrepareAndFinalizeNoEP(),
            CutlassExpertsFp8(
-                out_dtype=a.dtype,
-                # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
-                e=w2.shape[0],
-                n=w2.shape[2],
-                k=w2.shape[1],
+                moe_config=make_dummy_moe_config(
+                    num_experts=w2.shape[0],
+                    hidden_dim=w2.shape[1],
+                    intermediate_size_per_partition=w2.shape[2],
+                    in_dtype=a.dtype,
+                ),
                quant_config=quant_config,
-                device=w1.device,
            ),
        )

--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -231,7 +231,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
        assert bt.w_tok_s is None
        assert bt.group_size is not None

-        fn = lambda: ops.gptq_marlin_gemm(
+        fn = lambda: ops.marlin_gemm(
            a=bt.a,
            c=None,
            b_q_weight=w_q,
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -239,7 +239,7 @@ def bench_run(
        "sm_version": sm_version,
        "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD,
        # Kernels
-        "gptq_marlin_gemm": ops.gptq_marlin_gemm,
+        "marlin_gemm": ops.marlin_gemm,
        "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
        "gptq_marlin_repack": ops.gptq_marlin_repack,
        "allspark_w8a16_gemm": ops.allspark_w8a16_gemm,
@@ -263,21 +263,21 @@ def bench_run(

    results.append(
        benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
+            stmt="output = marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
-            description="gptq_marlin_gemm",
+            description="marlin_gemm",
        ).blocked_autorange(min_run_time=min_run_time)
    )

    results.append(
        benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
+            stmt="output = marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
-            description="gptq_marlin_gemm_fp32",
+            description="marlin_gemm_fp32",
        ).blocked_autorange(min_run_time=min_run_time)
    )

--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -15,11 +15,18 @@ import ray
 import torch
 from ray.experimental.tqdm_ray import tqdm

+from vllm.model_executor.layers.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEParallelConfig,
    FusedMoEQuantConfig,
+    RoutingMethodType,
    _get_config_dtype_str,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import *
+from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
+    TritonOrDeepGemmExperts,
+)
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
 from vllm.triton_utils import triton
@@ -194,10 +201,36 @@ def benchmark_config(
            block_shape=block_quant_shape,
        )

+        deep_gemm_experts = None
+        if use_deep_gemm:
+            deep_gemm_experts = mk.FusedMoEModularKernel(
+                prepare_finalize=MoEPrepareAndFinalizeNoEP(),
+                fused_experts=TritonOrDeepGemmExperts(
+                    moe_config=FusedMoEConfig(
+                        num_experts=num_experts,
+                        experts_per_token=topk,
+                        hidden_dim=hidden_size,
+                        intermediate_size_per_partition=shard_intermediate_size,
+                        num_local_experts=num_experts,
+                        activation="silu",
+                        moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
+                        in_dtype=init_dtype,
+                        routing_method=RoutingMethodType.TopK,
+                        device="cuda",
+                    ),
+                    quant_config=quant_config,
+                ),
+            )
+
        with override_config(config):
            topk_weights, topk_ids, token_expert_indices = fused_topk(
                x, input_gating, topk, renormalize=not use_deep_gemm
            )
+
+            if use_deep_gemm:
+                return deep_gemm_experts(
+                    x, w1, w2, topk_weights, topk_ids, inplace=True
+                )
            return fused_experts(
                x,
                w1,
@@ -206,7 +239,6 @@ def benchmark_config(
                topk_ids,
                inplace=True,
                quant_config=quant_config,
-                allow_deep_gemm=use_deep_gemm,
            )

    # JIT compilation & warmup
@@ -643,6 +675,7 @@ def main(args: argparse.Namespace):
        "DeepseekV3ForCausalLM",
        "DeepseekV32ForCausalLM",
        "Glm4MoeForCausalLM",
+        "Glm4MoeLiteForCausalLM",
        "NemotronHForCausalLM",
    ):
        E = config.n_routed_experts
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -8,7 +8,7 @@ import ray
 import torch
 from transformers import AutoConfig

-from vllm.model_executor.layers.fused_moe.fused_moe import *
+from vllm.model_executor.layers.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
    _moe_permute,
    _moe_unpermute_and_reduce,
@@ -86,9 +86,7 @@ def benchmark_permute(
                sorted_token_ids,
                expert_ids,
                inv_perm,
-            ) = _moe_permute(
-                qhidden_states, None, topk_ids, num_experts, None, align_block_size
-            )
+            ) = _moe_permute(qhidden_states, None, topk_ids, num_experts, None, 16)

    # JIT compilation & warmup
    run()
@@ -182,7 +180,7 @@ def benchmark_unpermute(
                expert_ids,
                inv_perm,
            ) = _moe_permute(
-                qhidden_states, None, topk_ids, num_experts, None, align_block_size
+                qhidden_states, None, topk_ids, num_experts, None, block_m=16
            )
            # convert to fp16/bf16 as gemm output
            return (
@@ -330,6 +328,7 @@ def main(args: argparse.Namespace):
        config.architectures[0] == "DeepseekV3ForCausalLM"
        or config.architectures[0] == "DeepseekV2ForCausalLM"
        or config.architectures[0] == "Glm4MoeForCausalLM"
+        or config.architectures[0] == "Glm4MoeLiteForCausalLM"
    ):
        E = config.n_routed_experts
        topk = config.num_experts_per_tok
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -14,7 +14,6 @@ from vllm.triton_utils import triton
 from vllm.utils.deep_gemm import (
    calc_diff,
    fp8_gemm_nt,
-    get_col_major_tma_aligned_tensor,
    per_block_cast_to_fp8,
 )

@@ -48,8 +47,9 @@ def benchmark_shape(
    block_size = [128, 128]

    # Pre-quantize A for all implementations
-    A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
-    A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
+    A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(
+        A, block_size[1], column_major_scales=True, tma_aligned_scales=True
+    )
    C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
    A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
    A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -13,6 +13,8 @@ endif()
 #
 # Define environment variables for special configurations
 #
+set(ENABLE_AVX2 $ENV{VLLM_CPU_AVX2})
+set(ENABLE_AVX512 $ENV{VLLM_CPU_AVX512})
 set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16})
 set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI})
 set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16})
@@ -103,6 +105,16 @@ else()
    find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
    find_isa(${CPUINFO} "S390" S390_FOUND)
    find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
+
+    # Support cross-compilation by allowing override via environment variables
+    if (ENABLE_AVX2)
+        set(AVX2_FOUND ON)
+        message(STATUS "AVX2 support enabled via VLLM_CPU_AVX2 environment variable")
+    endif()
+    if (ENABLE_AVX512)
+        set(AVX512_FOUND ON)
+        message(STATUS "AVX512 support enabled via VLLM_CPU_AVX512 environment variable")
+    endif()
 endif()

 if (AVX512_FOUND AND NOT AVX512_DISABLED)
@@ -379,6 +391,12 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
    endif()
 endif()

+if (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
+    set(VLLM_EXT_SRC
+        "csrc/cpu/shm.cpp"
+        ${VLLM_EXT_SRC})
+endif()
+
 if(USE_ONEDNN)
    set(VLLM_EXT_SRC
        "csrc/cpu/dnnl_kernels.cpp"
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -19,7 +19,7 @@ else()
  FetchContent_Declare(
        flashmla
        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
-        GIT_TAG 46d64a8ebef03fa50b4ae74937276a5c940e3f95
+        GIT_TAG c2afa9cb93e674d5a9120a170a6da57b89267208
        GIT_PROGRESS TRUE
        CONFIGURE_COMMAND ""
        BUILD_COMMAND ""
@@ -30,6 +30,24 @@ endif()
 FetchContent_MakeAvailable(flashmla)
 message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")

+# Vendor FlashMLA interface into vLLM with torch-ops shim.
+set(FLASHMLA_VENDOR_DIR "${CMAKE_SOURCE_DIR}/vllm/third_party/flashmla")
+file(MAKE_DIRECTORY "${FLASHMLA_VENDOR_DIR}")
+file(READ "${flashmla_SOURCE_DIR}/flash_mla/flash_mla_interface.py"
+     FLASHMLA_INTERFACE_CONTENT)
+string(REPLACE "import flash_mla.cuda as flash_mla_cuda"
+               "import vllm._flashmla_C\nflash_mla_cuda = torch.ops._flashmla_C"
+               FLASHMLA_INTERFACE_CONTENT
+               "${FLASHMLA_INTERFACE_CONTENT}")
+file(WRITE "${FLASHMLA_VENDOR_DIR}/flash_mla_interface.py"
+     "${FLASHMLA_INTERFACE_CONTENT}")
+
+# Install the generated flash_mla_interface.py to the wheel
+# Use COMPONENT _flashmla_C to ensure it's installed with the C extension
+install(FILES "${FLASHMLA_VENDOR_DIR}/flash_mla_interface.py"
+        DESTINATION vllm/third_party/flashmla/
+        COMPONENT _flashmla_C)
+
 # The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
 # Only build FlashMLA kernels if we are building for something compatible with 
 # sm90a
@@ -55,16 +73,42 @@ if(FLASH_MLA_ARCHS)

    set(FlashMLA_SOURCES
        ${flashmla_SOURCE_DIR}/csrc/torch_api.cpp
-        ${flashmla_SOURCE_DIR}/csrc/pybind.cpp
-        ${flashmla_SOURCE_DIR}/csrc/smxx/get_mla_metadata.cu
-        ${flashmla_SOURCE_DIR}/csrc/smxx/mla_combine.cu
-        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/splitkv_mla.cu
-        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/splitkv_mla.cu
+
+        # Misc kernels for decoding
+        ${flashmla_SOURCE_DIR}/csrc/smxx/decode/get_decoding_sched_meta/get_decoding_sched_meta.cu
+        ${flashmla_SOURCE_DIR}/csrc/smxx/decode/combine/combine.cu
+
+        # sm90 dense decode
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/instantiations/fp16.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/instantiations/bf16.cu
+
+        # sm90 sparse decode
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/model1_persistent_h64.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/model1_persistent_h128.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/v32_persistent_h64.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/v32_persistent_h128.cu
+
+        # sm90 sparse prefill
        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu
-        ${flashmla_SOURCE_DIR}/csrc/sm100/decode/sparse_fp8/splitkv_mla.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k512.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k512_topklen.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k576.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k576_topklen.cu
+
+        # sm100 dense prefill & backward
        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu
-        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_bwd_sm100.cu
-        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu
+
+        # sm100 sparse prefill
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head64/instantiations/phase1_k512.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head64/instantiations/phase1_k576.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head128/instantiations/phase1_k512.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head128/instantiations/phase1_k576.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/instantiations/phase1_prefill_k512.cu
+
+        # sm100 sparse decode
+        ${flashmla_SOURCE_DIR}/csrc/sm100/decode/head64/instantiations/v32.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/decode/head64/instantiations/model1.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/instantiations/phase1_decode_k512.cu
    )

    set(FlashMLA_Extension_SOURCES
@@ -76,6 +120,7 @@ if(FLASH_MLA_ARCHS)

    set(FlashMLA_INCLUDES
        ${flashmla_SOURCE_DIR}/csrc
+        ${flashmla_SOURCE_DIR}/csrc/kerutils/include
        ${flashmla_SOURCE_DIR}/csrc/sm90
        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
        ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
@@ -83,7 +128,6 @@ if(FLASH_MLA_ARCHS)

    set(FlashMLA_Extension_INCLUDES
        ${flashmla_SOURCE_DIR}/csrc
-        ${flashmla_SOURCE_DIR}/csrc/sm90
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/
        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
        ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
@@ -110,9 +154,12 @@ if(FLASH_MLA_ARCHS)

    # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
    # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
+    # Also enable C++20 for the FlashMLA sources (required for std::span, requires, etc.)
    target_compile_options(_flashmla_C PRIVATE
        $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
-        $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
+        $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>
+        $<$<COMPILE_LANGUAGE:CXX>:-std=c++20>
+        $<$<COMPILE_LANGUAGE:CUDA>:-std=c++20>)

    define_extension_target(
        _flashmla_extension_C
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -7,6 +7,7 @@
 #include <vector>

 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
+                 int64_t block_size_in_bytes,
                 const torch::Tensor& block_mapping);

 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -25,6 +25,7 @@ typedef __hip_bfloat16 __nv_bfloat16;
 #endif

 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
+                 int64_t block_size_in_bytes,
                 const torch::Tensor& block_mapping) {
  torch::Device src_device = src.device();
  torch::Device dst_device = dst.device();
@@ -49,10 +50,6 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
  char* src_ptr = static_cast<char*>(src.data_ptr());
  char* dst_ptr = static_cast<char*>(dst.data_ptr());

-  // We use the stride instead of numel in case the cache is padded for memory
-  // alignment reasons, we assume the blocks data (inclusive of any padding)
-  // is contiguous in memory
-  const int64_t block_size_in_bytes = src.element_size() * src.stride(0);
  const at::cuda::OptionalCUDAGuard device_guard(
      src_device.is_cuda() ? src_device : dst_device);
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@@ -205,7 +202,8 @@ __global__ void reshape_and_cache_flash_kernel(
    const int64_t block_stride, const int64_t page_stride,
    const int64_t head_stride, const int64_t key_stride,
    const int64_t value_stride, const int num_heads, const int head_size,
-    const int block_size, const float* k_scale, const float* v_scale) {
+    const int block_size, const float* k_scale, const float* v_scale,
+    const int kv_scale_stride) {
  const int64_t token_idx = blockIdx.x;
  const int64_t slot_idx = slot_mapping[token_idx];
  // NOTE: slot_idx can be -1 if the token is padded
@@ -229,21 +227,23 @@ __global__ void reshape_and_cache_flash_kernel(
  // this is true for the NHD layout where `head_stride == head_size`
  const bool is_contiguous_heads = (head_stride == head_size);

-  float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
-  float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
  constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
-  CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
-  CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
-  if (is_contiguous_heads) {
-    // NHD layout
+
+  if (is_contiguous_heads && kv_scale_stride == 0) {
+    // NHD layout and k/v_scales are [1] (i.e. single scale for all heads)
    // kv cache: [num_blocks, block_size, num_heads, head_size]
+    float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
+    float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
+
+    CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
+    CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
+
    vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, n_elems, threadIdx.x,
                                       blockDim.x, k_op);
-
    vectorize_with_alignment<VEC_SIZE>(value_src, value_dst, n_elems,
                                       threadIdx.x, blockDim.x, v_op);
-
  } else {
+    // HND layout OR k/v_scales are [num_heads] (i.e. per-attn-head)
    // HND layout: heads are strided, but each head_size segment is contiguous
    // kv cache: [num_blocks, num_heads, block_size, head_size]
    const int lane = threadIdx.x & 31;     // 0..31 within warp
@@ -259,6 +259,16 @@ __global__ void reshape_and_cache_flash_kernel(
      cache_t* __restrict__ v_dst_h =
          value_dst + static_cast<int64_t>(head) * head_stride;

+      float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto)
+                              ? 0.f
+                              : k_scale[head * kv_scale_stride];
+      float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto)
+                              ? 0.f
+                              : v_scale[head * kv_scale_stride];
+
+      CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
+      CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
+
      // within each head, let the 32 threads of the warp perform the vector
      // copy
      vectorize_with_alignment<VEC_SIZE>(k_src_h, k_dst_h, head_size, lane, 32,
@@ -608,7 +618,8 @@ void reshape_and_cache(
          slot_mapping.data_ptr<int64_t>(), block_stride, page_stride,    \
          head_stride, key_stride, value_stride, num_heads, head_size,    \
          block_size, reinterpret_cast<const float*>(k_scale.data_ptr()), \
-          reinterpret_cast<const float*>(v_scale.data_ptr()));
+          reinterpret_cast<const float*>(v_scale.data_ptr()),             \
+          kv_scale_stride);

 void reshape_and_cache_flash(
    torch::Tensor& key,        // [num_tokens, num_heads, head_size]
@@ -617,8 +628,9 @@ void reshape_and_cache_flash(
    torch::Tensor&
        value_cache,  // [num_blocks, block_size, num_heads, head_size]
    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
-    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
-    torch::Tensor& v_scale) {
+    const std::string& kv_cache_dtype,
+    torch::Tensor& k_scale,    // [1] or [num_heads]
+    torch::Tensor& v_scale) {  // [1] or [num_heads]
  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
  // slot_mapping.size(0) because of padding for CUDA graphs.
  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
@@ -641,6 +653,12 @@ void reshape_and_cache_flash(
  int64_t head_stride = key_cache.stride(2);
  TORCH_CHECK(key_cache.stride(0) == value_cache.stride(0));

+  TORCH_CHECK(k_scale.sizes() == v_scale.sizes(),
+              "k_scale and v_scale must have the same shape");
+  TORCH_CHECK(k_scale.numel() == 1 || k_scale.numel() == num_heads,
+              "k_scale and v_scale must be of shape [1] or [num_heads]");
+  int kv_scale_stride = (k_scale.numel() > 1) ? 1 : 0;
+
  dim3 grid(num_tokens);
  dim3 block(std::min(num_heads * head_size, 512));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -80,8 +80,10 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
    reg.val[1] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr) + 8);
  }

-  explicit FP16Vec16(const FP32Vec16& vec);
+  // ASIMD does not support non-temporal loads
+  explicit FP16Vec16(bool, const void* ptr) : FP16Vec16(ptr) {}

+  explicit FP16Vec16(const FP32Vec16& vec);
  void save(void* ptr) const {
    vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
    vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
@@ -190,6 +192,9 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
  explicit BF16Vec16(const void* ptr)
      : reg(*reinterpret_cast<const bfloat16x8x2_t*>(ptr)) {};

+  // ASIMD does not support non-temporal loads
+  explicit BF16Vec16(bool, const void* ptr) : BF16Vec16(ptr) {}
+
  explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {};

  explicit BF16Vec16(const FP32Vec16&);
@@ -474,6 +479,9 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
      : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8),
             vld1q_f32(ptr + 12)}) {}

+  // ASIMD does not support non-temporal loads
+  explicit FP32Vec16(bool, const float* ptr) : FP32Vec16(ptr) {}
+
  explicit FP32Vec16(float32x4x4_t data) : reg(data) {}

  explicit FP32Vec16(const FP32Vec8& data) {
@@ -756,6 +764,96 @@ struct INT8Vec16 : public Vec<INT8Vec16> {
  };
 };

+struct INT8Vec64 : public Vec<INT8Vec64> {
+  constexpr static int VEC_ELEM_NUM = 64;
+  union AliasReg {
+    int8x16x4_t reg;
+    int8_t values[VEC_ELEM_NUM];
+  };
+  int8x16x4_t reg;
+
+  explicit INT8Vec64(const int8_t* ptr) { reg = vld1q_s8_x4(ptr); }
+
+  // ASIMD does not support non-temporal loads
+  explicit INT8Vec64(bool, const int8_t* ptr) : INT8Vec64(ptr) {}
+
+  void save(int8_t* ptr) const { vst1q_s8_x4(ptr, reg); }
+
+  // masked store
+  void save(int8_t* p, int elem_num) const {
+    TORCH_CHECK(elem_num <= VEC_ELEM_NUM && elem_num > 0);
+
+    if (elem_num == VEC_ELEM_NUM) {
+      vst1q_s8_x4(p, reg);
+      return;
+    }
+
+    const int full_quadwords = elem_num / 16;
+    const int remaining_bytes = elem_num % 16;
+
+    for (int i = 0; i < full_quadwords; ++i) {
+      vst1q_s8(p + 16 * i, reg.val[i]);
+    }
+
+    if (remaining_bytes) {
+      const int8x16_t v = reg.val[full_quadwords];
+      int8_t* tail = p + 16 * full_quadwords;
+      switch (remaining_bytes) {
+        case 15:
+          tail[14] = vgetq_lane_s8(v, 14);
+          [[fallthrough]];
+        case 14:
+          tail[13] = vgetq_lane_s8(v, 13);
+          [[fallthrough]];
+        case 13:
+          tail[12] = vgetq_lane_s8(v, 12);
+          [[fallthrough]];
+        case 12:
+          tail[11] = vgetq_lane_s8(v, 11);
+          [[fallthrough]];
+        case 11:
+          tail[10] = vgetq_lane_s8(v, 10);
+          [[fallthrough]];
+        case 10:
+          tail[9] = vgetq_lane_s8(v, 9);
+          [[fallthrough]];
+        case 9:
+          tail[8] = vgetq_lane_s8(v, 8);
+          [[fallthrough]];
+        case 8:
+          tail[7] = vgetq_lane_s8(v, 7);
+          [[fallthrough]];
+        case 7:
+          tail[6] = vgetq_lane_s8(v, 6);
+          [[fallthrough]];
+        case 6:
+          tail[5] = vgetq_lane_s8(v, 5);
+          [[fallthrough]];
+        case 5:
+          tail[4] = vgetq_lane_s8(v, 4);
+          [[fallthrough]];
+        case 4:
+          tail[3] = vgetq_lane_s8(v, 3);
+          [[fallthrough]];
+        case 3:
+          tail[2] = vgetq_lane_s8(v, 2);
+          [[fallthrough]];
+        case 2:
+          tail[1] = vgetq_lane_s8(v, 1);
+          [[fallthrough]];
+        case 1:
+          tail[0] = vgetq_lane_s8(v, 0);
+          break;
+        default:
+          break;
+      }
+    }
+  }
+
+  // ASIMD does not support non-temporal stores
+  void nt_save(int8_t* ptr) const { save(ptr); }
+};  // INT8Vec64
+
 template <typename T>
 struct VecType {
  using vec_type = void;
--- a/csrc/cpu/shm.cpp
+++ b/csrc/cpu/shm.cpp
@@ -5,6 +5,10 @@
 #include <sys/stat.h>
 #include <unistd.h>

+#ifdef __aarch64__
+  #include <atomic>
+#endif
+
 namespace {
 #define MAX_SHM_RANK_NUM 8
 #define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024)
@@ -34,8 +38,17 @@ struct KernelVecType<c10::Half> {
 };

 struct ThreadSHMContext {
+#ifdef __aarch64__
+  // memory model is weaker on AArch64, so we use atomic variables for
+  // consumer (load-acquire) and producer (store-release) to make sure
+  // that a stamp cannot be ready before the corresponding data is ready.
+  std::atomic<char> _curr_thread_stamp[2];
+  std::atomic<char> _ready_thread_stamp[2];
+  static_assert(std::atomic<char>::is_always_lock_free);
+#else
  volatile char _curr_thread_stamp[2];
  volatile char _ready_thread_stamp[2];
+#endif  // __aarch64__
  int local_stamp_buffer_idx;
  int remote_stamp_buffer_idx;
  int thread_id;
@@ -62,10 +75,17 @@ struct ThreadSHMContext {
    TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
    TORCH_CHECK((size_t)this % 64 == 0);
    TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
+#ifdef __aarch64__
+    _curr_thread_stamp[0].store(1, std::memory_order_relaxed);
+    _curr_thread_stamp[1].store(1, std::memory_order_relaxed);
+    _ready_thread_stamp[0].store(0, std::memory_order_relaxed);
+    _ready_thread_stamp[1].store(0, std::memory_order_relaxed);
+#else
    _curr_thread_stamp[0] = 1;
    _curr_thread_stamp[1] = 1;
    _ready_thread_stamp[0] = 0;
    _ready_thread_stamp[1] = 0;
+#endif  // __aarch64__
    _thread_buffer_mask[0] = 0;
    _thread_buffer_mask[1] = 0;
    for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
@@ -103,19 +123,43 @@ struct ThreadSHMContext {
    _thread_buffer_mask[local_stamp_buffer_idx] ^= 0xFFFFFFFFFFFFFFFF;
  }

-  char get_curr_stamp(int idx) const { return _curr_thread_stamp[idx]; }
+  char get_curr_stamp(int idx) const {
+#ifdef __aarch64__
+    return _curr_thread_stamp[idx].load(std::memory_order_acquire);
+#else
+    return _curr_thread_stamp[idx];
+#endif  // __aarch64__
+  }

-  char get_ready_stamp(int idx) const { return _ready_thread_stamp[idx]; }
+  char get_ready_stamp(int idx) const {
+#ifdef __aarch64__
+    return _ready_thread_stamp[idx].load(std::memory_order_acquire);
+#else
+    return _ready_thread_stamp[idx];
+#endif  // __aarch64__
+  }

  void next_stamp() {
+#ifdef __aarch64__
+    _curr_thread_stamp[local_stamp_buffer_idx].fetch_add(
+        1, std::memory_order_release);
+#else
    _mm_mfence();
    _curr_thread_stamp[local_stamp_buffer_idx] += 1;
+#endif  // __aarch64__
  }

  void commit_ready_stamp() {
+#ifdef __aarch64__
+    _ready_thread_stamp[local_stamp_buffer_idx].store(
+        _curr_thread_stamp[local_stamp_buffer_idx].load(
+            std::memory_order_relaxed),
+        std::memory_order_release);
+#else
    _mm_mfence();
    _ready_thread_stamp[local_stamp_buffer_idx] =
        _curr_thread_stamp[local_stamp_buffer_idx];
+#endif  // __aarch64__
  }

  int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
@@ -142,7 +186,11 @@ struct ThreadSHMContext {
        break;
      }
      ++_spinning_count;
+#ifdef __aarch64__
+      __asm__ __volatile__("yield");
+#else
      _mm_pause();
+#endif  // __aarch64__
    }
  }

--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -230,7 +230,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 #endif

 // SHM CCL
-#ifdef __AVX512F__
+#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
  ops.def("init_shm_manager(str name, int group_size, int rank) -> int",
          &init_shm_manager);
  ops.def("join_shm_manager(int handle, str name) -> str", &join_shm_manager);
@@ -250,7 +250,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("shm_send_tensor_list", torch::kCPU, &shm_send_tensor_list);
  ops.def("shm_recv_tensor_list(int handle, int src) -> Tensor[](a)",
          &shm_recv_tensor_list);
-#endif
+#endif  // #if defined(__AVX512F__) || defined(__aarch64__)

  // sgl-kernels
 #if defined(__AVX512BF16__) && defined(__AVX512F__) && defined(__AVX512VNNI__)
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -31,8 +31,6 @@ namespace moe {

 constexpr unsigned FULL_WARP_MASK = 0xffffffff;
 constexpr int32_t WARP_SIZE = 32;
-constexpr int32_t BLOCK_SIZE = 512;
-constexpr int32_t NUM_WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE;

 namespace warp_topk {

@@ -65,14 +63,6 @@ __forceinline__ __device__ bool is_better_than(T val, T baseline, idxT index,
  return res;
 }

-template <typename T, typename idxT>
-int calc_smem_size_for_block_wide(int num_of_warp, int64_t k) {
-  int64_t cache_topk = (sizeof(T) + sizeof(idxT)) * num_of_warp * k;
-  int64_t n = std::max<int>(num_of_warp / 2 * k, num_of_warp * WARP_SIZE);
-  return max(cache_topk,
-             round_up_to_multiple_of<256>(n * sizeof(T)) + n * sizeof(idxT));
-}
-
 template <int size, bool ascending, bool reverse, typename T, typename idxT,
          bool is_stable>
 struct BitonicMerge {
@@ -267,6 +257,15 @@ class WarpSort {
    }
  }

+  // Accessors for per-lane selected value/index.
+  // NOTE: For the common case `capacity == WARP_SIZE`, `max_arr_len_ == 1`
+  // and callers should use `i == 0`.
+  __device__ __forceinline__ idxT get_idx(int i = 0) const {
+    return idx_arr_[i];
+  }
+
+  __device__ __forceinline__ T get_val(int i = 0) const { return val_arr_[i]; }
+
 protected:
  static constexpr int max_arr_len_ = capacity / WARP_SIZE;

@@ -285,6 +284,7 @@ class WarpSelect : public WarpSort<capacity, greater, T, idxT, is_stable> {
  __device__ WarpSelect(idxT k, T dummy)
      : WarpSort<capacity, greater, T, idxT, is_stable>(k, dummy),
        k_th_(dummy),
+        k_th_idx_(0),
        k_th_lane_((k - 1) % WARP_SIZE) {
    extern __shared__ char smem_buf[];  // extern __shared__ T smem_buf[];

@@ -346,9 +346,6 @@ class WarpSelect : public WarpSort<capacity, greater, T, idxT, is_stable> {
      idxT idx = (lane_ < smem_buf_len_) ? idx_smem_[lane_] : 0;
      merge_buf_(val, idx);
    }
-
-    // after done(), smem is used for merging results among warps
-    __syncthreads();
  }

 private:
@@ -503,255 +500,186 @@ __device__ void topk_with_k2(T* output, T const* input, BiasT const* bias,
  }
 }

-template <typename T, typename BiasT, ScoringFunc SF>
-__global__ void topk_with_k2_kernel(T* output, T* input, BiasT const* bias,
-                                    int64_t const num_tokens,
-                                    int64_t const num_cases,
-                                    int64_t const n_group,
-                                    int64_t const num_experts_per_group) {
-  int32_t warp_id = threadIdx.x / WARP_SIZE;
-  int32_t lane_id = threadIdx.x % WARP_SIZE;
-
-  int32_t case_id = blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;
-  if (case_id < num_cases) {
-    input += case_id * num_experts_per_group;
-    // bias is per expert group, offset to current group
-    int32_t group_id = case_id % n_group;
-    BiasT const* group_bias = bias + group_id * num_experts_per_group;
-    output += case_id;
-
-    cg::thread_block block = cg::this_thread_block();
-    cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
-#endif
-    topk_with_k2<T, BiasT, SF>(output, input, group_bias, tile, lane_id,
-                               num_experts_per_group);
-  }
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
-#endif
-}
-
-template <typename T, typename BiasT, typename IdxT, ScoringFunc SF,
-          int NGroup = -1>
-__global__ void group_idx_and_topk_idx_kernel(
-    T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices,
-    BiasT const* bias, int64_t const num_tokens, int64_t const n_group,
-    int64_t const topk_group, int64_t const topk, int64_t const num_experts,
-    int64_t const num_experts_per_group, bool renormalize,
+template <typename T, typename BiasT, typename IdxT, ScoringFunc SF>
+__global__ void grouped_topk_fused_kernel(
+    T* scores, float* topk_values, IdxT* topk_indices, BiasT const* bias,
+    int64_t const num_tokens, int64_t const num_experts, int64_t const n_group,
+    int64_t const topk_group, int64_t const topk, bool renormalize,
    double routed_scaling_factor) {
-  int32_t warp_id = threadIdx.x / WARP_SIZE;
-  int32_t lane_id = threadIdx.x % WARP_SIZE;
-  int32_t case_id =
-      blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;  // one per token
-  scores += case_id * num_experts;
-  group_scores += case_id * n_group;
-  topk_values += case_id * topk;
-  topk_indices += case_id * topk;
+  int32_t const token_id = static_cast<int32_t>(blockIdx.x);
+  if (token_id >= num_tokens) {
+    return;
+  }

-  constexpr bool kUseStaticNGroup = (NGroup > 0);
-  // use int32 to avoid implicit conversion
-  int32_t const n_group_i32 =
-      kUseStaticNGroup ? NGroup : static_cast<int32_t>(n_group);
+  int32_t const warp_id = threadIdx.x / WARP_SIZE;
+  int32_t const lane_id = threadIdx.x % WARP_SIZE;

-  int32_t align_num_experts_per_group =
-      warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
+  int32_t const n_group_i32 = static_cast<int32_t>(n_group);
+  int32_t const topk_group_i32 = static_cast<int32_t>(topk_group);
+  int32_t const topk_i32 = static_cast<int32_t>(topk);
+  int32_t const num_experts_i32 = static_cast<int32_t>(num_experts);
+
+  int32_t const num_warps = blockDim.x / WARP_SIZE;
+  if (warp_id >= n_group_i32 || num_warps < n_group_i32) {
+    return;
+  }
+
+  int32_t const num_experts_per_group = num_experts_i32 / n_group_i32;
+
+  T* scores_token = scores + static_cast<int64_t>(token_id) * num_experts;

  cg::thread_block block = cg::this_thread_block();
  cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);

-  extern __shared__ char smem_buf[];  // NOTE: reuse the shared memory here to
-                                      // store the target topk idx
-  int32_t* s_topk_idx = reinterpret_cast<int32_t*>(smem_buf);
-  T* s_topk_value =
-      reinterpret_cast<T*>(s_topk_idx + NUM_WARPS_PER_BLOCK * topk) +
-      warp_id * topk;
-  s_topk_idx += warp_id * topk;
+  extern __shared__ char smem_buf[];
+  // warpSelect internal staging buffer layout
+  size_t const val_bytes =
+      static_cast<size_t>(num_warps) * WARP_SIZE * sizeof(T);
+  size_t const val_bytes_aligned =
+      warp_topk::round_up_to_multiple_of<256>(val_bytes);
+  size_t const idx_bytes =
+      static_cast<size_t>(num_warps) * WARP_SIZE * sizeof(int32_t);
+  size_t const internal_bytes = val_bytes_aligned + idx_bytes;

-  T value = neg_inf<T>();
-  T topk_group_value = neg_inf<T>();
-  int32_t num_equalto_topkth_group;
+  // user-managed shared memory starts after warpSelect internal staging.
+  uintptr_t ptr_u = reinterpret_cast<uintptr_t>(smem_buf + internal_bytes);
+  ptr_u = (ptr_u + 15) & ~static_cast<uintptr_t>(15);  // align to 16B
+  T* s_group_scores = reinterpret_cast<T*>(ptr_u);

 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
  asm volatile("griddepcontrol.wait;");  // I think all prolog can be put before
                                         // acqbulk because it's ptr arithmetic
 #endif

-  if (case_id < num_tokens) {
-    // calculate group_idx
-    int32_t target_num_min =
-        WARP_SIZE - n_group_i32 + static_cast<int32_t>(topk_group);
-    // The check is necessary to avoid abnormal input
-    if (lane_id < n_group_i32 && is_finite(group_scores[lane_id])) {
-      value = group_scores[lane_id];
-    }
+  // phase 1: per-group scan
+  int32_t const group_offset = warp_id * num_experts_per_group;
+  topk_with_k2<T, BiasT, SF>(s_group_scores + warp_id,
+                             scores_token + group_offset, bias + group_offset,
+                             tile, lane_id, num_experts_per_group);

-    int count_equal_to_top_value = WARP_SIZE - n_group_i32;
-    int pre_count_equal_to_top_value = 0;
-    // Use loop to find the largset top_group
-    while (count_equal_to_top_value < target_num_min) {
-      topk_group_value = cg::reduce(tile, value, cg::greater<T>());
-      if (value == topk_group_value) {
-        value = neg_inf<T>();
-      }
-      pre_count_equal_to_top_value = count_equal_to_top_value;
-      count_equal_to_top_value =
-          __popc(__ballot_sync(FULL_WARP_MASK, (value == neg_inf<T>())));
-    }
-    num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
-  }
  __syncthreads();

+  // phase 2: warp0 selects groups + merges candidates to final topk
+  if (warp_id != 0) {
+    return;
+  }
+
+  topk_values += static_cast<int64_t>(token_id) * topk;
+  topk_indices += static_cast<int64_t>(token_id) * topk;
+
+  // select topk_group groups by group score
  warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
                        /* is_stable */ true>
-      queue((int32_t)topk, neg_inf<T>());
+      group_sel(static_cast<int32_t>(topk_group_i32), neg_inf<T>());

-  int count_equalto_topkth_group = 0;
-  bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
-  if (case_id < num_tokens && if_proceed_next_topk) {
-    auto process_group = [&](int i_group) {
-      if ((group_scores[i_group] > topk_group_value) ||
-          ((group_scores[i_group] == topk_group_value) &&
-           (count_equalto_topkth_group < num_equalto_topkth_group))) {
-        int32_t offset = i_group * num_experts_per_group;
-        for (int32_t i = lane_id; i < align_num_experts_per_group;
-             i += WARP_SIZE) {
-          T candidates = neg_inf<T>();
-          if (i < num_experts_per_group) {
-            // apply scoring function (if any) and add bias
-            T input = scores[offset + i];
-            if (is_finite(input)) {
-              T score = apply_scoring<SF>(input);
-              candidates = score + static_cast<T>(bias[offset + i]);
-            }
-          }
-          queue.add(candidates, offset + i);
-        }
-        if (group_scores[i_group] == topk_group_value) {
-          count_equalto_topkth_group++;
+  // all lanes must participate in WarpSelect::add().
+  T gscore = (lane_id < n_group_i32) ? s_group_scores[lane_id] : neg_inf<T>();
+  group_sel.add(gscore, lane_id);
+  group_sel.done();
+
+  // proceed only if the k-th selected group score is not -inf
+  bool proceed = false;
+  if (topk_group_i32 > 0) {
+    int const kth_lane = topk_group_i32 - 1;
+    // broadcast the k-th selected group score to all lanes
+    T kth_val = __shfl_sync(FULL_WARP_MASK, group_sel.get_val(0), kth_lane);
+    proceed = (kth_val != neg_inf<T>());
+  }
+
+  if (!proceed) {
+    for (int i = lane_id; i < topk_i32; i += WARP_SIZE) {
+      topk_indices[i] = static_cast<IdxT>(i);
+      topk_values[i] = 1.0f / static_cast<float>(topk_i32);
+    }
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    asm volatile("griddepcontrol.launch_dependents;");
+#endif
+    return;
+  }
+
+  // merge per-group topk candidates for selected groups, then select topk
+  warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
+                        /* is_stable */ true>
+      expert_sel(static_cast<int32_t>(topk_i32), neg_inf<T>());
+
+  // selected group ids reside in lanes [0, topk_group)
+  int32_t sel_gid_lane = (lane_id < topk_group_i32) ? group_sel.get_idx(0) : 0;
+
+  // add candidates from selected groups to expert_sel
+  for (int32_t g = 0; g < topk_group_i32; ++g) {
+    int32_t gid = __shfl_sync(FULL_WARP_MASK, sel_gid_lane, g);
+    int32_t const offset = gid * num_experts_per_group;
+    int32_t const align_num_experts_per_group =
+        warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
+    for (int32_t i = lane_id; i < align_num_experts_per_group; i += WARP_SIZE) {
+      // all lanes must call `add()` the same number of times.
+      T cand = neg_inf<T>();
+      int32_t idx = 0;
+      if (i < num_experts_per_group) {
+        idx = offset + i;
+        T input = scores_token[idx];
+        if (is_finite(input)) {
+          T score = apply_scoring<SF>(input);
+          cand = score + static_cast<T>(bias[idx]);
        }
      }
-    };
-
-    if constexpr (kUseStaticNGroup) {
-#pragma unroll
-      for (int i_group = 0; i_group < NGroup; ++i_group) {
-        process_group(i_group);
-      }
-    } else {
-      for (int i_group = 0; i_group < n_group_i32; ++i_group) {
-        process_group(i_group);
-      }
-    }
-    queue.done();
-    // Get the topk_idx
-    queue.dumpIdx(s_topk_idx);
-  }
-
-  // Load the valid score value
-  // Calculate the summation
-  float topk_sum = 1e-20;
-  if (case_id < num_tokens && if_proceed_next_topk) {
-    for (int i = lane_id;
-         i < warp_topk::round_up_to_multiple_of<WARP_SIZE>(topk);
-         i += WARP_SIZE) {
-      T value = cuda_cast<T, float>(0.0f);
-      if (i < topk) {
-        // Load the score value (without bias) for normalization
-        T input = scores[s_topk_idx[i]];
-        value = apply_scoring<SF>(input);
-        s_topk_value[i] = value;
-      }
-      if (renormalize) {
-        topk_sum +=
-            cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
-      }
+      expert_sel.add(cand, idx);
    }
  }
+  expert_sel.done();

-  __syncthreads();
-
-  if (case_id < num_tokens) {
-    if (if_proceed_next_topk) {
-      float scale = routed_scaling_factor;
-      if (renormalize) {
-        scale /= topk_sum;
-      }
-      for (int i = lane_id; i < topk; i += WARP_SIZE) {
-        float base = cuda_cast<float, T>(s_topk_value[i]);
-        float value = base * scale;
-        topk_indices[i] = s_topk_idx[i];
-        topk_values[i] = value;
-      }
-    } else {
-      for (int i = lane_id; i < topk; i += WARP_SIZE) {
-        topk_indices[i] = i;
-        topk_values[i] = 1.0f / topk;
-      }
-    }
-    // Note: when if_proceed_next_topk==false, choose the first 8 experts as the
-    // default result.
+  // compute unbiased routing weights + optional renorm.
+  float lane_unbiased = 0.0f;
+  IdxT lane_idx = 0;
+  if (lane_id < topk_i32) {
+    lane_idx = static_cast<IdxT>(expert_sel.get_idx(0));
+    T in = scores_token[static_cast<int32_t>(lane_idx)];
+    lane_unbiased = cuda_cast<float, T>(apply_scoring<SF>(in));
  }
+
+  float topk_sum = 1e-20f;
+  if (renormalize) {
+    topk_sum += cg::reduce(tile, lane_unbiased, cg::plus<float>());
+  }
+
+  float scale = static_cast<float>(routed_scaling_factor);
+  if (renormalize) {
+    scale /= topk_sum;
+  }
+
+  if (lane_id < topk_i32) {
+    topk_indices[lane_id] = lane_idx;
+    topk_values[lane_id] = lane_unbiased * scale;
+  }
+
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
  asm volatile("griddepcontrol.launch_dependents;");
 #endif
 }

-template <typename T, typename BiasT, typename IdxT, ScoringFunc SF>
-inline void launch_group_idx_and_topk_kernel(
-    cudaLaunchConfig_t const& config, T* scores, T* group_scores,
-    float* topk_values, IdxT* topk_indices, BiasT const* bias,
-    int64_t const num_tokens, int64_t const n_group, int64_t const topk_group,
-    int64_t const topk, int64_t const num_experts,
-    int64_t const num_experts_per_group, bool const renormalize,
-    double const routed_scaling_factor) {
-  auto launch = [&](auto* kernel_instance2) {
-    cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
-                       topk_values, topk_indices, bias, num_tokens, n_group,
-                       topk_group, topk, num_experts, num_experts_per_group,
-                       renormalize, routed_scaling_factor);
-  };
-
-  switch (n_group) {
-    case 4: {
-      launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 4>);
-      break;
-    }
-    case 8: {
-      launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 8>);
-      break;
-    }
-    case 16: {
-      launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 16>);
-      break;
-    }
-    case 32: {
-      launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF, 32>);
-      break;
-    }
-    default: {
-      launch(&group_idx_and_topk_idx_kernel<T, BiasT, IdxT, SF>);
-      break;
-    }
-  }
-}
-
 template <typename T, typename BiasT, typename IdxT>
-void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
-                   IdxT* topk_indices, BiasT const* bias,
-                   int64_t const num_tokens, int64_t const num_experts,
-                   int64_t const n_group, int64_t const topk_group,
-                   int64_t const topk, bool const renormalize,
-                   double const routed_scaling_factor, int const scoring_func,
-                   bool enable_pdl = false, cudaStream_t const stream = 0) {
-  int64_t num_cases = num_tokens * n_group;
-  int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
+void invokeNoAuxTc(T* scores, float* topk_values, IdxT* topk_indices,
+                   BiasT const* bias, int64_t const num_tokens,
+                   int64_t const num_experts, int64_t const n_group,
+                   int64_t const topk_group, int64_t const topk,
+                   bool const renormalize, double const routed_scaling_factor,
+                   int const scoring_func, bool enable_pdl = false,
+                   cudaStream_t const stream = 0) {
  cudaLaunchConfig_t config;
-  config.gridDim = topk_with_k2_num_blocks;
-  config.blockDim = BLOCK_SIZE;
-  config.dynamicSmemBytes = 0;
+  // One block per token; one warp per group.
+  config.gridDim = static_cast<uint32_t>(num_tokens);
+  config.blockDim = static_cast<uint32_t>(n_group) * WARP_SIZE;
+  // Dynamic shared memory: WarpSelect staging + per-group topk buffers.
+  int32_t const num_warps = static_cast<int32_t>(n_group);
+  size_t const val_bytes =
+      static_cast<size_t>(num_warps) * WARP_SIZE * sizeof(T);
+  size_t const val_bytes_aligned =
+      warp_topk::round_up_to_multiple_of<256>(val_bytes);
+  size_t const idx_bytes =
+      static_cast<size_t>(num_warps) * WARP_SIZE * sizeof(int32_t);
+  size_t const internal_bytes = val_bytes_aligned + idx_bytes;
+  size_t const extra_bytes = 16 + static_cast<size_t>(n_group) * sizeof(T);
+  config.dynamicSmemBytes = internal_bytes + extra_bytes;
  config.stream = stream;
  cudaLaunchAttribute attrs[1];
  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
@@ -759,66 +687,35 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values,
  config.numAttrs = 1;
  config.attrs = attrs;
  auto const sf = static_cast<ScoringFunc>(scoring_func);
-  int64_t const num_experts_per_group = num_experts / n_group;
-  auto launch_topk_with_k2 = [&](auto* kernel_instance1) {
-    cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias,
-                       num_tokens, num_cases, n_group, num_experts_per_group);
-  };
  switch (sf) {
    case SCORING_NONE: {
-      auto* kernel_instance1 = &topk_with_k2_kernel<T, BiasT, SCORING_NONE>;
-      launch_topk_with_k2(kernel_instance1);
-      break;
+      auto* kernel_instance =
+          &grouped_topk_fused_kernel<T, BiasT, IdxT, SCORING_NONE>;
+      cudaLaunchKernelEx(&config, kernel_instance, scores, topk_values,
+                         topk_indices, bias, num_tokens, num_experts, n_group,
+                         topk_group, topk, renormalize, routed_scaling_factor);
+      return;
    }
    case SCORING_SIGMOID: {
-      auto* kernel_instance1 = &topk_with_k2_kernel<T, BiasT, SCORING_SIGMOID>;
-      launch_topk_with_k2(kernel_instance1);
-      break;
+      auto* kernel_instance =
+          &grouped_topk_fused_kernel<T, BiasT, IdxT, SCORING_SIGMOID>;
+      cudaLaunchKernelEx(&config, kernel_instance, scores, topk_values,
+                         topk_indices, bias, num_tokens, num_experts, n_group,
+                         topk_group, topk, renormalize, routed_scaling_factor);
+      return;
    }
    default:
      // should be guarded by higher level checks.
      TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
  }
-
-  int64_t topk_with_k_group_num_blocks =
-      (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
-  size_t dynamic_smem_in_bytes =
-      warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
-                                                           topk);
-  config.gridDim = topk_with_k_group_num_blocks;
-  config.blockDim = BLOCK_SIZE;
-  config.dynamicSmemBytes = dynamic_smem_in_bytes;
-  config.stream = stream;
-  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
-  config.numAttrs = 1;
-  config.attrs = attrs;
-  switch (sf) {
-    case SCORING_NONE: {
-      launch_group_idx_and_topk_kernel<T, BiasT, IdxT, SCORING_NONE>(
-          config, scores, group_scores, topk_values, topk_indices, bias,
-          num_tokens, n_group, topk_group, topk, num_experts,
-          num_experts_per_group, renormalize, routed_scaling_factor);
-      break;
-    }
-    case SCORING_SIGMOID: {
-      launch_group_idx_and_topk_kernel<T, BiasT, IdxT, SCORING_SIGMOID>(
-          config, scores, group_scores, topk_values, topk_indices, bias,
-          num_tokens, n_group, topk_group, topk, num_experts,
-          num_experts_per_group, renormalize, routed_scaling_factor);
-      break;
-    }
-    default:
-      TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc");
-  }
 }

-#define INSTANTIATE_NOAUX_TC(T, BiasT, IdxT)                                  \
-  template void invokeNoAuxTc<T, BiasT, IdxT>(                                \
-      T * scores, T * group_scores, float* topk_values, IdxT* topk_indices,   \
-      BiasT const* bias, int64_t const num_tokens, int64_t const num_experts, \
-      int64_t const n_group, int64_t const topk_group, int64_t const topk,    \
-      bool const renormalize, double const routed_scaling_factor,             \
+#define INSTANTIATE_NOAUX_TC(T, BiasT, IdxT)                                 \
+  template void invokeNoAuxTc<T, BiasT, IdxT>(                               \
+      T * scores, float* topk_values, IdxT* topk_indices, BiasT const* bias, \
+      int64_t const num_tokens, int64_t const num_experts,                   \
+      int64_t const n_group, int64_t const topk_group, int64_t const topk,   \
+      bool const renormalize, double const routed_scaling_factor,            \
      int const scoring_func, bool enable_pdl, cudaStream_t const stream);

 INSTANTIATE_NOAUX_TC(float, float, int32_t);
@@ -843,17 +740,21 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
  int64_t num_tokens = input_size[0];
  int64_t num_experts = input_size[1];
  TORCH_CHECK(input_size.size() == 2, "scores must be a 2D Tensor");
+  TORCH_CHECK(n_group > 0, "n_group must be positive");
+  TORCH_CHECK(topk > 0, "topk must be positive");
+  TORCH_CHECK(topk_group > 0, "topk_group must be positive");
+  TORCH_CHECK(topk_group <= n_group, "topk_group must be <= n_group");
  TORCH_CHECK(num_experts % n_group == 0,
              "num_experts should be divisible by n_group");
  TORCH_CHECK(n_group <= 32,
              "n_group should be smaller than or equal to 32 for now");
  TORCH_CHECK(topk <= 32, "topk should be smaller than or equal to 32 for now");
+  TORCH_CHECK(topk <= topk_group * (num_experts / n_group),
+              "topk must be <= topk_group * (num_experts / n_group)");
  TORCH_CHECK(scoring_func == vllm::moe::SCORING_NONE ||
                  scoring_func == vllm::moe::SCORING_SIGMOID,
              "scoring_func must be SCORING_NONE (0) or SCORING_SIGMOID (1)");

-  torch::Tensor group_scores = torch::empty(
-      {num_tokens, n_group}, torch::dtype(data_type).device(torch::kCUDA));
  // Always output float32 for topk_values (eliminates Python-side conversion)
  torch::Tensor topk_values = torch::empty(
      {num_tokens, topk}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
@@ -868,7 +769,6 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
      case torch::kFloat16:                                                  \
        vllm::moe::invokeNoAuxTc<T, half, IdxT>(                             \
            reinterpret_cast<T*>(scores.mutable_data_ptr()),                 \
-            reinterpret_cast<T*>(group_scores.mutable_data_ptr()),           \
            reinterpret_cast<float*>(topk_values.mutable_data_ptr()),        \
            reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()),        \
            reinterpret_cast<half const*>(bias.data_ptr()), num_tokens,      \
@@ -879,7 +779,6 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
      case torch::kFloat32:                                                  \
        vllm::moe::invokeNoAuxTc<T, float, IdxT>(                            \
            reinterpret_cast<T*>(scores.mutable_data_ptr()),                 \
-            reinterpret_cast<T*>(group_scores.mutable_data_ptr()),           \
            reinterpret_cast<float*>(topk_values.mutable_data_ptr()),        \
            reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()),        \
            reinterpret_cast<float const*>(bias.data_ptr()), num_tokens,     \
@@ -890,7 +789,6 @@ std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
      case torch::kBFloat16:                                                 \
        vllm::moe::invokeNoAuxTc<T, __nv_bfloat16, IdxT>(                    \
            reinterpret_cast<T*>(scores.mutable_data_ptr()),                 \
-            reinterpret_cast<T*>(group_scores.mutable_data_ptr()),           \
            reinterpret_cast<float*>(topk_values.mutable_data_ptr()),        \
            reinterpret_cast<IdxT*>(topk_indices.mutable_data_ptr()),        \
            reinterpret_cast<__nv_bfloat16 const*>(bias.data_ptr()),         \
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -58,7 +58,7 @@ TEMPLATE = (
    "( MARLIN_KERNEL_PARAMS );"
 )

-THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
+THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128), (128, 64, 128)]

 THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]

--- a/csrc/moe/marlin_moe_wna16/kernel.h
+++ b/csrc/moe/marlin_moe_wna16/kernel.h
@@ -3,8 +3,8 @@
  #define MARLIN_NAMESPACE_NAME marlin_moe_wna16
 #endif

-#include "quantization/gptq_marlin/marlin.cuh"
-#include "quantization/gptq_marlin/marlin_dtypes.cuh"
+#include "quantization/marlin/marlin.cuh"
+#include "quantization/marlin/marlin_dtypes.cuh"
 #include "core/scalar_type.hpp"

 #define MARLIN_KERNEL_PARAMS                                          \
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
@@ -23,10 +23,10 @@
  #define MARLIN_NAMESPACE_NAME marlin_moe_wna16
 #endif

-#include "quantization/gptq_marlin/marlin.cuh"
-#include "quantization/gptq_marlin/marlin_dtypes.cuh"
-#include "quantization/gptq_marlin/dequant.h"
-#include "quantization/gptq_marlin/marlin_mma.h"
+#include "quantization/marlin/marlin.cuh"
+#include "quantization/marlin/marlin_dtypes.cuh"
+#include "quantization/marlin/dequant.h"
+#include "quantization/marlin/marlin_mma.h"
 #include "core/scalar_type.hpp"

 #define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@@ -126,14 +126,16 @@ thread_config_t small_batch_thread_configs[] = {

    // thread_k, thread_n, num_threads
    {128, 128, 256},
-    {64, 128, 128}};
+    {64, 128, 128},
+    {128, 64, 128}};

 thread_config_t large_batch_thread_configs[] = {
    // Ordered by priority

    // thread_k, thread_n, num_threads
    {64, 256, 256},
-    {64, 128, 128}};
+    {64, 128, 128},
+    {128, 64, 128}};

 typedef struct {
  int blocks_per_sm;
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -4,7 +4,13 @@

 void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
                  torch::Tensor& token_expert_indices,
-                  torch::Tensor& gating_output, bool renormalize);
+                  torch::Tensor& gating_output, bool renormalize,
+                  std::optional<torch::Tensor> bias);
+
+void topk_sigmoid(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
+                  torch::Tensor& token_expert_indices,
+                  torch::Tensor& gating_output, bool renormalize,
+                  std::optional<torch::Tensor> bias);

 void moe_sum(torch::Tensor& input, torch::Tensor& output);

--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -42,7 +42,7 @@ void moe_permute(
  auto sort_workspace = torch::empty(
      {sorter_size},
      torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
-  auto copy_topk_ids = topk_ids.clone();  // copy topk_ids for preprocess
+  torch::Tensor topk_ids_for_sort = topk_ids;
  auto permuted_experts_id = torch::empty_like(topk_ids);
  auto sorted_row_idx = torch::empty_like(inv_permuted_idx);

@@ -62,12 +62,13 @@ void moe_permute(
    const int* expert_map_ptr = get_ptr<int>(expert_map.value());
    valid_num_ptr =
        get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
-    preprocessTopkIdLauncher(get_ptr<int>(copy_topk_ids), n_token * topk,
+    topk_ids_for_sort = topk_ids.clone();
+    preprocessTopkIdLauncher(get_ptr<int>(topk_ids_for_sort), n_token * topk,
                             expert_map_ptr, n_expert, stream);
  }
  // expert sort topk expert id and scan expert id get expert_first_token_offset
  sortAndScanExpert(
-      get_ptr<int>(copy_topk_ids), get_ptr<int>(token_expert_indices),
+      get_ptr<const int>(topk_ids_for_sort), get_ptr<int>(token_expert_indices),
      get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
      get_ptr<int64_t>(expert_first_token_offset), n_token, n_expert,
      n_local_expert, topk, sorter, get_ptr<int>(sort_workspace), stream);
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
@@ -109,7 +109,7 @@ void computeExpertFirstTokenOffset(int const* sorted_indices,
      sorted_indices, total_indices, num_experts, expert_first_token_offset);
 }

-void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
+void sortAndScanExpert(const int* expert_for_source_row, const int* source_rows,
                       int* permuted_experts, int* permuted_rows,
                       int64_t* expert_first_token_offset, int num_rows,
                       int num_experts, int num_experts_per_node, int k,
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
@@ -48,7 +48,7 @@ void computeExpertFirstTokenOffset(int const* sorted_indices,
                                   int64_t* expert_first_token_offset,
                                   cudaStream_t stream);

-void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
+void sortAndScanExpert(const int* expert_for_source_row, const int* source_rows,
                       int* permuted_experts, int* permuted_rows,
                       int64_t* expert_first_token_offset, int num_rows,
                       int num_experts, int num_experts_per_node, int k,
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -62,6 +62,12 @@ __device__ __forceinline__ float toFloat(T value) {
    }
 }

+// Scoring function enums
+enum ScoringFunc {
+  SCORING_SOFTMAX = 0, // apply softmax
+  SCORING_SIGMOID = 1  // apply sigmoid
+};
+
 // ====================== Softmax things ===============================
 // We have our own implementation of softmax here so we can support transposing the output
 // in the softmax kernel when we extend this module to support expert-choice routing.
@@ -125,6 +131,27 @@ __launch_bounds__(TPB) __global__
    }
 }

+template <int TPB, typename InputType>
+__launch_bounds__(TPB) __global__
+    void moeSigmoid(const InputType* input, const bool* finished, float* output, const int num_cols)
+{
+    const int thread_row_offset = blockIdx.x * num_cols;
+
+    // Don't touch finished rows.
+    if ((finished != nullptr) && finished[blockIdx.x])
+    {
+        return;
+    }
+
+    for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
+    {
+        const int idx = thread_row_offset + ii;
+        const float val = toFloat(input[idx]);
+        const float sigmoid_val = 1.0f / (1.0f + __expf(-val));
+        output[idx] = sigmoid_val;
+    }
+}
+
 template <int TPB, typename IndType>
 __launch_bounds__(TPB) __global__ void moeTopK(
    const float* inputs_after_softmax,
@@ -136,7 +163,8 @@ __launch_bounds__(TPB) __global__ void moeTopK(
    const int k,
    const int start_expert,
    const int end_expert,
-    const bool renormalize)
+    const bool renormalize,
+    const float* bias)
 {

    using cub_kvp = cub::KeyValuePair<int, float>;
@@ -162,7 +190,13 @@ __launch_bounds__(TPB) __global__ void moeTopK(
        {
            const int idx = thread_read_offset + expert;
            inp_kvp.key = expert;
-            inp_kvp.value = inputs_after_softmax[idx];
+
+            // Apply correction bias if provided
+            if (bias != nullptr) {
+              inp_kvp.value = inputs_after_softmax[idx] + bias[expert];
+            } else {
+              inp_kvp.value = inputs_after_softmax[idx];
+            }

            for (int prior_k = 0; prior_k < k_idx; ++prior_k)
            {
@@ -186,12 +220,13 @@ __launch_bounds__(TPB) __global__ void moeTopK(
            const bool should_process_row = row_is_active && node_uses_expert;

            const int idx = k * block_row + k_idx;
-            output[idx] = result_kvp.value;
+            // Return the unbiased scores for output weights
+            output[idx] = inputs_after_softmax[thread_read_offset + expert];
            indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
            assert(indices[idx] >= 0);
            source_rows[idx] = k_idx * num_rows + block_row;
            if (renormalize) {
-                selected_sum += result_kvp.value;
+                selected_sum += inputs_after_softmax[thread_read_offset + expert];
            }
        }
        __syncthreads();
@@ -225,10 +260,12 @@ __launch_bounds__(TPB) __global__ void moeTopK(
  2) This implementation assumes k is small, but will work for any k.
 */

-template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType, typename InputType = float>
+template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType,
+          typename InputType = float, ScoringFunc SF>
 __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
-    void topkGatingSoftmax(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices,
-        int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize)
+    void topkGating(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices,
+        int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize,
+        const float* bias)
 {
    static_assert(std::is_same_v<InputType, float> || std::is_same_v<InputType, __nv_bfloat16> ||
                      std::is_same_v<InputType, __half>,
@@ -353,61 +390,89 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
        }
    }

-    // First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
-    // convert to float afterwards for the exp + sum reduction.
-    float thread_max = row_chunk[0];
+    if constexpr (SF == SCORING_SOFTMAX) {
+      // First, we perform a max reduce within the thread.
+      float thread_max = row_chunk[0];
 #pragma unroll
-    for (int ii = 1; ii < VPT; ++ii)
-    {
+      for (int ii = 1; ii < VPT; ++ii) {
        thread_max = max(thread_max, row_chunk[ii]);
-    }
+      }

 // Now, we find the max within the thread group and distribute among the threads. We use a butterfly reduce.
 #pragma unroll
-    for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
-    {
+      for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
+      {
        thread_max = max(thread_max, VLLM_SHFL_XOR_SYNC_WIDTH(thread_max, mask, THREADS_PER_ROW));
-    }
+      }

-    // From this point, thread max in all the threads have the max within the row.
-    // Now, we subtract the max from each element in the thread and take the exp. We also compute the thread local sum.
-    float row_sum = 0;
+      // From this point, thread max in all the threads have the max within the row.
+      // Now, we subtract the max from each element in the thread and take the exp. We also compute the thread local sum.
+      float row_sum = 0;
 #pragma unroll
-    for (int ii = 0; ii < VPT; ++ii)
-    {
+      for (int ii = 0; ii < VPT; ++ii)
+      {
        row_chunk[ii] = expf(row_chunk[ii] - thread_max);
        row_sum += row_chunk[ii];
-    }
+      }

 // Now, we perform the sum reduce within each thread group. Similar to the max reduce, we use a bufferfly pattern.
 #pragma unroll
-    for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
-    {
+      for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
+      {
        row_sum += VLLM_SHFL_XOR_SYNC_WIDTH(row_sum, mask, THREADS_PER_ROW);
-    }
+      }

-    // From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables
-    // respectively. Finally, we can scale the rows for the softmax. Technically, for top-k gating we don't need to
-    // compute the entire softmax row. We can likely look at the maxes and only compute for the top-k values in the row.
-    // However, this kernel will likely not be a bottle neck and it seems better to closer match torch and find the
-    // argmax after computing the softmax.
-    const float reciprocal_row_sum = 1.f / row_sum;
+      // From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables
+      // respectively. Finally, we can scale the rows for the softmax. Technically, for top-k gating we don't need to
+      // compute the entire softmax row. We can likely look at the maxes and only compute for the top-k values in the row.
+      // However, this kernel will likely not be a bottle neck and it seems better to closer match torch and find the
+      // argmax after computing the softmax.
+      const float reciprocal_row_sum = 1.f / row_sum;

 #pragma unroll
-    for (int ii = 0; ii < VPT; ++ii)
-    {
+      for (int ii = 0; ii < VPT; ++ii)
+      {
        row_chunk[ii] = row_chunk[ii] * reciprocal_row_sum;
+      }
+    } else if constexpr (SF == SCORING_SIGMOID) {
+#pragma unroll
+      for (int ii = 0; ii < VPT; ++ii)
+      {
+        row_chunk[ii] = 1.0f / (1.0f + __expf(-row_chunk[ii]));
+      }
    }

-    // Now, softmax_res contains the softmax of the row chunk. Now, I want to find the topk elements in each row, along
+    static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
+
+    // If bias is not null, use biased value for selection
+    float row_chunk_for_choice[VPT];
+    // Apply correction bias
+    if (bias != nullptr) {
+#pragma unroll
+      for (int ldg = 0; ldg < LDG_PER_THREAD; ++ldg) {
+#pragma unroll
+        for (int ii = 0; ii < ELTS_PER_LDG; ++ii) {
+          const int expert = first_elt_read_by_thread + ldg * COLS_PER_GROUP_LDG + ii;
+          float bias_val = expert < NUM_EXPERTS ? bias[expert] : 0.0f;
+          row_chunk_for_choice[ldg * ELTS_PER_LDG + ii] = row_chunk[ldg * ELTS_PER_LDG + ii] + bias_val;
+        }
+      }
+    } else {
+#pragma unroll
+      for (int ii = 0; ii < VPT; ++ii) {
+        row_chunk_for_choice[ii] = row_chunk[ii];
+      }
+    }
+
+    // Now, row_chunk contains the softmax / sigmoid of the row chunk. Now, I want to find the topk elements in each row, along
    // with the max index.
    int start_col = first_elt_read_by_thread;
-    static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;

    float selected_sum = 0.f;
    for (int k_idx = 0; k_idx < k; ++k_idx)
    {
        // First, each thread does the local argmax
+        float max_val_for_choice = row_chunk_for_choice[0];
        float max_val = row_chunk[0];
        int expert = start_col;
 #pragma unroll
@@ -416,12 +481,14 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
 #pragma unroll
            for (int ii = 0; ii < ELTS_PER_LDG; ++ii)
            {
+                float val_for_choice = row_chunk_for_choice[ldg * ELTS_PER_LDG + ii];
                float val = row_chunk[ldg * ELTS_PER_LDG + ii];

                // No check on the experts here since columns with the smallest index are processed first and only
                // updated if > (not >=)
-                if (val > max_val)
+                if (val_for_choice > max_val_for_choice)
                {
+                    max_val_for_choice = val_for_choice;
                    max_val = val;
                    expert = col + ii;
                }
@@ -434,12 +501,14 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
 #pragma unroll
        for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
        {
+            float other_max_for_choice = VLLM_SHFL_XOR_SYNC_WIDTH(max_val_for_choice, mask, THREADS_PER_ROW);
            float other_max = VLLM_SHFL_XOR_SYNC_WIDTH(max_val, mask, THREADS_PER_ROW);
            int other_expert = VLLM_SHFL_XOR_SYNC_WIDTH(expert, mask, THREADS_PER_ROW);

            // We want lower indices to "win" in every thread so we break ties this way
-            if (other_max > max_val || (other_max == max_val && other_expert < expert))
+            if (other_max_for_choice > max_val_for_choice || (other_max_for_choice == max_val_for_choice && other_expert < expert))
            {
+                max_val_for_choice = other_max_for_choice;
                max_val = other_max;
                expert = other_expert;
            }
@@ -474,7 +543,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
            {
                const int offset_for_expert = expert % ELTS_PER_LDG;
                // Safe to set to any negative value since row_chunk values must be between 0 and 1.
-                row_chunk[ldg_group_for_expert * ELTS_PER_LDG + offset_for_expert] = -10000.f;
+                row_chunk_for_choice[ldg_group_for_expert * ELTS_PER_LDG + offset_for_expert] = -10000.f;
            }
        }
    }
@@ -508,10 +577,10 @@ struct TopkConstants
 };
 } // namespace detail

-template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType, typename InputType>
-void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices,
+template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType, typename InputType, ScoringFunc SF>
+void topkGatingLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices,
    int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, const bool renormalize,
-    cudaStream_t stream)
+    const float* bias, cudaStream_t stream)
 {
    static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(InputType) * EXPERTS);
    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM, InputType>;
@@ -521,43 +590,51 @@ void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finishe
    const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;

    dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
-    topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM, IndType, InputType><<<num_blocks, block_dim, 0, stream>>>(
-        input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize);
+    topkGating<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM, IndType, InputType, SF><<<num_blocks, block_dim, 0, stream>>>(
+        input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize, bias);
 }

 #ifndef USE_ROCM
-#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                          \
-    static_assert(WARP_SIZE == 32,                                                    \
-                  "Unsupported warp size. Only 32 is supported for CUDA");            \
-    topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
-        gating_output, nullptr, topk_weights, topk_indices, token_expert_indices,     \
-        num_tokens, topk, 0, num_experts, renormalize, stream);
+  #define LAUNCH_TOPK(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                   \
+    static_assert(WARP_SIZE == 32,                                            \
+                  "Unsupported warp size. Only 32 is supported for CUDA");    \
+    topkGatingLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES, \
+                             IndType, InputType, SF>(                         \
+        gating_output, nullptr, topk_weights, topk_indices,                   \
+        token_expert_indices, num_tokens, topk, 0, num_experts, renormalize,  \
+        bias, stream);
 #else
-#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                             \
-    if (WARP_SIZE == 64) {                                                               \
-        topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>(       \
-            gating_output, nullptr, topk_weights, topk_indices, token_expert_indices,    \
-            num_tokens, topk, 0, num_experts, renormalize, stream);                      \
-    } else if (WARP_SIZE == 32) {                                                        \
-        topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>(       \
-            gating_output, nullptr, topk_weights, topk_indices, token_expert_indices,    \
-            num_tokens, topk, 0, num_experts, renormalize, stream);                      \
-    } else {                                                                             \
-        assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
+  #define LAUNCH_TOPK(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                    \
+    if (WARP_SIZE == 64) {                                                     \
+      topkGatingLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES,       \
+                               IndType, InputType, SF>(                        \
+          gating_output, nullptr, topk_weights, topk_indices,                  \
+          token_expert_indices, num_tokens, topk, 0, num_experts, renormalize, \
+          bias, stream);                                                       \
+    } else if (WARP_SIZE == 32) {                                              \
+      topkGatingLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES,       \
+                               IndType, InputType, SF>(                        \
+          gating_output, nullptr, topk_weights, topk_indices,                  \
+          token_expert_indices, num_tokens, topk, 0, num_experts, renormalize, \
+          bias, stream);                                                       \
+    } else {                                                                   \
+      assert(false &&                                                          \
+             "Unsupported warp size. Only 32 and 64 are supported for ROCm");  \
    }
 #endif

-template <typename IndType, typename InputType>
-void topkGatingSoftmaxKernelLauncher(
+template <typename IndType, typename InputType, ScoringFunc SF>
+void topkGatingKernelLauncher(
    const InputType* gating_output,
    float* topk_weights,
    IndType* topk_indices,
    int* token_expert_indices,
-    float* softmax_workspace,
+    float* workspace,
    const int num_tokens,
    const int num_experts,
    const int topk,
    const bool renormalize,
+    const float* bias,
    cudaStream_t stream) {
    static constexpr int WARPS_PER_TB = 4;
    static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
@@ -569,64 +646,71 @@ void topkGatingSoftmaxKernelLauncher(
 #endif
    switch (num_experts) {
        case 1:
-            LAUNCH_SOFTMAX(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_TOPK(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 2:
-            LAUNCH_SOFTMAX(2, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_TOPK(2, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 4:
-            LAUNCH_SOFTMAX(4, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_TOPK(4, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 8:
-            LAUNCH_SOFTMAX(8, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_TOPK(8, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 16:
-            LAUNCH_SOFTMAX(16, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_TOPK(16, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 32:
-            LAUNCH_SOFTMAX(32, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_TOPK(32, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 64:
-            LAUNCH_SOFTMAX(64, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_TOPK(64, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 128:
-            LAUNCH_SOFTMAX(128, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_TOPK(128, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 256:
-            LAUNCH_SOFTMAX(256, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_TOPK(256, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 512:
-            LAUNCH_SOFTMAX(512, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            LAUNCH_TOPK(512, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        // (CUDA only) support multiples of 64 when num_experts is not power of 2.
        // ROCm uses WARP_SIZE 64 so 8 bytes loading won't fit for some of num_experts,
        // alternatively we can test 4 bytes loading and enable it in future.
 #ifndef USE_ROCM
        case 192:
-            LAUNCH_SOFTMAX(192, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            LAUNCH_TOPK(192, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
            break;
        case 320:
-            LAUNCH_SOFTMAX(320, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            LAUNCH_TOPK(320, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
            break;
        case 384:
-            LAUNCH_SOFTMAX(384, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            LAUNCH_TOPK(384, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
            break;
        case 448:
-            LAUNCH_SOFTMAX(448, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            LAUNCH_TOPK(448, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
            break;
        case 576:
-            LAUNCH_SOFTMAX(576, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            LAUNCH_TOPK(576, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
            break;
 #endif
        default: {
-            TORCH_CHECK(softmax_workspace != nullptr,
-                "softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
+            TORCH_CHECK(workspace != nullptr,
+                "workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
            static constexpr int TPB = 256;
-            moeSoftmax<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
-                gating_output, nullptr, softmax_workspace, num_experts);
+            if constexpr (SF == SCORING_SOFTMAX) {
+              moeSoftmax<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
+                gating_output, nullptr, workspace, num_experts);
+            } else if constexpr (SF == SCORING_SIGMOID) {
+              moeSigmoid<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
+                gating_output, nullptr, workspace, num_experts);
+            } else {
+                TORCH_CHECK(false, "Unsupported scoring func");
+            }
            moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
-                softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
-                num_experts, topk, 0, num_experts, renormalize);
+                workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
+                num_experts, topk, 0, num_experts, renormalize, bias);
        }
    }
 }
@@ -635,40 +719,55 @@ void topkGatingSoftmaxKernelLauncher(
 } // namespace vllm


-template<typename ComputeType>
-void dispatch_topk_softmax_launch(
+template<typename ComputeType, vllm::moe::ScoringFunc SF>
+void dispatch_topk_launch(
    torch::Tensor& gating_output,
    torch::Tensor& topk_weights,
    torch::Tensor& topk_indices,
    torch::Tensor& token_expert_indices,
    torch::Tensor& softmax_workspace,
-    int num_tokens, int num_experts, int topk, bool renormalize, cudaStream_t stream)
-{
+    int num_tokens, int num_experts, int topk, bool renormalize,
+    std::optional<torch::Tensor> bias,
+    cudaStream_t stream)
+ {
+    const float* bias_ptr = nullptr;
+    if (bias.has_value()) {
+      const torch::Tensor& bias_tensor = bias.value();
+      TORCH_CHECK(bias_tensor.scalar_type() == at::ScalarType::Float, "bias tensor must be float32");
+      TORCH_CHECK(bias_tensor.dim() == 1, "bias tensor must be 1D");
+      TORCH_CHECK(bias_tensor.size(0) == num_experts, "bias size mismatch, expected: ", num_experts);
+      TORCH_CHECK(bias_tensor.is_contiguous(), "bias tensor must be contiguous");
+      bias_ptr = bias_tensor.data_ptr<float>();
+    }
+
    if (topk_indices.scalar_type() == at::ScalarType::Int) {
-        vllm::moe::topkGatingSoftmaxKernelLauncher<int, ComputeType>(
+        vllm::moe::topkGatingKernelLauncher<int, ComputeType, SF>(
            reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
            topk_weights.data_ptr<float>(),
            topk_indices.data_ptr<int>(),
            token_expert_indices.data_ptr<int>(),
            softmax_workspace.data_ptr<float>(),
-            num_tokens, num_experts, topk, renormalize, stream);
+            num_tokens, num_experts, topk, renormalize,
+            bias_ptr, stream);
    } else if (topk_indices.scalar_type() == at::ScalarType::UInt32) {
-        vllm::moe::topkGatingSoftmaxKernelLauncher<uint32_t, ComputeType>(
+        vllm::moe::topkGatingKernelLauncher<uint32_t, ComputeType, SF>(
            reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
            topk_weights.data_ptr<float>(),
            topk_indices.data_ptr<uint32_t>(),
            token_expert_indices.data_ptr<int>(),
            softmax_workspace.data_ptr<float>(),
-            num_tokens, num_experts, topk, renormalize, stream);
+            num_tokens, num_experts, topk, renormalize,
+            bias_ptr, stream);
    } else {
        TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
-        vllm::moe::topkGatingSoftmaxKernelLauncher<int64_t, ComputeType>(
+        vllm::moe::topkGatingKernelLauncher<int64_t, ComputeType, SF>(
            reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
            topk_weights.data_ptr<float>(),
            topk_indices.data_ptr<int64_t>(),
            token_expert_indices.data_ptr<int>(),
            softmax_workspace.data_ptr<float>(),
-            num_tokens, num_experts, topk, renormalize, stream);
+            num_tokens, num_experts, topk, renormalize,
+            bias_ptr, stream);
    }
 }

@@ -677,7 +776,8 @@ void topk_softmax(
    torch::Tensor& topk_indices,                // [num_tokens, topk]
    torch::Tensor& token_expert_indices,        // [num_tokens, topk]
    torch::Tensor& gating_output,               // [num_tokens, num_experts]
-    bool renormalize)
+    bool renormalize,
+    std::optional<torch::Tensor> bias)
 {
    const int num_experts = gating_output.size(-1);
    const auto num_tokens = gating_output.numel() / num_experts;
@@ -693,14 +793,55 @@ void topk_softmax(
    torch::Tensor softmax_workspace = torch::empty({workspace_size}, workspace_options);

    if (gating_output.scalar_type() == at::ScalarType::Float) {
-        dispatch_topk_softmax_launch<float>(gating_output, topk_weights, topk_indices, 
-            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
+        dispatch_topk_launch<float, vllm::moe::SCORING_SOFTMAX>(gating_output, topk_weights, topk_indices,
+            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize,
+            bias, stream);
    } else if (gating_output.scalar_type() == at::ScalarType::Half) {
-        dispatch_topk_softmax_launch<__half>(gating_output, topk_weights, topk_indices, 
-            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
+        dispatch_topk_launch<__half, vllm::moe::SCORING_SOFTMAX>(gating_output, topk_weights, topk_indices,
+            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize,
+            bias, stream);
    } else if (gating_output.scalar_type() == at::ScalarType::BFloat16) {
-        dispatch_topk_softmax_launch<__nv_bfloat16>(gating_output, topk_weights, topk_indices, 
-            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
+        dispatch_topk_launch<__nv_bfloat16, vllm::moe::SCORING_SOFTMAX>(gating_output, topk_weights, topk_indices,
+            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize,
+            bias, stream);
+    } else {
+        TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type());
+    }
+}
+
+void topk_sigmoid(
+    torch::Tensor& topk_weights,                // [num_tokens, topk]
+    torch::Tensor& topk_indices,                // [num_tokens, topk]
+    torch::Tensor& token_expert_indices,        // [num_tokens, topk]
+    torch::Tensor& gating_output,               // [num_tokens, num_experts]
+    bool renormalize,
+    std::optional<torch::Tensor> bias)
+{
+    const int num_experts = gating_output.size(-1);
+    const auto num_tokens = gating_output.numel() / num_experts;
+    const int topk = topk_weights.size(-1);
+
+    const bool is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
+    const bool needs_workspace = !is_pow_2 || num_experts > 256;
+    const int64_t workspace_size = needs_workspace ? num_tokens * num_experts : 0;
+
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    const auto workspace_options = gating_output.options().dtype(at::ScalarType::Float);
+    torch::Tensor workspace = torch::empty({workspace_size}, workspace_options);
+
+    if (gating_output.scalar_type() == at::ScalarType::Float) {
+        dispatch_topk_launch<float, vllm::moe::SCORING_SIGMOID>(gating_output, topk_weights, topk_indices,
+            token_expert_indices, workspace, num_tokens, num_experts, topk, renormalize,
+            bias, stream);
+    } else if (gating_output.scalar_type() == at::ScalarType::Half) {
+        dispatch_topk_launch<__half, vllm::moe::SCORING_SIGMOID>(gating_output, topk_weights, topk_indices,
+            token_expert_indices, workspace, num_tokens, num_experts, topk, renormalize,
+            bias, stream);
+    } else if (gating_output.scalar_type() == at::ScalarType::BFloat16) {
+        dispatch_topk_launch<__nv_bfloat16, vllm::moe::SCORING_SIGMOID>(gating_output, topk_weights, topk_indices,
+            token_expert_indices, workspace, num_tokens, num_experts, topk, renormalize,
+            bias, stream);
    } else {
        TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type());
    }
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -5,9 +5,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
  // Apply topk softmax to the gating outputs.
  m.def(
      "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
-      "token_expert_indices, Tensor gating_output, bool renormalize) -> ()");
+      "token_expert_indices, Tensor gating_output, bool renormalize, Tensor? "
+      "bias) -> ()");
  m.impl("topk_softmax", torch::kCUDA, &topk_softmax);

+  // Apply topk sigmoid to the gating outputs.
+  m.def(
+      "topk_sigmoid(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
+      "token_expert_indices, Tensor gating_output, bool renormalize, Tensor? "
+      "bias) -> ()");
+  m.impl("topk_sigmoid", torch::kCUDA, &topk_sigmoid);
+
  // Calculate the result of moe by summing up the partial results
  // from all selected experts.
  m.def("moe_sum(Tensor input, Tensor! output) -> ()");
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -260,12 +260,6 @@ void get_cutlass_moe_mm_data(
    const int64_t num_experts, const int64_t n, const int64_t k,
    const std::optional<torch::Tensor>& blockscale_offsets);

-void get_cutlass_moe_mm_problem_sizes(
-    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
-    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
-    std::optional<bool> force_swap_ab = std::nullopt);
-
 void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
    const torch::Tensor& expert_first_token_offset,
    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@@ -299,7 +293,8 @@ std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a);

 void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
                      torch::Tensor& output_scale,
-                      torch::Tensor const& input_scale);
+                      torch::Tensor const& input_scale,
+                      bool is_sf_swizzled_layout);

 void scaled_fp4_experts_quant(
    torch::Tensor& output, torch::Tensor& output_scale,
--- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@@ -27,17 +27,24 @@

 #include "cuda_utils.h"
 #include "launch_bounds_utils.h"
+
+// Define before including nvfp4_utils.cuh so the header
+// can use this macro during compilation.
+#define NVFP4_ENABLE_ELTS16 1
 #include "nvfp4_utils.cuh"

 namespace vllm {

 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
-__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
-    silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
-                             float const* SFScale, uint32_t* out,
-                             uint32_t* SFout) {
-  using PackedVec = PackedVec<Type>;
+__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
+    silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols,
+                             int32_t num_padded_cols,
+                             Type const* __restrict__ in,
+                             float const* __restrict__ SFScale,
+                             uint32_t* __restrict__ out,
+                             uint32_t* __restrict__ SFout) {
+  using PackedVec = vllm::PackedVec<Type>;
  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
@@ -49,34 +56,60 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
  // Get the global scaling factor, which will be applied to the SF.
  // Note SFScale is the same as next GEMM's alpha, which is
  // (448.f / (Alpha_A / 6.f)).
-  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
+  float const SFScaleVal = (SFScale == nullptr) ? 1.0f : SFScale[0];
+
+  int32_t const colIdx = blockDim.x * blockIdx.y + threadIdx.x;
+  int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;

  // Input tensor row/col loops.
  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
-    for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
-         colIdx += blockDim.x) {
+    if (colIdx < num_padded_cols) {
+      PackedVec in_vec;
+      PackedVec in_vec2;
      int64_t inOffset =
          rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) + colIdx;
      int64_t inOffset2 = rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) +
                          numCols / CVT_FP4_ELTS_PER_THREAD + colIdx;
-      PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-      PackedVec in_vec2 = reinterpret_cast<PackedVec const*>(in)[inOffset2];

-      // Get the output tensor offset.
-      // Same as inOffset because 8 elements are packed into one uint32_t.
-      int64_t outOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
-      auto& out_pos = out[outOffset];
+      bool valid = (rowIdx < numRows) && (elem_idx < numCols);
+      if constexpr (CVT_FP4_PACK16) {
+        ld256_or_zero_cg_u32<Type>(
+            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 8],
+            valid);
+        ld256_or_zero_cg_u32<Type>(
+            in_vec2, &reinterpret_cast<const uint32_t*>(in)[inOffset2 * 8],
+            valid);
+      } else {
+        ld128_or_zero_cg_u32<Type>(
+            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 4],
+            valid);
+        ld128_or_zero_cg_u32<Type>(
+            in_vec2, &reinterpret_cast<const uint32_t*>(in)[inOffset2 * 4],
+            valid);
+      }

      // Compute silu and mul
-      PackedVec out_silu_mul = compute_silu_mul(in_vec, in_vec2);
+      PackedVec out_silu_mul = compute_silu_mul<Type>(in_vec, in_vec2);

      auto sf_out =
          cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
                                             CVT_FP4_NUM_THREADS_PER_SF>(
              rowIdx, colIdx, numKTiles, SFout);

-      out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(out_silu_mul, SFScaleVal,
-                                                     sf_out);
+      auto out_val =
+          cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>(
+              out_silu_mul, SFScaleVal, sf_out);
+
+      if (valid) {
+        if constexpr (CVT_FP4_PACK16) {
+          int64_t outOffset = rowIdx * (numCols / 8) + colIdx * 2;
+          uint64_t packed64 =
+              (uint64_t(out_val.hi) << 32) | uint64_t(out_val.lo);
+          reinterpret_cast<uint64_t*>(out)[outOffset >> 1] = packed64;
+        } else {
+          out[inOffset] = out_val;
+        }
+      }
    }
  }
 }
@@ -103,17 +136,23 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output,  // [..., d]
  auto output_ptr = static_cast<int64_t*>(output.data_ptr());
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
-  dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
+  dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
  int const numBlocksPerSM =
      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
-  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+
+  int sf_n_unpadded = int(n / CVT_FP4_SF_VEC_SIZE);
+
+  int grid_y = vllm::div_round_up(sf_n_unpadded, static_cast<int>(block.x));
+  int grid_x = std::min(
+      int(m), std::max(1, (multiProcessorCount * numBlocksPerSM) / grid_y));
+  dim3 grid(grid_x, grid_y);

  VLLM_DISPATCH_HALF_TYPES(
      input.scalar_type(), "silu_and_mul_nvfp4_quant_kernel", [&] {
        using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
        auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
        vllm::silu_mul_cvt_fp16_to_fp4<cuda_type><<<grid, block, 0, stream>>>(
-            m, n, input_ptr, input_sf_ptr,
+            m, n, sf_n_unpadded, input_ptr, input_sf_ptr,
            reinterpret_cast<uint32_t*>(output_ptr),
            reinterpret_cast<uint32_t*>(sf_out));
      });
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@@ -140,8 +140,8 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
                                           CVT_FP4_NUM_THREADS_PER_SF>(
            rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);

-    out_pos =
-        cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(quant_input, SFScaleVal, sf_out);
+    out_pos = cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>(
+        quant_input, SFScaleVal, sf_out);
  }
 }

@@ -246,8 +246,8 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
                                           CVT_FP4_NUM_THREADS_PER_SF>(
            rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);

-    out_pos =
-        cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(quant_input, SFScaleVal, sf_out);
+    out_pos = cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>(
+        quant_input, SFScaleVal, sf_out);
  }
 }

--- a/csrc/quantization/fp4/nvfp4_quant_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@@ -21,7 +21,8 @@
 void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
                             torch::Tensor const& input,
                             torch::Tensor const& output_sf,
-                             torch::Tensor const& input_sf);
+                             torch::Tensor const& input_sf,
+                             bool is_sf_swizzled_layout);
 #endif

 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
@@ -51,10 +52,12 @@ void silu_and_mul_scaled_fp4_experts_quant_sm1xxa(
 #endif

 void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
-                      torch::Tensor& output_sf, torch::Tensor const& input_sf) {
+                      torch::Tensor& output_sf, torch::Tensor const& input_sf,
+                      bool is_sf_swizzled_layout) {
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
-  return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf);
+  return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf,
+                                 is_sf_swizzled_layout);
 #endif
  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization kernel");
 }
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -27,29 +27,23 @@

 #include "cuda_utils.h"
 #include "launch_bounds_utils.h"
+
+// Define before including nvfp4_utils.cuh so the header
+// can use this macro during compilation.
+#define NVFP4_ENABLE_ELTS16 1
 #include "nvfp4_utils.cuh"

 namespace vllm {

-template <typename Int>
-__host__ __device__ inline Int round_up(Int x, Int y) {
-  static_assert(std::is_integral_v<Int>,
-                "round_up argument must be integral type");
-  return ((x + y - 1) / y) * y;
-}
-
-// Compute effective rows for grid configuration with swizzled SF layouts.
-inline int computeEffectiveRows(int m) {
-  constexpr int ROW_TILE = 128;
-  return round_up(m, ROW_TILE);
-}
-
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
 __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
-    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
-                    float const* SFScale, uint32_t* out, uint32_t* SFout) {
-  using PackedVec = PackedVec<Type>;
+    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, int32_t num_padded_cols,
+                    Type const* __restrict__ in,
+                    float const* __restrict__ SFScale,
+                    uint32_t* __restrict__ out, uint32_t* __restrict__ SFout) {
+  using PackedVec = vllm::PackedVec<Type>;
+
  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
@@ -59,33 +53,31 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
  int32_t const numKTiles = (numCols + 63) / 64;

  int sf_m = round_up<int>(numRows, 128);
-  int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
-  int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
-  int num_padded_cols = sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE;
+  int32_t const colIdx = blockDim.x * blockIdx.y + threadIdx.x;
+  int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;

  // Get the global scaling factor, which will be applied to the SF.
  // Note SFScale is the same as next GEMM's alpha, which is
  // (448.f / (Alpha_A / 6.f)).
-  float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
+  float const global_scale = (SFScale == nullptr) ? 1.0f : SFScale[0];

  // Iterate over all rows and cols including padded ones -
  //  ensures we visit every single scale factor address to initialize it.
  for (int rowIdx = blockIdx.x; rowIdx < sf_m; rowIdx += gridDim.x) {
-    for (int colIdx = threadIdx.x;
-         colIdx < num_padded_cols / CVT_FP4_ELTS_PER_THREAD;
-         colIdx += blockDim.x) {
-      int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
-
+    if (colIdx < num_padded_cols) {
      PackedVec in_vec;
      int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;

      // If we are outside valid rows OR outside valid columns -> Use Zeros
-      if (rowIdx >= numRows || elem_idx >= numCols) {
-        memset(&in_vec, 0, sizeof(PackedVec));
-
+      bool valid = (rowIdx < numRows) && (elem_idx < numCols);
+      if constexpr (CVT_FP4_PACK16) {
+        ld256_or_zero_cg_u32<Type>(
+            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 8],
+            valid);
      } else {
-        // Valid Region: Load actual data
-        in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+        ld128_or_zero_cg_u32<Type>(
+            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 4],
+            valid);
      }

      auto sf_out =
@@ -94,13 +86,85 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
              rowIdx, colIdx, numKTiles, SFout);

      auto out_val =
-          cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
+          cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>(
+              in_vec, global_scale, sf_out);

      // We do NOT write output for padding because the 'out' tensor is not
      // padded.
-      if (rowIdx < numRows && elem_idx < numCols) {
-        // Same as inOffset because 8 elements are packed into one uint32_t.
-        out[inOffset] = out_val;
+      if (valid) {
+        if constexpr (CVT_FP4_PACK16) {
+          int64_t outOffset = rowIdx * (numCols / 8) + colIdx * 2;
+          uint64_t packed64 =
+              (uint64_t(out_val.hi) << 32) | uint64_t(out_val.lo);
+          reinterpret_cast<uint64_t*>(out)[outOffset >> 1] = packed64;
+        } else {
+          out[inOffset] = out_val;
+        }
+      }
+    }
+  }
+}
+
+// Use UE4M3 by default.
+template <class Type, bool UE8M0_SF = false>
+__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
+    cvt_fp16_to_fp4_sf_major(int32_t numRows, int32_t numCols,
+                             int32_t sf_n_unpadded, Type const* __restrict__ in,
+                             float const* __restrict__ SFScale,
+                             uint32_t* __restrict__ out,
+                             uint32_t* __restrict__ SFout) {
+  using PackedVec = PackedVec<Type>;
+
+  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
+      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
+  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
+                "Vec size is not matched.");
+
+  int32_t const colIdx = blockDim.x * blockIdx.y + threadIdx.x;
+  int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
+
+  // Get the global scaling factor, which will be applied to the SF.
+  // Note SFScale is the same as next GEMM's alpha, which is
+  // (448.f / (Alpha_A / 6.f)).
+  float const global_scale = (SFScale == nullptr) ? 1.0f : SFScale[0];
+
+  // Iterate over all rows and cols including padded ones -
+  //  ensures we visit every single scale factor address to initialize it.
+  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
+    if (colIdx < sf_n_unpadded) {
+      PackedVec in_vec;
+      int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
+
+      // If we are outside valid rows OR outside valid columns -> Use Zeros
+      bool valid = (rowIdx < numRows) && (elem_idx < numCols);
+      if constexpr (CVT_FP4_PACK16) {
+        ld256_or_zero_cg_u32<Type>(
+            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 8],
+            valid);
+      } else {
+        ld128_or_zero_cg_u32<Type>(
+            in_vec, &reinterpret_cast<const uint32_t*>(in)[inOffset * 4],
+            valid);
+      }
+
+      auto sf_out =
+          sf_out_rowmajor_u8<uint32_t>(rowIdx, colIdx, sf_n_unpadded, SFout);
+
+      auto out_val =
+          cvt_warp_fp16_to_fp4<Type, CVT_FP4_NUM_THREADS_PER_SF, UE8M0_SF>(
+              in_vec, global_scale, sf_out);
+
+      // We do NOT write output for padding because the 'out' tensor is not
+      // padded.
+      if (valid) {
+        if constexpr (CVT_FP4_PACK16) {
+          int64_t outOffset = rowIdx * (numCols / 8) + colIdx * 2;
+          uint64_t packed64 =
+              (uint64_t(out_val.hi) << 32) | uint64_t(out_val.lo);
+          reinterpret_cast<uint64_t*>(out)[outOffset >> 1] = packed64;
+        } else {
+          out[inOffset] = out_val;
+        }
      }
    }
  }
@@ -111,7 +175,8 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
 void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
                             torch::Tensor const& input,
                             torch::Tensor const& output_sf,
-                             torch::Tensor const& input_sf) {
+                             torch::Tensor const& input_sf,
+                             bool is_sf_swizzled_layout) {
  int32_t m = input.size(0);
  int32_t n = input.size(1);

@@ -129,19 +194,48 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  auto stream = at::cuda::getCurrentCUDAStream(input.get_device());

+  int sf_n_unpadded = int(n / CVT_FP4_SF_VEC_SIZE);
+
  // Grid, Block size. Each thread converts 8 values.
  dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
  int const numBlocksPerSM =
      vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
-  int effectiveRows = vllm::computeEffectiveRows(m);
-  dim3 grid(std::min(effectiveRows, multiProcessorCount * numBlocksPerSM));

-  VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
-    using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
-    auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
-    // NOTE: We don't support e8m0 scales at this moment.
-    vllm::cvt_fp16_to_fp4<cuda_type, false><<<grid, block, 0, stream>>>(
-        m, n, input_ptr, input_sf_ptr, reinterpret_cast<uint32_t*>(output_ptr),
-        reinterpret_cast<uint32_t*>(sf_out));
-  });
-}
+  if (is_sf_swizzled_layout) {
+    int sf_n_int = int(vllm::round_up(sf_n_unpadded, 4) / 4);
+    int32_t num_padded_cols =
+        sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD;
+
+    int grid_y = vllm::div_round_up(num_padded_cols, static_cast<int>(block.x));
+    int grid_x =
+        std::min(vllm::computeEffectiveRows(m),
+                 std::max(1, (multiProcessorCount * numBlocksPerSM) / grid_y));
+    dim3 grid(grid_x, grid_y);
+
+    VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
+      using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
+      auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
+      // NOTE: We don't support e8m0 scales at this moment.
+      vllm::cvt_fp16_to_fp4<cuda_type, false><<<grid, block, 0, stream>>>(
+          m, n, num_padded_cols, input_ptr, input_sf_ptr,
+          reinterpret_cast<uint32_t*>(output_ptr),
+          reinterpret_cast<uint32_t*>(sf_out));
+    });
+  } else {
+    int grid_y = vllm::div_round_up(sf_n_unpadded, static_cast<int>(block.x));
+    int grid_x = std::min(
+        m, std::max(1, (multiProcessorCount * numBlocksPerSM) / grid_y));
+    dim3 grid(grid_x, grid_y);
+
+    VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
+      using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
+      auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
+      // NOTE: We don't support e8m0 scales at this moment.
+      vllm::cvt_fp16_to_fp4_sf_major<cuda_type, false>
+          <<<grid, block, 0, stream>>>(m, n, sf_n_unpadded, input_ptr,
+                                       input_sf_ptr,
+                                       reinterpret_cast<uint32_t*>(output_ptr),
+                                       reinterpret_cast<uint32_t*>(sf_out));
+    });
+  }
+}
--- a/csrc/quantization/fp4/nvfp4_utils.cuh
+++ b/csrc/quantization/fp4/nvfp4_utils.cuh
@@ -19,9 +19,17 @@
 #include <cuda_runtime.h>
 #include <cuda_fp8.h>

-#define ELTS_PER_THREAD 8
-
+#if (defined(NVFP4_ENABLE_ELTS16) && (CUDART_VERSION >= 12090) && \
+     defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100)
+  #define ELTS_PER_THREAD 16
+constexpr int CVT_FP4_ELTS_PER_THREAD = 16;
+constexpr bool CVT_FP4_PACK16 = true;
+#else
+  #define ELTS_PER_THREAD 8
 constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
+constexpr bool CVT_FP4_PACK16 = false;
+#endif
+
 constexpr int CVT_FP4_SF_VEC_SIZE = 16;

 namespace vllm {
@@ -68,19 +76,46 @@ struct TypeConverter<__nv_bfloat16> {
  using Type = __nv_bfloat162;
 };

+#if (defined(NVFP4_ENABLE_ELTS16) && (CUDART_VERSION >= 12090) && \
+     defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100)
+// Define a 32 bytes packed data type.
+template <class Type>
+struct alignas(32) PackedVec {
+  typename TypeConverter<Type>::Type elts[8];
+};
+#else
 // Define a 16 bytes packed data type.
 template <class Type>
-struct PackedVec {
+struct alignas(16) PackedVec {
  typename TypeConverter<Type>::Type elts[4];
 };
+#endif

 template <>
 struct PackedVec<__nv_fp8_e4m3> {
  __nv_fp8x2_e4m3 elts[8];
 };

+template <typename Int>
+__host__ __device__ inline Int round_up(Int x, Int y) {
+  static_assert(std::is_integral_v<Int>,
+                "round_up argument must be integral type");
+  return ((x + y - 1) / y) * y;
+}
+
+template <typename Int>
+__host__ __device__ __forceinline__ Int div_round_up(Int x, Int y) {
+  return (x + y - 1) / y;
+}
+
+// Compute effective rows for grid configuration with swizzled SF layouts.
+inline int computeEffectiveRows(int m) {
+  constexpr int ROW_TILE = 128;
+  return round_up(m, ROW_TILE);
+}
+
 // Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
+inline __device__ uint32_t fp32_vec8_to_e2m1(float (&array)[8]) {
  uint32_t val;
  asm volatile(
      "{\n"
@@ -101,7 +136,7 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
 }

 // Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
+__device__ __forceinline__ uint32_t fp32_vec8_to_e2m1(float2 (&array)[4]) {
  uint32_t val;
  asm volatile(
      "{\n"
@@ -114,20 +149,115 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
-      "}"
+      "}\n"
      : "=r"(val)
      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
  return val;
 }

+struct u32x2 {
+  uint32_t lo, hi;
+};
+
+using fp4_packed_t = std::conditional_t<CVT_FP4_PACK16, u32x2, uint32_t>;
+
+__device__ __forceinline__ u32x2 fp32_vec16_to_e2m1(float2 (&array)[8]) {
+  u32x2 out;
+  asm volatile(
+      "{\n"
+      ".reg .b8 b0;\n"
+      ".reg .b8 b1;\n"
+      ".reg .b8 b2;\n"
+      ".reg .b8 b3;\n"
+      ".reg .b8 b4;\n"
+      ".reg .b8 b5;\n"
+      ".reg .b8 b6;\n"
+      ".reg .b8 b7;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   b0,  %3,  %2;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   b1,  %5,  %4;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   b2,  %7,  %6;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   b3,  %9,  %8;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   b4, %11, %10;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   b5, %13, %12;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   b6, %15, %14;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   b7, %17, %16;\n"
+      "mov.b32 %0, {b0, b1, b2, b3};\n"
+      "mov.b32 %1, {b4, b5, b6, b7};\n"
+      "}\n"
+      : "=r"(out.lo), "=r"(out.hi)
+      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
+        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y),
+        "f"(array[4].x), "f"(array[4].y), "f"(array[5].x), "f"(array[5].y),
+        "f"(array[6].x), "f"(array[6].y), "f"(array[7].x), "f"(array[7].y));
+  return out;
+}
+
+__device__ __forceinline__ uint32_t pack_fp4(float2 (&v)[4]) {
+  return fp32_vec8_to_e2m1(v);
+}
+
+__device__ __forceinline__ u32x2 pack_fp4(float2 (&v)[8]) {
+  return fp32_vec16_to_e2m1(v);
+}
+
 // Fast reciprocal.
-inline __device__ float reciprocal_approximate_ftz(float a) {
+__device__ __forceinline__ float reciprocal_approximate_ftz(float a) {
  float b;
-  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
+  asm volatile("rcp.approx.ftz.f32 %0, %1;" : "=f"(b) : "f"(a));
  return b;
 }

+template <class Type>
+__device__ __forceinline__ void ld128_or_zero_cg_u32(PackedVec<Type>& out,
+                                                     const void* ptr,
+                                                     bool pred) {
+  uint32_t r0, r1, r2, r3;
+
+  asm volatile(
+      "{\n"
+      "  .reg .pred pr;\n"
+      "  setp.ne.u32 pr, %4, 0;\n"
+      "  mov.u32 %0, 0;\n"
+      "  mov.u32 %1, 0;\n"
+      "  mov.u32 %2, 0;\n"
+      "  mov.u32 %3, 0;\n"
+      "  @pr ld.global.cg.v4.u32 {%0,%1,%2,%3}, [%5];\n"
+      "}\n"
+      : "=r"(r0), "=r"(r1), "=r"(r2), "=r"(r3)
+      : "r"((int)pred), "l"(ptr));
+
+  *reinterpret_cast<uint4*>(&out) = uint4{r0, r1, r2, r3};
+}
+
+template <class Type>
+__device__ __forceinline__ void ld256_or_zero_cg_u32(PackedVec<Type>& out,
+                                                     const void* ptr,
+                                                     bool pred) {
+  uint32_t r0, r1, r2, r3, r4, r5, r6, r7;
+
+  asm volatile(
+      "{\n"
+      "  .reg .pred pr;\n"
+      "  setp.ne.u32 pr, %8, 0;\n"
+      "  mov.u32 %0, 0;\n"
+      "  mov.u32 %1, 0;\n"
+      "  mov.u32 %2, 0;\n"
+      "  mov.u32 %3, 0;\n"
+      "  mov.u32 %4, 0;\n"
+      "  mov.u32 %5, 0;\n"
+      "  mov.u32 %6, 0;\n"
+      "  mov.u32 %7, 0;\n"
+      "  @pr ld.global.cg.v8.u32 {%0,%1,%2,%3,%4,%5,%6,%7}, [%9];\n"
+      "}\n"
+      : "=r"(r0), "=r"(r1), "=r"(r2), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6),
+        "=r"(r7)
+      : "r"((int)pred), "l"(ptr));
+
+  reinterpret_cast<uint4*>(&out)[0] = uint4{r0, r1, r2, r3};
+  reinterpret_cast<uint4*>(&out)[1] = uint4{r4, r5, r6, r7};
+}
+
 // Compute SF output offset for swizzled tensor core layout.
 // SF layout: [numMTiles, numKTiles, 32, 4, 4]
 // Caller must precompute: numKTiles = (numCols + 63) / 64
@@ -166,21 +296,41 @@ __device__ __forceinline__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(
  return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
 }

+template <class SFType>
+__device__ __forceinline__ uint8_t* sf_out_rowmajor_u8(int row, int pack,
+                                                       int packs_per_row_sf,
+                                                       SFType* SFout) {
+  constexpr int PACK = CVT_FP4_ELTS_PER_THREAD;
+  constexpr int THREADS_PER_SF =
+      CVT_FP4_SF_VEC_SIZE / PACK;  // 1 if PACK=16, 2 else PACK=8
+
+  if (threadIdx.x % THREADS_PER_SF != 0) return nullptr;
+
+  int sf_col =
+      pack / THREADS_PER_SF;  // PACK=16 => sf_col=pack; PACK=8 => sf_col=pack/2
+  int64_t off = (int64_t)row * packs_per_row_sf + sf_col;
+
+  return (uint8_t*)SFout + off;
+}
+
 // Quantizes the provided PackedVec into the uint32_t output
-template <class Type, bool UE8M0_SF = false>
-__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
-                                         uint8_t* SFout) {
+template <class Type, int CVT_FP4_NUM_THREADS_PER_SF, bool UE8M0_SF = false>
+__device__ __forceinline__ fp4_packed_t
+cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal, uint8_t* SFout) {
  // Get absolute maximum values among the local 8 values.
  auto localMax = __habs2(vec.elts[0]);

-// Local maximum value.
+  // Local maximum value.
 #pragma unroll
  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
    localMax = __hmax2(localMax, __habs2(vec.elts[i]));
  }

  // Get the absolute maximum among all 16 values (two threads).
-  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+
+  if constexpr (CVT_FP4_NUM_THREADS_PER_SF == 2) {
+    localMax = __hmax2(__shfl_xor_sync(0xffffffffu, localMax, 1), localMax);
+  }
  // Get the final absolute maximum values.
  float vecMax = float(__hmax(localMax.x, localMax.y));

@@ -205,18 +355,17 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
    // Convert back to fp32.
    SFValue = float(tmp);
  }
+
+  // Write the SF to global memory (STG.8).
+  if (SFout) *SFout = fp8SFVal;
+
  // Get the output scale.
  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
  //                       reciprocal(SFScaleVal))
  float outputScale =
-      SFValue != 0 ? reciprocal_approximate_ftz(
-                         SFValue * reciprocal_approximate_ftz(SFScaleVal))
-                   : 0.0f;
-
-  if (SFout) {
-    // Write the SF to global memory (STG.8).
-    *SFout = fp8SFVal;
-  }
+      SFValue != 0.0f ? reciprocal_approximate_ftz(
+                            SFValue * reciprocal_approximate_ftz(SFScaleVal))
+                      : 0.0f;

  // Convert the input to float.
  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
@@ -233,10 +382,7 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
  }

  // Convert to e2m1 values.
-  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
-
-  // Write the e2m1 values to global memory.
-  return e2m1Vec;
+  return pack_fp4(fp2Vals);
 }

 // silu in float32
--- a/csrc/quantization/gptq_allspark/allspark_utils.cuh
+++ b/csrc/quantization/gptq_allspark/allspark_utils.cuh
@@ -7,7 +7,7 @@
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
 #include <iostream>
-#include "../gptq_marlin/marlin_dtypes.cuh"
+#include "../marlin/marlin_dtypes.cuh"
 using marlin::MarlinScalarType2;

 namespace allspark {
--- a/csrc/quantization/gptq_marlin/.gitignore
+++ b/csrc/quantization/gptq_marlin/.gitignore
--- a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
--- a/csrc/quantization/gptq_marlin/dequant.h
+++ b/csrc/quantization/gptq_marlin/dequant.h
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
@@ -70,15 +70,6 @@ QUANT_CONFIGS = [
        "thread_m_blocks": THREAD_M_BLOCKS,
        "group_blocks": [-1, 2, 4, 8],
    },
-    # HQQ
-    {
-        "a_type": ["kFloat16"],
-        "b_type": "kU4",
-        "thread_configs": THREAD_CONFIGS,
-        "thread_m_blocks": THREAD_M_BLOCKS,
-        "group_blocks": [4],
-        "is_zp_float": True,
-    },
    # GPTQ-INT4
    {
        "b_type": "kU4B8",
--- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
--- a/csrc/quantization/gptq_marlin/kernel.h
+++ b/csrc/quantization/gptq_marlin/kernel.h
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -46,7 +46,7 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,

 }  // namespace marlin

-torch::Tensor gptq_marlin_gemm(
+torch::Tensor marlin_gemm(
    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
    torch::Tensor& b_q_weight,
    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
@@ -528,7 +528,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,

 }  // namespace marlin

-torch::Tensor gptq_marlin_gemm(
+torch::Tensor marlin_gemm(
    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
    torch::Tensor& b_q_weight,
    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
@@ -856,5 +856,5 @@ torch::Tensor gptq_marlin_gemm(
 #endif

 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
-  m.impl("gptq_marlin_gemm", &gptq_marlin_gemm);
+  m.impl("marlin_gemm", &marlin_gemm);
 }
--- a/csrc/quantization/gptq_marlin/marlin.cuh
+++ b/csrc/quantization/gptq_marlin/marlin.cuh
--- a/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
+++ b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
--- a/csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu
+++ b/csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu
--- a/csrc/quantization/gptq_marlin/marlin_mma.h
+++ b/csrc/quantization/gptq_marlin/marlin_mma.h
--- a/csrc/quantization/gptq_marlin/marlin_template.h
+++ b/csrc/quantization/gptq_marlin/marlin_template.h
--- a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
+++ b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
@@ -130,26 +130,6 @@ inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
 }
 }  // namespace

-void get_cutlass_moe_mm_problem_sizes_caller(
-    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
-    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
-    std::optional<bool> force_swap_ab = std::nullopt) {
-  auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
-  auto options_int32 =
-      torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
-  torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
-
-  // Swap-AB should be disabled for FP4 path
-  bool may_swap_ab =
-      force_swap_ab.value_or((!blockscale_offsets.has_value()) &&
-                             (topk_ids.numel() <= SWAP_AB_THRESHOLD));
-
-  launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2,
-                               atomic_buffer, num_experts, n, k, stream,
-                               may_swap_ab);
-}
-
 template <bool SWAP_AB>
 __global__ void compute_problem_sizes_from_expert_offsets(
    const int64_t* __restrict__ expert_first_token_offset,
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
@@ -77,12 +77,6 @@ void get_cutlass_moe_mm_data_caller(
    const int64_t num_experts, const int64_t n, const int64_t k,
    const std::optional<torch::Tensor>& blockscale_offsets);

-void get_cutlass_moe_mm_problem_sizes_caller(
-    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
-    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
-    std::optional<bool> force_swap_ab = std::nullopt);
-
 void get_cutlass_moe_mm_problem_sizes_from_expert_offsets_caller(
    const torch::Tensor& expert_first_token_offset,
    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@@ -306,27 +300,6 @@ void get_cutlass_moe_mm_data(
      version_num, ". Required capability: 90, 100, or 120");
 }

-void get_cutlass_moe_mm_problem_sizes(
-    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
-    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
-    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets,
-    std::optional<bool> force_swap_ab = std::nullopt) {
-  int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
-    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
-  get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
-                                          problem_sizes2, num_experts, n, k,
-                                          blockscale_offsets, force_swap_ab);
-  return;
-#endif
-  TORCH_CHECK_NOT_IMPLEMENTED(
-      false,
-      "No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm "
-      "kernel for CUDA device capability: ",
-      version_num, ". Required capability: 90, 100, or 120");
-}
-
 void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
    const torch::Tensor& expert_first_token_offset,
    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
@@ -9,6 +9,10 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
                       const std::optional<at::Tensor>& in_bias,
                       const int64_t CuCount);

+torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
+                         const std::optional<at::Tensor>& in_bias,
+                         const int64_t CuCount);
+
 void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
               const std::optional<at::Tensor>& in_bias, at::Tensor& out_c,
               const at::Tensor& scale_a, const at::Tensor& scale_b,
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@@ -13,6 +13,13 @@
 #include "dispatch_utils.h"
 #include "quantization/w8a8/fp8/common.cuh"

+// TODO(rasmith): The kernels in this file are susceptible to integer overflow
+// issues, do not take strides, and are unable to handle PyTorch tensors that
+// return is_contiguous() as False (the tensors may actually be contiguous
+// in memory).
+//
+// However, it may be possible to fix these kernels to handle both issues.
+
 #if defined(__HIPCC__) && \
    (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))
  #define __HIP__GFX9__
@@ -287,6 +294,11 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
    V0 += (s.x + s.y);                                                        \
  }

+// To avoid LLVM silently upcasting to double
+__device__ inline unsigned int min__(uint32_t a, uint32_t b) {
+  return min(a, b);
+}
+
 #if defined(__HIP__GFX9__)  // TODO: Add NAVI support
 // This version targets cases where A[] fits LDS capacity
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
@@ -334,11 +346,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  // - Then the WG will move to another 8 K elements
  // TODO: Logic below will only work when K is multiple of 8
  //----------------------------------------------------
-  for (uint32_t k = 0; k < min(K * N, max_lds_len);
+  for (uint32_t k = 0; k < min__(K * N, max_lds_len);
       k += THRDS * WvPrGrp * A_CHUNK) {
    uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);

-    if (k_in >= min(K * N, max_lds_len)) break;
+    if (k_in >= min__(K * N, max_lds_len)) break;

    *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
  }
@@ -633,11 +645,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  // - Then the WG will move to another 8 K elements
  // TODO: Logic below will only work when K is multiple of 8
  //----------------------------------------------------
-  for (uint32_t k = 0; k < min(K * N, max_lds_len);
+  for (uint32_t k = 0; k < min__(K * N, max_lds_len);
       k += THRDS * WvPrGrp * A_CHUNK) {
    uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);

-    if (k_in >= min(K * N, max_lds_len)) break;
+    if (k_in >= min__(K * N, max_lds_len)) break;

    *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
  }
@@ -954,11 +966,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  //----------------------------------------------------
  #define PCML
  #ifndef PCML
-  for (uint32_t k = 0; k < min(K * N, max_lds_len);
+  for (uint32_t k = 0; k < min__(K * N, max_lds_len);
       k += THRDS * WvPrGrp * A_CHUNK) {
    uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);

-    if (k_in >= min(K * N, max_lds_len)) break;
+    if (k_in >= min__(K * N, max_lds_len)) break;

    *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
  }
@@ -975,7 +987,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
             ? kFit
             : (kFit - kFit % TUC);  // round up to multiple of TUC
  // if (kFit == 0) kFit = TUC;
-  kFit = min(kFit, K);
+  kFit = min__(kFit, K);

  float sum[N][YTILE];
  scalar8 sum4[N][YTILE];
@@ -1251,6 +1263,7 @@ int mindiv(int N, int div1, int div2) {
  }
  for (int i = 12; i >= 0; i--)
    if (rnds[0] == rnds[i]) return (div2 - i);
+  return 0;
 }

 torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
@@ -1352,6 +1365,536 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
  return out_c;
 }

+#if defined(__gfx950__)  // TODO: Add NAVI support
+  // This version targets big A[] cases, where it is much larger than LDS
+  // capacity
+  #define WVSPLITKRC_1KPASS
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N, int GrpsShrB>
+
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    __attribute__((amdgpu_waves_per_eu(1, 1)))
+    wvSplitKrc_(const int actlN, const int K, const int M, const int Bx,
+                const int By, const scalar_t* __restrict__ B,
+                const scalar_t* __restrict__ A,
+                const scalar_t* __restrict__ BIAS, float* glbl, scalar_t* C,
+                const int CuCount) {
+  // Use upper half of glbl buffer for atomic reduce counting
+  int* cntr = (int*)(&glbl[M * N]);
+
+  constexpr int NTILE = 16;
+  constexpr int WVLDS_ = (NTILE * THRDS * A_CHUNK);
+  constexpr int APAD = 1;
+  constexpr int ASTRD = 64;
+  constexpr int BPAD = 1;
+  constexpr int BSTRD = 64;
+  constexpr int WVLDS = ((WVLDS_ + (WVLDS_ / BSTRD) * 4 * BPAD));
+
+  constexpr int max_lds_len = LDS_SIZE / 2;
+
+  using scalar16 =
+      __attribute__((__vector_size__((A_CHUNK * 2) * sizeof(float)))) float;
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(float)))) float;
+  using half4 =
+      __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(__bf16)))) __bf16;
+  union bigType {
+    scalar_t h[A_CHUNK];
+    float f[A_CHUNK / 2];
+    unsigned int i[A_CHUNK / 2];
+    float2 f2[A_CHUNK / 4];
+    unsigned long l[A_CHUNK / 4];
+    double d[A_CHUNK / 4];
+    half4 h4[A_CHUNK / 4];
+    scalar8 h8;
+  };
+  using big4 = __attribute__((__vector_size__(4 * sizeof(bigType)))) __bf16;
+
+  __shared__ scalar_t stg[WvPrGrp * WVLDS / GrpsShrB];
+  unsigned int* myStg = (unsigned int*)(&stg[WVLDS * (threadIdx.y / GrpsShrB)]);
+  __shared__ scalar_t s[max_lds_len - WvPrGrp * WVLDS / GrpsShrB];
+
+  #ifndef WVSPLITKRC_1KPASS
+  constexpr int TUC_ = (THRDS * UNRL * A_CHUNK);
+  // find biggest k size that fits padded into LDS
+  constexpr uint32_t kFit__ = (max_lds_len - WvPrGrp * WVLDS / GrpsShrB) / N;
+  constexpr uint32_t kFit_ = (kFit__ * ASTRD) / (APAD + ASTRD);
+  uint32_t kFit = kFit_ - (kFit_ % TUC_);
+  uint32_t kfitsPerRdc = (K + kFit - 1) / kFit;
+
+  // find best k split to fill the CUs
+  if (((K + kfitsPerRdc * kFit - 1) / (kfitsPerRdc * kFit)) * numCuWithFullK <=
+      CuCount)
+    while (true) {
+      while (kFit > TUC_) {
+        uint32_t kFit_ = kFit - TUC_;
+        if (((K + (kfitsPerRdc * kFit_ - 1)) / (kfitsPerRdc * kFit_)) *
+                numCuWithFullK >
+            CuCount)
+          break;
+        kFit = kFit_;
+      }
+      if (((K + ((kfitsPerRdc - 1) * kFit - 1)) / ((kfitsPerRdc - 1) * kFit)) *
+              numCuWithFullK <=
+          CuCount)
+        kfitsPerRdc--;
+      else
+        break;
+    }
+  #else
+  int constexpr kFit = 512;
+  int constexpr kfitsPerRdc = 1;
+  #endif
+
+  bool doRdc = (kfitsPerRdc * kFit < K);
+  uint32_t numCuWithFullK =
+      ((M + (WvPrGrp * YTILE / GrpsShrB) - 1) / (WvPrGrp * YTILE / GrpsShrB));
+  uint32_t Mmod = numCuWithFullK * (WvPrGrp * YTILE / GrpsShrB);
+
+  // given above k-split, find this wave's position
+  uint32_t kFitPdd = kFit + (kFit / ASTRD) * APAD;
+  uint32_t m0 = (blockIdx.x * WvPrGrp / GrpsShrB) * YTILE;
+  uint32_t m1 = ((threadIdx.y % WvPrGrp) / GrpsShrB) * YTILE;
+  uint32_t m = (m0 + m1) % Mmod;
+  const uint32_t k_str = (m0 / Mmod) * kFit * kfitsPerRdc;
+  uint32_t k_end = (m0 / Mmod + 1) * kFit * kfitsPerRdc;
+  const uint32_t k_rnd = (K + kFit * kfitsPerRdc - 1) / (kFit * kfitsPerRdc);
+
+  scalar8 sum4[N / NTILE / GrpsShrB][1];
+  bigType bigB_[YTILE / GrpsShrB][UNRL];
+  const uint32_t bLoader = (threadIdx.y % GrpsShrB);
+  uint32_t kBase = 0;
+  if (k_str >= K) return;
+  if (m >= Mmod) return;
+
+  bool noreloada = false;
+  constexpr bool FAST_UNSAFE_RDC_INIT = false;
+
+  #ifdef WVSPLITKRC_1KPASS
+  // Early glbl init, B[] loading, if 1KPASS
+  if constexpr (FAST_UNSAFE_RDC_INIT) {
+    if (m + (threadIdx.x % 16) < M)
+      if (doRdc)
+        if (k_str == 0) {
+          int mindx = m + (threadIdx.x % 16);
+          int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
+                       (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+          int adr_ = mindx + M * nindx_ / 4;
+          __hip_atomic_store(&cntr[adr_], 0, __ATOMIC_RELAXED,
+                             __HIP_MEMORY_SCOPE_AGENT);
+          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+            for (uint32_t j = 0; j < 4; j++) {
+              int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                          (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+              int adr = mindx + M * nindx;
+              __hip_atomic_store(&glbl[adr], 0, __ATOMIC_RELAXED,
+                                 __HIP_MEMORY_SCOPE_AGENT);
+            }
+          }
+        }
+  }
+
+    // Load first B[] chunk
+    #pragma unroll
+  for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+    uint32_t k = k_str + k2 * THRDS * A_CHUNK;
+    uint32_t k_ = k + threadIdx.x * A_CHUNK;
+    const scalar_t* B_ = &B[min__(k_, K - A_CHUNK)];
+    #pragma unroll
+    for (uint32_t y = 0; y < YTILE / GrpsShrB; y++)
+      bigB_[y][k2].h8 = (loadnt(
+          (scalar8*)(&B_[min__(y * GrpsShrB + bLoader + m, M - 1) * K])));
+  }
+  {
+  #else
+  while (m < Mmod) {
+  #endif
+
+  #ifndef WVSPLITKRC_1KPASS
+    if constexpr (FAST_UNSAFE_RDC_INIT) {
+      if (m + (threadIdx.x % 16) < M)
+        if (doRdc)
+          if (k_str == 0) {
+            int mindx = m + (threadIdx.x % 16);
+            int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
+                         (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+            int adr_ = mindx + M * nindx_ / 4;
+            __hip_atomic_store(&cntr[adr_], 0, __ATOMIC_RELAXED,
+                               __HIP_MEMORY_SCOPE_AGENT);
+            for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+              for (uint32_t j = 0; j < 4; j++) {
+                int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                            (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+                int adr = mindx + M * nindx;
+                __hip_atomic_store(&glbl[adr], 0, __ATOMIC_RELAXED,
+                                   __HIP_MEMORY_SCOPE_AGENT);
+              }
+            }
+          }
+    }
+
+  #endif
+
+  #ifndef WVSPLITKRC_1KPASS
+    for (uint32_t k1 = k_str; k1 < k_end; k1 += THRDS * A_CHUNK * UNRL) {
+  #else
+    const uint32_t k1 = k_str;
+    {
+  #endif
+  #ifndef WVSPLITKRC_1KPASS
+      const bool reloada = (!noreloada) &&
+                           ((k1 == k_str) || (k1 == k_str + kBase + kFit)) &&
+                           (k1 < k_end);
+      // load next chunk of A[] to LDS
+      if (reloada) {
+        if (k1 != k_str) kBase += kFit;
+        __syncthreads();
+  #else
+      const bool reloada = (!noreloada) &&
+                           ((k1 == k_str) || (k1 == k_str + kBase + kFit)) &&
+                           (k1 < k_end);
+      if (reloada) {
+  #endif
+        constexpr int sprdN = 4;
+        const uint32_t thrd = ((threadIdx.y / sprdN) * THRDS + threadIdx.x);
+
+  #ifndef WVSPLITKRC_1KPASS
+    #pragma unroll
+        for (int k = 0; k < kFit; k += THRDS * (WvPrGrp / sprdN) * A_CHUNK) {
+  #else
+        const unsigned int k = 0;
+        {
+  #endif
+          unsigned int kOff = k + (thrd * A_CHUNK);
+          unsigned int kOffcp = min__(K - A_CHUNK, k_str + kOff);
+          const unsigned int k_in = kOffcp + ((threadIdx.y % sprdN)) * K;
+          const unsigned int k_ot = kOff + ((threadIdx.y % sprdN)) * kFitPdd;
+          for (unsigned int n = 0; n < N / 2; n += sprdN) {
+            __builtin_amdgcn_global_load_lds((int*)(&A[k_in + n * K]),
+                                             (int*)(&s[(k_ot + n * kFitPdd)]),
+                                             16, 0, 0);
+            if (((threadIdx.y % sprdN)) + n + N / 2 >= actlN) continue;
+            __builtin_amdgcn_global_load_lds(
+                (int*)(&A[k_in + (n + N / 2) * K]),
+                (int*)(&s[(k_ot + (n + N / 2) * kFitPdd)]), 16, 0, 0);
+          }
+
+          // Stage loaded B[] to LDS for MFMA swizzling...
+          for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+            uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+            uint32_t k_ = k + threadIdx.x * A_CHUNK;
+            const bool oob_k = (k_ >= K);
+            for (uint32_t y = 0; y < YTILE / GrpsShrB; y++) {
+              uint32_t idx = threadIdx.x * 4 +
+                             (y * GrpsShrB + bLoader) * ((THRDS + BPAD) * 4);
+              // zero out if oob
+              *((scalar8*)&myStg[idx]) =
+                  (oob_k || (y * GrpsShrB + bLoader + m >= M))
+                      ? 0
+                      : bigB_[y][k2].h8;
+            }
+          }
+        }
+      }
+    }
+
+  #ifndef WVSPLITKRC_1KPASS
+    // Fire load of next B[] chunk...
+    if ((k1 + THRDS * A_CHUNK * UNRL < k_end) &&
+        (k1 + THRDS * A_CHUNK * UNRL < K))
+    #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + THRDS * A_CHUNK * UNRL + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        const scalar_t* B_ = &B[min__(k_, K - A_CHUNK)];
+    #pragma unroll
+        for (uint32_t y = 0; y < YTILE / GrpsShrB; y++)
+          bigB_[y][k2].h8 = (loadnt(
+              (scalar8*)(&B_[min__(y * GrpsShrB + bLoader + m, M - 1) * K])));
+      }
+  #endif
+
+    // B[] staging is cooperative across GrpsShrB, so sync here before reading
+    // back
+    __syncthreads();
+
+    // read back B[] swizzled for MFMA...
+    bigType bigB[YTILE][UNRL];
+    for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+      for (uint32_t y = 0; y < YTILE; y++) {
+        unsigned int idx = (threadIdx.x % YTILE) * ((THRDS + BPAD) * 4) +
+                           (threadIdx.x / YTILE) * 4 + y * 16;
+        bigB[y][k2].h8 = *((scalar8*)&myStg[idx]);
+      }
+    }
+
+    // rReadback A[] swizzled for MFMA...
+    bigType bigA[N / GrpsShrB][UNRL];
+  #pragma unroll
+    for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+      uint32_t k = k1 + k2 * THRDS * A_CHUNK - kBase - k_str;
+  #pragma unroll
+      for (uint32_t nt = 0; nt < N / GrpsShrB; nt += NTILE)
+  #pragma unroll
+        for (uint32_t n = 0; n < NTILE; n++) {
+          uint32_t idxa = (nt + (threadIdx.x % NTILE) +
+                           (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) *
+                              kFitPdd +
+                          A_CHUNK * ((threadIdx.x / NTILE) + n * 4) + k;
+          bigA[nt + n][k2] = *((const bigType*)(&(s[idxa])));
+        }
+    }
+
+    // Do the MFMAs
+  #pragma unroll
+    for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+  #pragma unroll
+      for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+        if constexpr (std::is_same_v<scalar_t, half>) {
+          sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16f16(
+              bigA[nt * NTILE + 0][k2].h4[0], bigB[0][k2].h4[0],
+              (k1 == k_str) ? ((scalar8){0}) : sum4[nt][0], 0, 0, 0);
+          sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16f16(
+              bigA[nt * NTILE + 0][k2].h4[1], bigB[0][k2].h4[1], sum4[nt][0], 0,
+              0, 0);
+        } else {  // bf16
+          sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
+              bigA[nt * NTILE + 0][k2].h4[0], bigB[0][k2].h4[0],
+              (k1 == k_str) ? ((scalar8){0}) : sum4[nt][0], 0, 0, 0);
+          sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
+              bigA[nt * NTILE + 0][k2].h4[1], bigB[0][k2].h4[1], sum4[nt][0], 0,
+              0, 0);
+        }
+  #pragma unroll
+        for (uint32_t j = 1; j < YTILE; j++) {
+          if constexpr (std::is_same_v<scalar_t, half>) {
+            sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16f16(
+                bigA[nt * NTILE + j][k2].h4[0], bigB[j][k2].h4[0], sum4[nt][0],
+                0, 0, 0);
+            sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16f16(
+                bigA[nt * NTILE + j][k2].h4[1], bigB[j][k2].h4[1], sum4[nt][0],
+                0, 0, 0);
+          } else {  // bf16
+            sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
+                bigA[nt * NTILE + j][k2].h4[0], bigB[j][k2].h4[0], sum4[nt][0],
+                0, 0, 0);
+            sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
+                bigA[nt * NTILE + j][k2].h4[1], bigB[j][k2].h4[1], sum4[nt][0],
+                0, 0, 0);
+          }
+        }
+      }
+    }
+  }
+
+  if (!doRdc) {
+    if (m + (threadIdx.x % 16) < M) {
+      scalar_t biases[N / NTILE / GrpsShrB][4] = {0};
+      if (BIAS)
+        for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+          for (uint32_t j = 0; j < 4; j++) {
+            int mindx = m + (threadIdx.x % 16);
+            int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                        (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+            biases[nt][j] = BIAS[(mindx % Bx) + (nindx % By) * M];
+          }
+        }
+      for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+        for (uint32_t j = 0; j < 4; j++) {
+          int mindx = m + (threadIdx.x % 16);
+          int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                      (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+          int adr = mindx + M * nindx;
+          if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
+            if (BIAS) sum4[nt][0][j] += __bfloat162float(biases[nt][j]);
+            C[adr] = __float2bfloat16(sum4[nt][0][j]);
+          } else {
+            if (BIAS) sum4[nt][0][j] += __half2float(biases[nt][j]);
+            C[adr] = __float2half(sum4[nt][0][j]);
+          }
+        }
+      }
+    }
+  } else {
+    if (m + (threadIdx.x % 16) < M) {
+      int my_cntr;
+      if (!BIAS) {
+        int mindx = m + (threadIdx.x % 16);
+        for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++)
+          for (uint32_t j = 0; j < 4; j++) {
+            int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                        (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+            int adr = mindx + M * nindx;
+            atomicAdd(&glbl[adr], sum4[nt][0][j]);
+          }
+        int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
+                     (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+        int adr_ = mindx + M * nindx_ / 4;
+        my_cntr = atomicAdd(&cntr[adr_], 1);
+        float vals[N / NTILE / GrpsShrB][4] = {};
+        if (my_cntr + 1 == k_rnd) {
+          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+            for (uint32_t j = 0; j < 4; j++) {
+              int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                          (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+              int adr = mindx + M * nindx;
+              vals[nt][j] = glbl[adr];
+            }
+          }
+          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+            for (uint32_t j = 0; j < 4; j++) {
+              int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                          (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+              if (nindx >= actlN) break;
+              int adr = mindx + M * nindx;
+              if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
+                C[adr] = __float2bfloat16(vals[nt][j]);
+              } else {
+                C[adr] = __float2half(vals[nt][j]);
+              }
+            }
+          }
+        }
+      } else {
+        int mindx = m + (threadIdx.x % 16);
+        scalar_t biases[N / NTILE / GrpsShrB][4] = {};
+        // Atomic add the output, read biases
+        for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++)
+          for (uint32_t j = 0; j < 4; j++) {
+            int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                        (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+            int adr = mindx + M * nindx;
+            atomicAdd(&glbl[adr], sum4[nt][0][j]);
+            biases[nt][j] = BIAS[(mindx % Bx) + (nindx % By) * M];
+          }
+        int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
+                     (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+        int adr_ = mindx + M * nindx_ / 4;
+        // Update the complete counter
+        my_cntr = atomicAdd(&cntr[adr_], 1);
+        float vals[N / NTILE / GrpsShrB][4] = {};
+        // If we're the last k-shard, read back the value and convert...
+        if (my_cntr + 1 == k_rnd) {
+          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+            for (uint32_t j = 0; j < 4; j++) {
+              int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                          (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+              int adr = mindx + M * nindx;
+              vals[nt][j] = glbl[adr];
+            }
+          }
+          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+            for (uint32_t j = 0; j < 4; j++) {
+              int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                          (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+              if (nindx >= actlN) break;
+              int adr = mindx + M * nindx;
+              if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
+                vals[nt][j] += __bfloat162float(biases[nt][j]);
+                C[adr] = __float2bfloat16(vals[nt][j]);
+              } else {
+                vals[nt][j] += __half2float(biases[nt][j]);
+                C[adr] = __float2half(vals[nt][j]);
+              }
+            }
+          }
+        }
+      }
+    }
+
+  #ifndef WVSPLITKRC_1KPASS
+    m0 += CuCount * WvPrGrp * YTILE / GrpsShrB;
+    m = (m0 + m1) % Mmod;
+    k_str = (m0 / Mmod) * kFit * kfitsPerRdc;
+    k_end = (m0 / Mmod + 1) * kFit * kfitsPerRdc;
+    if (k_str >= K) break;
+    kBase = 0;
+  #endif
+  }
+}
+#else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N, int GrpsShrB>
+__global__ void wvSplitKrc_(const int actlN, const int K, const int M,
+                            const int Bx, const int By, const scalar_t* B,
+                            const scalar_t* __restrict__ A,
+                            const scalar_t* __restrict__ BIAS, float* glbl,
+                            // int* cntr,
+                            scalar_t* C, const int CuCount){UNREACHABLE_CODE}
+#endif  // defined(__HIP__GFX9__) TODO: Add NAVI support
+
+torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
+                         const std::optional<at::Tensor>& in_bias,
+                         const int64_t CuCount) {
+  auto M_in = in_a.size(0);
+  auto N_in = in_b.size(0);
+  auto K_in = in_a.size(1);
+  auto Bx_in =
+      (in_bias.has_value() && in_bias->numel() > 0)
+          ? (in_bias->sizes().size() == 2) ? in_bias->size(1) : in_bias->size(0)
+          : 1;
+  auto By_in = (in_bias.has_value() && in_bias->numel() > 0 &&
+                in_bias->sizes().size() == 2)
+                   ? in_bias->size(0)
+                   : 1;
+
+  TORCH_CHECK(in_a.dtype() == in_b.dtype());
+  TORCH_CHECK(K_in % 8 == 0, "k % 8 == 0");
+  TORCH_CHECK(in_a.dtype() == torch::kFloat16 ||
+              in_a.dtype() == torch::kBFloat16);
+
+  auto out_c = torch::empty(
+      {N_in, M_in},
+      torch::TensorOptions().dtype(in_b.dtype()).device(in_b.device()));
+
+  auto N_p2 = 1U << (32 - __builtin_clz(N_in - 1));
+  auto axl_glbl = torch::empty(
+      {N_p2 + N_p2 / 4, M_in + M_in / 4},
+      torch::TensorOptions().dtype(torch::kFloat32).device(in_b.device()));
+  axl_glbl.zero_();  // disable for FAST_UNSAFE_RDC_INIT
+
+  dim3 grid(CuCount);
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  // const int max_lds_len = get_lds_size() / 2;
+
+#define WVSPLITKrc(_WvPrGrp, _YTILE, _UNRL, _N, _GrpsShrB)                     \
+  {                                                                            \
+    dim3 block(64, _WvPrGrp);                                                  \
+    wvSplitKrc_<fptype, 64, _YTILE, _WvPrGrp, 8, _UNRL, _N, _GrpsShrB>         \
+        <<<grid, block, 0, stream>>>(N_in, K_in, M_in, Bx_in, By_in, af4, bf4, \
+                                     biasf4, glbl, c, CuCount);                \
+  }
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitKrc", [&] {
+    using fptype = typename scalar<scalar_t>::type;
+    fptype* af4 = reinterpret_cast<fptype*>(in_a.data_ptr());
+    const fptype* bf4 = reinterpret_cast<const fptype*>(in_b.data_ptr());
+    const fptype* biasf4 =
+        (in_bias.has_value() && in_bias->numel() > 0)
+            ? reinterpret_cast<const fptype*>(in_bias->data_ptr())
+            : nullptr;
+    fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
+    auto glbl = axl_glbl.data_ptr<float>();
+    switch (N_p2) {
+      case 16:
+        WVSPLITKrc(4, 16, 1, 16, 1) break;
+      case 32:
+        WVSPLITKrc(4, 16, 1, 32, 2) break;
+      case 64:
+        WVSPLITKrc(4, 16, 1, 64, 2) break;
+      case 128:
+        WVSPLITKrc(4, 16, 1, 128, 4) break;
+      default:
+        throw std::runtime_error(
+            "Unsupported N value: " + std::to_string(M_in) + "," +
+            std::to_string(K_in) + "," + std::to_string(N_in));
+    }
+  });
+  return out_c;
+}
+
 #if defined(__HIP__MI3XX__)  // TODO: Add NAVI support
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
          int A_CHUNK, int UNRL, int N>
@@ -1381,7 +1924,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  __shared__ fp8_t s[max_lds_len];

  for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
-       k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
+       k < min__(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
    *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
  }
  __syncthreads();
@@ -1570,7 +2113,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  __shared__ fp8_t s[max_lds_len];

  for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
-       k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
+       k < min__(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
    *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
  }
  __syncthreads();
--- a/csrc/rocm/torch_bindings.cpp
+++ b/csrc/rocm/torch_bindings.cpp
@@ -26,6 +26,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
      "Tensor");
  rocm_ops.impl("wvSplitK", torch::kCUDA, &wvSplitK);

+  // Custom gemm op for skinny matrix-matrix multiplication
+  rocm_ops.def(
+      "wvSplitKrc(Tensor in_a, Tensor in_b, Tensor? in_bias, int CuCount) -> "
+      "Tensor");
+  rocm_ops.impl("wvSplitKrc", torch::kCUDA, &wvSplitKrc);
+
  // wvSplitK for fp8
  rocm_ops.def(
      "wvSplitKQ(Tensor in_a, Tensor in_b, Tensor? in_bias, Tensor! out_c, "
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -303,9 +303,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def("permute_cols(Tensor A, Tensor perm) -> Tensor");
  ops.impl("permute_cols", torch::kCUDA, &permute_cols);

-  // gptq_marlin Optimized Quantized GEMM for GPTQ.
+  // Marlin Optimized Quantized GEMM (supports GPTQ, AWQ, FP8, NVFP4, MXFP4).
  ops.def(
-      "gptq_marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, "
+      "marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, "
      "Tensor? b_bias_or_none,Tensor b_scales, "
      "Tensor? a_scales, Tensor? global_scale, Tensor? b_zeros_or_none, "
      "Tensor? "
@@ -474,19 +474,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "()");
  ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);

-  // A function that computes problem sizes for each expert's multiplication
-  // used by the two mms called from fused MoE operation. It takes topk_ids as
-  // an input, and computes problem_sizes1 and problem_sizes2 only.
-  ops.def(
-      "get_cutlass_moe_mm_problem_sizes(Tensor topk_ids, "
-      "                                 Tensor! problem_sizes1, "
-      "                                 Tensor! problem_sizes2, "
-      "                                 int num_experts, int n, int k, "
-      "                                 Tensor? blockscale_offsets, "
-      "                                 bool? force_swap_ab) -> ()");
-  ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA,
-           &get_cutlass_moe_mm_problem_sizes);
-
  // compute per-expert problem sizes from expert_first_token_offset
  // produced by vLLM's moe_permute kernel
  ops.def(
@@ -559,7 +546,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // Compute NVFP4 block quantized tensor.
  ops.def(
      "scaled_fp4_quant(Tensor! output, Tensor input,"
-      "                 Tensor! output_scale, Tensor input_scale) -> ()");
+      "                 Tensor! output_scale, Tensor input_scale, bool "
+      "is_sf_swizzled_layout) -> ()");
  ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant);

  // Compute NVFP4 experts quantization.
@@ -705,7 +693,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
  // Cache ops
  // Swap in (out) the cache blocks from src to dst.
  cache_ops.def(
-      "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
+      "swap_blocks(Tensor src, Tensor! dst,"
+      "            int block_size_in_bytes, Tensor block_mapping) -> ()");
  cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks);

  // Reshape the key and value tensors and cache them.
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -5,6 +5,23 @@
 # docs/contributing/dockerfile/dockerfile.md and
 # docs/assets/contributing/dockerfile-stages-dependency.png

+# =============================================================================
+# VERSION MANAGEMENT
+# =============================================================================
+# ARG defaults in this Dockerfile are the source of truth for pinned versions.
+# docker/versions.json is auto-generated for use with docker buildx bake.
+#
+# When updating versions:
+# 1. Edit the ARG defaults below
+# 2. Run: python tools/generate_versions_json.py
+#
+# To query versions programmatically:
+#   jq -r '.variable.CUDA_VERSION.default' docker/versions.json
+#
+# To build with bake:
+#   docker buildx bake -f docker/docker-bake.hcl -f docker/versions.json
+# =============================================================================
+
 ARG CUDA_VERSION=12.9.1
 ARG PYTHON_VERSION=3.12

@@ -117,8 +134,8 @@ ENV UV_LINK_MODE=copy
 # Verify GCC version
 RUN gcc --version

-# Workaround for triton/pytorch issues
-RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+# Ensure CUDA compatibility library is loaded
+RUN echo "/usr/local/cuda-$(echo "$CUDA_VERSION" | cut -d. -f1,2)/compat/" > /etc/ld.so.conf.d/00-cuda-compat.conf && ldconfig

 # ============================================================
 # SLOW-CHANGING DEPENDENCIES BELOW
@@ -131,16 +148,41 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL

 WORKDIR /workspace

-# install build and runtime dependencies
+# We can specify the standard or nightly build of PyTorch
+ARG PYTORCH_NIGHTLY
+
+# Install build and runtime dependencies, including PyTorch
+# Check whether to install torch nightly instead of release for this build
 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY pyproject.toml pyproject.toml
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
-    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
+        echo "Installing torch nightly..." \
+        && uv pip install --python /opt/venv/bin/python3 torch torchaudio torchvision --pre \
+        --index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+        && echo "Installing other requirements..." \
+        && /opt/venv/bin/python3 use_existing_torch.py --prefix \
+        && uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    else \
+        uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    fi
+
+# Track PyTorch lib versions used during build and match in downstream instances.
+# We do this for both nightly and release so we can strip dependencies/*.txt as needed.
+# Otherwise library dependencies can upgrade/downgrade torch incorrectly.
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip freeze | grep -i "^torch=\|^torchvision=\|^torchaudio=" > torch_lib_versions.txt \
+    && TORCH_LIB_VERSIONS=$(cat torch_lib_versions.txt | xargs) \
+    && echo "Installed torch libs: ${TORCH_LIB_VERSIONS}"

 # CUDA arch list used by torch
 # Explicitly set the list to avoid issues with torch 2.2
 # See https://github.com/pytorch/pytorch/pull/123243
+# From versions.json: .torch.cuda_arch_list
 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 #################### BUILD BASE IMAGE ####################
@@ -153,8 +195,13 @@ ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL

-# install build dependencies
+# We can specify the standard or nightly build of PyTorch
+ARG PYTORCH_NIGHTLY
+
+# Install build dependencies
 COPY requirements/build.txt requirements/build.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY --from=base /workspace/torch_lib_versions.txt torch_lib_versions.txt

 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@@ -164,8 +211,18 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE=copy

 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
-    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
+        echo "Installing build requirements without torch..." \
+        && python3 use_existing_torch.py --prefix \
+        && uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
+        && echo "Installing torch nightly..." \
+        && uv pip install --python /opt/venv/bin/python3 $(cat torch_lib_versions.txt | grep -i "^torch=" | xargs) --pre \
+        --index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    else \
+        echo "Installing build requirements..." \
+        && uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    fi

 WORKDIR /workspace

@@ -197,6 +254,13 @@ ARG VLLM_MAIN_CUDA_VERSION=""
 # Use dummy version for csrc-build wheel (only .so files are extracted, version doesn't matter)
 ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"

+# Use existing torch for nightly builds
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
+        python3 use_existing_torch.py --prefix; \
+    fi
+
+# Build the vLLM wheel
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$USE_SCCACHE" = "1" ]; then \
@@ -240,6 +304,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
        export VLLM_DOCKER_BUILD_CONTEXT=1 && \
        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
    fi
+
 #################### CSRC BUILD IMAGE ####################

 #################### EXTENSIONS BUILD IMAGE ####################
@@ -256,7 +321,8 @@ ENV UV_LINK_MODE=copy
 WORKDIR /workspace

 # Build DeepGEMM wheel
-ARG DEEPGEMM_GIT_REF
+# Default moved here from tools/install_deepgemm.sh for centralized version management
+ARG DEEPGEMM_GIT_REF=594953acce41793ae00a1233eb516044d604bcb6
 COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
 RUN --mount=type=cache,target=/root/.cache/uv \
    mkdir -p /tmp/deepgemm/dist && \
@@ -271,8 +337,9 @@ RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped

 # Build pplx-kernels and DeepEP wheels
 COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
-ARG PPLX_COMMIT_HASH
-ARG DEEPEP_COMMIT_HASH
+# Defaults moved here from tools/ep_kernels/install_python_libraries.sh for centralized version management
+ARG PPLX_COMMIT_HASH=12cecfd
+ARG DEEPEP_COMMIT_HASH=73b6ea4
 ARG NVSHMEM_VER
 RUN --mount=type=cache,target=/root/.cache/uv \
    mkdir -p /tmp/ep_kernels_workspace/dist && \
@@ -294,8 +361,13 @@ ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL

-# install build dependencies
+# We can specify the standard or nightly build of PyTorch
+ARG PYTORCH_NIGHTLY
+
+# Install build dependencies
 COPY requirements/build.txt requirements/build.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY --from=base /workspace/torch_lib_versions.txt torch_lib_versions.txt

 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@@ -305,14 +377,23 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE=copy

 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
-    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
+        echo "Installing build requirements without torch..." \
+        && python3 use_existing_torch.py --prefix \
+        && uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
+        && echo "Installing torch nightly..." \
+        && uv pip install --python /opt/venv/bin/python3 $(cat torch_lib_versions.txt | grep -i "^torch=" | xargs) --pre \
+        --index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    else \
+        echo "Installing build requirements..." \
+        && uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    fi

 WORKDIR /workspace

 # Copy pre-built csrc wheel directly
 COPY --from=csrc-build /workspace/dist /precompiled-wheels
-
 COPY . .

 ARG GIT_REPO_CHECK=0
@@ -325,6 +406,13 @@ ENV VLLM_TARGET_DEVICE=${vllm_target_device}
 # Skip adding +precompiled suffix to version (preserves git-derived version)
 ENV VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX=1

+# Use existing torch for nightly builds
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
+        python3 use_existing_torch.py --prefix; \
+    fi
+
+# Build the vLLM wheel
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
    if [ "${vllm_target_device}" = "cuda" ]; then \
@@ -347,7 +435,8 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
    else \
        echo "Skipping wheel size check."; \
    fi
-#################### EXTENSION Build IMAGE ####################
+
+#################### WHEEL BUILD IMAGE ####################

 #################### DEV IMAGE ####################
 FROM base AS dev
@@ -365,12 +454,34 @@ ENV UV_LINK_MODE=copy

 # Install libnuma-dev, required by fastsafetensors (fixes #20384)
 RUN apt-get update && apt-get install -y --no-install-recommends libnuma-dev && rm -rf /var/lib/apt/lists/*
+
+
+# We can specify the standard or nightly build of PyTorch
+ARG PYTORCH_NIGHTLY
+
+# Install development dependencies
 COPY requirements/lint.txt requirements/lint.txt
+COPY requirements/test.in requirements/test.in
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY --from=base /workspace/torch_lib_versions.txt torch_lib_versions.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
-    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
+        echo "Installing dev requirements plus torch nightly..." \
+        && python3 use_existing_torch.py --prefix \
+        && cat torch_lib_versions.txt >> requirements/test.in \
+        && uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+        && uv pip install --python /opt/venv/bin/python3 $(cat torch_lib_versions.txt | xargs) --pre \
+        -r requirements/dev.txt \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    else \
+        echo "Installing dev requirements..." \
+        && uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    fi
+
 #################### DEV IMAGE ####################
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
@@ -453,8 +564,8 @@ ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE=copy

-# Workaround for triton/pytorch issues
-RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+# Ensure CUDA compatibility library is loaded
+RUN echo "/usr/local/cuda-$(echo "$CUDA_VERSION" | cut -d. -f1,2)/compat/" > /etc/ld.so.conf.d/00-cuda-compat.conf && ldconfig

 # ============================================================
 # SLOW-CHANGING DEPENDENCIES BELOW
@@ -474,7 +585,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Install FlashInfer pre-compiled kernel cache and binaries
 # This is ~1.1GB and only changes when FlashInfer version bumps
 # https://docs.flashinfer.ai/installation.html
-ARG FLASHINFER_VERSION=0.5.3
+# From versions.json: .flashinfer.version
+ARG FLASHINFER_VERSION=0.6.1
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \
    && uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
@@ -503,14 +615,20 @@ RUN set -eux; \

 # Install vllm-openai dependencies (saves ~2.6s per build)
 # These are stable packages that don't depend on vLLM itself
+# From versions.json: .bitsandbytes.x86_64, .bitsandbytes.arm64
+# From versions.json: .openai_server_extras.timm, .openai_server_extras.runai_model_streamer
+ARG BITSANDBYTES_VERSION_X86=0.46.1
+ARG BITSANDBYTES_VERSION_ARM64=0.42.0
+ARG TIMM_VERSION=">=1.0.17"
+ARG RUNAI_MODEL_STREAMER_VERSION=">=0.15.3"
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        BITSANDBYTES_VERSION="0.42.0"; \
+        BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_ARM64}"; \
    else \
-        BITSANDBYTES_VERSION="0.46.1"; \
+        BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_X86}"; \
    fi; \
    uv pip install --system accelerate hf_transfer modelscope \
-        "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.3'
+        "bitsandbytes>=${BITSANDBYTES_VERSION}" "timm${TIMM_VERSION}" "runai-model-streamer[s3,gcs]${RUNAI_MODEL_STREAMER_VERSION}"

 # ============================================================
 # VLLM INSTALLATION (depends on build stage)
@@ -521,11 +639,26 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER

-# Install vllm wheel first, so that torch etc will be installed.
+# We can specify the standard or nightly build of PyTorch
+ARG PYTORCH_NIGHTLY
+
+# Install vLLM wheel first, so that torch etc will be installed.
+# Check whether to install torch nightly instead of release for this build.
+COPY --from=base /workspace/torch_lib_versions.txt torch_lib_versions.txt
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system dist/*.whl --verbose \
-        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
+        echo "Installing torch nightly..." \
+        && uv pip install --system $(cat torch_lib_versions.txt | xargs) --pre \
+        --index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+        && echo "Installing vLLM..." \
+        && uv pip install --system dist/*.whl --verbose \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    else \
+        echo "Installing vLLM..." \
+        && uv pip install --system dist/*.whl --verbose \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+    fi

 RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
@@ -585,12 +718,33 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && apt-get update -y \
    && apt-get install -y git

-# install development dependencies (for testing)
+# We can specify the standard or nightly build of PyTorch
+ARG PYTORCH_NIGHTLY
+
+# Install development dependencies (for testing)
+COPY requirements/lint.txt requirements/lint.txt
+COPY requirements/test.in requirements/test.in
+COPY requirements/test.txt requirements/test.txt
+COPY requirements/dev.txt requirements/dev.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY --from=base /workspace/torch_lib_versions.txt torch_lib_versions.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
    CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
    if [ "$CUDA_MAJOR" -ge 12 ]; then \
-        uv pip install --system -r requirements/dev.txt \
-        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+        if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
+            echo "Installing dev requirements plus torch nightly..." \
+            && python3 use_existing_torch.py --prefix \
+            && cat torch_lib_versions.txt >> requirements/test.in \
+            && uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match \
+            --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+            && uv pip install --system $(cat torch_lib_versions.txt | xargs) --pre \
+            -r requirements/dev.txt \
+            --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+        else \
+            echo "Installing dev requirements..." \
+            && uv pip install --system -r requirements/dev.txt \
+            --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
+        fi \
    fi

 # install development dependencies (for testing)
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -15,9 +15,11 @@
 # Build arguments:
 #   PYTHON_VERSION=3.13|3.12 (default)|3.11|3.10
 #   VLLM_CPU_DISABLE_AVX512=false (default)|true
-#   VLLM_CPU_AVX512BF16=false (default)|true
-#   VLLM_CPU_AVX512VNNI=false (default)|true
-#   VLLM_CPU_AMXBF16=false |true (default)
+#   VLLM_CPU_AVX2=false (default)|true (for cross-compilation)
+#   VLLM_CPU_AVX512=false (default)|true (for cross-compilation)
+#   VLLM_CPU_AVX512BF16=false (default)|true (for cross-compilation)
+#   VLLM_CPU_AVX512VNNI=false (default)|true (for cross-compilation)
+#   VLLM_CPU_AMXBF16=false (default)|true (for cross-compilation)
 #

 ######################### COMMON BASE IMAGE #########################
@@ -54,9 +56,12 @@ ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE="copy"
+
+# Copy requirements files for installation
+COPY requirements/common.txt requirements/common.txt
+COPY requirements/cpu.txt requirements/cpu.txt
+
 RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
-    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
    uv pip install --upgrade pip && \
    uv pip install -r requirements/cpu.txt

@@ -88,6 +93,12 @@ ARG GIT_REPO_CHECK=0
 # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 ARG VLLM_CPU_DISABLE_AVX512=0
 ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+# Support for cross-compilation with AVX2 ISA: docker build --build-arg VLLM_CPU_AVX2="1" ...
+ARG VLLM_CPU_AVX2=0
+ENV VLLM_CPU_AVX2=${VLLM_CPU_AVX2}
+# Support for cross-compilation with AVX512 ISA: docker build --build-arg VLLM_CPU_AVX512="1" ...
+ARG VLLM_CPU_AVX512=0
+ENV VLLM_CPU_AVX512=${VLLM_CPU_AVX512}
 # Support for building with AVX512BF16 ISA: docker build --build-arg VLLM_CPU_AVX512BF16="true" ...
 ARG VLLM_CPU_AVX512BF16=0
 ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
@@ -100,18 +111,19 @@ ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16}

 WORKDIR /workspace/vllm

+# Copy build requirements
+COPY requirements/cpu-build.txt requirements/build.txt
+
 RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=requirements/cpu-build.txt,target=requirements/build.txt \
    uv pip install -r requirements/build.txt

 COPY . .
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+RUN if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi

 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
-    --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38

 ######################### TEST DEPS #########################
@@ -119,9 +131,11 @@ FROM base AS vllm-test-deps

 WORKDIR /workspace/vllm

+# Copy test requirements
+COPY requirements/test.in requirements/cpu-test.in
+
 # TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
-RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
-    cp requirements/test.in requirements/cpu-test.in && \
+RUN \
    sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
    remove_packages_not_supported_on_aarch64() { \
      case "$(uname -m)" in \
@@ -132,7 +146,7 @@ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
      esac; \
    }; \
    remove_packages_not_supported_on_aarch64 && \
-    sed -i 's/^torch==.*/torch==2.9.1/g' requirements/cpu-test.in && \
+    sed -i 's/^torch==.*/torch==2.10.0/g' requirements/cpu-test.in && \
    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
    uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
@@ -200,4 +214,29 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
    uv pip install dist/*.whl

+# Add labels to document build configuration
+LABEL org.opencontainers.image.title="vLLM CPU"
+LABEL org.opencontainers.image.description="vLLM inference engine for CPU platforms"
+LABEL org.opencontainers.image.vendor="vLLM Project"
+LABEL org.opencontainers.image.source="https://github.com/vllm-project/vllm"
+
+# Build configuration labels
+ARG TARGETARCH
+ARG VLLM_CPU_DISABLE_AVX512
+ARG VLLM_CPU_AVX2
+ARG VLLM_CPU_AVX512
+ARG VLLM_CPU_AVX512BF16
+ARG VLLM_CPU_AVX512VNNI
+ARG VLLM_CPU_AMXBF16
+ARG PYTHON_VERSION
+
+LABEL ai.vllm.build.target-arch="${TARGETARCH}"
+LABEL ai.vllm.build.cpu-disable-avx512="${VLLM_CPU_DISABLE_AVX512:-false}"
+LABEL ai.vllm.build.cpu-avx2="${VLLM_CPU_AVX2:-false}"
+LABEL ai.vllm.build.cpu-avx512="${VLLM_CPU_AVX512:-false}"
+LABEL ai.vllm.build.cpu-avx512bf16="${VLLM_CPU_AVX512BF16:-false}"
+LABEL ai.vllm.build.cpu-avx512vnni="${VLLM_CPU_AVX512VNNI:-false}"
+LABEL ai.vllm.build.cpu-amxbf16="${VLLM_CPU_AMXBF16:-false}"
+LABEL ai.vllm.build.python-version="${PYTHON_VERSION:-3.12}"
+
 ENTRYPOINT ["vllm", "serve"]
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -1,3 +1,11 @@
+#######
+#
+# THIS FILE IS DEPRECATED AND WILL BE REMOVED SHORTLY
+#
+# Please use the standard Dockerfile with PYTORCH_NIGHTLY=1 instead
+#
+#######
+
 # The vLLM Dockerfile is used to construct vLLM image against torch nightly that can be directly used for testing

 # for torch nightly, cuda >=12.6 is required,
@@ -213,15 +221,14 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.


 # build flashinfer for torch nightly from source around 10 mins
-# release version: v0.5.2
+# release version: v0.6.1
 # todo(elainewy): cache flashinfer build result for faster build
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/uv \
    echo "git clone flashinfer..." \
-    && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
+    && git clone --depth 1 --branch v0.6.1 --recursive https://github.com/flashinfer-ai/flashinfer.git \
    && cd flashinfer \
-    && git checkout v0.5.2 \
    && git submodule update --init --recursive \
    && echo "finish git clone flashinfer..." \
    && rm -rf build \
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -3,6 +3,14 @@ ARG REMOTE_VLLM="0"
 ARG COMMON_WORKDIR=/app
 ARG BASE_IMAGE=rocm/vllm-dev:base

+# Sccache configuration (only used in release pipeline)
+ARG USE_SCCACHE
+ARG SCCACHE_DOWNLOAD_URL
+ARG SCCACHE_ENDPOINT
+ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
+ARG SCCACHE_REGION_NAME=us-west-2
+ARG SCCACHE_S3_NO_CREDENTIALS=0
+
 FROM ${BASE_IMAGE} AS base

 ARG ARG_PYTORCH_ROCM_ARCH
@@ -14,9 +22,14 @@ ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
 RUN apt-get update -q -y && apt-get install -q -y \
    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
    apt-transport-https ca-certificates wget curl
-# Remove sccache
 RUN python3 -m pip install --upgrade pip
-RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
+# Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base)
+ARG USE_SCCACHE
+RUN if [ "$USE_SCCACHE" != "1" ]; then \
+        apt-get purge -y sccache || true; \
+        python3 -m pip uninstall -y sccache || true; \
+        rm -f "$(which sccache)" || true; \
+    fi

 # Install UV
 RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh
@@ -28,6 +41,39 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy

+# Install sccache if USE_SCCACHE is enabled (for release builds)
+ARG USE_SCCACHE
+ARG SCCACHE_DOWNLOAD_URL
+ARG SCCACHE_ENDPOINT
+ARG SCCACHE_BUCKET_NAME
+ARG SCCACHE_REGION_NAME
+ARG SCCACHE_S3_NO_CREDENTIALS
+RUN if [ "$USE_SCCACHE" = "1" ]; then \
+        if command -v sccache >/dev/null 2>&1; then \
+            echo "sccache already installed, skipping installation"; \
+            sccache --version; \
+        else \
+            echo "Installing sccache..." \
+            && SCCACHE_ARCH="x86_64" \
+            && SCCACHE_VERSION="v0.8.1" \
+            && SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
+            && curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
+            && tar -xzf /tmp/sccache.tar.gz -C /tmp \
+            && mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
+            && chmod +x /usr/bin/sccache \
+            && rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
+            && sccache --version; \
+        fi; \
+    fi
+
+# Set sccache environment variables only when USE_SCCACHE=1
+# This prevents S3 config from leaking into images when sccache is not used
+ARG USE_SCCACHE
+ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}}
+ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
+ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
+ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}
+
 ARG COMMON_WORKDIR
 WORKDIR ${COMMON_WORKDIR}

@@ -39,6 +85,8 @@ ONBUILD COPY ./ vllm/
 FROM base AS fetch_vllm_1
 ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
 ARG VLLM_BRANCH="main"
+ENV VLLM_REPO=${VLLM_REPO}
+ENV VLLM_BRANCH=${VLLM_BRANCH}
 ONBUILD RUN git clone ${VLLM_REPO} \
 	    && cd vllm \
 	    && git fetch -v --prune -- origin ${VLLM_BRANCH} \
@@ -51,7 +99,7 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
 # -----------------------
 # vLLM build stages
 FROM fetch_vllm AS build_vllm
-# Build vLLM
+# Build vLLM (setup.py auto-detects sccache in PATH)
 RUN cd vllm \
    && python3 -m pip install -r requirements/rocm.txt \
    && python3 setup.py clean --all  \
@@ -67,6 +115,178 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1

+# RIXL/UCX build stages
+FROM base AS build_rixl
+ARG RIXL_BRANCH="f33a5599"
+ARG RIXL_REPO="https://github.com/ROCm/RIXL.git"
+ARG UCX_BRANCH="da3fac2a"
+ARG UCX_REPO="https://github.com/ROCm/ucx.git"
+ENV ROCM_PATH=/opt/rocm
+ENV UCX_HOME=/usr/local/ucx
+ENV RIXL_HOME=/usr/local/rixl
+ENV RIXL_BENCH_HOME=/usr/local/rixl_bench
+
+# RIXL build system dependences and RDMA support
+RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
+    libgrpc-dev \
+    libgrpc++-dev \
+    libprotobuf-dev \
+    protobuf-compiler-grpc \
+    libcpprest-dev \
+    libaio-dev \
+    librdmacm1 \
+    librdmacm-dev \
+    libibverbs1 \
+    libibverbs-dev \
+    ibverbs-utils \
+    rdmacm-utils \
+    ibverbs-providers \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN uv pip install --system meson auditwheel patchelf tomlkit
+
+RUN cd /usr/local/src && \
+    git clone ${UCX_REPO} &&  \
+    cd ucx  && \
+    git checkout ${UCX_BRANCH} && \
+    ./autogen.sh && \
+    mkdir build && cd build && \
+    ../configure \
+        --prefix=/usr/local/ucx \
+        --enable-shared \
+        --disable-static \
+        --disable-doxygen-doc \
+        --enable-optimizations \
+        --enable-devel-headers \
+        --with-rocm=/opt/rocm \
+        --with-verbs \
+        --with-dm \
+        --enable-mt && \
+    make -j && \
+    make install
+
+ENV PATH=/usr/local/ucx/bin:$PATH
+ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}
+
+RUN git clone ${RIXL_REPO} /opt/rixl && \
+    cd /opt/rixl && \
+    git checkout ${RIXL_BRANCH} && \
+    meson setup build --prefix=${RIXL_HOME} \
+                     -Ducx_path=${UCX_HOME} \
+                     -Drocm_path=${ROCM_PATH} && \
+    cd build && \
+    ninja && \
+    ninja install
+
+# Generate RIXL wheel
+RUN cd /opt/rixl && mkdir -p /app/install && \
+    ./contrib/build-wheel.sh \
+        --output-dir /app/install \
+        --rocm-dir ${ROCM_PATH} \
+        --ucx-plugins-dir ${UCX_HOME}/lib/ucx \
+        --nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins
+
+
+# -----------------------
+# vLLM wheel release build stage (for building distributable wheels)
+# This stage pins dependencies to custom ROCm wheel versions and handles version detection
+FROM fetch_vllm AS build_vllm_wheel_release
+
+ARG COMMON_WORKDIR
+
+# Create /install directory for custom wheels
+RUN mkdir -p /install
+
+# Copy custom ROCm wheels from docker/context if they exist
+# COPY ensures Docker cache is invalidated when wheels change
+# .keep file ensures directory always exists for COPY to work
+COPY docker/context/base-wheels/ /tmp/base-wheels/
+# This is how we know if we are building for a wheel release or not.
+# If there are not wheels found there, we are not building for a wheel release. 
+# So we exit with an error. To skip this stage.
+RUN if [ -n "$(ls /tmp/base-wheels/*.whl 2>/dev/null)" ]; then \
+        echo "Found custom wheels - copying to /install"; \
+        cp /tmp/base-wheels/*.whl /install/ && \
+        echo "Copied custom wheels:"; \
+        ls -lh /install/; \
+    else \
+        echo "ERROR: No custom wheels found in docker/context/base-wheels/"; \
+        echo "Wheel releases require pre-built ROCm wheels."; \
+        exit 1; \
+    fi
+
+# GIT_REPO_CHECK: Verify repo is clean and tags are available (for release builds)
+# This matches CUDA's Dockerfile behavior for proper version detection via setuptools_scm
+ARG GIT_REPO_CHECK=0
+RUN if [ "$GIT_REPO_CHECK" != "0" ]; then \
+        echo "Running repository checks..."; \
+        cd vllm && bash tools/check_repo.sh; \
+    fi
+
+# Extract version from git BEFORE any modifications (pin_rocm_dependencies.py modifies requirements/rocm.txt)
+# This ensures setuptools_scm sees clean repo state for version detection
+RUN --mount=type=bind,source=.git,target=vllm/.git \
+    cd vllm \
+    && pip install setuptools_scm regex \
+    && VLLM_VERSION=$(python3 -c "import setuptools_scm; print(setuptools_scm.get_version())") \
+    && echo "Detected vLLM version: ${VLLM_VERSION}" \
+    && echo "${VLLM_VERSION}" > /tmp/vllm_version.txt
+
+# Fail if git-based package dependencies are found in requirements files
+# (uv doesn't handle git+ URLs well, and packages should be distributed on PyPI)
+# Extra notes: pip install is able to handle git+ URLs, but uv doesn't.
+RUN echo "Checking for git-based packages in requirements files..." \
+    && echo "Checking common.txt for git-based packages:" \
+    && if grep -q 'git+' ${COMMON_WORKDIR}/vllm/requirements/common.txt; then \
+         echo "ERROR: Git-based packages found in common.txt:"; \
+         grep 'git+' ${COMMON_WORKDIR}/vllm/requirements/common.txt; \
+         echo "Please publish these packages to PyPI instead of using git dependencies."; \
+         exit 1; \
+       else \
+         echo "  ✓ No git-based packages found in common.txt"; \
+       fi \
+    && echo "Checking rocm.txt for git-based packages:" \
+    && if grep -q 'git+' ${COMMON_WORKDIR}/vllm/requirements/rocm.txt; then \
+         echo "ERROR: Git-based packages found in rocm.txt:"; \
+         grep 'git+' ${COMMON_WORKDIR}/vllm/requirements/rocm.txt; \
+         echo "Please publish these packages to PyPI instead of using git dependencies."; \
+         exit 1; \
+       else \
+         echo "  ✓ No git-based packages found in rocm.txt"; \
+       fi \
+    && echo "All requirements files are clean - no git-based packages found"
+
+# Pin vLLM dependencies to exact versions of custom ROCm wheels
+# This ensures 'pip install vllm' automatically installs correct torch/triton/torchvision/amdsmi
+COPY tools/vllm-rocm/pin_rocm_dependencies.py /tmp/pin_rocm_dependencies.py
+RUN echo "Pinning vLLM dependencies to custom wheel versions..." \
+    && python3 /tmp/pin_rocm_dependencies.py /install ${COMMON_WORKDIR}/vllm/requirements/rocm.txt
+
+# Install dependencies using custom wheels from /install
+RUN cd vllm \
+    && echo "Building vLLM with custom wheels from /install" \
+    && python3 -m pip install --find-links /install -r requirements/rocm.txt \
+    && python3 setup.py clean --all
+
+# Build wheel using pre-extracted version to avoid dirty state from modified requirements/rocm.txt
+# (setup.py auto-detects sccache in PATH)
+RUN --mount=type=bind,source=.git,target=vllm/.git \
+    cd vllm \
+    && export SETUPTOOLS_SCM_PRETEND_VERSION=$(cat /tmp/vllm_version.txt) \
+    && echo "Building wheel with version: ${SETUPTOOLS_SCM_PRETEND_VERSION}" \
+    && python3 setup.py bdist_wheel --dist-dir=dist
+
+FROM scratch AS export_vllm_wheel_release
+ARG COMMON_WORKDIR
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/dist/*.whl /
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/requirements /requirements
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/tests /tests
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/examples /examples
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
+
 # -----------------------
 # Test vLLM image
 FROM base AS test
@@ -83,6 +303,10 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
    && pip uninstall -y vllm \
    && uv pip install --system *.whl

+# Install RIXL wheel
+RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
+    uv pip install --system /rixl_install/*.whl
+
 WORKDIR /vllm-workspace
 ARG COMMON_WORKDIR
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
@@ -118,6 +342,19 @@ RUN mkdir src && mv vllm src/vllm
 FROM base AS final

 RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
+
+# Clean up sccache from release image (not needed at runtime)
+# This removes the binary and wrappers that may have been installed during build
+RUN rm -f /usr/bin/sccache || true \
+    && rm -rf /opt/sccache-wrappers || true
+
+# Unset sccache environment variables for the release image
+# This prevents S3 bucket config from leaking into production images
+ENV SCCACHE_BUCKET=
+ENV SCCACHE_REGION=
+ENV SCCACHE_S3_NO_CREDENTIALS=
+ENV SCCACHE_IDLE_TIMEOUT=
+
 # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
 # Manually remove it so that later steps of numpy upgrade can continue
 RUN case "$(which python3)" in \
@@ -159,3 +396,7 @@ ENV KINETO_CONFIG="${COMMON_WORKDIR}/libkineto.conf"
 RUN echo "VLLM_BASE_IMAGE=${BASE_IMAGE}" >> ${COMMON_WORKDIR}/versions.txt

 CMD ["/bin/bash"]
+
+#Set entrypoint for vllm-openai official images
+FROM final AS vllm-openai
+ENTRYPOINT ["vllm", "serve"]
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -14,16 +14,13 @@ ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 ARG MORI_BRANCH="2d02c6a9"
 ARG MORI_REPO="https://github.com/ROCm/mori.git"

-#TODO: When patch has been upstreamed, switch to the main repo/branch
-# ARG RIXL_BRANCH="<TODO>"
-# ARG RIXL_REPO="https://github.com/ROCm/RIXL.git"
-ARG RIXL_BRANCH="50d63d94"
-ARG RIXL_REPO="https://github.com/vcave/RIXL.git"
-# Needed by RIXL
-ARG ETCD_BRANCH="7c6e714f"
-ARG ETCD_REPO="https://github.com/etcd-cpp-apiv3/etcd-cpp-apiv3.git"
-ARG UCX_BRANCH="da3fac2a"
-ARG UCX_REPO="https://github.com/ROCm/ucx.git"
+# Sccache configuration (only used in release pipeline)
+ARG USE_SCCACHE
+ARG SCCACHE_DOWNLOAD_URL
+ARG SCCACHE_ENDPOINT
+ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
+ARG SCCACHE_REGION_NAME=us-west-2
+ARG SCCACHE_S3_NO_CREDENTIALS=0

 FROM ${BASE_IMAGE} AS base

@@ -64,6 +61,49 @@ RUN apt-get update -y \
 RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
 RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/*

+# Install sccache if USE_SCCACHE is enabled (for release builds)
+ARG USE_SCCACHE
+ARG SCCACHE_DOWNLOAD_URL
+ARG SCCACHE_ENDPOINT
+ARG SCCACHE_BUCKET_NAME
+ARG SCCACHE_REGION_NAME
+ARG SCCACHE_S3_NO_CREDENTIALS
+RUN if [ "$USE_SCCACHE" = "1" ]; then \
+        echo "Installing sccache..." \
+        && SCCACHE_ARCH="x86_64" \
+        && SCCACHE_VERSION="v0.8.1" \
+        && SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
+        && curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
+        && tar -xzf /tmp/sccache.tar.gz -C /tmp \
+        && mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
+        && chmod +x /usr/bin/sccache \
+        && rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
+        && sccache --version; \
+    fi
+
+# Setup sccache for HIP compilation via HIP_CLANG_PATH
+# This creates wrapper scripts in a separate directory and points HIP to use them
+# This avoids modifying the original ROCm binaries which can break detection
+# NOTE: HIP_CLANG_PATH is NOT set as ENV to avoid affecting downstream images (Dockerfile.rocm)
+# Instead, each build stage should export HIP_CLANG_PATH=/opt/sccache-wrappers if USE_SCCACHE=1
+RUN if [ "$USE_SCCACHE" = "1" ]; then \
+        echo "Setting up sccache wrappers for HIP compilation..." \
+        && mkdir -p /opt/sccache-wrappers \
+        && printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang++ "$@"\n' > /opt/sccache-wrappers/clang++ \
+        && chmod +x /opt/sccache-wrappers/clang++ \
+        && printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang "$@"\n' > /opt/sccache-wrappers/clang \
+        && chmod +x /opt/sccache-wrappers/clang \
+        && echo "sccache wrappers created in /opt/sccache-wrappers"; \
+    fi
+
+# Set sccache environment variables only when USE_SCCACHE=1
+# This prevents S3 config from leaking into images when sccache is not used
+ARG USE_SCCACHE
+ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}}
+ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
+ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
+ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}
+

 ###
 ### Triton Build
@@ -100,22 +140,42 @@ ARG PYTORCH_AUDIO_BRANCH
 ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
 ARG PYTORCH_AUDIO_REPO
+ARG USE_SCCACHE

 RUN git clone ${PYTORCH_REPO} pytorch
 RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \
    && pip install -r requirements.txt && git submodule update --init --recursive \
    && python3 tools/amd_build/build_amd.py \
+    && if [ "$USE_SCCACHE" = "1" ]; then \
+           export HIP_CLANG_PATH=/opt/sccache-wrappers \
+           && export CMAKE_C_COMPILER_LAUNCHER=sccache \
+           && export CMAKE_CXX_COMPILER_LAUNCHER=sccache \
+           && sccache --show-stats; \
+       fi \
    && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
+    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
    && pip install dist/*.whl
 RUN git clone ${PYTORCH_VISION_REPO} vision
 RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
+    && if [ "$USE_SCCACHE" = "1" ]; then \
+           export HIP_CLANG_PATH=/opt/sccache-wrappers \
+           && export CMAKE_C_COMPILER_LAUNCHER=sccache \
+           && export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \
+       fi \
    && python3 setup.py bdist_wheel --dist-dir=dist \
+    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
    && pip install dist/*.whl
 RUN git clone ${PYTORCH_AUDIO_REPO} audio
 RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \
    && git submodule update --init --recursive \
    && pip install -r requirements.txt \
+    && if [ "$USE_SCCACHE" = "1" ]; then \
+           export HIP_CLANG_PATH=/opt/sccache-wrappers \
+           && export CMAKE_C_COMPILER_LAUNCHER=sccache \
+           && export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \
+       fi \
    && python3 setup.py bdist_wheel --dist-dir=dist \
+    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
    && pip install dist/*.whl
 RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
    && cp /app/vision/dist/*.whl /app/install \
@@ -138,105 +198,25 @@ RUN cd mori \
 RUN mkdir -p /app/install && cp /app/mori/dist/*.whl /app/install


-###
-### RIXL Build
-###
-FROM build_pytorch AS build_rixl
-ARG RIXL_BRANCH
-ARG RIXL_REPO
-ARG ETCD_BRANCH
-ARG ETCD_REPO
-ARG UCX_BRANCH
-ARG UCX_REPO
-
-ENV ROCM_PATH=/opt/rocm
-ENV UCX_HOME=/usr/local/ucx
-ENV RIXL_HOME=/usr/local/rixl
-ENV RIXL_BENCH_HOME=/usr/local/rixl_bench
-
-# RIXL build system dependences and RDMA support
-RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
-    libgrpc-dev \
-    libgrpc++-dev \
-    libprotobuf-dev \
-    protobuf-compiler-grpc \
-    libcpprest-dev \
-    libaio-dev \
-    librdmacm1 \
-    librdmacm-dev \
-    libibverbs1 \
-    libibverbs-dev \
-    ibverbs-utils \
-    rdmacm-utils \
-    ibverbs-providers
-
-RUN pip install meson auditwheel patchelf tomlkit
-
-WORKDIR /workspace
-
-RUN git clone ${ETCD_REPO} && \
-    cd etcd-cpp-apiv3 && \
-    git checkout ${ETCD_BRANCH} && \
-    mkdir build && cd build && \
-    cmake .. -DCMAKE_POLICY_VERSION_MINIMUM=3.5 && \
-    make -j$(nproc) && \
-    make install
-
-RUN cd /usr/local/src && \
-    git clone ${UCX_REPO} &&  \
-    cd ucx  && \
-    git checkout ${UCX_BRANCH} && \
-    ./autogen.sh && \
-    mkdir build && cd build && \
-    ../configure \
-        --prefix=/usr/local/ucx \
-        --enable-shared \
-        --disable-static \
-        --disable-doxygen-doc \
-        --enable-optimizations \
-        --enable-devel-headers \
-        --with-rocm=/opt/rocm \
-        --with-verbs \
-        --with-dm \
-        --enable-mt && \
-    make -j && \
-    make -j install
-
-ENV PATH=/usr/local/ucx/bin:$PATH
-ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}
-
-RUN git clone ${RIXL_REPO} /opt/rixl && \
-    cd /opt/rixl && \
-    git checkout ${RIXL_BRANCH} && \
-    meson setup build --prefix=${RIXL_HOME} \
-                     -Ducx_path=${UCX_HOME} \
-                     -Drocm_path=${ROCM_PATH} && \
-    cd build && \
-    ninja && \
-    ninja install
-
-# Generate RIXL wheel
-RUN cd /opt/rixl && mkdir -p /app/install && \
-    ./contrib/build-wheel.sh \
-        --output-dir /app/install \
-        --rocm-dir ${ROCM_PATH} \
-        --ucx-plugins-dir ${UCX_HOME}/lib/ucx \
-        --nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins
-
-
 ###
 ### FlashAttention Build
 ###
 FROM base AS build_fa
 ARG FA_BRANCH
 ARG FA_REPO
+ARG USE_SCCACHE
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl
 RUN git clone ${FA_REPO}
 RUN cd flash-attention \
    && git checkout ${FA_BRANCH} \
    && git submodule update --init \
-    && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
+    && if [ "$USE_SCCACHE" = "1" ]; then \
+           export HIP_CLANG_PATH=/opt/sccache-wrappers \
+           && sccache --show-stats; \
+       fi \
+    && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist \
+    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi
 RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install


@@ -246,6 +226,7 @@ RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install
 FROM base AS build_aiter
 ARG AITER_BRANCH
 ARG AITER_REPO
+ARG USE_SCCACHE
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl
 RUN git clone --recursive ${AITER_REPO}
@@ -253,13 +234,37 @@ RUN cd aiter \
    && git checkout ${AITER_BRANCH} \
    && git submodule update --init --recursive \
    && pip install -r requirements.txt
-RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
+RUN pip install pyyaml && cd aiter \
+    && if [ "$USE_SCCACHE" = "1" ]; then \
+           export HIP_CLANG_PATH=/opt/sccache-wrappers \
+           && sccache --show-stats; \
+       fi \
+    && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist \
+    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
+    && ls /app/aiter/dist/*.whl
 RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install


 ###
 ### Final Build
 ###
+
+# Wheel release stage - 
+# only includes dependencies used by wheel release pipeline
+FROM base AS debs_wheel_release
+RUN mkdir /app/debs
+RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_fa,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+
+# Full debs stage - includes Mori (used by Docker releases)
 FROM base AS debs
 RUN mkdir /app/debs
 RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
@@ -274,8 +279,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
 RUN --mount=type=bind,from=build_mori,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
-RUN --mount=type=bind,from=build_rixl,src=/app/install/,target=/install \
-    cp /install/*.whl /app/debs

 FROM base AS final
 RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \
@@ -294,12 +297,6 @@ ARG FA_BRANCH
 ARG FA_REPO
 ARG AITER_BRANCH
 ARG AITER_REPO
-ARG RIXL_BRANCH
-ARG RIXL_REPO
-ARG ETCD_BRANCH
-ARG ETCD_REPO
-ARG UCX_BRANCH
-ARG UCX_REPO
 ARG MORI_BRANCH
 ARG MORI_REPO
 RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
@@ -315,11 +312,5 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \
-    && echo "RIXL_BRANCH: ${RIXL_BRANCH}" >> /app/versions.txt \
-    && echo "RIXL_REPO: ${RIXL_REPO}" >> /app/versions.txt \
-    && echo "ETCD_BRANCH: ${ETCD_BRANCH}" >> /app/versions.txt \
-    && echo "ETCD_REPO: ${ETCD_REPO}" >> /app/versions.txt \
-    && echo "UCX_BRANCH: ${UCX_BRANCH}" >> /app/versions.txt \
-    && echo "UCX_REPO: ${UCX_REPO}" >> /app/versions.txt \
    && echo "MORI_BRANCH: ${MORI_BRANCH}" >> /app/versions.txt \
    && echo "MORI_REPO: ${MORI_REPO}" >> /app/versions.txt
--- a/docker/versions.json
+++ b/docker/versions.json
@@ -0,0 +1,92 @@
+{
+  "_comment": "Auto-generated from Dockerfile ARGs. Do not edit manually. Run: python tools/generate_versions_json.py",
+  "variable": {
+    "CUDA_VERSION": {
+      "default": "12.9.1"
+    },
+    "PYTHON_VERSION": {
+      "default": "3.12"
+    },
+    "BUILD_BASE_IMAGE": {
+      "default": "nvidia/cuda:12.9.1-devel-ubuntu20.04"
+    },
+    "FINAL_BASE_IMAGE": {
+      "default": "nvidia/cuda:12.9.1-base-ubuntu22.04"
+    },
+    "GET_PIP_URL": {
+      "default": "https://bootstrap.pypa.io/get-pip.py"
+    },
+    "PYTORCH_CUDA_INDEX_BASE_URL": {
+      "default": "https://download.pytorch.org/whl"
+    },
+    "PIP_KEYRING_PROVIDER": {
+      "default": "disabled"
+    },
+    "UV_KEYRING_PROVIDER": {
+      "default": "disabled"
+    },
+    "INSTALL_KV_CONNECTORS": {
+      "default": "false"
+    },
+    "TORCH_CUDA_ARCH_LIST": {
+      "default": "7.0 7.5 8.0 8.9 9.0 10.0 12.0"
+    },
+    "MAX_JOBS": {
+      "default": "2"
+    },
+    "NVCC_THREADS": {
+      "default": "8"
+    },
+    "SCCACHE_BUCKET_NAME": {
+      "default": "vllm-build-sccache"
+    },
+    "SCCACHE_REGION_NAME": {
+      "default": "us-west-2"
+    },
+    "SCCACHE_S3_NO_CREDENTIALS": {
+      "default": "0"
+    },
+    "vllm_target_device": {
+      "default": "cuda"
+    },
+    "DEEPGEMM_GIT_REF": {
+      "default": "594953acce41793ae00a1233eb516044d604bcb6"
+    },
+    "PPLX_COMMIT_HASH": {
+      "default": "12cecfd"
+    },
+    "DEEPEP_COMMIT_HASH": {
+      "default": "73b6ea4"
+    },
+    "GIT_REPO_CHECK": {
+      "default": "0"
+    },
+    "VLLM_MAX_SIZE_MB": {
+      "default": "500"
+    },
+    "RUN_WHEEL_CHECK": {
+      "default": "true"
+    },
+    "FLASHINFER_VERSION": {
+      "default": "0.6.1"
+    },
+    "GDRCOPY_CUDA_VERSION": {
+      "default": "12.8"
+    },
+    "GDRCOPY_OS_VERSION": {
+      "default": "Ubuntu22_04"
+    },
+    "BITSANDBYTES_VERSION_X86": {
+      "default": "0.46.1"
+    },
+    "BITSANDBYTES_VERSION_ARM64": {
+      "default": "0.42.0"
+    },
+    "TIMM_VERSION": {
+      "default": ">=1.0.17"
+    },
+    "RUNAI_MODEL_STREAMER_VERSION": {
+      "default": ">=0.15.3"
+    }
+  }
+}
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -82,10 +82,6 @@ Internal data structures.

 - [vllm.multimodal.processing][]

-### Memory Profiling
-
- [vllm.multimodal.profiling][]
-
 ### Registry

 - [vllm.multimodal.registry][]
--- a/docs/assets/contributing/dockerfile-stages-dependency.png
+++ b/docs/assets/contributing/dockerfile-stages-dependency.png
--- a/docs/benchmarking/dashboard.md
+++ b/docs/benchmarking/dashboard.md
@@ -13,14 +13,14 @@ For x86 CPU environment, please use the image with "-cpu" postfix. For AArch64 C
 Here is an example for docker run command for CPU. For GPUs skip setting the `ON_CPU` env var.

 ```bash
-export VLLM_COMMIT=1da94e673c257373280026f75ceb4effac80e892 # use full commit hash from the main branch
+export VLLM_COMMIT=7f42dc20bb2800d09faa72b26f25d54e26f1b694 # use full commit hash from the main branch
 export HF_TOKEN=<valid Hugging Face token>
 if [[ "$(uname -m)" == aarch64 || "$(uname -m)" == arm64 ]]; then
  IMG_SUFFIX="arm64-cpu"
 else
  IMG_SUFFIX="cpu"
 fi
-docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN=$HF_TOKEN -e ON_ARM64_CPU=1 --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${VLLM_COMMIT}-${IMG_SUFFIX}
+docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN=$HF_TOKEN -e ON_CPU=1 --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${VLLM_COMMIT}-${IMG_SUFFIX}
 ```

 Then, run below command inside the docker instance.
--- a/docs/benchmarking/sweeps.md
+++ b/docs/benchmarking/sweeps.md
@@ -139,6 +139,63 @@ The algorithm for adjusting the SLA variable is as follows:

    For a given combination of `--serve-params` and `--bench-params`, we share the benchmark results across `--sla-params` to avoid rerunning benchmarks with the same SLA variable value.

+### Startup
+
+`vllm bench sweep startup` runs `vllm bench startup` across parameter combinations to compare cold/warm startup time for different engine settings.
+
+Follow these steps to run the script:
+
+1. (Optional) Construct the base command to `vllm bench startup`, and pass it to `--startup-cmd` (default: `vllm bench startup`).
+2. (Optional) Reuse a `--serve-params` JSON from `vllm bench sweep serve` to vary engine settings. Only parameters supported by `vllm bench startup` are applied.
+3. (Optional) Create a `--startup-params` JSON to vary startup-specific options like iteration counts.
+4. Determine where you want to save the results, and pass that to `--output-dir`.
+
+Example `--serve-params`:
+
+```json
+[
+    {
+        "_benchmark_name": "tp1",
+        "model": "Qwen/Qwen3-0.6B",
+        "tensor_parallel_size": 1,
+        "gpu_memory_utilization": 0.9
+    },
+    {
+        "_benchmark_name": "tp2",
+        "model": "Qwen/Qwen3-0.6B",
+        "tensor_parallel_size": 2,
+        "gpu_memory_utilization": 0.9
+    }
+]
+```
+
+Example `--startup-params`:
+
+```json
+[
+    {
+        "_benchmark_name": "qwen3-0.6",
+        "num_iters_cold": 2,
+        "num_iters_warmup": 1,
+        "num_iters_warm": 2
+    }
+]
+```
+
+Example command:
+
+```bash
+vllm bench sweep startup \
+    --startup-cmd 'vllm bench startup --model Qwen/Qwen3-0.6B' \
+    --serve-params benchmarks/serve_hparams.json \
+    --startup-params benchmarks/startup_hparams.json \
+    -o benchmarks/results
+```
+
+!!! important
+    By default, unsupported parameters in `--serve-params` or `--startup-params` are ignored with a warning.
+    Use `--strict-params` to fail fast on unknown keys.
+
 ## Visualization

 ### Basic
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@@ -43,10 +43,16 @@ If you are only developing vLLM's Python code, install vLLM using:
 VLLM_USE_PRECOMPILED=1 uv pip install -e .
 ```

-If you are developing vLLM's Python and CUDA/C++ code, install vLLM using:
+If you are developing vLLM's Python and CUDA/C++ code, install Pytorch first:

 ```bash
-uv pip install -e .
+uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu129
+```
+
+then install vLLM using:
+
+```bash
+uv pip install -e . --no-build-isolation
 ```

 For more details about installing from source and installing for other hardware, check out the [installation instructions](../getting_started/installation/README.md) for your hardware and head to the "Build wheel from source" section.
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@@ -23,29 +23,32 @@ Further update the model as follows:
                raise ValueError("Only image modality is supported")
        ```

- Reserve a keyword parameter in [forward][torch.nn.Module.forward] for each input tensor that corresponds to a multi-modal input, as shown in the following example:
+- Inside `__init__` method, initialize the language components of the model inside [_mark_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal._mark_language_model], and the multimodal components of the model inside [_mark_tower_model][vllm.model_executor.models.interfaces.SupportsMultiModal._mark_tower_model], e.g.:

-  ```diff
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-  +     pixel_values: torch.Tensor,
-    ) -> SamplerOutput:
-  ```
-  
-  More conveniently, you can simply pass `**kwargs` to the [forward][torch.nn.Module.forward] method and retrieve the keyword parameters for multimodal inputs from it.
+    ```python
+        def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+            super().__init__()
+
+            config = vllm_config.model_config.hf_config
+
+            with self._mark_tower_model(vllm_config, "image"):
+                self.vision_encoder = ...
+                self.multi_modal_projector = ...
+
+            with self._mark_language_model(vllm_config):
+                self.language_model = init_vllm_registered_model(
+                    vllm_config=vllm_config,
+                    hf_config=config.text_config,
+                    prefix=maybe_prefix(prefix, "language_model"),
+                )
+    ```

 - Implement [embed_multimodal][vllm.model_executor.models.interfaces.SupportsMultiModal.embed_multimodal] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.

    ??? code

        ```python
-        class YourModelForImage2Seq(nn.Module):
-            ...
-
            def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
-                assert self.vision_encoder is not None
                image_features = self.vision_encoder(image_input)
                return self.multi_modal_projector(image_features)

@@ -71,18 +74,7 @@ Further update the model as follows:
    [PlaceholderRange][vllm.multimodal.inputs.PlaceholderRange] from input processing.
    This logic can be found at [embed_input_ids][vllm.model_executor.models.interfaces.SupportsMultiModal.embed_input_ids].

-    You may override this method if additional logic is required for your model when merging embeddings. 
-
- Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model.
-
-    ```python
-    class YourModelForImage2Seq(nn.Module):
-        ...
-
-        def get_language_model(self) -> torch.nn.Module:
-            # Change `language_model` according to your implementation.
-            return self.language_model
-    ```
+    You may override this method if additional logic is required for your model when merging embeddings.

 - Once the above steps are done, update the model class with the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface.

@@ -116,12 +108,10 @@ def get_supported_mm_limits(self) -> Mapping[str, int | None]:

 ## 3. Specify dummy inputs

-Then, inherit [BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] to construct dummy inputs for
-HF processing as well as memory profiling.
+Then, inherit [BaseDummyInputsBuilder][vllm.multimodal.processing.BaseDummyInputsBuilder] to construct dummy inputs for
+HF processing. The processed outputs are also used for memory profiling.

-### For memory profiling
-
-Override the abstract methods [get_dummy_text][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text] and [get_dummy_mm_data][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_mm_data] to construct dummy inputs for memory profiling. These dummy inputs should result in the worst-case memory usage of the model so that vLLM can reserve the correct amount of memory for it.
+Override the abstract methods [get_dummy_text][vllm.multimodal.processing.BaseDummyInputsBuilder.get_dummy_text] and [get_dummy_mm_data][vllm.multimodal.processing.BaseDummyInputsBuilder.get_dummy_mm_data] to construct dummy inputs. These dummy inputs should result in the worst-case memory usage of the model so that vLLM can reserve the correct amount of memory for it.

 Assuming that the memory usage increases with the number of tokens, the dummy inputs can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens.

@@ -803,7 +793,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
 ## 5. Register processor-related classes

 After you have defined [BaseProcessingInfo][vllm.multimodal.processing.BaseProcessingInfo] (Step 2),
-[BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] (Step 3),
+[BaseDummyInputsBuilder][vllm.multimodal.processing.BaseDummyInputsBuilder] (Step 3),
 and [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] (Step 4),
 decorate the model class with [MULTIMODAL_REGISTRY.register_processor][vllm.multimodal.registry.MultiModalRegistry.register_processor]
 to register them to the multi-modal registry:
--- a/docs/design/custom_op.md
+++ b/docs/design/custom_op.md
@@ -8,15 +8,6 @@ This document will introduce how CustomOp works in vLLM and how to implement a n

 `CustomOp` manages two dictionaries of all custom ops (i.e., op classes, indexed by registered name) in its class, for vLLM and OOT plugins respectively.

-??? code
-
-    ```python
-    class CustomOp(nn.Module):
-
-        op_registry: dict[str, type["CustomOp"]] = {}
-        op_registry_oot: dict[str, type["CustomOp"]] = {}
-    ```
-
 We can use `@CustomOp.register("op_name")` to register an op class to the `CustomOp` system. After this, the `op_name` and its class will be added into the `op_registry` dictionary. In addition, We can also register an OOT op by `@CustomOp.register_oot("op_name")`. We will introduce this mechanism in detail later.

 When a `CustomOp` is called (i.e., call its `forward()` method), if it is enabled (i.e., with `--compilation_config.custom_ops '["+op_name"]'`), it will automatically dispatch the forward method to the appropriate backend according to `current_platform`. Otherwise (i.e., it is disabled), it will only call the `forward_native()` method to use PyTorch-native implementation of this forward method.
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@@ -79,7 +79,7 @@ The `post_process*` methods take `PoolingRequestOutput` objects as input and gen
 The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters.
 The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/pooling/pooling/serving.py).

-An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/pooling/plugin/prithvi_geospatial_mae_client.py](../../examples/pooling/plugin/prithvi_geospatial_mae_client.py)) and offline ([examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py](../../examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py)) inference examples.
+An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/pooling/plugin/prithvi_geospatial_mae_online.py](../../examples/pooling/plugin/prithvi_geospatial_mae_online.py)) and offline ([examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py](../../examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py)) inference examples.

 ## Using an IO Processor plugin

--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@@ -49,7 +49,7 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of
 - `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds.
 - `vllm:prompt_tokens` - Prompt tokens.
 - `vllm:generation_tokens` - Generation tokens.
- `vllm:time_per_output_token_seconds` - Inter-token latency (Time Per Output Token, TPOT) in seconds.
+- `vllm:inter_token_latency_seconds` - Inter-token latency (Time Per Output Token, TPOT) in seconds.
 - `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
 - `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in the RUNNING, WAITING, and SWAPPED states.
 - `vllm:kv_cache_usage_perc` - Percentage of used cache blocks by vLLM.
--- a/docs/design/mm_processing.md
+++ b/docs/design/mm_processing.md
@@ -43,7 +43,7 @@ Moreover, since the tokenized text has not passed through the HF processor, we h

 ### Dummy text

-We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via [get_dummy_text][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text]. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
+We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via [get_dummy_text][vllm.multimodal.processing.BaseDummyInputsBuilder.get_dummy_text]. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.

 ### Automatic prompt updating

--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -85,14 +85,13 @@ To be used with a particular `FusedMoEPrepareAndFinalize` subclass, MoE kernels
 |--------|-------------------|--------------|---------------|---------------------|-----------------------|---------|--------|
 | triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] |
 | triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
-| deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],</br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
+| deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | </br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
 | cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] |
 | cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
-| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
+| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
 | gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
 | marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
 | trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
-| iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] |
 | rocm aiter moe | standard | fp8 | G(128),A,T | silu, gelu | Y | N | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts] |
 | cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] |
 | naive batched<sup>4</sup> | batched | int8,</br>fp8 | G,A,T | silu, gelu | <sup>6</sup> | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts] |
--- a/docs/design/prefix_caching.md
+++ b/docs/design/prefix_caching.md
@@ -22,8 +22,13 @@ In the example above, the KV cache in the first block can be uniquely identified
    We only cache full blocks.

 !!! note "Note 2"
-    The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value. To avoid any hash collisions **in a multi-tenant setup, we use SHA256** as hash function instead of the builtin hash.
-    SHA256 is supported since vLLM v0.8.3 and the default since v0.10.2. It comes with a negligible performance impact of about 75ns per token (<4ms for 50k tokens of context).
+    In previous versions, the hash key was not guaranteed to be collision-free. As of v0.11, the default hashing algorithm is `sha256`, which addresses collision risks.
+
+    For `vllm serve`, you can control the hashing algorithm via `--prefix-caching-hash-algo`:
+    - `sha256` (default): Uses Python's `pickle` for serialization. Hashes may not be reproducible across different Python or vLLM versions.
+    - `sha256_cbor`: Uses `cbor2` for serialization, providing a reproducible, cross-language compatible hash. This is recommended for deterministic caching across environments.
+    - `xxhash`: `Uses Pickle serialization with xxHash (128-bit) for faster, non-cryptographic hashing. Requires the optional `xxhash` package. IMPORTANT: Use of a hashing algorithm that is not considered cryptographically secure theoretically increases the risk of hash collisions, which can cause undefined behavior or even leak private information in multi-tenant environments. Even if collisions are still very unlikely, it is important to consider your security risk tolerance against the performance benefits before turning this on.
+    - `xxhash_cbor` combines canonical CBOR serialization with xxHash for reproducible hashing. Requires the optional `xxhash` package.    

 **A hashing example with multi-modality inputs**  
 In this example, we illustrate how prefix caching works with multi-modality inputs (e.g., images). Assuming we have a request with the following messages:
--- a/docs/design/torch_compile_multimodal.md
+++ b/docs/design/torch_compile_multimodal.md
@@ -11,14 +11,14 @@ to new models to improve performance.

 ## Overview

-We have recently enabled the `@supports_torch_compile` decorator to work for multiple nn module components within a model type; this enables
+We have recently enabled the `@support_torch_compile` decorator to work for multiple nn module components within a model type; this enables
 turning compile on for multimodal encoders, bringing performance improvements to additional components of the stack.

 When applied to the vision block of [`Qwen2_5_vl`](https://github.com/vllm-project/vllm/pull/23207) we observe ~4.5% e2e perf improvements with
 some increase in compilation time

 This feature is off by default, but can be enabled by setting `compile_mm_encoder: true` in the compilation config when models have the
-`@supports_torch_compile` decorator.
+`@support_torch_compile` decorator.

 ## How Compilation Works for Multimodal Components

@@ -26,7 +26,7 @@ This feature is off by default, but can be enabled by setting `compile_mm_encode

 To compile a multimodal component such as an encoder, we follow the same mechanism as the LLM text backbone, with a few additional scaffoldings:

-1. The `@supports_torch_compile` decorator should include `enable_if=should_torch_compile_mm_vit`. This will gate the compilation behind our
+1. The `@support_torch_compile` decorator should include `enable_if=should_torch_compile_mm_vit`. This will gate the compilation behind our
 `compile_mm_encoder` configuration

 2. `with set_model_tag("<component_name>", is_encoder=True)` context manager should be used around the nn.Module's instantiation. Since torch.compile
@@ -44,9 +44,9 @@ this for more configuration in the future.

 ## Applying torch.compile to a New Multimodal Model/Component

-To apply `supports_torch_compile` to a new general nn.Module, we advise following the same steps in [`debug_vllm_compile`](./debug_vllm_compile.md); this includes:
+To apply `support_torch_compile` to a new general nn.Module, we advise following the same steps in [`debug_vllm_compile`](./debug_vllm_compile.md); this includes:

-1. Applying `supports_torch_compile` on initially small modules (such as basic MLP layers), then raising to more general modules until one reaches a good performance
+1. Applying `support_torch_compile` on initially small modules (such as basic MLP layers), then raising to more general modules until one reaches a good performance
 tradeoff

 2. Leveraging [`tlparse`](https://github.com/meta-pytorch/tlparse) to identify and eliminate the source of recompiles and graph breaks
--- a/docs/features/batch_invariance.md
+++ b/docs/features/batch_invariance.md
@@ -106,6 +106,7 @@ Batch invariance has been tested and verified on the following models:
 - **DeepSeek series**: `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-V3-0324`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`
 - **Qwen3 (Dense)**: `Qwen/Qwen3-1.7B`, `Qwen/Qwen3-8B`
 - **Qwen3 (MoE)**: `Qwen/Qwen3-30B-A3B`, `Qwen/Qwen3-Next-80B-A3B-Instruct`
+- **Qwen2.5**: `Qwen/Qwen2.5-0.5B-Instruct`, `Qwen/Qwen2.5-1.5B-Instruct`, `Qwen/Qwen2.5-3B-Instruct`, `Qwen/Qwen2.5-7B-Instruct`, `Qwen/Qwen2.5-14B-Instruct`, `Qwen/Qwen2.5-32B-Instruct`
 - **Llama 3**: `meta-llama/Llama-3.1-8B-Instruct`, `meta-llama/Llama-3.2-1B-Instruct`

 Other models may also work, but these have been explicitly validated. If you encounter issues with a specific model, please report them on the [GitHub issue tracker](https://github.com/vllm-project/vllm/issues/new/choose).
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -210,6 +210,24 @@ Alternatively, follow these example steps to implement your own plugin:

    For more details, refer to the [vLLM's Plugins System](../design/plugin_system.md).

+### In-Place LoRA Reloading
+
+When dynamically loading LoRA adapters, you may need to replace an existing adapter with updated weights while keeping the same name. The `load_inplace` parameter enables this functionality. This commonly occurs in asynchronous reinforcement learning setups, where adapters are continuously updated and swapped in without interrupting ongoing inference.
+
+When `load_inplace=True`, vLLM will replace the existing adapter with the new one.
+
+Example request to load or replace a LoRA adapter with the same name:
+
+```bash
+curl -X POST http://localhost:8000/v1/load_lora_adapter \
+-H "Content-Type: application/json" \
+-d '{
+    "lora_name": "my-adapter",
+    "lora_path": "/path/to/adapter/v2",
+    "load_inplace": true
+}'
+```
+
 ## New format for `--lora-modules`

 In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example:
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -20,67 +20,6 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
 - `prompt`: The prompt should follow the format that is documented on HuggingFace.
 - `multi_modal_data`: This is a dictionary that follows the schema defined in [vllm.multimodal.inputs.MultiModalDataDict][].

-### Stable UUIDs for Caching (multi_modal_uuids)
-
-When using multi-modal inputs, vLLM normally hashes each media item by content to enable caching across requests. You can optionally pass `multi_modal_uuids` to provide your own stable IDs for each item so caching can reuse work across requests without rehashing the raw content.
-
-??? code
-
-    ```python
-    from vllm import LLM
-    from PIL import Image
-
-    # Qwen2.5-VL example with two images
-    llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct")
-
-    prompt = "USER: <image><image>\nDescribe the differences.\nASSISTANT:"
-    img_a = Image.open("/path/to/a.jpg")
-    img_b = Image.open("/path/to/b.jpg")
-
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": {"image": [img_a, img_b]},
-        # Provide stable IDs for caching.
-        # Requirements (matched by this example):
-        #  - Include every modality present in multi_modal_data.
-        #  - For lists, provide the same number of entries.
-        #  - Use None to fall back to content hashing for that item.
-        "multi_modal_uuids": {"image": ["sku-1234-a", None]},
-    })
-
-    for o in outputs:
-        print(o.outputs[0].text)
-    ```
-
-Using UUIDs, you can also skip sending media data entirely if you expect cache hits for respective items. Note that the request will fail if the skipped media doesn't have a corresponding UUID, or if the UUID fails to hit the cache.
-
-??? code
-
-    ```python
-    from vllm import LLM
-    from PIL import Image
-
-    # Qwen2.5-VL example with two images
-    llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct")
-
-    prompt = "USER: <image><image>\nDescribe the differences.\nASSISTANT:"
-    img_b = Image.open("/path/to/b.jpg")
-
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": {"image": [None, img_b]},
-        # Since img_a is expected to be cached, we can skip sending the actual
-        # image entirely.
-        "multi_modal_uuids": {"image": ["sku-1234-a", None]},
-    })
-
-    for o in outputs:
-        print(o.outputs[0].text)
-    ```
-
-!!! warning
-    If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored.
-
 ### Image Inputs

 You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
@@ -397,7 +336,8 @@ No manual conversion is needed - vLLM handles the channel normalization automati
 ### Embedding Inputs

 To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
-pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
+pass a tensor of shape `(..., hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
+The exact shape depends on the model being used.

 You must enable this feature via `enable_mm_embeds=True`.

@@ -418,8 +358,7 @@ You must enable this feature via `enable_mm_embeds=True`.
    # Refer to the HuggingFace repo for the correct format to use
    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"

-    # Embeddings for single image
-    # torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
+    # For most models, `image_embeds` has shape: (num_images, image_feature_size, hidden_size)
    image_embeds = torch.load(...)

    outputs = llm.generate({
@@ -430,21 +369,8 @@ You must enable this feature via `enable_mm_embeds=True`.
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
-    ```

-For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
-
-??? code
-
-    ```python
-    # Construct the prompt based on your model
-    prompt = ...
-
-    # Embeddings for multiple images
-    # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
-    image_embeds = torch.load(...)
-
-    # Qwen2-VL
+    # Additional examples for models that require extra fields
    llm = LLM(
        "Qwen/Qwen2-VL-2B-Instruct",
        limit_mm_per_prompt={"image": 4},
@@ -452,13 +378,15 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
    )
    mm_data = {
        "image": {
-            "image_embeds": image_embeds,
+            # Shape: (total_feature_size, hidden_size)
+            # total_feature_size = sum(image_feature_size for image in images)
+            "image_embeds": torch.load(...),
+            # Shape: (num_images, 3)
            # image_grid_thw is needed to calculate positional encoding.
-            "image_grid_thw": torch.load(...),  # torch.Tensor of shape (1, 3),
+            "image_grid_thw": torch.load(...),
        }
    }

-    # MiniCPM-V
    llm = LLM(
        "openbmb/MiniCPM-V-2_6",
        trust_remote_code=True,
@@ -467,20 +395,14 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
    )
    mm_data = {
        "image": {
-            "image_embeds": image_embeds,
+            # Shape: (num_images, num_slices, hidden_size)
+            # num_slices can differ for each image
+            "image_embeds": [torch.load(...) for image in images],  
+            # Shape: (num_images, 2)
            # image_sizes is needed to calculate details of the sliced image.
-            "image_sizes": [image.size for image in images],  # list of image sizes
+            "image_sizes": [image.size for image in images],
        }
    }
-
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": mm_data,
-    })
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
    ```

 For Qwen3-VL, the `image_embeds` should contain both the base image embedding and deepstack features.
@@ -501,8 +423,8 @@ You can pass pre-computed audio embeddings similar to image embeddings:
    # Refer to the HuggingFace repo for the correct format to use
    prompt = "USER: <audio>\nWhat is in this audio?\nASSISTANT:"

-    # Load pre-computed audio embeddings
-    # torch.Tensor of shape (1, audio_feature_size, hidden_size of LM)
+    # Load pre-computed audio embeddings, usually with shape:
+    # (num_audios, audio_feature_size, hidden_size of LM)
    audio_embeds = torch.load(...)

    outputs = llm.generate({
@@ -515,6 +437,67 @@ You can pass pre-computed audio embeddings similar to image embeddings:
        print(generated_text)
    ```

+### Cached Inputs
+
+When using multi-modal inputs, vLLM normally hashes each media item by content to enable caching across requests. You can optionally pass `multi_modal_uuids` to provide your own stable IDs for each item so caching can reuse work across requests without rehashing the raw content.
+
+??? code
+
+    ```python
+    from vllm import LLM
+    from PIL import Image
+
+    # Qwen2.5-VL example with two images
+    llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct")
+
+    prompt = "USER: <image><image>\nDescribe the differences.\nASSISTANT:"
+    img_a = Image.open("/path/to/a.jpg")
+    img_b = Image.open("/path/to/b.jpg")
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"image": [img_a, img_b]},
+        # Provide stable IDs for caching.
+        # Requirements (matched by this example):
+        #  - Include every modality present in multi_modal_data.
+        #  - For lists, provide the same number of entries.
+        #  - Use None to fall back to content hashing for that item.
+        "multi_modal_uuids": {"image": ["sku-1234-a", None]},
+    })
+
+    for o in outputs:
+        print(o.outputs[0].text)
+    ```
+
+Using UUIDs, you can also skip sending media data entirely if you expect cache hits for respective items. Note that the request will fail if the skipped media doesn't have a corresponding UUID, or if the UUID fails to hit the cache.
+
+??? code
+
+    ```python
+    from vllm import LLM
+    from PIL import Image
+
+    # Qwen2.5-VL example with two images
+    llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct")
+
+    prompt = "USER: <image><image>\nDescribe the differences.\nASSISTANT:"
+    img_b = Image.open("/path/to/b.jpg")
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"image": [None, img_b]},
+        # Since img_a is expected to be cached, we can skip sending the actual
+        # image entirely.
+        "multi_modal_uuids": {"image": ["sku-1234-a", None]},
+    })
+
+    for o in outputs:
+        print(o.outputs[0].text)
+    ```
+
+!!! warning
+    If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored.
+
 ## Online Serving

 Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). Media inputs also support optional UUIDs users can provide to uniquely identify each media, which is used to cache the media results across requests.
@@ -879,7 +862,11 @@ Full example: [examples/online_serving/openai_chat_completion_client_for_multimo
 ### Embedding Inputs

 To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
-pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
+pass a tensor of shape `(..., hidden_size of LM)` for each item to the corresponding field of the multi-modal dictionary.
+
+!!! important
+    Unlike offline inference, the embeddings for each item must be passed separately
+    in order for placeholder tokens to be applied correctly by the chat template.

 You must enable this feature via the `--enable-mm-embeds` flag in `vllm serve`.

@@ -897,11 +884,6 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
    ```python
    from vllm.utils.serial_utils import tensor2base64

-    image_embedding = torch.load(...)
-    grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
-
-    base64_image_embedding = tensor2base64(image_embedding)
-
    client = OpenAI(
        # defaults to os.environ.get("OPENAI_API_KEY")
        api_key=openai_api_key,
@@ -912,29 +894,33 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
    model = "llava-hf/llava-1.5-7b-hf"
    embeds = {
        "type": "image_embeds",
-        "image_embeds": f"{base64_image_embedding}",
+        "image_embeds": tensor2base64(torch.load(...)),  # Shape: (image_feature_size, hidden_size)
        "uuid": image_url,  # Optional
    }

-    # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
+
+    # Additional examples for models that require extra fields
    model = "Qwen/Qwen2-VL-2B-Instruct"
    embeds = {
        "type": "image_embeds",
        "image_embeds": {
-            "image_embeds": f"{base64_image_embedding}",  # Required
-            "image_grid_thw": f"{base64_image_grid_thw}",  # Required by Qwen/Qwen2-VL-2B-Instruct
+            "image_embeds": tensor2base64(torch.load(...)),  # Shape: (image_feature_size, hidden_size)
+            "image_grid_thw": tensor2base64(torch.load(...)),  # Shape: (3,)
        },
        "uuid": image_url,  # Optional
    }
+
    model = "openbmb/MiniCPM-V-2_6"
    embeds = {
        "type": "image_embeds",
        "image_embeds": {
-            "image_embeds": f"{base64_image_embedding}",  # Required
-            "image_sizes": f"{base64_image_sizes}",  # Required by openbmb/MiniCPM-V-2_6
+            "image_embeds": tensor2base64(torch.load(...)),  # Shape: (num_slices, hidden_size)
+            "image_sizes": tensor2base64(torch.load(...)),  # Shape: (2,)
        },
        "uuid": image_url,  # Optional
    }
+
+    # Single image input
    chat_completion = client.chat.completions.create(
        messages=[
            {
@@ -954,9 +940,55 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
        ],
        model=model,
    )
+
+    # Multi image input
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {
+                "role": "system",
+                "content": "You are a helpful assistant.",
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's in this image?",
+                    },
+                    embeds,
+                    embeds,
+                ],
+            },
+        ],
+        model=model,
+    )
+
+    # Multi image input (interleaved)
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {
+                "role": "system",
+                "content": "You are a helpful assistant.",
+            },
+            {
+                "role": "user",
+                "content": [
+                    embeds,
+                    {
+                        "type": "text",
+                        "text": "What's in this image?",
+                    },
+                    embeds,
+                ],
+            },
+        ],
+        model=model,
+    )
    ```

-For Online Serving, you can also skip sending media if you expect cache hits with provided UUIDs. You can do so by sending media like this:
+### Cached Inputs
+
+Just like with offline inference, you can skip sending media if you expect cache hits with provided UUIDs. You can do so by sending media like this:

 ??? code

@@ -990,13 +1022,3 @@ For Online Serving, you can also skip sending media if you expect cache hits wit
        },

    ```
-
-!!! note
-    Multiple messages can now contain `{"type": "image_embeds"}`, enabling you to pass multiple image embeddings in a single request (similar to regular images). The number of embeddings is limited by `--limit-mm-per-prompt`.
-
-    **Important**: The embedding shape format differs based on the number of embeddings:
-
-    - **Single embedding**: 3D tensor of shape `(1, feature_size, hidden_size)`
-    - **Multiple embeddings**: List of 2D tensors, each of shape `(feature_size, hidden_size)`
-
-    If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc.
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@@ -50,7 +50,7 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=5600 \
 vllm serve Qwen/Qwen3-0.6B \
  --port 8100 \
  --enforce-eager \
-  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
+  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both","kv_load_failure_policy":"fail"}'
 ```

 ### Consumer (Decoder) Configuration
@@ -65,7 +65,7 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=5601 \
 vllm serve Qwen/Qwen3-0.6B \
  --port 8200 \
  --enforce-eager \
-  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
+  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both","kv_load_failure_policy":"fail"}'
 ```

 ### Proxy Server
@@ -110,7 +110,7 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=5600 \
 UCX_NET_DEVICES=all \
 vllm serve Qwen/Qwen3-0.6B --port 8000 \
  --tensor-parallel-size 8 \
-  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}'
+  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer","kv_load_failure_policy":"fail"}'

 # Prefiller 2 on Machine B (example IP: ${IP2})
 VLLM_NIXL_SIDE_CHANNEL_HOST=${IP2} \
@@ -118,7 +118,7 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=5600 \
 UCX_NET_DEVICES=all \
 vllm serve Qwen/Qwen3-0.6B --port 8000 \
  --tensor-parallel-size 8 \
-  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}'
+  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer","kv_load_failure_policy":"fail"}'
 ```

 ### Multiple Decoder Instances on Different Machines
@@ -130,7 +130,7 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=5600 \
 UCX_NET_DEVICES=all \
 vllm serve Qwen/Qwen3-0.6B --port 8000 \
  --tensor-parallel-size 8 \
-  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}'
+  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer","kv_load_failure_policy":"fail"}'

 # Decoder 2 on Machine D (example IP: ${IP4})
 VLLM_NIXL_SIDE_CHANNEL_HOST=${IP4} \
@@ -138,7 +138,7 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=5600 \
 UCX_NET_DEVICES=all \
 vllm serve Qwen/Qwen3-0.6B --port 8000 \
  --tensor-parallel-size 8 \
-  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}'
+  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer","kv_load_failure_policy":"fail"}'
 ```

 ### Proxy for Multiple Instances
@@ -164,6 +164,16 @@ For multi-host DP deployment, only need to provide the host/port of the head ins
    NixlConnector currently does not distinguish `kv_role`; the actual prefiller/decoder roles are determined by the upper-level proxy (e.g., `toy_proxy_server.py` using `--prefiller-hosts` and `--decoder-hosts`).
    Therefore, `kv_role` in `--kv-transfer-config` is effectively a placeholder and does not affect NixlConnector's behavior.

+### KV Load Failure Policy
+
+The `kv_load_failure_policy` setting controls how the system handles failures when the decoder instance loads KV cache blocks from the prefiller instance:
+
+- **fail** (recommended): Immediately fail the request with an error when KV load fails. This prevents performance degradation by avoiding recomputation of prefill work on the decode instance.
+- **recompute** (default): Recompute failed blocks locally on the decode instance. This may cause performance _jitter_ on decode instances as the scheduled prefill will delay and interfere with other decodes. Furthermore, decode instances are typically configured with low-latency optimizations.
+
+!!! warning
+    Using `kv_load_failure_policy="recompute"` can lead to performance degradation in production deployments. When KV loads fail, the decode instance will execute prefill work with decode-optimized configurations, which is inefficient and defeats the purpose of disaggregated prefilling. This also increases tail latency for other ongoing decode requests.
+
 ## Experimental Feature

 ### Heterogeneous KV Layout support
--- a/Show More
+++ b/Show More