[Frontend][last/5] Make pooling entrypoints request schema consensus. (#31127 )

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
[Tiny] Rename encoder budget file to more specific name (#34103 )
2026-02-09 06:42:38 +00:00 · 2026-02-09 03:48:19 +00:00 · 2026-02-09 09:46:46 +08:00 · 2026-02-08 13:51:09 -08:00 · 2026-02-08 11:16:48 -08:00 · 2026-02-08 18:55:47 +00:00
920 changed files with 48516 additions and 18292 deletions
--- a/.buildkite/hardware_tests/arm.yaml
+++ b/.buildkite/hardware_tests/arm.yaml
@@ -1,8 +0,0 @@
-group: Hardware
-steps:
-  - label: "Arm CPU Test"
-    soft_fail: true
-    device: arm_cpu
-    no_plugin: true
-    commands: 
-    - bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -0,0 +1,100 @@
+group: CPU
+depends_on: []
+steps:
+- label: CPU-Kernel Tests
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  - csrc/cpu/
+  - cmake/cpu_extension.cmake
+  - CMakeLists.txt
+  - vllm/_custom_ops.py
+  - tests/kernels/attention/test_cpu_attn.py
+  - tests/kernels/moe/test_cpu_fused_moe.py
+  - tests/kernels/test_onednn.py
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+      pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+      pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
+      pytest -x -v -s tests/kernels/test_onednn.py"
+
+- label: CPU-Language Generation and Pooling Model Tests
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  - csrc/cpu/
+  - vllm/
+  - tests/models/language/generation/
+  - tests/models/language/pooling/
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
+      pytest -x -v -s tests/models/language/generation -m cpu_model
+      pytest -x -v -s tests/models/language/pooling -m cpu_model"
+
+- label: CPU-Quantization Model Tests
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  - csrc/cpu/
+  - vllm/model_executor/layers/quantization/cpu_wna16.py
+  - vllm/model_executor/layers/quantization/gptq_marlin.py
+  - vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+  - vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
+  - vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
+  - tests/quantization/test_compressed_tensors.py
+  - tests/quantization/test_cpu_wna16.py
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+      pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
+      pytest -x -v -s tests/quantization/test_cpu_wna16.py"
+      
+- label: CPU-Distributed Tests
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  - csrc/cpu/shm.cpp
+  - vllm/v1/worker/cpu_worker.py
+  - vllm/v1/worker/gpu_worker.py
+  - vllm/v1/worker/cpu_model_runner.py
+  - vllm/v1/worker/gpu_model_runner.py
+  - vllm/platforms/cpu.py
+  - vllm/distributed/parallel_state.py
+  - vllm/distributed/device_communicators/cpu_communicator.py
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
+      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
+
+- label: CPU-Multi-Modal Model Tests %N
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  # - vllm/
+  - vllm/model_executor/layers/rotary_embedding
+  - tests/models/multimodal/generation/
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m "
+      pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB"
+  parallelism: 2
+
+- label: "Arm CPU Test"
+  depends_on: []
+  soft_fail: true
+  device: arm_cpu
+  no_plugin: true
+  commands: 
+  - bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
--- a/.buildkite/hardware_tests/intel.yaml
+++ b/.buildkite/hardware_tests/intel.yaml
@@ -1,13 +1,6 @@
 group: Hardware
 depends_on: ~
 steps:
-  - label: "Intel CPU Test"
-    soft_fail: true
-    device: intel_cpu
-    no_plugin: true
-    commands: 
-    - bash .buildkite/scripts/hardware_ci/run-cpu-test.sh
-  
  - label: "Intel HPU Test"
    soft_fail: true
    device: intel_hpu
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -3,6 +3,7 @@ steps:
  - label: ":docker: Build image"
    key: image-build
    depends_on: []
+    timeout_in_minutes: 600
    commands:
    - if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
    - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
@@ -41,7 +42,7 @@ steps:
          limit: 2
        - exit_status: -10  # Agent was lost
          limit: 2
-  
+
  - label: ":docker: Build CPU arm64 image"
    key: cpu-arm64-image-build
    depends_on: []
--- a/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
+++ b/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
@@ -0,0 +1,15 @@
+model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.695
+  - name: "exact_match,flexible-extract"
+    value: 0.447
+limit: 1319
+num_fewshot: 5
+max_model_len: 262144
+enforce_eager: false
+apply_chat_template: true
+fewshot_as_multiturn: true
+trust_remote_code: true
--- a/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
@@ -0,0 +1,19 @@
+model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.7142
+  - name: "exact_match,flexible-extract"
+    value: 0.4579
+env_vars:
+  VLLM_USE_FLASHINFER_MOE_FP8: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
+limit: 1319
+num_fewshot: 5
+max_model_len: 262144
+kv_cache_dtype: fp8
+enforce_eager: false
+apply_chat_template: true
+fewshot_as_multiturn: true
+trust_remote_code: true
--- a/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
@@ -1 +1,2 @@
 Qwen3-235B-A22B-Instruct-2507-FP8.yaml
+NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
+NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -393,6 +393,11 @@ run_serving_tests() {
      fi
    fi

+    # save the compilation mode and optimization level on the serving results
+    # whenever they are set
+    compilation_config_mode=$(echo "$server_params" | jq -r '."compilation_config.mode" // empty')
+    optimization_level=$(echo "$server_params" | jq -r '.optimization_level // empty')
+
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
@@ -406,15 +411,15 @@ run_serving_tests() {
      for max_concurrency in $max_concurrency_list; do
        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
        echo " new test name $new_test_name"
-        # pass the tensor parallel size to the client so that it can be displayed
-        # on the benchmark dashboard
+        # pass the tensor parallel size, the compilation mode, and the optimization
+        # level to the client so that they can be used on the benchmark dashboard
        client_command="vllm bench serve \
          --save-result \
          --result-dir $RESULTS_FOLDER \
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --max-concurrency $max_concurrency \
-          --metadata "tensor_parallel_size=$tp" \
+          --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
          $client_args $client_remote_args "

        echo "Running test case $test_name with qps $qps"
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -27,7 +27,7 @@ aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-
 To download and upload the image:

 \`\`\`
-Download images:
+# Download images:

 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
@@ -35,8 +35,12 @@ docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
+docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
+docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}

-Tag and push images:
+# Tag and push images:
+
+## CUDA

 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
@@ -62,19 +66,36 @@ docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-a
 docker push vllm/vllm-openai:latest-aarch64-cu130
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130

-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:latest
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
-docker push vllm/vllm-openai-rocm:latest
-docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
+## ROCm
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+docker push vllm/vllm-openai-rocm:latest
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}

-Create multi-arch manifest:
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
 docker push vllm/vllm-openai-rocm:latest-base
 docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base

+## CPU
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
+docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
+docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
+docker push vllm/vllm-openai-cpu:latest-x86_64
+docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
+docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
+docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+docker push vllm/vllm-openai-cpu:latest-arm64
+docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+
+# Create multi-arch manifest:
+
 docker manifest rm vllm/vllm-openai:latest
 docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
@@ -86,5 +107,11 @@ docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
 docker manifest push vllm/vllm-openai:latest-cu130
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
+
+docker manifest rm vllm/vllm-openai-cpu:latest || true
+docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
+docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+docker manifest push vllm/vllm-openai-cpu:latest
+docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
 \`\`\`
-EOF 
+EOF
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -87,7 +87,7 @@ mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"

 commands=$@
-echo "Commands:$commands"
+echo "Raw commands: $commands"

 commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}

@@ -169,6 +169,9 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi

+commands=$(echo "$commands" | sed 's/ \\ / /g')
+echo "Final commands: $commands"
+
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
@@ -176,7 +179,6 @@ fi
 # --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13


-PARALLEL_JOB_COUNT=8
 MYPYTHONPATH=".."

 # Test that we're launching on the machine that has
@@ -187,56 +189,7 @@ if [[ -z "$render_gid" ]]; then
  exit 1
 fi

-# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
-if [[ $commands == *"--shard-id="* ]]; then
-  # assign job count as the number of shards used
-  commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
-  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
-    # assign shard-id for each shard
-    commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
-    echo "Shard ${GPU} commands:$commands_gpu"
-    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
-    docker run \
-        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
-        --network=host \
-        --shm-size=16gb \
-        --group-add "$render_gid" \
-        --rm \
-        -e HIP_VISIBLE_DEVICES="${GPU}" \
-        -e HF_TOKEN \
-        -e AWS_ACCESS_KEY_ID \
-        -e AWS_SECRET_ACCESS_KEY \
-        -v "${HF_CACHE}:${HF_MOUNT}" \
-        -e "HF_HOME=${HF_MOUNT}" \
-        -e "PYTHONPATH=${MYPYTHONPATH}" \
-        --name "${container_name}_${GPU}" \
-        "${image_name}" \
-        /bin/bash -c "${commands_gpu}" \
-        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
-    PIDS+=($!)
-  done
-  #wait for all processes to finish and collect exit codes
-  for pid in "${PIDS[@]}"; do
-    wait "${pid}"
-    STATUS+=($?)
-  done
-  at_least_one_shard_with_tests=0
-  for st in "${STATUS[@]}"; do
-    if [[ ${st} -ne 0 ]] && [[ ${st} -ne 5 ]]; then
-      echo "One of the processes failed with $st"
-      exit "${st}"
-    elif [[ ${st} -eq 5 ]]; then
-      echo "Shard exited with status 5 (no tests collected) - treating as success"
-    else # This means st is 0
-      at_least_one_shard_with_tests=1
-    fi
-  done
-  if [[ ${#STATUS[@]} -gt 0 && ${at_least_one_shard_with_tests} -eq 0 ]]; then
-    echo "All shards reported no tests collected. Failing the build."
-    exit 1
-  fi
-
-elif [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
+if [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then

  export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')

--- a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+set -euox pipefail
+
+echo "--- PP+TP"
+vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
+server_pid=$!
+timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+vllm bench serve \
+    --backend vllm \
+    --dataset-name random \
+    --model meta-llama/Llama-3.2-3B-Instruct \
+    --num-prompts 20 \
+    --endpoint /v1/completions
+kill -s SIGTERM $server_pid &
+
+echo "--- DP+TP"
+vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
+server_pid=$!
+timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+vllm bench serve \
+    --backend vllm \
+    --dataset-name random \
+    --model meta-llama/Llama-3.2-3B-Instruct \
+    --num-prompts 20 \
+    --endpoint /v1/completions
+kill -s SIGTERM $server_pid &
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -2,119 +2,19 @@

 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
-set -ex
+set -euox pipefail

 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-48-95}
-# used for TP/PP E2E test
-OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
+IMAGE_NAME="cpu-test-$NUMA_NODE"
+TIMEOUT_VAL=$1
+TEST_COMMAND=$2

-export CMAKE_BUILD_PARALLEL_LEVEL=32
-
-# Setup cleanup
-remove_docker_container() {
-    set -e;
-    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
-}
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+# building the docker image
+echo "--- :docker: Building Docker image"
+docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
-
-function cpu_tests() {
-  set -e
-  export NUMA_NODE=$2
-
-  # list packages
-  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
-    set -e
-    pip list"
-
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pip list"
-
-  # offline inference
-  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
-    set -e
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
-
-  # Run kernel tests
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
-    pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
-    pytest -x -v -s tests/kernels/test_onednn.py"
-
-  # Run basic model test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    # Note: disable until supports V1
-    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
-    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
-
-    pytest -x -v -s tests/models/language/generation -m cpu_model
-    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
-
-    pytest -x -v -s tests/models/language/pooling -m cpu_model
-    pytest -x -v -s tests/models/multimodal/generation \
-                --ignore=tests/models/multimodal/generation/test_pixtral.py \
-                -m cpu_model"
-
-  # Run compressed-tensor test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -x -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
-
-  # Run AWQ/GPTQ test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -x -s -v \
-    tests/quantization/test_cpu_wna16.py"
-
-  # Run multi-lora tests
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -x -s -v \
-    tests/lora/test_qwenvl.py"
-
-  # online serving: tp+pp
-  docker exec cpu-test-"$NUMA_NODE" bash -c '
-    set -e
-    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
-    server_pid=$!
-    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
-    vllm bench serve \
-      --backend vllm \
-      --dataset-name random \
-      --model meta-llama/Llama-3.2-3B-Instruct \
-      --num-prompts 20 \
-      --endpoint /v1/completions
-    kill -s SIGTERM $server_pid &'
-
-  # online serving: tp+dp
-  docker exec cpu-test-"$NUMA_NODE" bash -c '
-    set -e
-    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
-    server_pid=$!
-    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
-    vllm bench serve \
-      --backend vllm \
-      --dataset-name random \
-      --model meta-llama/Llama-3.2-3B-Instruct \
-      --num-prompts 20 \
-      --endpoint /v1/completions
-    kill -s SIGTERM $server_pid &'
-}
-
-# All of CPU tests are expected to be finished less than 40 mins.
-export -f cpu_tests
-timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+docker run --rm --cpuset-cpus=$CORE_RANGE --cpuset-mems=$NUMA_NODE -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g $IMAGE_NAME \
+        timeout $TIMEOUT_VAL bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -5,7 +5,9 @@
 set -exuo pipefail

 # Try building the docker image
-cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
+image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
+container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
+cat <<EOF | docker build -t ${image_name} -f - .
 FROM gaudi-base-image:latest

 COPY ./ /workspace/vllm
@@ -15,7 +17,8 @@ WORKDIR /workspace/vllm
 ENV no_proxy=localhost,127.0.0.1
 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true

-RUN VLLM_TARGET_DEVICE=empty pip install .
+RUN bash -c 'pip install -r <(sed "/^torch/d" requirements/build.txt)'
+RUN VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e .
 RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git

 # install development dependencies (for testing)
@@ -36,15 +39,20 @@ EOF
 # functions, while other platforms only need one remove_docker_container
 # function.
 EXITCODE=1
-remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
+remove_docker_containers() { docker rm -f ${container_name} || true; }
 trap 'remove_docker_containers; exit $EXITCODE;' EXIT
 remove_docker_containers

 echo "Running HPU plugin v1 test"
-docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
+docker run --rm --runtime=habana --name=${container_name} --network=host \
  -e HABANA_VISIBLE_DEVICES=all \
-  hpu-plugin-v1-test-env \
-  /bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
+  -e VLLM_SKIP_WARMUP=true \
+  -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
+  -e PT_HPU_LAZY_MODE=1 \
+  "${image_name}" \
+  /bin/bash -c '
+  cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
+'

 EXITCODE=$?
 if [ $EXITCODE -eq 0 ]; then
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -38,10 +38,12 @@ docker run \
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
+    python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
+    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
+    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
    cd tests
-    pytest -v -s v1/core
+    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -43,7 +43,6 @@ trap cleanup EXIT

 for BACK in "${BACKENDS[@]}"; do
  VLLM_DEEP_GEMM_WARMUP=skip \
-  VLLM_ALL2ALL_BACKEND=$BACK \
  vllm serve "$MODEL" \
    --enforce-eager \
    --tensor-parallel-size 2 \
@@ -52,6 +51,7 @@ for BACK in "${BACKENDS[@]}"; do
    --enable-eplb \
    --trust-remote-code \
    --max-model-len 2048 \
+    --all2all-backend $BACK \
    --port $PORT &
  SERVER_PID=$!
  wait_for_server $PORT
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -70,6 +70,7 @@ steps:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
+  - tests/test_pooling_params.py
  - tests/multimodal
  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
@@ -82,6 +83,7 @@ steps:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
+  - pytest -v -s test_pooling_params.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s renderers
  - pytest -v -s tokenizers_
@@ -231,6 +233,7 @@ steps:
  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
+  - examples/offline_inference/new_weight_syncing/
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
@@ -266,10 +269,16 @@ steps:
  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
+  # OLD rlhf examples
  - pushd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd
+  # NEW rlhf examples
+  - pushd ../examples/offline_inference/new_weight_syncing
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
+  - popd

 - label: Distributed Tests (8 GPUs) # 4min
  timeout_in_minutes: 10
@@ -505,7 +514,7 @@ steps:
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    # for pooling models
-    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    - python3 pooling/embed/vision_embedding_offline.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
@@ -525,6 +534,7 @@ steps:
  - tests/cuda
  commands:
    - pytest -v -s cuda/test_cuda_context.py
+    - pytest -v -s cuda/test_platform_no_cuda_init.py

 - label: Samplers Test # 56min
  timeout_in_minutes: 75
@@ -604,9 +614,11 @@ steps:
  - tests/compile
  commands:
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
-    # Limit to no custom ops to reduce running time
-    # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+    # # Limit to no custom ops to reduce running time
+    # # Wrap with quotes to escape yaml and avoid starting -k string with a -
+    # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.

 - label: Cudagraph test
  timeout_in_minutes: 20
@@ -742,7 +754,7 @@ steps:
 - label: Benchmarks # 11min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  working_dir: "/vllm-workspace/.buildkite"
  source_file_dependencies:
@@ -753,7 +765,7 @@ steps:
 - label: Benchmarks CLI Test # 7min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - vllm/
@@ -852,10 +864,11 @@ steps:
  torch_nightly: true
  source_file_dependencies:
  - vllm/
+  - tests/models/test_terratorch.py
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
  commands:
-    - pytest -v -s models/test_transformers.py models/test_registry.py
+    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py

 - label: Basic Models Test (Other CPU) # 5min
  mirror_hardwares: [amdexperimental, amdproduction]
@@ -1178,44 +1191,26 @@ steps:
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/test_fusion_attn.py
-  - tests/compile/test_silu_mul_quant_fusion.py
-  - tests/compile/distributed/test_fusion_all_reduce.py
-  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/passes/test_fusion_attn.py
+  - tests/compile/passes/test_silu_mul_quant_fusion.py
+  - tests/compile/passes/distributed/test_fusion_all_reduce.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
-    - pytest -v -s tests/compile/test_fusion_attn.py
-    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    - pytest -v -s tests/compile/passes/test_fusion_attn.py
+    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
+
+    # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # # Wrap with quotes to escape yaml
+    # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
+
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: Blackwell Fusion E2E Tests # 30 min
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  gpu: b200
-  optional: true
-  num_gpus: 2
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
-  commands:
-    - nvidia-smi
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-
 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
@@ -1562,12 +1557,15 @@ steps:
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
-    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
    #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
-    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
+    # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
+
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
    - pytest -v -s tests/v1/distributed/test_dbo.py
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -63,6 +63,7 @@ steps:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
+  - tests/test_pooling_params.py
  - tests/multimodal
  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
@@ -75,6 +76,7 @@ steps:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
+  - pytest -v -s test_pooling_params.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s renderers
  - pytest -v -s tokenizers_
@@ -204,6 +206,7 @@ steps:
  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
+  - examples/offline_inference/new_weight_syncing/
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
@@ -238,10 +241,16 @@ steps:
  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
+  # OLD rlhf examples
  - pushd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd
+  # NEW rlhf examples
+  - pushd ../examples/offline_inference/new_weight_syncing
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
+  - popd

 - label: Distributed Tests (8 GPUs) # 4min
  timeout_in_minutes: 10
@@ -444,7 +453,7 @@ steps:
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    # for pooling models
-    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    - python3 pooling/embed/vision_embedding_offline.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
@@ -510,6 +519,7 @@ steps:
  # However, find does not normally propagate error codes, so we combine it with xargs
  # (using -0 for proper path handling)
  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
+  - pytest -s -v compile/passes --ignore compile/passes/distributed

 - label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
@@ -537,9 +547,11 @@ steps:
  commands:
    # fp8 kv scales not supported on sm89, tested on Blackwell instead
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
-    # Limit to no custom ops to reduce running time
-    # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+    # # Limit to no custom ops to reduce running time
+    # # Wrap with quotes to escape yaml and avoid starting -k string with a -
+    # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.

 - label: Cudagraph test
  timeout_in_minutes: 20
@@ -793,10 +805,11 @@ steps:
  torch_nightly: true
  source_file_dependencies:
  - vllm/
+  - tests/models/test_terratorch.py
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
  commands:
-    - pytest -v -s models/test_transformers.py models/test_registry.py
+    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py

 - label: Basic Models Test (Other CPU) # 5min
  timeout_in_minutes: 10
@@ -1068,84 +1081,23 @@ steps:
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
-  - tests/compile/distributed/test_fusion_all_reduce.py
-  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/passes/distributed/test_fusion_all_reduce.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
    - pytest -v -s tests/compile/test_fusion_attn.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
+    #  # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    #  # Wrap with quotes to escape yaml
+    #  - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
+
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: Blackwell Fusion E2E Tests # 30 min
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  gpu: b200
-  optional: true
-  num_gpus: 2
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
-  commands:
-    - nvidia-smi
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-
- label: Hopper Fusion E2E Tests (H100) # 10min
-  timeout_in_minutes: 70
-  working_dir: "/vllm-workspace/"
-  gpu: h100
-  optional: true
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/test_fusion_attn.py
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # skip Llama-4 since it does not fit on this device
-    - pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
-
- label: Hopper Fusion Distributed E2E Tests (2xH100)  # 70min
-  timeout_in_minutes: 70
-  working_dir: "/vllm-workspace/"
-  gpu: h100
-  optional: true
-  num_gpus: 2
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-
 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
@@ -1203,6 +1155,8 @@ steps:
  - pytest -v -s distributed/test_shm_broadcast.py
  - pytest -v -s distributed/test_shm_buffer.py
  - pytest -v -s distributed/test_shm_storage.py
+  - pytest -v -s distributed/test_packed_tensor.py
+  - pytest -v -s distributed/test_weight_transfer.py

 - label: 2 Node Tests (4 GPUs in total) # 16min
  timeout_in_minutes: 30
@@ -1468,8 +1422,8 @@ steps:
  commands:
    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
    # Run sequence parallel tests
-    - pytest -v -s tests/distributed/test_sequence_parallel.py
-    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
+    - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py

 - label: Distributed Tests (H100) # optional
  gpu: h100
@@ -1477,7 +1431,7 @@ steps:
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -2,56 +2,202 @@ group: Compile
 depends_on: 
  - image-build
 steps:
- label: Fusion and Compile Tests (B200)
-  timeout_in_minutes: 40
+- label: Sequence Parallel Correctness Tests (2 GPUs)
+  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/"
-  device: b200
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/v1/worker/
-  - vllm/v1/cudagraph_dispatcher.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/test_fusion_attn.py
-  - tests/compile/test_silu_mul_quant_fusion.py
-  - tests/compile/distributed/test_fusion_all_reduce.py
-  - tests/compile/distributed/test_fusions_e2e.py
-  - tests/compile/fullgraph/test_full_graph.py
-  commands:
-    - nvidia-smi
-    - pytest -v -s tests/compile/test_fusion_attn.py
-    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
-    # this runner has 2 GPUs available even though num_devices=2 is not set
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
-    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
-
- label: Fusion E2E (2 GPUs)(B200)
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  device: b200
-  optional: true
  num_devices: 2
  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/model_executor/layers/
  - vllm/compilation/
-  # can affect pattern matching
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
+  - tests/compile/correctness_e2e/test_sequence_parallel.py
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
+
+- label: Sequence Parallel Correctness Tests (2xH100)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/"
+  device: h100
+  optional: true
+  num_devices: 2
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
+
+- label: AsyncTP Correctness Tests (2xH100)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/"
+  device: h100
+  optional: true
+  num_devices: 2
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
+
+- label: Distributed Compile Unit Tests (2xH100)
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 2
+  source_file_dependencies:
+  - vllm/compilation/
+  - vllm/model_executor/layers
+  - tests/compile/passes/distributed/
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -s -v tests/compile/passes/distributed
+
+- label: Fusion and Compile Unit Tests (B200)
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/"
+  device: b200
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
+  - vllm/model_executor/layers/attention/attention.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
+  - tests/compile/passes/test_fusion_attn.py
+  - tests/compile/passes/test_silu_mul_quant_fusion.py
+  - tests/compile/passes/distributed/test_fusion_all_reduce.py
+  - tests/compile/fullgraph/test_full_graph.py
+  commands:
+    # b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
+    - nvidia-smi
+    - pytest -v -s tests/compile/passes/test_fusion_attn.py -k FLASHINFER
+    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_devices=2 is not set
+    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    # TODO(luka) move to H100 once pass tests run on H100
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+
+- label: Fusion E2E Quick (H100)
+  timeout_in_minutes: 15
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 1
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"

+- label: Fusion E2E Config Sweep (H100)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 1
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/compilation/
+    # can affect pattern matching
+    - vllm/model_executor/layers/layernorm.py
+    - vllm/model_executor/layers/activation.py
+    - vllm/model_executor/layers/attention/attention.py
+    - vllm/model_executor/layers/quantization/input_quant_fp8.py
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run just llama3 (fp8) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
+
+- label: Fusion E2E Config Sweep (B200)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/"
+  device: b200
+  num_devices: 1
+  optional: true
+  commands:
+    - nvidia-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    # -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
+    # Run just llama3 (fp8 & fp4) for all config combinations
+    # -k "llama-3"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
+
+- label: Fusion E2E TP2 Quick (H100)
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+
+- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/compilation/
+    # can affect pattern matching
+    - vllm/model_executor/layers/layernorm.py
+    - vllm/model_executor/layers/activation.py
+    - vllm/model_executor/layers/attention/attention.py
+    - vllm/model_executor/layers/quantization/input_quant_fp8.py
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run just llama3 (fp4 & fp8 & bf16) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
+
+- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/compilation/
+    # can affect pattern matching
+    - vllm/model_executor/layers/layernorm.py
+    - vllm/model_executor/layers/activation.py
+    - vllm/model_executor/layers/attention/attention.py
+    - vllm/model_executor/layers/quantization/input_quant_fp8.py
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run just llama3 (fp8 & bf16) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
+
+- label: Fusion E2E TP2 (B200)
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/"
+  device: b200
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    # for ar-rms-quant-fp4, also sweep llama3
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
--- a/.buildkite/test_areas/cuda.yaml
+++ b/.buildkite/test_areas/cuda.yaml
@@ -9,6 +9,7 @@ steps:
  - tests/cuda
  commands:
    - pytest -v -s cuda/test_cuda_context.py
+    - pytest -v -s cuda/test_platform_no_cuda_init.py

 - label: Cudagraph
  timeout_in_minutes: 20
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -16,7 +16,7 @@ steps:
  - pytest -v -s distributed/test_shm_storage.py

 - label: Distributed (2 GPUs)
-  timeout_in_minutes: 90
+  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
  num_devices: 2
  source_file_dependencies:
@@ -47,7 +47,6 @@ steps:
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
-  - pytest -v -s distributed/test_sequence_parallel.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py

@@ -63,6 +62,7 @@ steps:
  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
+  - examples/offline_inference/new_weight_syncing/
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
@@ -97,9 +97,14 @@ steps:
  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
+  # OLD rlhf examples
  - cd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+  # NEW rlhf examples
+  - cd new_weight_syncing
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py

 - label: Distributed Tests (8 GPUs)(H100)
  timeout_in_minutes: 10
@@ -133,25 +138,13 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py

- label: Sequence Parallel Tests (H100)
-  timeout_in_minutes: 60
-  working_dir: "/vllm-workspace/"
-  device: h100
-  optional: true
-  num_devices: 2
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # Run sequence parallel tests
-    - pytest -v -s tests/distributed/test_sequence_parallel.py
-    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-
 - label: Distributed Tests (2 GPUs)(H100)
+  timeout_in_minutes: 15
  device: h100
  optional: true
  working_dir: "/vllm-workspace/"
  num_devices: 2
  commands:
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py
@@ -217,45 +210,3 @@ steps:
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py
-
- label: Hopper Fusion E2E Tests (H100)
-  timeout_in_minutes: 70
-  working_dir: "/vllm-workspace/"
-  device: h100
-  optional: true
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/test_fusion_attn.py
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # skip Llama-4 since it does not fit on this device
-    - pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
-
- label: Hopper Fusion Distributed E2E Tests (2xH100)
-  timeout_in_minutes: 70
-  working_dir: "/vllm-workspace/"
-  device: h100
-  optional: true
-  num_devices: 2
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -16,7 +16,7 @@ steps:
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
-    - pytest -v -s v1/spec_decode
+    - pytest -v -s -m 'not slow_test' v1/spec_decode
    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
    - pytest -v -s -m 'not cpu_test' v1/metrics
    - pytest -v -s v1/test_oracle.py
@@ -72,7 +72,7 @@ steps:
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
     # for pooling models
-    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    - python3 pooling/embed/vision_embedding_offline.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
@@ -122,6 +122,7 @@ steps:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
+  - tests/test_pooling_params.py
  - tests/multimodal
  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
@@ -134,6 +135,7 @@ steps:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
+  - pytest -v -s test_pooling_params.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s renderers
  - pytest -v -s tokenizers_
@@ -166,4 +168,18 @@ steps:
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pip install pytest-timeout pytest-forked
    - pytest -v -s v1/determinism/test_batch_invariance.py
-    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+  
+- label: Acceptance Length Test (Large Models) # optional
+  timeout_in_minutes: 25
+  gpu: h100
+  optional: true
+  num_gpus: 1
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/v1/spec_decode/
+  - vllm/model_executor/models/mlp_speculator.py
+  - tests/v1/spec_decode/test_acceptance_length.py
+  commands:
+    - export VLLM_ALLOW_INSECURE_SERIALIZATION=1
+    - pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -33,10 +33,11 @@ steps:
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
+  - tests/models/test_terratorch.py
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
  commands:
-    - pytest -v -s models/test_transformers.py models/test_registry.py
+    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py

 - label: Basic Models Test (Other CPU) # 5min
  depends_on: 
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -3,7 +3,7 @@ depends_on:
  - image-build
 steps:
 - label: PyTorch Compilation Unit Tests
-  timeout_in_minutes: 30
+  timeout_in_minutes: 10
  source_file_dependencies:
    - vllm/
    - tests/compile
@@ -17,8 +17,16 @@ steps:
  # (using -0 for proper path handling)
  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"

+- label: PyTorch Compilation Passes Unit Tests
+  timeout_in_minutes: 20
+  source_file_dependencies:
+    - vllm/
+    - tests/compile/passes
+  commands:
+  - pytest -s -v compile/passes --ignore compile/passes/distributed
+
 - label: PyTorch Fullgraph Smoke Test
-  timeout_in_minutes: 30
+  timeout_in_minutes: 35
  source_file_dependencies:
  - vllm/
  - tests/compile
@@ -30,16 +38,13 @@ steps:
  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"

 - label: PyTorch Fullgraph
-  timeout_in_minutes: 40
+  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
    # fp8 kv scales not supported on sm89, tested on Blackwell instead
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
-    # Limit to no custom ops to reduce running time
-    # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"

 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -121,24 +121,9 @@ repos:
    name: Update Dockerfile dependency graph
    entry: tools/pre_commit/update-dockerfile-graph.sh
    language: script
-  - id: enforce-import-regex-instead-of-re
-    name: Enforce import regex as re
-    entry: python tools/pre_commit/enforce_regex_import.py
-    language: python
-    types: [python]
-    pass_filenames: false
-    additional_dependencies: [regex]
-  # forbid directly import triton
-  - id: forbid-direct-triton-import
-    name: "Forbid direct 'import triton'"
-    entry: python tools/pre_commit/check_triton_import.py
-    language: python
-    types: [python]
-    pass_filenames: false
-    additional_dependencies: [regex]
-  - id: check-pickle-imports
-    name: Prevent new pickle/cloudpickle imports
-    entry: python tools/pre_commit/check_pickle_imports.py
+  - id: check-forbidden-imports
+    name: Check for forbidden imports
+    entry: python tools/pre_commit/check_forbidden_imports.py
    language: python
    types: [python]
    additional_dependencies: [regex]
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,8 +56,8 @@ endif()
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.10.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.10.0")

 #
 # Try to find python package with an executable that exactly matches
@@ -433,7 +433,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
    endif()

-    if (MARLIN_SM75_ARCHS) 
+    if (MARLIN_SM75_ARCHS)
      file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/marlin/sm75_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
@@ -445,7 +445,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
    endif()

-    if (MARLIN_FP8_ARCHS) 
+    if (MARLIN_FP8_ARCHS)
      file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/marlin/sm89_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
@@ -1042,7 +1042,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
    endif()

-    if (MARLIN_MOE_SM75_ARCHS) 
+    if (MARLIN_MOE_SM75_ARCHS)
      file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_MOE_SM75_SRC}"
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -11,7 +11,7 @@ This directory used to contain vLLM's benchmark scripts and utilities for perfor

 ## Usage

-For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).
+For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/benchmarking/cli/#benchmark-cli).

 For full CLI reference see:

--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -842,6 +842,7 @@ class BenchmarkTensors:
            "sorted_token_ids": sorted_token_ids,
            "expert_ids": expert_ids,
            "num_tokens_post_padded": num_tokens_post_padded,
+            "token_lora_mapping": self.lora_kernel_meta.token_lora_mapping,
            "top_k_num": ctx.top_k_num,
            "device": self.input.device,
            "N": lora_rank,
@@ -915,6 +916,7 @@ class BenchmarkTensors:
            "sorted_token_ids": sorted_token_ids,
            "expert_ids": expert_ids,
            "num_tokens_post_padded": num_tokens_post_padded,
+            "token_lora_mapping": self.lora_kernel_meta.token_lora_mapping,
            "top_k_num": ctx.top_k_num,
            "device": self.input.device,
            "N": lora_rank,
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -27,7 +27,6 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
    TritonOrDeepGemmExperts,
 )
-from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
 from vllm.triton_utils import triton
 from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -482,6 +481,8 @@ class BenchmarkWorker:
        block_quant_shape: list[int] = None,
        use_deep_gemm: bool = False,
    ) -> tuple[dict[str, int], float]:
+        # local import to allow serialization by ray
+
        set_random_seed(self.seed)
        dtype_str = _get_config_dtype_str(
            dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
@@ -535,6 +536,9 @@ class BenchmarkWorker:
        block_quant_shape: list[int],
        use_deep_gemm: bool,
    ) -> dict[str, int]:
+        # local import to allow serialization by ray
+        from vllm.platforms import current_platform
+
        best_config = None
        best_time = float("inf")
        if current_platform.is_rocm():
@@ -646,20 +650,28 @@ def save_configs(
        f.write("\n")


+def get_compressed_tensors_block_structure(config, default_value=None):
+    config_groups = config.get("config_groups", {})
+    if len(config_groups) != 1:
+        return default_value
+    group = next(iter(config_groups.values()))
+    weights = group.get("weights", {})
+    block_structure = weights.get("block_structure", default_value)
+    return block_structure
+
+
 def get_weight_block_size_safety(config, default_value=None):
    quantization_config = getattr(config, "quantization_config", {})
    if isinstance(quantization_config, dict):
-        return quantization_config.get("weight_block_size", default_value)
+        if "weight_block_size" in quantization_config:
+            return quantization_config["weight_block_size"]
+        return get_compressed_tensors_block_structure(
+            quantization_config, default_value
+        )
    return default_value


-def main(args: argparse.Namespace):
-    print(args)
-
-    config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
-    if args.model_prefix:
-        config = getattr(config, args.model_prefix)
-
+def get_model_params(config):
    if config.architectures[0] == "DbrxForCausalLM":
        E = config.ffn_config.moe_num_experts
        topk = config.ffn_config.moe_top_k
@@ -677,6 +689,7 @@ def main(args: argparse.Namespace):
        "Glm4MoeForCausalLM",
        "Glm4MoeLiteForCausalLM",
        "NemotronHForCausalLM",
+        "MistralLarge3ForCausalLM",
    ):
        E = config.n_routed_experts
        topk = config.num_experts_per_tok
@@ -697,16 +710,20 @@ def main(args: argparse.Namespace):
        topk = text_config.num_experts_per_tok
        intermediate_size = text_config.moe_intermediate_size
        hidden_size = text_config.hidden_size
-    elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
+    elif config.architectures[0] == "HunYuanMoEV1ForCausalLM":
        E = config.num_experts
        topk = config.moe_topk[0]
        intermediate_size = config.moe_intermediate_size[0]
        hidden_size = config.hidden_size
-    elif config.architectures[0] in ["Qwen3OmniMoeForConditionalGeneration"]:
+    elif config.architectures[0] == "Qwen3OmniMoeForConditionalGeneration":
        E = config.thinker_config.text_config.num_experts
        topk = config.thinker_config.text_config.num_experts_per_tok
        intermediate_size = config.thinker_config.text_config.moe_intermediate_size
        hidden_size = config.thinker_config.text_config.hidden_size
+    elif config.architectures[0] == "PixtralForConditionalGeneration":
+        # Pixtral can contain different LLM architectures,
+        # recurse to get their parameters
+        return get_model_params(config.get_text_config())
    else:
        # Support for llama4
        config = config.get_text_config()
@@ -715,6 +732,16 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok
        intermediate_size = config.intermediate_size
        hidden_size = config.hidden_size
+    return E, topk, intermediate_size, hidden_size
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
+    if args.model_prefix:
+        config = getattr(config, args.model_prefix)
+    E, topk, intermediate_size, hidden_size = get_model_params(config)
    enable_ep = bool(args.enable_expert_parallel)
    if enable_ep:
        ensure_divisibility(E, args.tp_size, "Number of experts")
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -44,10 +44,8 @@ def benchmark_permute(
    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
    # output_hidden_states = torch.empty_like(hidden_states)
    if use_fp8_w8a8:
-        align_block_size = 128  # deepgemm needs 128 m aligned block
        qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
    else:
-        align_block_size = None
        qhidden_states = hidden_states

    gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32)
@@ -67,7 +65,6 @@ def benchmark_permute(
            topk_ids=topk_ids,
            n_expert=num_experts,
            expert_map=None,
-            align_block_size=align_block_size,
        )

    # JIT compilation & warmup
@@ -117,10 +114,8 @@ def benchmark_unpermute(
    # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
    if use_fp8_w8a8:
-        align_block_size = 128  # deepgemm needs 128 m aligned block
        qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
    else:
-        align_block_size = None
        qhidden_states = hidden_states

    input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
@@ -142,7 +137,6 @@ def benchmark_unpermute(
            topk_ids=topk_ids,
            n_expert=num_experts,
            expert_map=None,
-            align_block_size=align_block_size,
        )
        # convert to fp16/bf16 as gemm output
        return (
--- a/benchmarks/kernels/cpu/benchmark_cpu_attn.py
+++ b/benchmarks/kernels/cpu/benchmark_cpu_attn.py
@@ -14,7 +14,7 @@ from vllm._custom_ops import (
 )
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
-from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
 from vllm.v1.attention.backends.cpu_attn import CPUAttentionBackend, _get_attn_isa


@@ -58,7 +58,7 @@ def main(
    seed: int = 0,
    iters: int = 20,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    num_seqs = len(seq_lens)
    query_lens = [x[0] for x in seq_lens]
    kv_lens = [x[1] for x in seq_lens]
--- a/benchmarks/kernels/cpu/benchmark_cpu_fused_moe.py
+++ b/benchmarks/kernels/cpu/benchmark_cpu_fused_moe.py
@@ -7,8 +7,8 @@ import time
 import numpy as np
 import torch

-from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import set_random_seed

 # Check if CPU MoE operations are available
 try:
@@ -41,7 +41,7 @@ def main(
    seed: int = 0,
    iters: int = 20,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    # up_dim = 2 * intermediate_size for gate + up projection
    up_dim = 2 * intermediate_size

--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -359,6 +359,19 @@ else()
    add_compile_definitions(-DVLLM_NUMA_DISABLED)
 endif()

+#
+# Generate CPU attention dispatch header
+#
+message(STATUS "Generating CPU attention dispatch header")
+execute_process(
+    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/csrc/cpu/generate_cpu_attn_dispatch.py
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/csrc/cpu
+    RESULT_VARIABLE GEN_RESULT
+)
+if(NOT GEN_RESULT EQUAL 0)
+    message(FATAL_ERROR "Failed to generate CPU attention dispatch header")
+endif()
+
 #
 # _C extension
 #
--- a/cmake/external_projects/triton_kernels.cmake
+++ b/cmake/external_projects/triton_kernels.cmake
@@ -1,9 +1,9 @@
 # Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels

-set(DEFAULT_TRITON_KERNELS_TAG "v3.5.0")
+set(DEFAULT_TRITON_KERNELS_TAG "v3.6.0")

 # Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
-# be directly set to the triton_kernels python directory. 
+# be directly set to the triton_kernels python directory.
 if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
  message(STATUS "[triton_kernels] Fetch from $ENV{TRITON_KERNELS_SRC_DIR}")
  FetchContent_Declare(
@@ -24,7 +24,7 @@ else()
  )
 endif()

-# Fetch content 
+# Fetch content
 FetchContent_MakeAvailable(triton_kernels)

 if (NOT triton_kernels_SOURCE_DIR)
@@ -47,7 +47,7 @@ install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/third_party/tr
 ## Copy .py files to install directory.
 install(DIRECTORY
        ${TRITON_KERNELS_PYTHON_DIR}
-        DESTINATION 
+        DESTINATION
        vllm/third_party/triton_kernels/
        COMPONENT triton_kernels
        FILES_MATCHING PATTERN "*.py")
--- a/csrc/cpu/cpu_attn.cpp
+++ b/csrc/cpu/cpu_attn.cpp
@@ -1,79 +1,4 @@
-#include "cpu_attn_vec.hpp"
-#include "cpu_attn_vec16.hpp"
-
-#ifdef CPU_CAPABILITY_AMXBF16
-  #include "cpu_attn_amx.hpp"
-  #define AMX_DISPATCH(...)                                                   \
-    case cpu_attention::ISA::AMX: {                                           \
-      using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::AMX, \
-                                                     scalar_t, head_dim>;     \
-      return __VA_ARGS__();                                                   \
-    }
-#else
-  #define AMX_DISPATCH(...) case cpu_attention::ISA::AMX:
-#endif
-
-#ifdef __aarch64__
-  #include "cpu_attn_neon.hpp"
-  // NEON requires head_dim to be a multiple of 32
-  #define NEON_DISPATCH(...)                                                   \
-    case cpu_attention::ISA::NEON: {                                           \
-      using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::NEON, \
-                                                     scalar_t, head_dim>;      \
-      return __VA_ARGS__();                                                    \
-    }
-#else
-  #define NEON_DISPATCH(...) case cpu_attention::ISA::NEON:
-#endif  // #ifdef __aarch64__
-
-#define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \
-  case HEAD_DIM: {                            \
-    constexpr size_t head_dim = HEAD_DIM;     \
-    return __VA_ARGS__();                     \
-  }
-
-#define CPU_ATTN_DISPATCH_CASE_HEADDIM(HEAD_DIM, ...)           \
-  [&] {                                                         \
-    switch (HEAD_DIM) {                                         \
-      CPU_ATTN_DISPATCH_CASE(32, __VA_ARGS__)                   \
-      CPU_ATTN_DISPATCH_CASE(64, __VA_ARGS__)                   \
-      CPU_ATTN_DISPATCH_CASE(80, __VA_ARGS__)                   \
-      CPU_ATTN_DISPATCH_CASE(96, __VA_ARGS__)                   \
-      CPU_ATTN_DISPATCH_CASE(112, __VA_ARGS__)                  \
-      CPU_ATTN_DISPATCH_CASE(128, __VA_ARGS__)                  \
-      CPU_ATTN_DISPATCH_CASE(160, __VA_ARGS__)                  \
-      CPU_ATTN_DISPATCH_CASE(192, __VA_ARGS__)                  \
-      CPU_ATTN_DISPATCH_CASE(224, __VA_ARGS__)                  \
-      CPU_ATTN_DISPATCH_CASE(256, __VA_ARGS__)                  \
-      default: {                                                \
-        TORCH_CHECK(false, "Invalid CPU attention head_dim: " + \
-                               std::to_string(HEAD_DIM));       \
-      }                                                         \
-    }                                                           \
-  }()
-
-#define CPU_ATTN_DISPATCH_IMPL(ISA_TYPE, ...)                                 \
-  [&] {                                                                       \
-    switch (ISA_TYPE) {                                                       \
-      AMX_DISPATCH(__VA_ARGS__)                                               \
-      NEON_DISPATCH(__VA_ARGS__)                                              \
-      case cpu_attention::ISA::VEC: {                                         \
-        using attn_impl =                                                     \
-            cpu_attention::AttentionImpl<cpu_attention::ISA::VEC, scalar_t,   \
-                                         head_dim>;                           \
-        return __VA_ARGS__();                                                 \
-      }                                                                       \
-      case cpu_attention::ISA::VEC16: {                                       \
-        using attn_impl =                                                     \
-            cpu_attention::AttentionImpl<cpu_attention::ISA::VEC16, scalar_t, \
-                                         head_dim>;                           \
-        return __VA_ARGS__();                                                 \
-      }                                                                       \
-      default: {                                                              \
-        TORCH_CHECK(false, "Invalid CPU attention ISA type.");                \
-      }                                                                       \
-    }                                                                         \
-  }()
+#include "cpu_attn_dispatch_generated.h"

 torch::Tensor get_scheduler_metadata(
    const int64_t num_req, const int64_t num_heads_q,
@@ -122,16 +47,14 @@ torch::Tensor get_scheduler_metadata(
  input.enable_kv_split = enable_kv_split;

  VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
-    CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
-      CPU_ATTN_DISPATCH_IMPL(isa, [&]() {
-        input.elem_size = sizeof(scalar_t);
-        input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
-        input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
-        input.output_buffer_elem_size =
-            sizeof(attn_impl::partial_output_buffer_t);
-        input.max_num_q_per_iter = attn_impl::MaxQHeadNumPerIteration;
-        input.kv_block_alignment = attn_impl::BlockSizeAlignment;
-      });
+    CPU_ATTN_DISPATCH(head_dim, isa, [&]() {
+      input.elem_size = sizeof(scalar_t);
+      input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
+      input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
+      input.output_buffer_elem_size =
+          sizeof(attn_impl::partial_output_buffer_t);
+      input.max_num_q_per_iter = attn_impl::MaxQHeadNumPerIteration;
+      input.kv_block_alignment = attn_impl::BlockSizeAlignment;
    });
  });

@@ -184,18 +107,14 @@ void cpu_attn_reshape_and_cache(

  VLLM_DISPATCH_FLOATING_TYPES(
      key.scalar_type(), "cpu_attn_reshape_and_cache", [&]() {
-        CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
-          CPU_ATTN_DISPATCH_IMPL(isa_tag, [&]() {
-            attn_impl::reshape_and_cache(
-                key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
-                key_cache.data_ptr<scalar_t>(),
-                value_cache.data_ptr<scalar_t>(),
-                slot_mapping.data_ptr<int64_t>(), token_num,
-                key_token_num_stride, value_token_num_stride, head_num,
-                key_head_num_stride, value_head_num_stride, num_blocks,
-                num_blocks_stride, cache_head_num_stride, block_size,
-                block_size_stride);
-          });
+        CPU_ATTN_DISPATCH(head_dim, isa_tag, [&]() {
+          attn_impl::reshape_and_cache(
+              key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
+              key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
+              slot_mapping.data_ptr<int64_t>(), token_num, key_token_num_stride,
+              value_token_num_stride, head_num, key_head_num_stride,
+              value_head_num_stride, num_blocks, num_blocks_stride,
+              cache_head_num_stride, block_size, block_size_stride);
        });
      });
 }
@@ -257,12 +176,10 @@ void cpu_attention_with_kv_cache(

  VLLM_DISPATCH_FLOATING_TYPES(
      query.scalar_type(), "cpu_attention_with_kv_cache", [&]() {
-        CPU_ATTN_DISPATCH_CASE_HEADDIM(query.size(2), [&] {
-          CPU_ATTN_DISPATCH_IMPL(input.metadata->isa, [&]() {
-            TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
-            cpu_attention::AttentionMainLoop<attn_impl> mainloop;
-            mainloop(&input);
-          });
+        CPU_ATTN_DISPATCH(query.size(2), input.metadata->isa, [&]() {
+          TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
+          cpu_attention::AttentionMainLoop<attn_impl> mainloop;
+          mainloop(&input);
        });
      });
 }
--- a/csrc/cpu/cpu_attn_amx.hpp
+++ b/csrc/cpu/cpu_attn_amx.hpp
@@ -377,7 +377,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
      const int32_t q_heads_per_kv, const int64_t q_num_stride,
      const int64_t q_head_stride, const float scale) {
    constexpr int64_t bytes_per_head = head_dim * sizeof(scalar_t);
-    // static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
+    static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
    constexpr int64_t head_size_block_num = bytes_per_head / AMX_TILE_ROW_BYTES;
    constexpr int64_t head_elem_num_pre_block =
        AMX_TILE_ROW_BYTES / sizeof(scalar_t);
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -816,14 +816,10 @@ struct VecTypeTrait<float> {
  using vec_t = vec_op::FP32Vec16;
 };

-// ARM only supports BF16 with ARMv8.6-A extension
-#if (defined(__aarch64__) && !defined(ARM_BF16_SUPPORT))
-#else
 template <>
 struct VecTypeTrait<c10::BFloat16> {
  using vec_t = vec_op::BF16Vec16;
 };
-#endif

 #if !defined(__powerpc__) && !defined(__s390x__)
 template <>
@@ -1111,7 +1107,8 @@ class AttentionMainLoop {
          if (sliding_window_left != -1) {
            pos = std::max(pos, curr_token_pos - sliding_window_left);
          }
-          return pos;
+          // Clamp to tile end to avoid OOB when window starts past the tile
+          return std::min(pos, kv_tile_end_pos);
        }();

        int32_t right_kv_pos = [&]() {
@@ -1585,17 +1582,10 @@ class AttentionMainLoop {

              if (use_sink) {
                alignas(64) float s_aux_fp32[16];
-#if defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
-                // ARM without native BF16 support: manual conversion
-                for (int i = 0; i < 16; ++i) {
-                  s_aux_fp32[i] = static_cast<float>(curr_s_aux[i]);
-                }
-#else
                // All other platforms have BF16Vec16 available
                vec_op::BF16Vec16 vec_bf16(curr_s_aux);
                vec_op::FP32Vec16 vec_fp32(vec_bf16);
                vec_fp32.save(s_aux_fp32);
-#endif

                float* __restrict__ curr_sum_buffer = sum_buffer;
                float* __restrict__ curr_max_buffer = max_buffer;
--- a/csrc/cpu/cpu_attn_neon.hpp
+++ b/csrc/cpu/cpu_attn_neon.hpp
@@ -4,6 +4,9 @@
 #include "cpu_attn_impl.hpp"
 #include <arm_neon.h>
 #include <type_traits>
+#ifdef ARM_BF16_SUPPORT
+  #include "cpu_attn_neon_bfmmla.hpp"
+#endif
 namespace cpu_attention {

 namespace {
@@ -57,7 +60,7 @@ FORCE_INLINE void load_row8_B_as_f32<c10::BFloat16>(const c10::BFloat16* p,
 #endif
 }

-// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with NEON FMLAs
+// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with ASIMD FMLAs
 // #Loads = (K // 4) * (M + 4 * sizeof(kv_cache_t) / 2)
 // #FMLAs = (K // 4) * (4 * 2 * M)
 // We have (4 * 2 * M) FMLAs for (M + 4 * sizeof(kv_cache_t) / 2) loads
@@ -264,7 +267,7 @@ class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
  constexpr static ISA ISAType = ISA::NEON;
  constexpr static bool scale_on_logits = false;  // apply scale on q_buffer

-  //  static_assert(HeadDim % HeadDimAlignment == 0);
+  static_assert(HeadDim % HeadDimAlignment == 0);
  // the gemm micro kernel is Mx8
  static_assert(HeadDimAlignment % 8 == 0);
  static_assert(BlockSizeAlignment % 8 == 0);
@@ -381,6 +384,18 @@ class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
    }
  }
 };
+
+#ifdef ARM_BF16_SUPPORT
+// For BF16 on Arm, reuse the BFMMLA kernels with 32-token alignment.
+template <int64_t head_dim>
+class AttentionImpl<ISA::NEON, c10::BFloat16, head_dim>
+    : public AttentionImplNEONBFMMLA<BLOCK_SIZE_ALIGNMENT, ISA::NEON,
+                                     head_dim> {};
+#endif
 }  // namespace cpu_attention

-#endif  // #ifndef CPU_ATTN_NEON_HPP
+#undef BLOCK_SIZE_ALIGNMENT
+#undef HEAD_SIZE_ALIGNMENT
+#undef MAX_Q_HEAD_NUM_PER_ITER
+
+#endif  // #ifndef CPU_ATTN_ASIMD_HPP
--- a/csrc/cpu/cpu_attn_neon_bfmmla.hpp
+++ b/csrc/cpu/cpu_attn_neon_bfmmla.hpp
@@ -0,0 +1,682 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#ifndef CPU_ATTN_NEON_BFMMLA_HPP
+#define CPU_ATTN_NEON_BFMMLA_HPP
+
+#include "cpu_attn_impl.hpp"
+
+#include <arm_neon.h>
+
+#include <cstdint>
+#include <vector>
+
+namespace cpu_attention {
+
+namespace {
+
+// BFMMLA tile dimensions
+constexpr int32_t TILE_ROWS = 2;  // M dimension
+constexpr int32_t TILE_K = 4;     // K reduction
+constexpr int32_t TILE_COLS = 2;  // N dimension (column-pair)
+
+// Derived constants
+constexpr int32_t OUTPUT_COLS_PER_BLOCK = 8;   // 4 column-pairs
+constexpr int32_t K_TOKENS_PER_GROUP = 8;      // Tokens grouped in K cache
+constexpr int32_t V_TOKENS_PER_ROW_BLOCK = 4;  // Tokens per V cache row block
+constexpr int32_t K_INNER_STRIDE = K_TOKENS_PER_GROUP * TILE_K;
+constexpr int32_t V_INNER_STRIDE = V_TOKENS_PER_ROW_BLOCK * TILE_COLS;
+constexpr int32_t PACK_ELEMENTS_PER_K_CHUNK = TILE_ROWS * TILE_K;  // A packing
+
+// Matrix Packing and Accumulator
+// Reshape two rows of Q into BFMMLA-friendly interleaved
+// Input:  row0 = [a0,a1,a2,a3], row1 = [b0,b1,b2,b3]
+// Output: [a0,a1,a2,a3,b0,b1,b2,b3, a4,a5,a6,a7,b4,b5,b6,b7]
+// For K tail (K % TILE_K != 0): pads with zeros to complete the final chunk
+FORCE_INLINE void reshape_Q_2xK_for_bfmmla(const c10::BFloat16* __restrict r0,
+                                           const c10::BFloat16* __restrict r1,
+                                           c10::BFloat16* __restrict dst,
+                                           int32_t K) {
+  const uint16_t* s0 = reinterpret_cast<const uint16_t*>(r0);
+  const uint16_t* s1 = reinterpret_cast<const uint16_t*>(r1);
+  uint16_t* d = reinterpret_cast<uint16_t*>(dst);
+
+  // Process TILE_K elements at a time (PACK_ELEMENTS_PER_K_CHUNK output)
+  int32_t k = 0;
+  for (; k + TILE_K <= K; k += TILE_K, d += PACK_ELEMENTS_PER_K_CHUNK) {
+    vst1q_u16(d, vcombine_u16(vld1_u16(s0 + k), vld1_u16(s1 + k)));
+  }
+
+  // Handle K tail: pack remaining elements with zero-padding
+  const int32_t tail = K - k;
+  if (tail > 0) {
+    // Pack remaining tail elements: [r0[k..k+tail-1], pad, r1[k..k+tail-1],
+    // pad]
+    for (int32_t t = 0; t < tail; ++t) {
+      d[t] = s0[k + t];
+      d[t + TILE_K] = s1[k + t];
+    }
+    // Zero-pad the rest
+    for (int32_t t = tail; t < TILE_K; ++t) {
+      d[t] = 0;
+      d[t + TILE_K] = 0;
+    }
+  }
+}
+
+// 2x2 accumulator load/store with compile-time row count
+template <int32_t m_rows>
+FORCE_INLINE float32x4_t load_acc_2x2(float* base, int64_t ldc, int col_off) {
+  static_assert(m_rows == 1 || m_rows == 2);
+  float32x2_t row0 = vld1_f32(base + col_off);
+  float32x2_t row1 =
+      (m_rows == 2) ? vld1_f32(base + ldc + col_off) : vdup_n_f32(0.f);
+  return vcombine_f32(row0, row1);
+}
+
+template <int32_t m_rows>
+FORCE_INLINE void store_acc_2x2(float32x4_t acc, float* base, int64_t ldc,
+                                int col_off) {
+  static_assert(m_rows == 1 || m_rows == 2);
+  vst1_f32(base + col_off, vget_low_f32(acc));
+  if constexpr (m_rows == 2) {
+    vst1_f32(base + ldc + col_off, vget_high_f32(acc));
+  }
+}
+
+// Initialize 4 column-pair accumulators for 2 rows (8 columns total)
+#define INIT_ACC_ROWPAIR_4(a0, a1, a2, a3, Crow, ldc, m_rows, accum) \
+  do {                                                               \
+    if (accum) {                                                     \
+      if (m_rows == 2) {                                             \
+        a0 = load_acc_2x2<2>(Crow, ldc, 0);                          \
+        a1 = load_acc_2x2<2>(Crow, ldc, 2);                          \
+        a2 = load_acc_2x2<2>(Crow, ldc, 4);                          \
+        a3 = load_acc_2x2<2>(Crow, ldc, 6);                          \
+      } else {                                                       \
+        a0 = load_acc_2x2<1>(Crow, ldc, 0);                          \
+        a1 = load_acc_2x2<1>(Crow, ldc, 2);                          \
+        a2 = load_acc_2x2<1>(Crow, ldc, 4);                          \
+        a3 = load_acc_2x2<1>(Crow, ldc, 6);                          \
+      }                                                              \
+    } else {                                                         \
+      a0 = a1 = a2 = a3 = vdupq_n_f32(0.f);                          \
+    }                                                                \
+  } while (0)
+
+// Store 4 column-pair accumulators back to C matrix
+#define STORE_ACC_ROWPAIR_4(a0, a1, a2, a3, Crow, ldc, m_rows) \
+  do {                                                         \
+    if (m_rows == 2) {                                         \
+      store_acc_2x2<2>(a0, Crow, ldc, 0);                      \
+      store_acc_2x2<2>(a1, Crow, ldc, 2);                      \
+      store_acc_2x2<2>(a2, Crow, ldc, 4);                      \
+      store_acc_2x2<2>(a3, Crow, ldc, 6);                      \
+    } else {                                                   \
+      store_acc_2x2<1>(a0, Crow, ldc, 0);                      \
+      store_acc_2x2<1>(a1, Crow, ldc, 2);                      \
+      store_acc_2x2<1>(a2, Crow, ldc, 4);                      \
+      store_acc_2x2<1>(a3, Crow, ldc, 6);                      \
+    }                                                          \
+  } while (0)
+
+// Perform 4 BFMMLA operations: acc += A @ B for 4 column-pairs
+#define BFMMLA_COMPUTE_4(r0, r1, r2, r3, a, b0, b1, b2, b3) \
+  do {                                                      \
+    r0 = vbfmmlaq_f32(r0, a, b0);                           \
+    r1 = vbfmmlaq_f32(r1, a, b1);                           \
+    r2 = vbfmmlaq_f32(r2, a, b2);                           \
+    r3 = vbfmmlaq_f32(r3, a, b3);                           \
+  } while (0)
+
+// Micro-kernel: updates a small fixed tile using BFMMLA.
+// RP = number of row-pairs (1,2,4)
+// Computes C[TILE_ROWS*RP, OUTPUT_COLS_PER_BLOCK] += A_packed @ B.
+// A_packed interleaves RP row-pairs; B layout is driven by the attention phase:
+// - AttentionGemmPhase::QK -> token-column layout (Q @ K^T)
+// - AttentionGemmPhase::PV -> token-row layout (P @ V)
+// K_static < 0 enables runtime K (PV only)
+template <int32_t RP, int32_t K_static, AttentionGemmPhase phase>
+FORCE_INLINE void gemm_rowpairs_x8_bfmmla_neon(
+    const bfloat16_t* const* __restrict A_packed_rp,
+    const int32_t* __restrict m_rows_rp, const bfloat16_t* __restrict B_blk,
+    float* __restrict C, int64_t ldc, bool accumulate, int64_t b_stride,
+    int32_t K_runtime = 0) {
+  static_assert(RP == 1 || RP == 2 || RP == 4, "RP must be 1,2,4");
+  static_assert(K_static < 0 || K_static % TILE_K == 0,
+                "K must be divisible by TILE_K");
+  static_assert(K_static >= 0 || phase == AttentionGemmPhase::PV,
+                "Runtime K only supported for PV");
+
+  constexpr bool runtime_k = (K_static < 0);
+  const int32_t K_iters =
+      runtime_k ? (K_runtime / TILE_K) : (K_static / TILE_K);
+  const int32_t K_tail = runtime_k ? (K_runtime % TILE_K) : 0;
+
+  if (!runtime_k) {
+    // Help the compiler fold away unused K_runtime when K is compile-time
+    (void)K_runtime;
+  }
+
+  auto* C_al = C;
+  const auto* B_al = B_blk;
+
+  // Setup A pointers
+  const bfloat16_t* a_ptr[4] = {
+      A_packed_rp[0],
+      (RP >= 2) ? A_packed_rp[1] : nullptr,
+      (RP >= 4) ? A_packed_rp[2] : nullptr,
+      (RP >= 4) ? A_packed_rp[3] : nullptr,
+  };
+
+  // Setup B pointers based on layout
+  const bfloat16_t* b_ptr[4];
+  if constexpr (phase == AttentionGemmPhase::PV) {
+    b_ptr[0] = B_blk + 0 * b_stride;
+    b_ptr[1] = B_blk + 1 * b_stride;
+    b_ptr[2] = B_blk + 2 * b_stride;
+    b_ptr[3] = B_blk + 3 * b_stride;
+  }
+
+  float32x4_t acc[4][4];
+
+// Initialize accumulators
+#define INIT_RP(rp)                                                            \
+  if constexpr (RP > rp) {                                                     \
+    INIT_ACC_ROWPAIR_4(acc[rp][0], acc[rp][1], acc[rp][2], acc[rp][3],         \
+                       C_al + (rp * 2) * ldc, ldc, m_rows_rp[rp], accumulate); \
+  }
+  INIT_RP(0);
+  INIT_RP(1);
+  INIT_RP(2);
+  INIT_RP(3);
+#undef INIT_RP
+
+  // Main compute loop
+  for (int32_t ki = 0; ki < K_iters; ++ki) {
+    bfloat16x8_t b0, b1, b2, b3;
+    if constexpr (phase == AttentionGemmPhase::PV) {
+      b0 = vld1q_bf16(b_ptr[0] + ki * V_INNER_STRIDE);
+      b1 = vld1q_bf16(b_ptr[1] + ki * V_INNER_STRIDE);
+      b2 = vld1q_bf16(b_ptr[2] + ki * V_INNER_STRIDE);
+      b3 = vld1q_bf16(b_ptr[3] + ki * V_INNER_STRIDE);
+    } else {
+      const bfloat16_t* b_base = B_al + ki * b_stride;
+      b0 = vld1q_bf16(b_base + 0 * V_INNER_STRIDE);
+      b1 = vld1q_bf16(b_base + 1 * V_INNER_STRIDE);
+      b2 = vld1q_bf16(b_base + 2 * V_INNER_STRIDE);
+      b3 = vld1q_bf16(b_base + 3 * V_INNER_STRIDE);
+    }
+
+#define COMPUTE_RP(rp)                                                       \
+  if constexpr (RP > rp) {                                                   \
+    bfloat16x8_t a = vld1q_bf16(a_ptr[rp] + ki * PACK_ELEMENTS_PER_K_CHUNK); \
+    BFMMLA_COMPUTE_4(acc[rp][0], acc[rp][1], acc[rp][2], acc[rp][3], a, b0,  \
+                     b1, b2, b3);                                            \
+  }
+    COMPUTE_RP(0);
+    COMPUTE_RP(1);
+    COMPUTE_RP(2);
+    COMPUTE_RP(3);
+#undef COMPUTE_RP
+  }
+
+  // K tail for runtime PV: fallback path
+  if constexpr (runtime_k) {
+    if (K_tail > 0) {
+      const int32_t tail_offset = K_iters * V_INNER_STRIDE;
+      const int32_t a_tail_offset = K_iters * PACK_ELEMENTS_PER_K_CHUNK;
+      for (int32_t kt = 0; kt < K_tail; ++kt) {
+        float32x4_t b_vecs[4];
+        for (int32_t p = 0; p < 4; ++p) {
+          const bfloat16_t* bp = b_ptr[p] + tail_offset + kt * TILE_COLS;
+          const float b0 = vcvtah_f32_bf16(bp[0]);
+          const float b1 = vcvtah_f32_bf16(bp[1]);
+          const float32x2_t b_pair = vset_lane_f32(b1, vdup_n_f32(b0), 1);
+          b_vecs[p] = vcombine_f32(b_pair, b_pair);
+        }
+
+#define TAIL_RP(rp)                                                     \
+  if constexpr (RP > rp) {                                              \
+    const bfloat16_t* ap = A_packed_rp[rp] + a_tail_offset;             \
+    float a_row0 = vcvtah_f32_bf16(ap[kt]);                             \
+    float a_row1 =                                                      \
+        (m_rows_rp[rp] == 2) ? vcvtah_f32_bf16(ap[kt + TILE_K]) : 0.0f; \
+    const float32x4_t a_vec =                                           \
+        vcombine_f32(vdup_n_f32(a_row0), vdup_n_f32(a_row1));           \
+    for (int32_t p = 0; p < 4; ++p) {                                   \
+      acc[rp][p] = vmlaq_f32(acc[rp][p], a_vec, b_vecs[p]);             \
+    }                                                                   \
+  }
+        TAIL_RP(0);
+        TAIL_RP(1);
+        TAIL_RP(2);
+        TAIL_RP(3);
+#undef TAIL_RP
+      }
+    }
+  }
+
+  // Store results
+#define STORE_RP(rp)                                                    \
+  if constexpr (RP > rp) {                                              \
+    STORE_ACC_ROWPAIR_4(acc[rp][0], acc[rp][1], acc[rp][2], acc[rp][3], \
+                        C_al + (rp * 2) * ldc, ldc, m_rows_rp[rp]);     \
+  }
+  STORE_RP(0);
+  STORE_RP(1);
+  STORE_RP(2);
+  STORE_RP(3);
+#undef STORE_RP
+}
+
+// Meso-kernel: packs a small MBxK slice of A, then tiles over N and calls the
+// micro-kernel for each OUTPUT_COLS_PER_BLOCK chunk. K_static < 0 enables
+// runtime K (PV only).
+template <int32_t MB, int32_t N, int32_t K_static, AttentionGemmPhase phase>
+FORCE_INLINE void gemm_packA_compute_MB_xN(
+    const c10::BFloat16* __restrict A, const c10::BFloat16* __restrict B,
+    float* __restrict C, int32_t K_runtime, int64_t lda, int64_t ldc,
+    int64_t b_layout_stride, int64_t b_reduction_stride, bool accumulate) {
+  static_assert(MB >= 1 && MB <= 8, "MB must be in [1,8]");
+  static_assert(N % OUTPUT_COLS_PER_BLOCK == 0,
+                "N must be a multiple of OUTPUT_COLS_PER_BLOCK");
+  static_assert(K_static < 0 || K_static % TILE_K == 0,
+                "K must be divisible by TILE_K");
+  static_assert(K_static >= 0 || phase == AttentionGemmPhase::PV,
+                "Runtime K only supported for PV");
+
+  constexpr bool runtime_k = (K_static < 0);
+  const int32_t K_val = runtime_k ? K_runtime : K_static;
+
+  // Keep small packs on-stack to avoid heap churn
+  constexpr int32_t STACK_PACK_STRIDE =
+      (1024 / TILE_K) * PACK_ELEMENTS_PER_K_CHUNK;
+
+  constexpr int32_t ROW_PAIRS = (MB + 1) / TILE_ROWS;
+  const int32_t pack_stride =
+      runtime_k ? ((K_val + TILE_K - 1) / TILE_K) * PACK_ELEMENTS_PER_K_CHUNK
+                : (K_static / TILE_K) * PACK_ELEMENTS_PER_K_CHUNK;
+
+  alignas(64) c10::BFloat16 A_packed_stack[ROW_PAIRS * STACK_PACK_STRIDE];
+  std::vector<c10::BFloat16> A_packed_heap;
+  c10::BFloat16* A_packed =
+      (pack_stride <= STACK_PACK_STRIDE)
+          ? A_packed_stack
+          : (A_packed_heap.resize(ROW_PAIRS * pack_stride),
+             A_packed_heap.data());
+
+  for (int32_t rp = 0; rp < ROW_PAIRS; ++rp) {
+    const int32_t m = rp * TILE_ROWS;
+    const int32_t m_rows = (m + 1 < MB) ? TILE_ROWS : 1;
+    const c10::BFloat16* A0 = A + m * lda;
+    const c10::BFloat16* A1 = (m_rows == TILE_ROWS) ? (A + (m + 1) * lda) : A0;
+    reshape_Q_2xK_for_bfmmla(A0, A1, A_packed + rp * pack_stride, K_val);
+  }
+
+  for (int32_t n = 0; n < N; n += OUTPUT_COLS_PER_BLOCK) {
+    const c10::BFloat16* B_blk_c10 =
+        (phase == AttentionGemmPhase::PV)
+            ? (B + (n / TILE_COLS) * b_layout_stride)
+            : (B + (n / OUTPUT_COLS_PER_BLOCK) * b_layout_stride);
+    const bfloat16_t* B_blk = reinterpret_cast<const bfloat16_t*>(B_blk_c10);
+
+    // Process row-pairs in groups of 4, 2, then 1
+    int32_t row_pair_idx = 0;
+
+#define PROCESS_RP_GROUP(group_size)                                       \
+  for (; row_pair_idx + (group_size - 1) < ROW_PAIRS;                      \
+       row_pair_idx += group_size) {                                       \
+    const bfloat16_t* Ap[group_size];                                      \
+    int32_t mr[group_size];                                                \
+    for (int32_t i = 0; i < group_size; ++i) {                             \
+      Ap[i] = reinterpret_cast<const bfloat16_t*>(                         \
+          A_packed + (row_pair_idx + i) * pack_stride);                    \
+      mr[i] = (((row_pair_idx + i) * TILE_ROWS + 1) < MB) ? TILE_ROWS : 1; \
+    }                                                                      \
+    float* C_blk = C + (row_pair_idx * TILE_ROWS) * ldc + n;               \
+    if constexpr (runtime_k) {                                             \
+      gemm_rowpairs_x8_bfmmla_neon<group_size, -1, phase>(                 \
+          Ap, mr, B_blk, C_blk, ldc, accumulate, b_layout_stride, K_val);  \
+    } else {                                                               \
+      gemm_rowpairs_x8_bfmmla_neon<group_size, K_static, phase>(           \
+          Ap, mr, B_blk, C_blk, ldc, accumulate,                           \
+          (phase == AttentionGemmPhase::PV) ? b_layout_stride              \
+                                            : b_reduction_stride);         \
+    }                                                                      \
+  }
+
+    PROCESS_RP_GROUP(4);
+    PROCESS_RP_GROUP(2);
+    PROCESS_RP_GROUP(1);
+#undef PROCESS_RP_GROUP
+  }
+}
+
+// Macro-kernel: iterates over M in MB={8,4,2,1} chunks.
+// Supports compile-time K specialization when K >= 0; otherwise uses runtime K
+// (runtime K path is only supported for PV).
+template <AttentionGemmPhase phase, int32_t N, int32_t K = -1>
+FORCE_INLINE void gemm_macro_neon_bfmmla(
+    const c10::BFloat16* __restrict A, const c10::BFloat16* __restrict B,
+    float* __restrict C, int32_t M, int32_t K_runtime, int64_t lda, int64_t ldc,
+    int64_t b_layout_stride, int64_t b_reduction_stride, bool accumulate) {
+  static_assert(N % OUTPUT_COLS_PER_BLOCK == 0,
+                "N must be a multiple of OUTPUT_COLS_PER_BLOCK");
+
+  if constexpr (K >= 0) {
+    static_assert(K % TILE_K == 0, "K must be divisible by TILE_K");
+    for (int32_t m = 0; m < M;) {
+      const int32_t rem = M - m;
+      const c10::BFloat16* A_blk = A + m * lda;
+      float* C_blk = C + m * ldc;
+
+#define DISPATCH_MB(mb)                                                   \
+  gemm_packA_compute_MB_xN<mb, N, K, phase>(A_blk, B, C_blk, 0, lda, ldc, \
+                                            b_layout_stride,              \
+                                            b_reduction_stride, accumulate)
+
+      if (rem >= 8) {
+        DISPATCH_MB(8);
+        m += 8;
+      } else if (rem >= 4) {
+        DISPATCH_MB(4);
+        m += 4;
+      } else if (rem >= 2) {
+        DISPATCH_MB(2);
+        m += 2;
+      } else {
+        DISPATCH_MB(1);
+        m += 1;
+      }
+#undef DISPATCH_MB
+    }
+  } else {
+    static_assert(phase == AttentionGemmPhase::PV,
+                  "Runtime K specialization only supported for PV.");
+    const int32_t K_val = K_runtime;
+
+    for (int32_t m = 0; m < M;) {
+      const int32_t rem = M - m;
+      const c10::BFloat16* A_blk = A + m * lda;
+      float* C_blk = C + m * ldc;
+
+#define DISPATCH_MB_RUNTIME(mb)                                                \
+  gemm_packA_compute_MB_xN<mb, N, -1, phase>(A_blk, B, C_blk, K_val, lda, ldc, \
+                                             b_layout_stride,                  \
+                                             b_reduction_stride, accumulate)
+
+      if (rem >= 8) {
+        DISPATCH_MB_RUNTIME(8);
+        m += 8;
+      } else if (rem >= 4) {
+        DISPATCH_MB_RUNTIME(4);
+        m += 4;
+      } else if (rem >= 2) {
+        DISPATCH_MB_RUNTIME(2);
+        m += 2;
+      } else {
+        DISPATCH_MB_RUNTIME(1);
+        m += 1;
+      }
+#undef DISPATCH_MB_RUNTIME
+    }
+  }
+}
+
+#undef INIT_ACC_ROWPAIR_4
+#undef STORE_ACC_ROWPAIR_4
+#undef BFMMLA_COMPUTE_4
+
+}  // namespace
+
+// TileGemm Adapter for Attention
+
+template <typename kv_cache_t, int32_t BlockTokens, int32_t HeadDim>
+class TileGemmNEONBFMMLA {
+ public:
+  template <AttentionGemmPhase phase, int32_t head_dim_ct>
+  FORCE_INLINE static void gemm(const int32_t m_size, void* __restrict__ a_tile,
+                                kv_cache_t* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                [[maybe_unused]] const int64_t ldb,
+                                const int64_t ldc,
+                                [[maybe_unused]] const int32_t block_size,
+                                [[maybe_unused]] const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    static_assert(BlockTokens % OUTPUT_COLS_PER_BLOCK == 0);
+    // BFMMLA kernels require compile-time head_dim; keep head_dim_ct only for
+    // API parity with other tile_gemm implementations.
+    if constexpr (head_dim_ct >= 0) {
+      static_assert(head_dim_ct == HeadDim,
+                    "BFMMLA expects head_dim_ct to match HeadDim; PV passes "
+                    "-1 for API parity.");
+    }
+
+    if constexpr (phase == AttentionGemmPhase::QK) {
+      const int64_t b_reduction_stride = K_INNER_STRIDE;
+      const int64_t b_token_block_stride = (HeadDim / TILE_K) * K_INNER_STRIDE;
+
+      gemm_macro_neon_bfmmla<AttentionGemmPhase::QK, BlockTokens, HeadDim>(
+          reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
+          m_size, 0, lda, ldc, b_token_block_stride, b_reduction_stride,
+          accum_c);
+    } else {
+      const int64_t b_pair_stride =
+          (block_size / V_TOKENS_PER_ROW_BLOCK) * V_INNER_STRIDE;
+
+      // PV gemm with runtime K specialization
+      switch (dynamic_k_size) {
+        case 32:
+          gemm_macro_neon_bfmmla<AttentionGemmPhase::PV, HeadDim, 32>(
+              reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
+              m_size, 32, lda, ldc, b_pair_stride, 0, accum_c);
+          break;
+        case 128:
+          gemm_macro_neon_bfmmla<AttentionGemmPhase::PV, HeadDim, 128>(
+              reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
+              m_size, 128, lda, ldc, b_pair_stride, 0, accum_c);
+          break;
+        case 256:
+          gemm_macro_neon_bfmmla<AttentionGemmPhase::PV, HeadDim, 256>(
+              reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
+              m_size, 256, lda, ldc, b_pair_stride, 0, accum_c);
+          break;
+        default:
+          gemm_macro_neon_bfmmla<AttentionGemmPhase::PV, HeadDim>(
+              reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
+              m_size, dynamic_k_size, lda, ldc, b_pair_stride, 0, accum_c);
+          break;
+      }
+    }
+  }
+};
+
+// Shared ASIMD BFMMLA implementation (BF16 only). The block size alignment and
+// ISA tag are template parameters so we can reuse the same kernels for
+// different NEON configurations.
+template <int64_t block_size_alignment, ISA isa_type, int64_t head_dim>
+class AttentionImplNEONBFMMLA {
+ public:
+  using query_t = c10::BFloat16;
+  using q_buffer_t = c10::BFloat16;
+  using kv_cache_t = c10::BFloat16;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = c10::BFloat16;
+
+  static constexpr int64_t BlockSizeAlignment = block_size_alignment;
+  // HeadDimAlignment equals head_dim so that the PV phase processes
+  // the full head dimension in a single gemm call.
+  static constexpr int64_t HeadDimAlignment = head_dim;
+  static constexpr int64_t MaxQHeadNumPerIteration = 16;
+  static constexpr int64_t HeadDim = head_dim;
+  static constexpr ISA ISAType = isa_type;
+  static constexpr bool scale_on_logits = false;
+
+  static_assert(HeadDim % OUTPUT_COLS_PER_BLOCK == 0);
+  static_assert(BlockSizeAlignment % OUTPUT_COLS_PER_BLOCK == 0);
+  static_assert(HeadDim % TILE_K == 0, "HeadDim must be a multiple of TILE_K");
+
+ public:
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    attention<
+        TileGemmNEONBFMMLA<kv_cache_t, static_cast<int32_t>(BlockSizeAlignment),
+                           static_cast<int32_t>(HeadDim)>>
+        attention_iteration;
+    attention_iteration(CPU_ATTENTION_PARAMS);
+  }
+
+  // Key cache stride per token group (TokenColumn layout; QK)
+  static constexpr int64_t k_cache_token_group_stride(
+      [[maybe_unused]] const int32_t block_size) {
+    static_assert(BlockSizeAlignment % K_TOKENS_PER_GROUP == 0);
+    return (BlockSizeAlignment / K_TOKENS_PER_GROUP) *
+           ((head_dim / TILE_K) * K_INNER_STRIDE);
+  }
+
+  // Value cache stride per token group (TokenRow layout; PV)
+  static constexpr int64_t v_cache_token_group_stride(
+      [[maybe_unused]] const int32_t block_size) {
+    static_assert(BlockSizeAlignment % V_TOKENS_PER_ROW_BLOCK == 0);
+    return (BlockSizeAlignment / V_TOKENS_PER_ROW_BLOCK) * V_INNER_STRIDE;
+  }
+
+  // The stride to move to the "next" head_dim group
+  // is the full V cache size per head, since HeadDimAlignment == head_dim.
+  // Hence, the stride is not used in this case
+  static constexpr int64_t v_cache_head_group_stride(
+      [[maybe_unused]] const int32_t block_size) {
+    return head_dim * block_size;
+  }
+
+  // Convert Q heads to BF16 and apply scale factor using native BF16 intrinsics
+  static void copy_q_heads_tile(c10::BFloat16* __restrict__ src,
+                                c10::BFloat16* __restrict__ q_buffer,
+                                const int32_t q_num,
+                                const int32_t q_heads_per_kv,
+                                const int64_t q_num_stride,
+                                const int64_t q_head_stride, float scale) {
+    constexpr int32_t dim = static_cast<int32_t>(head_dim);
+    const float32x4_t scale_vec = vdupq_n_f32(scale);
+
+    for (int32_t qi = 0; qi < q_num; ++qi) {
+      for (int32_t hi = 0; hi < q_heads_per_kv; ++hi) {
+        c10::BFloat16* __restrict__ curr_q =
+            src + qi * q_num_stride + hi * q_head_stride;
+        c10::BFloat16* __restrict__ dst =
+            q_buffer + qi * q_heads_per_kv * head_dim + hi * head_dim;
+
+        for (int32_t i = 0; i < dim; i += OUTPUT_COLS_PER_BLOCK) {
+          bfloat16x8_t in8 =
+              vld1q_bf16(reinterpret_cast<const bfloat16_t*>(curr_q + i));
+          float32x4_t lo = vmulq_f32(vcvtq_low_f32_bf16(in8), scale_vec);
+          float32x4_t hi = vmulq_f32(vcvtq_high_f32_bf16(in8), scale_vec);
+
+          bfloat16x4_t lo_b = vcvt_bf16_f32(lo);
+          bfloat16x4_t hi_b = vcvt_bf16_f32(hi);
+          bfloat16x8_t out = vcombine_bf16(lo_b, hi_b);
+          vst1q_bf16(reinterpret_cast<bfloat16_t*>(dst + i), out);
+        }
+      }
+    }
+  }
+
+ public:
+  // Reshape and cache K/V into BFMMLA-optimized layouts
+  // K cache:
+  // [block_size/K_TOKENS_PER_GROUP][head_dim/TILE_K][K_INNER_STRIDE]
+  // - TokenColumn
+  // V cache:
+  // [head_dim/TILE_COLS][block_size/V_TOKENS_PER_ROW_BLOCK][V_INNER_STRIDE]
+  // - TokenRows
+  static void reshape_and_cache(
+      const c10::BFloat16* __restrict__ key,
+      const c10::BFloat16* __restrict__ value,
+      c10::BFloat16* __restrict__ key_cache,
+      c10::BFloat16* __restrict__ value_cache,
+      const int64_t* __restrict__ slot_mapping, const int64_t token_num,
+      const int64_t key_token_num_stride, const int64_t value_token_num_stride,
+      const int64_t head_num, const int64_t key_head_num_stride,
+      const int64_t value_head_num_stride,
+      [[maybe_unused]] const int64_t num_blocks,
+      const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
+      const int64_t block_size,
+      [[maybe_unused]] const int64_t block_size_stride) {
+    const int64_t k_block_stride = (head_dim / TILE_K) * K_INNER_STRIDE;
+    const int64_t v_pair_stride =
+        (block_size / V_TOKENS_PER_ROW_BLOCK) * V_INNER_STRIDE;
+
+#pragma omp parallel for
+    for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
+      for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
+        const int64_t pos = slot_mapping[token_idx];
+        if (pos < 0) continue;
+
+        const int64_t block_idx = pos / block_size;
+        const int64_t block_offset = pos % block_size;
+
+        // Key cache: TokenColumn QK
+        {
+          const c10::BFloat16* __restrict key_src =
+              key + token_idx * key_token_num_stride +
+              head_idx * key_head_num_stride;
+
+          c10::BFloat16* __restrict key_base = key_cache +
+                                               block_idx * num_blocks_stride +
+                                               head_idx * cache_head_num_stride;
+
+          const int64_t block_in_block = block_offset / K_TOKENS_PER_GROUP;
+          const int64_t pair_in_block =
+              (block_offset % K_TOKENS_PER_GROUP) / TILE_COLS;
+          const int64_t lane_base = (block_offset & 1) ? TILE_K : 0;
+
+          c10::BFloat16* __restrict block_base =
+              key_base + block_in_block * k_block_stride;
+
+          for (int64_t hd4 = 0; hd4 < head_dim / TILE_K; ++hd4) {
+            uint16_t* dst_u16 = reinterpret_cast<uint16_t*>(
+                block_base + hd4 * K_INNER_STRIDE +
+                pair_in_block * V_INNER_STRIDE + lane_base);
+            const uint16_t* src_u16 =
+                reinterpret_cast<const uint16_t*>(key_src + hd4 * TILE_K);
+            vst1_u16(dst_u16, vld1_u16(src_u16));
+          }
+        }
+
+        // Value cache: TokenRow PV
+        {
+          const c10::BFloat16* __restrict value_src =
+              value + token_idx * value_token_num_stride +
+              head_idx * value_head_num_stride;
+
+          c10::BFloat16* __restrict value_base =
+              value_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride;
+
+          const int64_t row_block = block_offset / V_TOKENS_PER_ROW_BLOCK;
+          const int64_t lane = block_offset & (V_TOKENS_PER_ROW_BLOCK - 1);
+
+          c10::BFloat16* __restrict row_block_base =
+              value_base + row_block * V_INNER_STRIDE;
+
+          for (int64_t hd2 = 0; hd2 < head_dim / TILE_COLS; ++hd2) {
+            c10::BFloat16* __restrict dst_val =
+                row_block_base + hd2 * v_pair_stride;
+
+            const uint16_t* src_u16 =
+                reinterpret_cast<const uint16_t*>(value_src);
+            uint16_t* dst_u16 = reinterpret_cast<uint16_t*>(dst_val);
+            dst_u16[lane] = src_u16[hd2 * TILE_COLS + 0];
+            dst_u16[lane + V_TOKENS_PER_ROW_BLOCK] =
+                src_u16[hd2 * TILE_COLS + 1];
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace cpu_attention
+
+#endif  // CPU_ATTN_ASIMD_BFMMLA_HPP
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@@ -14,13 +14,11 @@ struct KernelVecType<float> {
  using cvt_vec_type = vec_op::FP32Vec16;
 };

-#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
 template <>
 struct KernelVecType<c10::BFloat16> {
  using load_vec_type = vec_op::BF16Vec16;
  using cvt_vec_type = vec_op::FP32Vec16;
 };
-#endif

 template <>
 struct KernelVecType<c10::Half> {
--- a/csrc/cpu/generate_cpu_attn_dispatch.py
+++ b/csrc/cpu/generate_cpu_attn_dispatch.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Generate CPU attention dispatch switch cases and kernel instantiations.
+"""
+
+import os
+
+# Head dimensions divisible by 32 (support all ISAs)
+HEAD_DIMS_32 = [32, 64, 96, 128, 160, 192, 224, 256]
+
+# Head dimensions divisible by 16 but not 32 (VEC16 only)
+HEAD_DIMS_16 = [80, 112]
+
+# ISA types
+ISA_TYPES = {
+    "AMX": 0,
+    "VEC": 1,
+    "VEC16": 2,
+    "NEON": 3,
+}
+
+# ISAs supported for head_dims divisible by 32
+ISA_FOR_32 = ["AMX", "NEON", "VEC", "VEC16"]
+
+# ISAs supported for head_dims divisible by 16 only
+ISA_FOR_16 = ["VEC16"]
+
+
+def encode_params(head_dim: int, isa_type: str) -> int:
+    """Encode head_dim and ISA type into a single int64_t."""
+    isa_val = ISA_TYPES[isa_type]
+    # Encoding: (head_dim << 8) | isa_type
+    # This allows head_dim up to 2^56 - 1 and 256 ISA types
+    return (head_dim << 8) | isa_val
+
+
+def generate_cases_for_isa_group(isa_list: list[str]) -> str:
+    """Generate switch cases for a specific ISA group."""
+    cases = []
+
+    # Generate cases for head_dims divisible by 32
+    for head_dim in HEAD_DIMS_32:
+        for isa in isa_list:
+            if isa not in ISA_FOR_32:
+                continue
+            encoded = encode_params(head_dim, isa)
+            case_str = (
+                f"""      case {encoded}LL: {{ """
+                f"""/* head_dim={head_dim}, isa={isa} */ \\"""
+                f"""
+        constexpr size_t head_dim = {head_dim}; \\"""
+                f"""
+        using attn_impl = cpu_attention::AttentionImpl<"""
+                f"""cpu_attention::ISA::{isa}, \\"""
+                f"""
+                                                       """
+                f"""scalar_t, head_dim>; \\"""
+                f"""
+        return __VA_ARGS__(); \\"""
+                f"""
+      }} \\"""
+            )
+            cases.append(case_str)
+
+    # Generate cases for head_dims divisible by 16 only
+    for head_dim in HEAD_DIMS_16:
+        for isa in isa_list:
+            encoded = encode_params(head_dim, isa)
+            case_str = (
+                f"""      case {encoded}LL: {{ """
+                f"""/* head_dim={head_dim}, isa={isa} """
+                f"""(using VEC16) */ \\"""
+                f"""
+        constexpr size_t head_dim = {head_dim}; \\"""
+                f"""
+        using attn_impl = cpu_attention::AttentionImpl<"""
+                f"""cpu_attention::ISA::VEC16, \\"""
+                f"""
+                                                       """
+                f"""scalar_t, head_dim>; \\"""
+                f"""
+        return __VA_ARGS__(); \\"""
+                f"""
+      }} \\"""
+            )
+            cases.append(case_str)
+
+    return "\n".join(cases)
+
+
+def generate_helper_function() -> str:
+    """Generate helper function to encode parameters."""
+    return """
+inline int64_t encode_cpu_attn_params(int64_t head_dim, cpu_attention::ISA isa) {
+  return (head_dim << 8) | static_cast<int64_t>(isa);
+}
+"""
+
+
+def generate_header_file() -> str:
+    """Generate the complete header file content."""
+    header = """// auto generated by generate_cpu_attn_dispatch.py
+// clang-format off
+
+#ifndef CPU_ATTN_DISPATCH_GENERATED_H
+#define CPU_ATTN_DISPATCH_GENERATED_H
+
+#include "cpu_attn_vec.hpp"
+#include "cpu_attn_vec16.hpp"
+
+#ifdef CPU_CAPABILITY_AMXBF16
+  #include "cpu_attn_amx.hpp"
+#endif
+
+#ifdef __aarch64__
+  #include "cpu_attn_neon.hpp"
+#endif
+
+"""
+
+    header += generate_helper_function()
+
+    # Generate dispatch macro with conditional compilation for different ISA sets
+    header += """
+// Dispatch macro using encoded parameters
+"""
+
+    # x86_64 with AMX
+    header += """#if defined(CPU_CAPABILITY_AMXBF16)
+#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
+  [&] { \\
+    int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
+    switch (encoded_params) { \\
+"""
+    header += generate_cases_for_isa_group(["AMX", "VEC", "VEC16"])
+    header += """
+      default: { \\
+        TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
+                    std::to_string(HEAD_DIM) + " isa=" + \\
+                    std::to_string(static_cast<int>(ISA_TYPE))); \\
+      } \\
+    } \\
+  }()
+
+"""
+
+    # ARM64 with NEON
+    header += """#elif defined(__aarch64__)
+#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
+  [&] { \\
+    int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
+    switch (encoded_params) { \\
+"""
+    header += generate_cases_for_isa_group(["NEON", "VEC", "VEC16"])
+    header += """
+      default: { \\
+        TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
+                    std::to_string(HEAD_DIM) + " isa=" + \\
+                    std::to_string(static_cast<int>(ISA_TYPE))); \\
+      } \\
+    } \\
+  }()
+
+"""
+
+    # Fallback: VEC and VEC16 only
+    header += """#else
+#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
+  [&] { \\
+    int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
+    switch (encoded_params) { \\
+"""
+    header += generate_cases_for_isa_group(["VEC", "VEC16"])
+    header += """
+      default: { \\
+        TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
+                    std::to_string(HEAD_DIM) + " isa=" + \\
+                    std::to_string(static_cast<int>(ISA_TYPE))); \\
+      } \\
+    } \\
+  }()
+
+#endif  /* CPU_CAPABILITY_AMXBF16 / __aarch64__ */
+
+#endif  // CPU_ATTN_DISPATCH_GENERATED_H
+"""
+
+    return header
+
+
+def main():
+    output_path = os.path.join(
+        os.path.dirname(__file__), "cpu_attn_dispatch_generated.h"
+    )
+
+    with open(output_path, "w") as f:
+        f.write(generate_header_file())
+
+
+if __name__ == "__main__":
+    main()
--- a/csrc/cpu/mla_decode.cpp
+++ b/csrc/cpu/mla_decode.cpp
@@ -38,9 +38,16 @@ struct KernelVecType<c10::BFloat16> {
  using qk_vec_type = vec_op::BF16Vec32;
  using v_load_vec_type = vec_op::BF16Vec16;
 };
-#elif defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
-// pass
-#else
+
+#elif defined(__s390x__)
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using qk_load_vec_type = vec_op::BF16Vec16;
+  using qk_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::BF16Vec16;
+};
+
+#elif defined(__aarch64__)
 template <>
 struct KernelVecType<c10::BFloat16> {
  using qk_load_vec_type = vec_op::BF16Vec16;
--- a/csrc/cpu/sgl-kernels/gemm.cpp
+++ b/csrc/cpu/sgl-kernels/gemm.cpp
@@ -265,7 +265,7 @@ void tinygemm_kernel(
        // mb_size = 4
        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
        case 0x44: LAUNCH_TINYGEMM_KERNEL_NN(4, 64); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
      }
    }
  }
--- a/csrc/cpu/sgl-kernels/gemm_fp8.cpp
+++ b/csrc/cpu/sgl-kernels/gemm_fp8.cpp
@@ -324,7 +324,7 @@ void tinygemm_kernel(
        case 0x22: LAUNCH_TINYGEMM_KERNEL_NN(2, 32); break;
        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
      }
    }
  }
--- a/csrc/cpu/sgl-kernels/gemm_int8.cpp
+++ b/csrc/cpu/sgl-kernels/gemm_int8.cpp
@@ -180,7 +180,7 @@ void tinygemm_kernel(
        // mb_size = 4
        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
        case 0x44: LAUNCH_TINYGEMM_KERNEL_NN(4, 64); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
      }
    }
  }
--- a/csrc/cpu/sgl-kernels/moe.cpp
+++ b/csrc/cpu/sgl-kernels/moe.cpp
@@ -398,7 +398,7 @@ void tinygemm_kernel(
        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
        // mb_size = 4
        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
      }
    }
  }
@@ -511,7 +511,7 @@ void tinygemm_kernel(
        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN2(3, 32); break;
        // mb_size = 4
        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN2(4, 32); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
      }
    }
  }
--- a/csrc/cpu/sgl-kernels/moe_int8.cpp
+++ b/csrc/cpu/sgl-kernels/moe_int8.cpp
@@ -271,7 +271,7 @@ void tinygemm_kernel(
        case 0x22: LAUNCH_TINYGEMM_KERNEL_VNNI(2, 32); break;
        case 0x32: LAUNCH_TINYGEMM_KERNEL_VNNI(3, 32); break;
        case 0x42: LAUNCH_TINYGEMM_KERNEL_VNNI(4, 32); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
      }
    }
  }
@@ -401,7 +401,7 @@ void tinygemm_kernel(
        case 0x22: LAUNCH_TINYGEMM_KERNEL_VNNI2(2, 32); break;
        case 0x32: LAUNCH_TINYGEMM_KERNEL_VNNI2(3, 32); break;
        case 0x42: LAUNCH_TINYGEMM_KERNEL_VNNI2(4, 32); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
      }
    }
  }
--- a/csrc/cpu/utils.hpp
+++ b/csrc/cpu/utils.hpp
@@ -30,12 +30,10 @@ struct VecTypeTrait<float> {
  using vec_t = vec_op::FP32Vec16;
 };

-#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
 template <>
 struct VecTypeTrait<c10::BFloat16> {
  using vec_t = vec_op::BF16Vec16;
 };
-#endif

 #if !defined(__powerpc__)
 template <>
--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -115,11 +115,28 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
  if (flag) {  // support GPUDirect RDMA if possible
    prop.allocFlags.gpuDirectRDMACapable = 1;
  }
+  int fab_flag = 0;
+  CUDA_CHECK(cuDeviceGetAttribute(
+      &fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device));
+  if (fab_flag) {  // support fabric handle if possible
+    prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
+  }
 #endif

 #ifndef USE_ROCM
  // Allocate memory using cuMemCreate
-  CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
+  CUresult ret = (CUresult)cuMemCreate(p_memHandle, size, &prop, 0);
+  if (ret) {
+    if (fab_flag &&
+        (ret == CUDA_ERROR_NOT_PERMITTED || ret == CUDA_ERROR_NOT_SUPPORTED)) {
+      // Fabric allocation may fail without multi-node nvlink,
+      // fallback to POSIX file descriptor
+      prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+      CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
+    } else {
+      CUDA_CHECK(ret);
+    }
+  }
  if (error_code != 0) {
    return;
  }
--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@@ -3,7 +3,8 @@
 #include "cutlass/cutlass.h"
 #include <climits>
 #include "cuda_runtime.h"
-#include <iostream>
+#include <cstdio>
+#include <cstdlib>

 /**
 * Helper function for checking CUTLASS errors
@@ -31,12 +32,63 @@ int32_t get_sm_version_num();
 * __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
 * into code that will be executed on the device where it is defined.
 */
+
+template <typename Kernel>
+struct enable_sm75_to_sm80 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
+    Kernel::invoke(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm[75, 80).\n");
+    asm("trap;");
+  #endif
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm80_to_sm89 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
+    Kernel::invoke(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm[80, 89).\n");
+    asm("trap;");
+  #endif
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm89_to_sm90 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
+    Kernel::invoke(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm[89, 90).\n");
+    asm("trap;");
+  #endif
+#endif
+  }
+};
+
 template <typename Kernel>
 struct enable_sm90_or_later : Kernel {
  template <typename... Args>
  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ >= 900
    Kernel::operator()(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm >= 90.\n");
+    asm("trap;");
+  #endif
 #endif
  }
 };
@@ -45,18 +97,43 @@ template <typename Kernel>
 struct enable_sm90_only : Kernel {
  template <typename... Args>
  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 900
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ == 900
    Kernel::operator()(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm90.\n");
+    asm("trap;");
+  #endif
 #endif
  }
 };

 template <typename Kernel>
-struct enable_sm100_only : Kernel {
+struct enable_sm100f_only : Kernel {
  template <typename... Args>
  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 1000
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ == 1000 || __CUDA_ARCH__ == 1030
    Kernel::operator()(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm100f.\n");
+    asm("trap;");
+  #endif
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm100a_only : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ == 1000
+    Kernel::operator()(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm100a.\n");
+    asm("trap;");
+  #endif
 #endif
  }
 };
@@ -65,7 +142,23 @@ template <typename Kernel>
 struct enable_sm120_only : Kernel {
  template <typename... Args>
  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 1200
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ == 1200
+    Kernel::operator()(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm120.\n");
+    asm("trap;");
+  #endif
+#endif
+  }
+};
+
+// SM12x family includes SM120 (RTX 5090) and SM121 (DGX Spark GB10)
+template <typename Kernel>
+struct enable_sm120_family : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && (__CUDA_ARCH__ >= 1200 && __CUDA_ARCH__ < 1300)
    Kernel::operator()(std::forward<Args>(args)...);
 #endif
  }
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@@ -770,7 +770,7 @@ torch::Tensor moe_wna16_marlin_gemm(
    b_bias = b_bias_or_none.value();
    TORCH_CHECK(b_bias.device().is_cuda(), "b_bias is not on GPU");
    TORCH_CHECK(b_bias.is_contiguous(), "b_bias is not contiguous");
-    TORCH_CHECK(b_bias.size(1) == size_n, "b_bias.size(0) != size_n");
+    TORCH_CHECK(b_bias.size(1) == size_n, "b_bias.size(1) != size_n");
    TORCH_CHECK(b_bias.stride(1) == 1, "b_bias.stride(1) != 1");
  } else {
    b_bias = torch::empty({0}, options);
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -14,12 +14,10 @@ void moe_permute(
    const torch::Tensor& token_expert_indices,       // [n_token, topk]
    const std::optional<torch::Tensor>& expert_map,  // [n_expert]
    int64_t n_expert, int64_t n_local_expert, int64_t topk,
-    const std::optional<int64_t>& align_block_size,
    torch::Tensor& permuted_input,             // [permuted_size, hidden]
    torch::Tensor& expert_first_token_offset,  // [n_local_expert + 1]
    torch::Tensor& inv_permuted_idx,           // [n_token, topk]
-    torch::Tensor& permuted_idx,               // [permute_size]
-    torch::Tensor& m_indices) {                // [align_expand_m]
+    torch::Tensor& permuted_idx) {             // [permute_size]
  TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long,
              "expert_first_token_offset must be int64");
  TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
@@ -34,8 +32,6 @@ void moe_permute(
              "token_expert_indices shape must be same as inv_permuted_idx");
  auto n_token = input.sizes()[0];
  auto n_hidden = input.sizes()[1];
-  auto align_block_size_value =
-      align_block_size.has_value() ? align_block_size.value() : -1;
  auto stream = at::cuda::getCurrentCUDAStream().stream();
  const long sorter_size =
      CubKeyValueSorter::getWorkspaceSize(n_token * topk, n_expert);
@@ -73,42 +69,15 @@ void moe_permute(
      get_ptr<int64_t>(expert_first_token_offset), n_token, n_expert,
      n_local_expert, topk, sorter, get_ptr<int>(sort_workspace), stream);

-  // DeepGEMM: use getMIndices kernel to compute
-  // 1) align_expert_first_token_offset (aligned prefix offsets)
-  // 2) m_indices (expert id for each aligned row)
-  // eg. expert0: 3, expert1: 5, expert2: 2 tokens respectively
-  // expert_first_token_offset = [0, 3, 8, 10], align_block_size = 4
-  // expert0: 3->4, expert1: 5->8, expert2: 2->4
-  // align_expert_first_token_offset = [0, 4, 12, 16]
-  // so m_indices = [0,0,0,0, 1,1,1,1,1,1,1,1, 2,2,2,2]
-  torch::Tensor align_expert_first_token_offset;
-  const int64_t* aligned_expert_first_token_offset_ptr = nullptr;
-  if (align_block_size.has_value()) {
-    align_expert_first_token_offset =
-        torch::zeros_like(expert_first_token_offset);
-    getMIndices(get_ptr<int64_t>(expert_first_token_offset),
-                get_ptr<int64_t>(align_expert_first_token_offset),
-                get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
-                stream);
-    aligned_expert_first_token_offset_ptr =
-        get_ptr<int64_t>(align_expert_first_token_offset);
-  }
-
  // dispatch expandInputRowsKernelLauncher
  MOE_DISPATCH(input.scalar_type(), [&] {
    expandInputRowsKernelLauncher<scalar_t>(
        get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
        get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
        get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
-        get_ptr<int64_t>(expert_first_token_offset),
-        aligned_expert_first_token_offset_ptr, n_token, valid_num_ptr, n_hidden,
-        topk, n_local_expert, align_block_size_value, stream);
+        get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
+        n_hidden, topk, n_local_expert, stream);
  });
-
-  // this is only required for DeepGemm and not required for CUTLASS group gemm
-  if (align_block_size.has_value()) {
-    expert_first_token_offset.copy_(align_expert_first_token_offset);
-  }
 }

 void moe_unpermute(
@@ -201,16 +170,13 @@ void shuffle_rows(const torch::Tensor& input_tensor,

 #else

-void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
-                 torch::Tensor& topk_ids,
+void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_ids,
                 const torch::Tensor& token_expert_indices,
                 const std::optional<torch::Tensor>& expert_map,
                 int64_t n_expert, int64_t n_local_expert, int64_t topk,
-                 const std::optional<int64_t>& align_block_size,
                 torch::Tensor& permuted_input,
                 torch::Tensor& expert_first_token_offset,
-                 torch::Tensor& src_row_id2dst_row_id_map,
-                 torch::Tensor& m_indices) {
+                 torch::Tensor& inv_permuted_idx, torch::Tensor& permuted_idx) {
  TORCH_CHECK(false, "moe_permute is not supported on CUDA < 12.0");
 }

--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
@@ -168,64 +168,4 @@ void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
      topk_id_ptr, size, expert_map_ptr, num_experts);
 }

-template <bool ALIGN_BLOCK_SIZE>
-__global__ void getMIndicesKernel(int64_t* expert_first_token_offset,
-                                  int64_t* align_expert_first_token_offset,
-                                  int* m_indices, const int num_local_expert,
-                                  const int align_block_size) {
-  int eidx = blockIdx.x;
-  int tidx = threadIdx.x;
-  extern __shared__ int64_t smem_expert_first_token_offset[];
-  for (int i = tidx; i <= num_local_expert; i += blockDim.x) {
-    smem_expert_first_token_offset[i] = __ldg(expert_first_token_offset + i);
-  }
-  __syncthreads();
-  auto last_token_offset = smem_expert_first_token_offset[eidx + 1];
-  auto first_token_offset = smem_expert_first_token_offset[eidx];
-  int n_token_in_expert = last_token_offset - first_token_offset;
-
-  if constexpr (ALIGN_BLOCK_SIZE) {
-    n_token_in_expert = (n_token_in_expert + align_block_size - 1) /
-                        align_block_size * align_block_size;
-    // round up to ALIGN_BLOCK_SIZE
-    int64_t accumulate_align_offset = 0;
-    for (int i = 1; i <= eidx + 1; i++) {
-      int n_token = smem_expert_first_token_offset[i] -
-                    smem_expert_first_token_offset[i - 1];
-      accumulate_align_offset =
-          accumulate_align_offset + (n_token + align_block_size - 1) /
-                                        align_block_size * align_block_size;
-      if (i == eidx) {
-        first_token_offset = accumulate_align_offset;
-      }
-      // last block store align_expert_first_token_offset
-      if (eidx == num_local_expert - 1 && threadIdx.x == 0) {
-        align_expert_first_token_offset[i] = accumulate_align_offset;
-      }
-    }
-  }
-  for (int idx = tidx; idx < n_token_in_expert; idx += blockDim.x) {
-    // update m_indice with expert id
-    m_indices[first_token_offset + idx] = eidx;
-  }
-}
-
-void getMIndices(int64_t* expert_first_token_offset,
-                 int64_t* align_expert_first_token_offset, int* m_indices,
-                 int num_local_expert, const int align_block_size,
-                 cudaStream_t stream) {
-  int block = 256;
-  int grid = num_local_expert;
-  int smem_size = sizeof(int64_t) * (num_local_expert + 1);
-  if (align_block_size == -1) {
-    getMIndicesKernel<false><<<grid, block, smem_size, stream>>>(
-        expert_first_token_offset, align_expert_first_token_offset, m_indices,
-        num_local_expert, align_block_size);
-  } else {
-    getMIndicesKernel<true><<<grid, block, smem_size, stream>>>(
-        expert_first_token_offset, align_expert_first_token_offset, m_indices,
-        num_local_expert, align_block_size);
-  }
-}
-
 #endif
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
@@ -60,10 +60,9 @@ void expandInputRowsKernelLauncher(
    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
    int const* expanded_dest_row_to_expanded_source_row,
    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
-    int64_t const* expert_first_token_offset,
-    int64_t const* aligned_expert_first_token_offset, int64_t const num_rows,
+    int64_t const* expert_first_token_offset, int64_t const num_rows,
    int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
-    int num_local_experts, const int& align_block_size, cudaStream_t stream);
+    int num_local_experts, cudaStream_t stream);

 template <class T, class OutputType>
 void finalizeMoeRoutingKernelLauncher(
@@ -76,9 +75,4 @@ void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
                              const int* expert_map_ptr, int num_experts,
                              cudaStream_t stream);

-void getMIndices(int64_t* expert_first_token_offset,
-                 int64_t* align_expert_first_token_offset, int* m_indices,
-                 int num_local_expert, const int align_block_size,
-                 cudaStream_t stream);
-
 #include "moe_permute_unpermute_kernel.inl"
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
@@ -1,14 +1,13 @@
 #pragma once

-template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE>
+template <typename T, bool CHECK_SKIPPED>
 __global__ void expandInputRowsKernel(
    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
    int const* expanded_dest_row_to_expanded_source_row,
    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
-    int64_t const* expert_first_token_offset,
-    int64_t const* aligned_expert_first_token_offset, int64_t const num_rows,
+    int64_t const* expert_first_token_offset, int64_t const num_rows,
    int64_t const* num_dest_rows, int64_t const cols, int64_t k,
-    int num_local_experts, int align_block_size) {
+    int num_local_experts) {
  // Reverse permutation map.
  // I do this so that later, we can use the source -> dest map to do the k-way
  // reduction and unpermuting. I need the reverse map for that reduction to
@@ -19,24 +18,6 @@ __global__ void expandInputRowsKernel(
      expanded_dest_row_to_expanded_source_row[expanded_dest_row];
  int expert_id = sorted_experts[expanded_dest_row];

-  if constexpr (ALIGN_BLOCK_SIZE) {
-    // convert (unaligned) expanded_dest_row -> aligned expanded_dest_row.
-    // aligned_expert_first_token_offset[e] provides the aligned prefix start
-    // for expert e. For non-local experts we map to the end (total aligned M).
-    int64_t aligned_base = 0;
-    int64_t token_offset_in_expert = 0;
-    if (expert_id >= num_local_experts) {
-      aligned_base =
-          __ldg(aligned_expert_first_token_offset + num_local_experts);
-      token_offset_in_expert = 0;
-    } else {
-      aligned_base = __ldg(aligned_expert_first_token_offset + expert_id);
-      token_offset_in_expert =
-          expanded_dest_row - __ldg(expert_first_token_offset + expert_id);
-    }
-    expanded_dest_row = aligned_base + token_offset_in_expert;
-  }
-
  if (threadIdx.x == 0) {
    assert(expanded_dest_row <= INT32_MAX);
    expanded_source_row_to_expanded_dest_row[expanded_source_row] =
@@ -76,29 +57,25 @@ void expandInputRowsKernelLauncher(
    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
    int const* expanded_dest_row_to_expanded_source_row,
    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
-    int64_t const* expert_first_token_offset,
-    int64_t const* aligned_expert_first_token_offset, int64_t const num_rows,
+    int64_t const* expert_first_token_offset, int64_t const num_rows,
    int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
-    int num_local_experts, const int& align_block_size, cudaStream_t stream) {
+    int num_local_experts, cudaStream_t stream) {
  int64_t const blocks = num_rows * k;
  int64_t const threads = 256;
-  using FuncPtr = decltype(&expandInputRowsKernel<T, true, true>);
-  FuncPtr func_map[2][2] = {
-      {&expandInputRowsKernel<T, false, false>,
-       &expandInputRowsKernel<T, false, true>},
-      {&expandInputRowsKernel<T, true, false>,
-       &expandInputRowsKernel<T, true, true>},
+  using FuncPtr = decltype(&expandInputRowsKernel<T, true>);
+  FuncPtr func_map[2] = {
+      &expandInputRowsKernel<T, false>,
+      &expandInputRowsKernel<T, true>,
  };
  bool is_check_skip = num_valid_tokens_ptr != nullptr;
-  bool is_align_block_size = align_block_size != -1;
-  auto func = func_map[is_check_skip][is_align_block_size];
+  auto func = func_map[is_check_skip];

  func<<<blocks, threads, 0, stream>>>(
      unpermuted_input, permuted_output, sorted_experts,
      expanded_dest_row_to_expanded_source_row,
      expanded_source_row_to_expanded_dest_row, permuted_idx,
-      expert_first_token_offset, aligned_expert_first_token_offset, num_rows,
-      num_valid_tokens_ptr, cols, k, num_local_experts, align_block_size);
+      expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
+      num_local_experts);
 }

 template <class T, class U>
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -99,9 +99,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "moe_permute(Tensor input, Tensor topk_ids,"
      "Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
      "int n_local_expert,"
-      "int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
+      "int topk, Tensor! permuted_input, Tensor! "
      "expert_first_token_offset, Tensor! inv_permuted_idx, Tensor! "
-      "permuted_idx, Tensor! m_indices)->()");
+      "permuted_idx)->()");

  m.def(
      "moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights,"
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm.cuh
@@ -141,8 +141,8 @@ struct cutlass_3x_gemm_sm100 {
              sizeof(typename CollectiveEpilogue::SharedStorage))>,
          KernelSchedule>::CollectiveOp;

-  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+  using GemmKernel = enable_sm100f_only<cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>>;
 };

 template <typename ElementAB_, typename ElementD_,
@@ -202,8 +202,8 @@ struct cutlass_3x_gemm_sm120 {
              sizeof(typename CollectiveEpilogue::SharedStorage))>,
          KernelSchedule>::CollectiveOp;

-  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+  using GemmKernel = enable_sm120_only<cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>>;
 };

 }  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
@@ -123,7 +123,7 @@ struct cutlass_3x_gemm_fp8_blockwise {
          MainloopScheduler
      >::CollectiveOp>;

-  using KernelType = enable_sm100_only<cutlass::gemm::kernel::GemmUniversal<
+  using KernelType = enable_sm100f_only<cutlass::gemm::kernel::GemmUniversal<
      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;

  struct GemmKernel : public KernelType {};
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
@@ -103,7 +103,8 @@ struct cutlass_3x_gemm_fp8_blockwise {
          MainloopScheduler
      >::CollectiveOp;

-  using KernelType = enable_sm120_only<cutlass::gemm::kernel::GemmUniversal<
+  // SM12x family to support both SM120 (RTX 5090) and SM121 (DGX Spark)
+  using KernelType = enable_sm120_family<cutlass::gemm::kernel::GemmUniversal<
      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;

  struct GemmKernel : public KernelType {};
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@@ -90,8 +90,8 @@ struct cutlass_3x_gemm_sm100_fp8 {
  // -----------------------------------------------------------
  // Kernel definition
  // -----------------------------------------------------------
-  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+  using GemmKernel = enable_sm100f_only<cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>>;
 };

 template <typename InType, typename OutType, bool EnableBias>
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cuh
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cuh
@@ -36,41 +36,6 @@ using namespace cute;
 */

 namespace vllm {
-
-// Wrappers for the GEMM kernel that is used to guard against compilation on
-// architectures that will never use the kernel. The purpose of this is to
-// reduce the size of the compiled binary.
-// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
-// into code that will be executed on the device where it is defined.
-template <typename Kernel>
-struct enable_sm75_to_sm80 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE static void invoke(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
-    Kernel::invoke(std::forward<Args>(args)...);
-#endif
-  }
-};
-
-template <typename Kernel>
-struct enable_sm80_to_sm89 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE static void invoke(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
-    Kernel::invoke(std::forward<Args>(args)...);
-#endif
-  }
-};
-
-template <typename Kernel>
-struct enable_sm89_to_sm90 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE static void invoke(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
-    Kernel::invoke(std::forward<Args>(args)...);
-#endif
-  }
-};
 template <typename Arch, template <typename> typename ArchGuard,
          typename ElementAB_, typename ElementD_,
          template <typename, typename> typename Epilogue_, typename TileShape,
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_fp8_dispatch.cuh
@@ -50,7 +50,7 @@ struct sm89_fp8_config_default {
      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -58,7 +58,7 @@ struct sm89_fp8_config_default {
      using TileShape = typename cutlass::gemm::GemmShape<256, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -67,7 +67,7 @@ struct sm89_fp8_config_default {
      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -100,7 +100,7 @@ struct sm89_fp8_config_M256 {
      using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -108,7 +108,7 @@ struct sm89_fp8_config_M256 {
      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -141,7 +141,7 @@ struct sm89_fp8_config_M128 {
      using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -150,7 +150,7 @@ struct sm89_fp8_config_M128 {
      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -158,7 +158,7 @@ struct sm89_fp8_config_M128 {
      using TileShape = typename cutlass::gemm::GemmShape<128, 64, 128>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -191,7 +191,7 @@ struct sm89_fp8_config_M64 {
      using FP8MathOperator = typename cutlass::arch::OpMultiplyAdd;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -201,7 +201,7 @@ struct sm89_fp8_config_M64 {
      using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -211,7 +211,7 @@ struct sm89_fp8_config_M64 {
      using FP8MathOperator = typename cutlass::arch::OpMultiplyAdd;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -244,7 +244,7 @@ struct sm89_fp8_config_M32 {
      using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -253,7 +253,7 @@ struct sm89_fp8_config_M32 {
      using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 4, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -262,7 +262,7 @@ struct sm89_fp8_config_M32 {
      using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -296,7 +296,7 @@ struct sm89_fp8_config_M16 {
      using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, MainLoopStages,
                                FP8MathOperator>,
@@ -305,7 +305,7 @@ struct sm89_fp8_config_M16 {
      using TileShape = typename cutlass::gemm::GemmShape<16, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, MainLoopStages,
                                FP8MathOperator>,
@@ -314,7 +314,7 @@ struct sm89_fp8_config_M16 {
      using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, MainLoopStages,
                                FP8MathOperator>,
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_int8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_int8_dispatch.cuh
@@ -48,7 +48,7 @@ struct sm89_int8_config_default {
      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -56,7 +56,7 @@ struct sm89_int8_config_default {
      using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -64,7 +64,7 @@ struct sm89_int8_config_default {
      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -72,7 +72,7 @@ struct sm89_int8_config_default {
      using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -104,7 +104,7 @@ struct sm89_int8_config_M256 {
      using TileShape = cutlass::gemm::GemmShape<64, 128, 128>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -112,7 +112,7 @@ struct sm89_int8_config_M256 {
      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -120,7 +120,7 @@ struct sm89_int8_config_M256 {
      using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -128,7 +128,7 @@ struct sm89_int8_config_M256 {
      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -160,7 +160,7 @@ struct sm89_int8_config_M128 {
      using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -169,7 +169,7 @@ struct sm89_int8_config_M128 {
      using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -178,7 +178,7 @@ struct sm89_int8_config_M128 {
      using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -210,7 +210,7 @@ struct sm89_int8_config_M64 {
      using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -219,7 +219,7 @@ struct sm89_int8_config_M64 {
      using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -251,7 +251,7 @@ struct sm89_int8_config_M32 {
      using WarpShape = cutlass::gemm::GemmShape<16, 64, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -260,7 +260,7 @@ struct sm89_int8_config_M32 {
      using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 4>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -292,7 +292,7 @@ struct sm89_int8_config_M16 {
      using TileShape = cutlass::gemm::GemmShape<16, 64, 128>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -300,7 +300,7 @@ struct sm89_int8_config_M16 {
      using TileShape = cutlass::gemm::GemmShape<16, 128, 128>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 4>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@@ -1365,13 +1365,12 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
  return out_c;
 }

-#if defined(__gfx950__)  // TODO: Add NAVI support
-  // This version targets big A[] cases, where it is much larger than LDS
-  // capacity
+// This version targets cases skinny where CUs are not filled
+// Wave-SplitK is used with reduction done via atomics.
+#if defined(__gfx950__)
  #define WVSPLITKRC_1KPASS
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
-          int UNRL, int N, int GrpsShrB>
-
+          int UNRL, int N, int GrpsShrB, int CHUNKK>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
    __attribute__((amdgpu_waves_per_eu(1, 1)))
    wvSplitKrc_(const int actlN, const int K, const int M, const int Bx,
@@ -1383,12 +1382,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  int* cntr = (int*)(&glbl[M * N]);

  constexpr int NTILE = 16;
-  constexpr int WVLDS_ = (NTILE * THRDS * A_CHUNK);
  constexpr int APAD = 1;
  constexpr int ASTRD = 64;
  constexpr int BPAD = 1;
-  constexpr int BSTRD = 64;
-  constexpr int WVLDS = ((WVLDS_ + (WVLDS_ / BSTRD) * 4 * BPAD));
+  constexpr int WVLDS_ = THRDS * A_CHUNK / CHUNKK;
+  constexpr int WVLDS = ((WVLDS_ + A_CHUNK * BPAD)) * YTILE;

  constexpr int max_lds_len = LDS_SIZE / 2;

@@ -1442,17 +1440,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
        break;
    }
  #else
-  int constexpr kFit = 512;
+  int constexpr kFit = 512 / CHUNKK;
  int constexpr kfitsPerRdc = 1;
  #endif

-  bool doRdc = (kfitsPerRdc * kFit < K);
+  bool doRdc = true;  // Assuming (kfitsPerRdc * kFit < K) is always true
  uint32_t numCuWithFullK =
      ((M + (WvPrGrp * YTILE / GrpsShrB) - 1) / (WvPrGrp * YTILE / GrpsShrB));
  uint32_t Mmod = numCuWithFullK * (WvPrGrp * YTILE / GrpsShrB);

  // given above k-split, find this wave's position
-  uint32_t kFitPdd = kFit + (kFit / ASTRD) * APAD;
+  uint32_t kFitPdd = kFit * CHUNKK + ((kFit * CHUNKK) / ASTRD) * APAD;
  uint32_t m0 = (blockIdx.x * WvPrGrp / GrpsShrB) * YTILE;
  uint32_t m1 = ((threadIdx.y % WvPrGrp) / GrpsShrB) * YTILE;
  uint32_t m = (m0 + m1) % Mmod;
@@ -1460,8 +1458,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  uint32_t k_end = (m0 / Mmod + 1) * kFit * kfitsPerRdc;
  const uint32_t k_rnd = (K + kFit * kfitsPerRdc - 1) / (kFit * kfitsPerRdc);

-  scalar8 sum4[N / NTILE / GrpsShrB][1];
-  bigType bigB_[YTILE / GrpsShrB][UNRL];
+  scalar8 sum4[N / NTILE / GrpsShrB][1] = {0};
+  bigType bigB_[YTILE / GrpsShrB / CHUNKK][UNRL];
  const uint32_t bLoader = (threadIdx.y % GrpsShrB);
  uint32_t kBase = 0;
  if (k_str >= K) return;
@@ -1498,12 +1496,15 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
    #pragma unroll
  for (uint32_t k2 = 0; k2 < UNRL; k2++) {
    uint32_t k = k_str + k2 * THRDS * A_CHUNK;
-    uint32_t k_ = k + threadIdx.x * A_CHUNK;
+    uint32_t k_ = k + (threadIdx.x % (THRDS / CHUNKK)) * A_CHUNK;
    const scalar_t* B_ = &B[min__(k_, K - A_CHUNK)];
    #pragma unroll
-    for (uint32_t y = 0; y < YTILE / GrpsShrB; y++)
-      bigB_[y][k2].h8 = (loadnt(
-          (scalar8*)(&B_[min__(y * GrpsShrB + bLoader + m, M - 1) * K])));
+    for (uint32_t y = 0; y < YTILE / GrpsShrB; y += CHUNKK)
+      bigB_[y / CHUNKK][k2].h8 = (loadnt(
+          (scalar8*)(&B_[min__((y + threadIdx.x / (THRDS / CHUNKK)) * GrpsShrB +
+                                   bLoader + m,
+                               M - 1) *
+                         K])));
  }
  {
  #else
@@ -1556,48 +1557,51 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
      if (reloada) {
  #endif
        constexpr int sprdN = 4;
-        const uint32_t thrd = ((threadIdx.y / sprdN) * THRDS + threadIdx.x);
+        const uint32_t thrd = threadIdx.x % (THRDS / CHUNKK);

  #ifndef WVSPLITKRC_1KPASS
    #pragma unroll
-        for (int k = 0; k < kFit; k += THRDS * (WvPrGrp / sprdN) * A_CHUNK) {
+        for (int k = 0; k < kFit;
+             k += (THRDS * (WvPrGrp / sprdN) * A_CHUNK) / CHUNKK) {
  #else
        const unsigned int k = 0;
        {
  #endif
          unsigned int kOff = k + (thrd * A_CHUNK);
-          unsigned int kOffcp = min__(K - A_CHUNK, k_str + kOff);
-          const unsigned int k_in = kOffcp + ((threadIdx.y % sprdN)) * K;
-          const unsigned int k_ot = kOff + ((threadIdx.y % sprdN)) * kFitPdd;
-          for (unsigned int n = 0; n < N / 2; n += sprdN) {
-            __builtin_amdgcn_global_load_lds((int*)(&A[k_in + n * K]),
-                                             (int*)(&s[(k_ot + n * kFitPdd)]),
-                                             16, 0, 0);
-            if (((threadIdx.y % sprdN)) + n + N / 2 >= actlN) continue;
+          unsigned int kOffcp =
+              k_str + kOff;  // min__(K - A_CHUNK, k_str + kOff);
+          for (unsigned int n = 0; n < N; n += CHUNKK * sprdN) {
            __builtin_amdgcn_global_load_lds(
-                (int*)(&A[k_in + (n + N / 2) * K]),
-                (int*)(&s[(k_ot + (n + N / 2) * kFitPdd)]), 16, 0, 0);
+                (int*)(&A[min__(
+                    K * actlN - A_CHUNK,
+                    kOffcp + K * (n / CHUNKK +
+                                  (N / CHUNKK) * (threadIdx.x / (64 / CHUNKK)) +
+                                  (threadIdx.y % sprdN)))]),
+                (int*)(&s[(k +
+                           kFitPdd * ((n / CHUNKK) + (threadIdx.y % sprdN)))]),
+                16, 0, 0);
          }

          // Stage loaded B[] to LDS for MFMA swizzling...
          for (uint32_t k2 = 0; k2 < UNRL; k2++) {
            uint32_t k = k1 + k2 * THRDS * A_CHUNK;
-            uint32_t k_ = k + threadIdx.x * A_CHUNK;
+            uint32_t k_ = k + (threadIdx.x % (THRDS / CHUNKK)) * A_CHUNK;
            const bool oob_k = (k_ >= K);
-            for (uint32_t y = 0; y < YTILE / GrpsShrB; y++) {
-              uint32_t idx = threadIdx.x * 4 +
-                             (y * GrpsShrB + bLoader) * ((THRDS + BPAD) * 4);
+            for (uint32_t y = 0; y < YTILE / GrpsShrB; y += CHUNKK) {
+              uint32_t idx =
+                  (threadIdx.x % (THRDS / CHUNKK)) * 4 +
+                  ((y + threadIdx.x / (THRDS / CHUNKK)) * GrpsShrB + bLoader) *
+                      ((THRDS / CHUNKK + BPAD) * 4);
              // zero out if oob
              *((scalar8*)&myStg[idx]) =
-                  (oob_k || (y * GrpsShrB + bLoader + m >= M))
+                  (oob_k)  // TODO: ever necessary (y*GrpsShrB+bLoader+m>=M) ?
                      ? 0
-                      : bigB_[y][k2].h8;
+                      : bigB_[y / CHUNKK][k2].h8;
            }
          }
        }
      }
    }
-
  #ifndef WVSPLITKRC_1KPASS
    // Fire load of next B[] chunk...
    if ((k1 + THRDS * A_CHUNK * UNRL < k_end) &&
@@ -1608,40 +1612,50 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
        uint32_t k_ = k + threadIdx.x * A_CHUNK;
        const scalar_t* B_ = &B[min__(k_, K - A_CHUNK)];
    #pragma unroll
-        for (uint32_t y = 0; y < YTILE / GrpsShrB; y++)
-          bigB_[y][k2].h8 = (loadnt(
-              (scalar8*)(&B_[min__(y * GrpsShrB + bLoader + m, M - 1) * K])));
+        for (uint32_t y = 0; y < YTILE / GrpsShrB; y += CHUNKK)
+          bigB_[y / CHUNKK][k2].h8 = (loadnt(
+              (scalar8*)(&B_[min__((y + threadIdx.x / (THRDS / CHUNKK)) *
+                                           GrpsShrB +
+                                       bLoader + m,
+                                   M - 1) *
+                             K])));
      }
  #endif

    // B[] staging is cooperative across GrpsShrB, so sync here before reading
-    // back
+    // back. This wait is currently inserted by compiler, but not gauranteed.
+    asm volatile("s_waitcnt 0");
    __syncthreads();

    // read back B[] swizzled for MFMA...
-    bigType bigB[YTILE][UNRL];
+    bigType bigB[YTILE / CHUNKK][UNRL];
    for (uint32_t k2 = 0; k2 < UNRL; k2++) {
-      for (uint32_t y = 0; y < YTILE; y++) {
-        unsigned int idx = (threadIdx.x % YTILE) * ((THRDS + BPAD) * 4) +
-                           (threadIdx.x / YTILE) * 4 + y * 16;
+      for (uint32_t y = 0; y < YTILE / CHUNKK; y++) {
+        unsigned int idx =
+            (threadIdx.x % YTILE) * ((THRDS / CHUNKK + BPAD) * 4) +
+            (threadIdx.x / YTILE) * 4 + y * 16;
        bigB[y][k2].h8 = *((scalar8*)&myStg[idx]);
      }
    }

    // rReadback A[] swizzled for MFMA...
-    bigType bigA[N / GrpsShrB][UNRL];
+    bigType bigA[N / GrpsShrB / CHUNKK][UNRL];
  #pragma unroll
    for (uint32_t k2 = 0; k2 < UNRL; k2++) {
      uint32_t k = k1 + k2 * THRDS * A_CHUNK - kBase - k_str;
  #pragma unroll
      for (uint32_t nt = 0; nt < N / GrpsShrB; nt += NTILE)
  #pragma unroll
-        for (uint32_t n = 0; n < NTILE; n++) {
-          uint32_t idxa = (nt + (threadIdx.x % NTILE) +
-                           (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) *
-                              kFitPdd +
-                          A_CHUNK * ((threadIdx.x / NTILE) + n * 4) + k;
-          bigA[nt + n][k2] = *((const bigType*)(&(s[idxa])));
+        for (uint32_t n = 0; n < NTILE / CHUNKK; n++) {
+          uint32_t idxa =
+              ((nt + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) % (N / CHUNKK) +
+               (threadIdx.x % NTILE)) *
+                  kFitPdd +
+              ((nt + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) /
+               (N / CHUNKK)) *
+                  A_CHUNK * (64 / CHUNKK) +
+              A_CHUNK * ((threadIdx.x / NTILE) + n * 4) + k;
+          bigA[nt / CHUNKK + n][k2] = *((const bigType*)(&(s[idxa])));
        }
    }

@@ -1650,152 +1664,75 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
    for (uint32_t k2 = 0; k2 < UNRL; k2++) {
  #pragma unroll
      for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
-        if constexpr (std::is_same_v<scalar_t, half>) {
-          sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16f16(
-              bigA[nt * NTILE + 0][k2].h4[0], bigB[0][k2].h4[0],
-              (k1 == k_str) ? ((scalar8){0}) : sum4[nt][0], 0, 0, 0);
-          sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16f16(
-              bigA[nt * NTILE + 0][k2].h4[1], bigB[0][k2].h4[1], sum4[nt][0], 0,
-              0, 0);
-        } else {  // bf16
-          sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
-              bigA[nt * NTILE + 0][k2].h4[0], bigB[0][k2].h4[0],
-              (k1 == k_str) ? ((scalar8){0}) : sum4[nt][0], 0, 0, 0);
-          sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
-              bigA[nt * NTILE + 0][k2].h4[1], bigB[0][k2].h4[1], sum4[nt][0], 0,
-              0, 0);
-        }
  #pragma unroll
-        for (uint32_t j = 1; j < YTILE; j++) {
+        for (uint32_t j = 0; j < YTILE / CHUNKK; j++) {
          if constexpr (std::is_same_v<scalar_t, half>) {
-            sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16f16(
-                bigA[nt * NTILE + j][k2].h4[0], bigB[j][k2].h4[0], sum4[nt][0],
-                0, 0, 0);
-            sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16f16(
-                bigA[nt * NTILE + j][k2].h4[1], bigB[j][k2].h4[1], sum4[nt][0],
-                0, 0, 0);
+            sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x32_f16(
+                bigA[nt * (YTILE / CHUNKK) + j][k2].h8, bigB[j][k2].h8,
+                sum4[nt][0], 0, 0, 0);
          } else {  // bf16
-            sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
-                bigA[nt * NTILE + j][k2].h4[0], bigB[j][k2].h4[0], sum4[nt][0],
-                0, 0, 0);
-            sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
-                bigA[nt * NTILE + j][k2].h4[1], bigB[j][k2].h4[1], sum4[nt][0],
-                0, 0, 0);
+            sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x32_bf16(
+                bigA[nt * (YTILE / CHUNKK) + j][k2].h8, bigB[j][k2].h8,
+                sum4[nt][0], 0, 0, 0);
          }
        }
      }
    }
  }

-  if (!doRdc) {
-    if (m + (threadIdx.x % 16) < M) {
-      scalar_t biases[N / NTILE / GrpsShrB][4] = {0};
+  if (m + (threadIdx.x % 16) < M) {
+    int my_cntr;
+    int mindx = m + (threadIdx.x % 16);
+    int g_mindx = m * 4 + (threadIdx.x % 64);  // coalesced atomic reduction
+    scalar_t biases[N / NTILE / GrpsShrB][4] = {};
+    // Atomic add the output, read biases
+    for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++)
+      for (uint32_t j = 0; j < 4; j++) {
+        // int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+        //             (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+        // int adr = mindx + M * nindx;
+        int g_nindx =
+            j + (nt * NTILE + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) / 4;
+        int g_adr = g_mindx + M * g_nindx * 4;
+        atomicAdd(&glbl[g_adr], sum4[nt][0][j]);
+      }
+    int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
+                 (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+    int adr_ = mindx + M * nindx_ / 4;
+    // Update the complete counter
+    my_cntr = atomicAdd(&cntr[adr_], 1);
+    float vals[N / NTILE / GrpsShrB][4] = {};
+    // If we're the last k-shard, read back the value and convert...
+    if (my_cntr + 1 == k_rnd) {
      if (BIAS)
        for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
          for (uint32_t j = 0; j < 4; j++) {
-            int mindx = m + (threadIdx.x % 16);
            int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
                        (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
-            biases[nt][j] = BIAS[(mindx % Bx) + (nindx % By) * M];
+            biases[nt][j] = BIAS[(mindx % Bx) + (nindx % By) * Bx];
          }
        }
      for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
        for (uint32_t j = 0; j < 4; j++) {
-          int mindx = m + (threadIdx.x % 16);
-          int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
-                      (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
-          int adr = mindx + M * nindx;
-          if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
-            if (BIAS) sum4[nt][0][j] += __bfloat162float(biases[nt][j]);
-            C[adr] = __float2bfloat16(sum4[nt][0][j]);
-          } else {
-            if (BIAS) sum4[nt][0][j] += __half2float(biases[nt][j]);
-            C[adr] = __float2half(sum4[nt][0][j]);
-          }
+          int g_nindx =
+              j + (nt * NTILE + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) / 4;
+          int g_adr = g_mindx + M * g_nindx * 4;
+          vals[nt][j] = glbl[g_adr];
        }
      }
-    }
-  } else {
-    if (m + (threadIdx.x % 16) < M) {
-      int my_cntr;
-      if (!BIAS) {
-        int mindx = m + (threadIdx.x % 16);
-        for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++)
-          for (uint32_t j = 0; j < 4; j++) {
-            int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
-                        (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+      __builtin_amdgcn_sched_barrier(0);
+      for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+        for (uint32_t j = 0; j < 4; j++) {
+          int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                      (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+          if (nindx < actlN) {
            int adr = mindx + M * nindx;
-            atomicAdd(&glbl[adr], sum4[nt][0][j]);
-          }
-        int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
-                     (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
-        int adr_ = mindx + M * nindx_ / 4;
-        my_cntr = atomicAdd(&cntr[adr_], 1);
-        float vals[N / NTILE / GrpsShrB][4] = {};
-        if (my_cntr + 1 == k_rnd) {
-          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
-            for (uint32_t j = 0; j < 4; j++) {
-              int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
-                          (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
-              int adr = mindx + M * nindx;
-              vals[nt][j] = glbl[adr];
-            }
-          }
-          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
-            for (uint32_t j = 0; j < 4; j++) {
-              int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
-                          (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
-              if (nindx >= actlN) break;
-              int adr = mindx + M * nindx;
-              if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
-                C[adr] = __float2bfloat16(vals[nt][j]);
-              } else {
-                C[adr] = __float2half(vals[nt][j]);
-              }
-            }
-          }
-        }
-      } else {
-        int mindx = m + (threadIdx.x % 16);
-        scalar_t biases[N / NTILE / GrpsShrB][4] = {};
-        // Atomic add the output, read biases
-        for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++)
-          for (uint32_t j = 0; j < 4; j++) {
-            int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
-                        (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
-            int adr = mindx + M * nindx;
-            atomicAdd(&glbl[adr], sum4[nt][0][j]);
-            biases[nt][j] = BIAS[(mindx % Bx) + (nindx % By) * M];
-          }
-        int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
-                     (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
-        int adr_ = mindx + M * nindx_ / 4;
-        // Update the complete counter
-        my_cntr = atomicAdd(&cntr[adr_], 1);
-        float vals[N / NTILE / GrpsShrB][4] = {};
-        // If we're the last k-shard, read back the value and convert...
-        if (my_cntr + 1 == k_rnd) {
-          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
-            for (uint32_t j = 0; j < 4; j++) {
-              int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
-                          (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
-              int adr = mindx + M * nindx;
-              vals[nt][j] = glbl[adr];
-            }
-          }
-          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
-            for (uint32_t j = 0; j < 4; j++) {
-              int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
-                          (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
-              if (nindx >= actlN) break;
-              int adr = mindx + M * nindx;
-              if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
-                vals[nt][j] += __bfloat162float(biases[nt][j]);
-                C[adr] = __float2bfloat16(vals[nt][j]);
-              } else {
-                vals[nt][j] += __half2float(biases[nt][j]);
-                C[adr] = __float2half(vals[nt][j]);
-              }
+            if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
+              vals[nt][j] += __bfloat162float(biases[nt][j]);
+              C[adr] = __float2bfloat16(vals[nt][j]);
+            } else {
+              vals[nt][j] += __half2float(biases[nt][j]);
+              C[adr] = __float2half(vals[nt][j]);
            }
          }
        }
@@ -1814,7 +1751,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 }
 #else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
-          int UNRL, int N, int GrpsShrB>
+          int UNRL, int N, int GrpsShrB, int CHUNKK>
 __global__ void wvSplitKrc_(const int actlN, const int K, const int M,
                            const int Bx, const int By, const scalar_t* B,
                            const scalar_t* __restrict__ A,
@@ -1859,10 +1796,10 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  // const int max_lds_len = get_lds_size() / 2;

-#define WVSPLITKrc(_WvPrGrp, _YTILE, _UNRL, _N, _GrpsShrB)                     \
+#define WVSPLITKrc(_N, _GrpsShrB, _CHUNKK)                                     \
  {                                                                            \
-    dim3 block(64, _WvPrGrp);                                                  \
-    wvSplitKrc_<fptype, 64, _YTILE, _WvPrGrp, 8, _UNRL, _N, _GrpsShrB>         \
+    dim3 block(64, 4);                                                         \
+    wvSplitKrc_<fptype, 64, 16, 4, 8, 1, _N, _GrpsShrB, _CHUNKK>               \
        <<<grid, block, 0, stream>>>(N_in, K_in, M_in, Bx_in, By_in, af4, bf4, \
                                     biasf4, glbl, c, CuCount);                \
  }
@@ -1877,15 +1814,37 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
            : nullptr;
    fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
    auto glbl = axl_glbl.data_ptr<float>();
+
+    // With 64 Ms per CU (each of 4 SIMDs working on a 16x16 tile),
+    // and each working on a 512-shard of K, how many CUs would we need?
+    int rndup_cus = ((M_in + 64 - 1) / 64) * ((K_in + 512 - 1) / 512);
+
+    // How many of 4 waves in a group can work on same 16 Ms at same time? First
+    // try to maximize this. This reduces the Ms each group works on, i.e.
+    // increasing the number of CUs needed.
+    int GrpsShrB = min(N_p2 / 16, 4);
+
+    // Given the above, how many CUs would we need?
+    int CuNeeded = rndup_cus * GrpsShrB;
+
+    if (CuNeeded > CuCount) std::runtime_error("Invalid wvSplitKrc size");
+
+    // Can we increase SplitK by shrinking the K-shared to 256?
+    int chunkk = (CuNeeded * 2 <= CuCount) ? 2 : 1;
+
    switch (N_p2) {
      case 16:
-        WVSPLITKrc(4, 16, 1, 16, 1) break;
+        WVSPLITKrc(16, 1, 1) break;
      case 32:
-        WVSPLITKrc(4, 16, 1, 32, 2) break;
+        if (chunkk == 2)
+          WVSPLITKrc(32, 2, 2) else if (chunkk == 1) WVSPLITKrc(32, 2, 1) break;
      case 64:
-        WVSPLITKrc(4, 16, 1, 64, 2) break;
+        if (chunkk == 2)
+          WVSPLITKrc(64, 4, 2) else if (chunkk == 1) WVSPLITKrc(64, 4, 1) break;
      case 128:
-        WVSPLITKrc(4, 16, 1, 128, 4) break;
+        if (chunkk == 2)
+          WVSPLITKrc(128, 4, 2) else if (chunkk == 1)
+              WVSPLITKrc(128, 4, 1) break;
      default:
        throw std::runtime_error(
            "Unsupported N value: " + std::to_string(M_in) + "," +
@@ -1899,8 +1858,9 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
          int A_CHUNK, int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
-    wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, const int Bx,
-                      const int By, const fp8_t* B, const fp8_t* __restrict__ A,
+    wvSplitKQ_hf_sml_(const int K, const int Kap, const int Kbp, const int M,
+                      const int Bx, const int By, const fp8_t* B,
+                      const fp8_t* __restrict__ A,
                      const scalar_t* __restrict__ BIAS, scalar_t* C,
                      const float* __restrict__ s_A,
                      const float* __restrict__ s_B, const int _WvPrGrp,
@@ -1924,9 +1884,14 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  __shared__ fp8_t s[max_lds_len];

  for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
-       k < min__(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
+       k < min__(Kap * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
+  #if defined(__gfx950__)
+    __builtin_amdgcn_global_load_lds((int*)(&A[k]), (int*)(&s[k]), 16, 0, 0);
+  #else
    *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
+  #endif
  }
+  asm volatile("s_waitcnt vmcnt(0)");
  __syncthreads();

  if (threadIdx.y >= _WvPrGrp) return;
@@ -1934,37 +1899,24 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;

  using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
-  floatx16 sum[N][YTILE];
  float sA = *s_A;
  float sB = *s_B;

  while (m < M) {
-    for (int i = 0; i < YTILE; i++)
-      for (int n = 0; n < N; n++) sum[n][i] = {0.f};
-
-    bigType bigA[N][UNRL];
-    bigType bigB[YTILE][UNRL];
-
+    floatx16 sum[N][YTILE] = {};
    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
-  #pragma unroll
-      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
-  #pragma unroll
-        for (uint32_t n = 0; n < N; ++n) bigA[n][k2].h8 = {0.f};
-  #pragma unroll
-        for (uint32_t y = 0; y < YTILE; ++y) bigB[y][k2].h8 = {0.f};
-      }
+      bigType bigA[N][UNRL] = {};
+      bigType bigB[YTILE][UNRL];

      // Fetch the weight matrix from memory!
  #pragma unroll
      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
        uint32_t k_ = k + threadIdx.x * A_CHUNK;
-        if (k_ >= K) break;
-
-        const fp8_t* B_ = &B[(m + 0) * Kp + k_];
+        const fp8_t* B_ = &B[min__(k_, K - A_CHUNK)];
  #pragma unroll
        for (uint32_t y = 0; y < YTILE; ++y) {
-          bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[y * Kp])));
+          bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[min__(y + m, M - 1) * Kbp])));
        }
      }

@@ -1975,16 +1927,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
        uint32_t k_ = k + threadIdx.x * A_CHUNK;
        if (k_ >= K) break;
        for (int n = 0; n < N; n++) {
-          bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+          bigA[n][k2] = *((const bigType*)(&(s[k_ + Kap * n])));
        }
      }

  // Do the matrix multiplication in interleaved manner
  #pragma unroll
      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
-        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
-        if (k >= K) break;
-
        for (uint32_t n = 0; n < N; n++) {
          for (int i = 0; i < A_CHUNK; i += 8) {
            for (int y = 0; y < YTILE; ++y) {
@@ -2002,48 +1951,27 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
      for (int y = 0; y < YTILE; y++) {
        float accm0 = sum[n][y][0];
        float accm16 = sum[n][y][8];
-        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][1]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][9]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][2]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][10]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][3]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][11]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][4]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][12]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][5]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][13]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][6]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][14]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][7]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][15]), "v"(accm16));
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][1], 0x101, 0xf, 0xf,
+                                          1);  // row_shl1
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][9], 0x101, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][2], 0x102, 0xf, 0xf,
+                                          1);  // row_shl2
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][10], 0x102, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][3], 0x103, 0xf, 0xf,
+                                          1);  // row_shl3
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][11], 0x103, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][4], 0x108, 0xf, 0xf,
+                                          1);  // row_shl8
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][12], 0x108, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][5], 0x109, 0xf, 0xf,
+                                          1);  // row_shl9
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][13], 0x109, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][6], 0x10a, 0xf, 0xf,
+                                          1);  // row_shl10
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][14], 0x10a, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][7], 0x10b, 0xf, 0xf,
+                                          1);  // row_shl11
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][15], 0x10b, 0xf, 0xf, 1);
        accm0 += __shfl(accm0, 36);
        accm16 += __shfl(accm16, 52);
        sum[n][y][0] = accm0 + __shfl(accm16, 16);
@@ -2051,19 +1979,23 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
    }

    if (threadIdx.x == 0) {
+      scalar_t biases[N][YTILE] = {};
+      if (BIAS)
+        for (int n = 0; n < N; n++) {
+          for (int y = 0; y < YTILE; y++) {
+            biases[n][y] = BIAS[(m + y) % Bx + (n % By) * Bx];
+          }
+        }
      for (int n = 0; n < N; n++) {
        for (int y = 0; y < YTILE; y++) {
          if (y + m >= M) break;  // To avoid mem access fault.
          sum[n][y][0] *= sA * sB;
          if constexpr (std::is_same_v<scalar_t, half>) {
-            if (BIAS)
-              sum[n][y][0] += __half2float(BIAS[(m + y) % Bx + (n % By) * M]);
+            sum[n][y][0] += __half2float(biases[n][y]);
          } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
-            if (BIAS)
-              sum[n][y][0] +=
-                  __bfloat162float(BIAS[(m + y) % Bx + (n % By) * M]);
+            sum[n][y][0] += __bfloat162float(biases[n][y]);
          }
-          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0]);  // * sA * sB);
+          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0]);
        }
      }
    }
@@ -2074,9 +2006,9 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 #else   // !defined(__HIP__MI3XX__) TODO: Add NAVI support
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
          int A_CHUNK, int UNRL, int N>
-__global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
-                                  const int Bx, const int By, const fp8_t* B,
-                                  const fp8_t* __restrict__ A,
+__global__ void wvSplitKQ_hf_sml_(const int K, const int Kap, const int Kbp,
+                                  const int M, const int Bx, const int By,
+                                  const fp8_t* B, const fp8_t* __restrict__ A,
                                  const scalar_t* __restrict__ BIAS,
                                  scalar_t* C, const float* __restrict__ s_A,
                                  const float* __restrict__ s_B,
@@ -2089,8 +2021,9 @@ __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
          int A_CHUNK, int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
-    wvSplitKQ_hf_(const int K, const int Kp, const int M, const int Bx,
-                  const int By, const fp8_t* B, const fp8_t* __restrict__ A,
+    wvSplitKQ_hf_(const int K, const int Kap, const int Kbp, const int M,
+                  const int Bx, const int By, const fp8_t* B,
+                  const fp8_t* __restrict__ A,
                  const scalar_t* __restrict__ BIAS, scalar_t* C,
                  const float* __restrict__ s_A, const float* __restrict__ s_B,
                  const int _WvPrGrp, const int CuCount) {
@@ -2113,9 +2046,14 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  __shared__ fp8_t s[max_lds_len];

  for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
-       k < min__(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
+       k < min__(Kap * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
+  #if defined(__gfx950__)
+    __builtin_amdgcn_global_load_lds((int*)(&A[k]), (int*)(&s[k]), 16, 0, 0);
+  #else
    *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
+  #endif
  }
+  asm volatile("s_waitcnt vmcnt(0)");
  __syncthreads();

  if (threadIdx.y >= _WvPrGrp) return;
@@ -2123,29 +2061,23 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;

  using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
-  floatx16 sum[N][YTILE];
  float sA = *s_A;
  float sB = *s_B;

  while (m < M) {
-    for (int i = 0; i < YTILE; i++)
-      for (int n = 0; n < N; n++) sum[n][i] = {0};
-
-    bigType bigA[N][UNRL];
-    bigType bigB[YTILE][UNRL];
-
+    floatx16 sum[N][YTILE] = {};
    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+      bigType bigA[N][UNRL] = {};
+      bigType bigB[YTILE][UNRL];
+
      // Fetch the weight matrix from memory!
  #pragma unroll
      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
        uint32_t k_ = k + threadIdx.x * A_CHUNK;
-        if (k_ >= K) break;
-
-        const fp8_t* B_ = &B[(m + 0) * Kp + k_];
+        const fp8_t* B_ = &B[min__(k_, K - A_CHUNK)];
        for (int y = 0; y < YTILE; ++y) {
-          if (y + m >= M) break;  // To avoid mem access fault.
-          bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[y * Kp])));
+          bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[min__(y + m, M - 1) * Kbp])));
        }
      }

@@ -2156,20 +2088,16 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
        uint32_t k_ = k + threadIdx.x * A_CHUNK;
        if (k_ >= K) break;
        for (int n = 0; n < N; n++) {
-          if (k_ + K * n < max_lds_len)
-            bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+          if (k_ + Kap * n < max_lds_len)
+            bigA[n][k2] = *((const bigType*)(&(s[k_ + Kap * n])));
          else
-            bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
+            bigA[n][k2] = *((const bigType*)(&(A[k_ + Kap * n])));
        }
      }

  // Do the matrix multiplication in interleaved manner
  #pragma unroll
      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
-        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
-        uint32_t k_ = k + threadIdx.x * A_CHUNK;
-        if (k_ >= K) break;
-
        for (uint32_t n = 0; n < N; n++) {
          for (int i = 0; i < A_CHUNK; i += 8) {
            for (int y = 0; y < YTILE; ++y) {
@@ -2187,48 +2115,27 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
      for (int y = 0; y < YTILE; y++) {
        float accm0 = sum[n][y][0];
        float accm16 = sum[n][y][8];
-        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][1]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][9]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][2]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][10]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][3]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][11]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][4]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][12]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][5]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][13]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][6]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][14]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][7]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][15]), "v"(accm16));
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][1], 0x101, 0xf, 0xf,
+                                          1);  // row_shl1
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][9], 0x101, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][2], 0x102, 0xf, 0xf,
+                                          1);  // row_shl2
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][10], 0x102, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][3], 0x103, 0xf, 0xf,
+                                          1);  // row_shl3
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][11], 0x103, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][4], 0x108, 0xf, 0xf,
+                                          1);  // row_shl8
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][12], 0x108, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][5], 0x109, 0xf, 0xf,
+                                          1);  // row_shl9
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][13], 0x109, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][6], 0x10a, 0xf, 0xf,
+                                          1);  // row_shl10
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][14], 0x10a, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][7], 0x10b, 0xf, 0xf,
+                                          1);  // row_shl11
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][15], 0x10b, 0xf, 0xf, 1);
        accm0 += __shfl(accm0, 36);
        accm16 += __shfl(accm16, 52);
        sum[n][y][0] = accm0 + __shfl(accm16, 16);
@@ -2236,17 +2143,21 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
    }

    if (threadIdx.x == 0) {
+      scalar_t biases[N][YTILE] = {};
+      if (BIAS)
+        for (int n = 0; n < N; n++) {
+          for (int y = 0; y < YTILE; y++) {
+            biases[n][y] = BIAS[(m + y) % Bx + (n % By) * Bx];
+          }
+        }
      for (int n = 0; n < N; n++) {
        for (int y = 0; y < YTILE; y++) {
          if (y + m >= M) break;  // To avoid mem access fault.
          sum[n][y][0] *= sA * sB;
          if constexpr (std::is_same_v<scalar_t, half>) {
-            if (BIAS)
-              sum[n][y][0] += __half2float(BIAS[(m + y) % Bx + (n % By) * M]);
+            sum[n][y][0] += __half2float(biases[n][y]);
          } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
-            if (BIAS)
-              sum[n][y][0] +=
-                  __bfloat162float(BIAS[(m + y) % Bx + (n % By) * M]);
+            sum[n][y][0] += __bfloat162float(biases[n][y]);
          }
          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0]);
        }
@@ -2259,9 +2170,9 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 #else   // !defined(__HIP__MI3XX__) TODO: Add NAVI support
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
          int A_CHUNK, int UNRL, int N>
-__global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
-                              const int Bx, const int By, const fp8_t* B,
-                              const fp8_t* __restrict__ A,
+__global__ void wvSplitKQ_hf_(const int K, const int Kap, const int Kbp,
+                              const int M, const int Bx, const int By,
+                              const fp8_t* B, const fp8_t* __restrict__ A,
                              const scalar_t* __restrict__ BIAS, scalar_t* C,
                              const float* __restrict__ s_A,
                              const float* __restrict__ s_B, const int _WvPrGrp,
@@ -2270,17 +2181,18 @@ __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
 }
 #endif  // defined(__HIP__MI3XX__) TODO: Add NAVI support

-void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
+void wvSplitKQ(const at::Tensor& in_b, const at::Tensor& in_a,
               const std::optional<at::Tensor>& in_bias, at::Tensor& out_c,
               const at::Tensor& scale_a, const at::Tensor& scale_b,
               const int64_t CuCount) {
  static c10::ScalarType kFp8Type = is_fp8_ocp()
                                        ? c10::ScalarType::Float8_e4m3fn
                                        : c10::ScalarType::Float8_e4m3fnuz;
-  auto M_in = in_a.size(0);
-  auto K_in = in_a.size(1);
-  auto N_in = in_b.size(0);
-  auto Kp_in = in_a.stride(0);
+  auto M_in = in_b.size(0);
+  auto K_in = in_b.size(1);
+  auto N_in = in_a.size(0);
+  auto Kap_in = in_a.stride(0);
+  auto Kbp_in = in_b.stride(0);
  auto Bx_in =
      (in_bias.has_value() && in_bias->numel() > 0)
          ? (in_bias->sizes().size() == 2) ? in_bias->size(1) : in_bias->size(0)
@@ -2300,23 +2212,22 @@ void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  const int max_lds_len = get_lds_size();

-#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
-                  _N)                                                          \
-  {                                                                            \
-    dim3 block(64, _WvPrGrp);                                                  \
-    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) {               \
-      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);               \
-      wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N>  \
-          <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, Bx_in, By_in, a_ptr, \
-                                       b_ptr, bias_ptr, c_ptr, s_a, s_b,       \
-                                       __wvPrGrp, CuCount);                    \
-    } else {                                                                   \
-      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp);               \
-      wvSplitKQ_hf_<fptype, fp8_t, 64, _YTILEm, _WvPrGrp, 16, _UNRLm, _N>      \
-          <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, Bx_in, By_in, a_ptr, \
-                                       b_ptr, bias_ptr, c_ptr, s_a, s_b,       \
-                                       __wvPrGrp, CuCount);                    \
-    }                                                                          \
+#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N)             \
+  {                                                                           \
+    dim3 block(64, _WvPrGrp);                                                 \
+    if ((Kap_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) {            \
+      int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEs, 16));     \
+      wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N> \
+          <<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in,     \
+                                       By_in, b_ptr, a_ptr, bias_ptr, c_ptr,  \
+                                       s_a, s_b, __wvPrGrp, CuCount);         \
+    } else {                                                                  \
+      int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEm, 16));     \
+      wvSplitKQ_hf_<fptype, fp8_t, 64, _YTILEm, _WvPrGrp, 16, _UNRLm, _N>     \
+          <<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in,     \
+                                       By_in, b_ptr, a_ptr, bias_ptr, c_ptr,  \
+                                       s_a, s_b, __wvPrGrp, CuCount);         \
+    }                                                                         \
  }

  AT_DISPATCH_REDUCED_FLOATING_TYPES(out_c.scalar_type(), "wvSplitKQ", [&] {
@@ -2332,16 +2243,16 @@ void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
                          : nullptr;
      switch (N_in) {
        case 1:
-          WVSPLITKQ(16, 2, 2, 2, 2, 2, 2, 1)
+          WVSPLITKQ(12, 2, 2, 2, 2, 1)
          break;
        case 2:
-          WVSPLITKQ(16, 2, 2, 2, 2, 2, 2, 2)
+          WVSPLITKQ(12, 2, 2, 2, 2, 2)
          break;
        case 3:
-          WVSPLITKQ(16, 4, 7, 7, 1, 1, 1, 3)
+          WVSPLITKQ(8, 2, 2, 1, 1, 3)
          break;
        case 4:
-          WVSPLITKQ(16, 4, 7, 7, 1, 1, 1, 4)
+          WVSPLITKQ(4, 2, 2, 1, 1, 4)
          break;
        default:
          throw std::runtime_error(
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -97,9 +97,7 @@ ARG PYTHON_VERSION
 ENV DEBIAN_FRONTEND=noninteractive

 # Install system dependencies including build tools
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
+RUN apt-get update -y \
    && apt-get install -y --no-install-recommends \
        ccache \
        software-properties-common \
@@ -322,7 +320,7 @@ WORKDIR /workspace

 # Build DeepGEMM wheel
 # Default moved here from tools/install_deepgemm.sh for centralized version management
-ARG DEEPGEMM_GIT_REF=594953acce41793ae00a1233eb516044d604bcb6
+ARG DEEPGEMM_GIT_REF=477618cd51baffca09c4b0b87e97c03fe827ef03
 COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
 RUN --mount=type=cache,target=/root/.cache/uv \
    mkdir -p /tmp/deepgemm/dist && \
@@ -502,9 +500,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment

 # Install Python and system dependencies
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
+RUN apt-get update -y \
    && apt-get install -y --no-install-recommends \
        software-properties-common \
        curl \
@@ -586,7 +582,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # This is ~1.1GB and only changes when FlashInfer version bumps
 # https://docs.flashinfer.ai/installation.html
 # From versions.json: .flashinfer.version
-ARG FLASHINFER_VERSION=0.6.1
+ARG FLASHINFER_VERSION=0.6.3
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \
    && uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
@@ -713,9 +709,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy

-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
+RUN apt-get update -y \
    && apt-get install -y git

 # We can specify the standard or nightly build of PyTorch
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -20,9 +20,7 @@ ARG PYTHON_VERSION=3.12
 ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive
 # Install Python and other dependencies
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
+RUN apt-get update -y \
    && apt-get install -y ccache software-properties-common git curl sudo \
    && for i in 1 2 3; do \
        add-apt-repository -y ppa:deadsnakes/ppa && break || \
@@ -172,9 +170,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment

 # Install Python and other dependencies
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
+RUN apt-get update -y \
    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
    && for i in 1 2 3; do \
@@ -221,13 +217,13 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.


 # build flashinfer for torch nightly from source around 10 mins
-# release version: v0.6.1
+# release version: v0.6.3
 # todo(elainewy): cache flashinfer build result for faster build
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/uv \
    echo "git clone flashinfer..." \
-    && git clone --depth 1 --branch v0.6.1 --recursive https://github.com/flashinfer-ai/flashinfer.git \
+    && git clone --depth 1 --branch v0.6.3 --recursive https://github.com/flashinfer-ai/flashinfer.git \
    && cd flashinfer \
    && git submodule update --init --recursive \
    && echo "finish git clone flashinfer..." \
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -15,8 +15,6 @@ FROM ${BASE_IMAGE} AS base

 ARG ARG_PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
-ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
-ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1

 # Install some basic utilities
 RUN apt-get update -q -y && apt-get install -q -y \
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
@@ -1,5 +1,5 @@
 # Base UBI image for s390x architecture
-ARG BASE_UBI_IMAGE_TAG=9.5-1736404155
+ARG BASE_UBI_IMAGE_TAG=9.6
 ARG PYTHON_VERSION=3.12
 FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base

@@ -14,12 +14,18 @@ ENV LANG=C.UTF-8 \

 # Install development utilities
 RUN microdnf install -y \
-    which procps findutils tar vim git gcc-toolset-14 gcc-toolset-14-libatomic-devel patch zlib-devel \
+    which procps findutils tar vim git gcc-toolset-14 gcc-toolset-14-binutils gcc-toolset-14-libatomic-devel patch zlib-devel \
    libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
    openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile \
    clang llvm-devel llvm-static clang-devel && \
    microdnf clean all

+ENV GCC_TOOLSET_ROOT=/opt/rh/gcc-toolset-14/root \
+    PATH=/opt/rh/gcc-toolset-14/root/usr/bin:/usr/local/bin:/usr/bin:/bin \
+    LD_LIBRARY_PATH=/opt/rh/gcc-toolset-14/root/usr/lib64:/usr/local/lib:/usr/lib64 \
+    LIBRARY_PATH=/opt/rh/gcc-toolset-14/root/usr/lib64 \
+    PKG_CONFIG_PATH=/opt/rh/gcc-toolset-14/root/usr/lib64/pkgconfig
+
 # Python Installation
 FROM base AS python-install
 ARG PYTHON_VERSION
@@ -87,13 +93,13 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \

 FROM python-install AS torch-vision
 # Install torchvision
-ARG TORCH_VISION_VERSION=v0.23.0
+ARG TORCH_VISION_VERSION=v0.25.0
 WORKDIR /tmp
 RUN --mount=type=cache,target=/root/.cache/uv \
    git clone https://github.com/pytorch/vision.git && \
    cd vision && \
    git checkout $TORCH_VISION_VERSION && \
-    uv pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cpu && \
+    uv pip install torch==2.10.0 --index-url https://download.pytorch.org/whl/cpu && \
    python setup.py bdist_wheel

 FROM python-install AS hf-xet-builder
@@ -174,7 +180,19 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \
       sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \
    fi && python setup.py bdist_wheel
-    
+
+# Build OpenCV from source for s390x
+FROM python-install AS opencv-builder
+WORKDIR /tmp
+ARG MAX_JOBS
+ARG OPENCV_VERSION=90
+ARG ENABLE_HEADLESS=1
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install numpy setuptools  wheel scikit_build build && \
+    git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
+    cd opencv-python && \
+    python -m build --wheel --installer=uv --outdir /tmp/opencv-python/dist
+
 # Build Outlines Core
 FROM python-install AS outlines-core-builder
 WORKDIR /tmp
@@ -198,7 +216,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Final build stage
 FROM python-install AS vllm-cpu
 ARG PYTHON_VERSION
-
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 # Set correct library path for torch and numactl
 ENV LD_LIBRARY_PATH="/opt/vllm/lib64/python${PYTHON_VERSION}/site-packages/torch/lib:/usr/local/lib:/opt/rh/gcc-toolset-14/root/usr/lib64:$LD_LIBRARY_PATH"
 ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH"
@@ -209,7 +227,8 @@ ENV GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
 ENV PCP_DIR=/opt/rh/gcc-toolset-14/root
 ENV PKG_CONFIG_PATH="/opt/rh/gcc-toolset-14/root/usr/lib64/pkgconfig:/usr/local/lib/pkgconfig/"
 ENV PATH="${VIRTUAL_ENV:+${VIRTUAL_ENV}/bin}:/opt/rh/gcc-toolset-14/root/usr/bin:/usr/local/bin:$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
-
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 COPY . /workspace/vllm
 WORKDIR /workspace/vllm

@@ -225,23 +244,25 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
    --mount=type=bind,from=numba-builder,source=/tmp/llvmlite/dist,target=/tmp/llvmlite-wheels/ \
    --mount=type=bind,from=numba-builder,source=/tmp/numba/dist,target=/tmp/numba-wheels/ \
+    --mount=type=bind,from=opencv-builder,source=/tmp/opencv-python/dist,target=/tmp/opencv-wheels/ \
    --mount=type=bind,from=outlines-core-builder,source=/tmp/outlines-core/dist,target=/tmp/outlines-core/dist/ \
-     sed -i '/^torch/d' requirements/build.txt && \
     ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl) && \
     VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl) && \
     HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl) && \
     LLVM_WHL_FILE=$(ls /tmp/llvmlite-wheels/*.whl) && \
     NUMBA_WHL_FILE=$(ls /tmp/numba-wheels/*.whl) && \
+     OPENCV_WHL_FILE=$(ls /tmp/opencv-wheels/*.whl) && \
     OUTLINES_CORE_WHL_FILE=$(ls /tmp/outlines-core/dist/*.whl) && \
-    uv pip install -v \    
+     uv pip install -v \    
        $ARROW_WHL_FILE  \
        $VISION_WHL_FILE \
        $HF_XET_WHL_FILE \
        $LLVM_WHL_FILE \
        $NUMBA_WHL_FILE \
+        $OPENCV_WHL_FILE \
        $OUTLINES_CORE_WHL_FILE \
        --index-strategy unsafe-best-match \
-        -r requirements/build.txt \
+        -r requirements/cpu-build.txt \
        -r requirements/cpu.txt


@@ -252,7 +273,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \

 # setup non-root user for vllm
 RUN umask 002 && \
-    useradd --uid 2000 --gid 0 vllm && \
+    /usr/sbin/useradd --uid 2000 --gid 0 vllm && \
    mkdir -p /home/vllm && \
    chmod g+rwx /home/vllm

--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -1,8 +1,13 @@
-FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base
+FROM intel/deep-learning-essentials:2025.3.2-0-devel-ubuntu24.04 AS vllm-base
+
+WORKDIR /workspace/
+
+ARG PYTHON_VERSION=3.12
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/xpu"

 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
-    add-apt-repository -y ppa:kobuk-team/intel-graphics-staging
+    add-apt-repository -y ppa:kobuk-team/intel-graphics

 RUN apt clean && apt-get update -y && \
    apt-get install -y --no-install-recommends --fix-missing \
@@ -22,13 +27,19 @@ RUN apt clean && apt-get update -y && \
    python3.12-dev \
    python3-pip

-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
-RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
+RUN apt update && apt upgrade -y && \
+    apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc && \
+    apt install -y intel-oneapi-compiler-dpcpp-cpp-2025.3

-RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc
+ENV PATH="/root/.local/bin:$PATH"
+ENV VIRTUAL_ENV="/opt/venv"
+ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"

 # This oneccl contains the BMG support which is not the case for default version of oneapi 2025.2.
-ARG ONECCL_INSTALLER="intel-oneccl-2021.15.7.6_offline.sh"
+ARG ONECCL_INSTALLER="intel-oneccl-2021.15.7.8_offline.sh"
 RUN wget "https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.7/${ONECCL_INSTALLER}" && \
    bash "${ONECCL_INSTALLER}" -a --silent --eula accept && \
    rm "${ONECCL_INSTALLER}" && \
@@ -41,20 +52,31 @@ SHELL ["bash", "-c"]
 CMD ["bash", "-c", "source /root/.bashrc && exec bash"]

 WORKDIR /workspace/vllm
-COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt
-COPY requirements/common.txt /workspace/vllm/requirements/common.txt

-# suppress the python externally managed environment error
-RUN python3 -m pip config set global.break-system-packages true
+ENV UV_HTTP_TIMEOUT=500

-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install --no-cache-dir \
-    -r requirements/xpu.txt
+# Configure package index for XPU
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+ENV UV_LINK_MODE="copy"

-# arctic-inference is built from source which needs torch-xpu properly installed
-# used for suffix method speculative decoding
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install --no-cache-dir arctic-inference==0.1.1
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=requirements/common.txt,target=/workspace/vllm/requirements/common.txt \
+    --mount=type=bind,src=requirements/xpu.txt,target=/workspace/vllm/requirements/xpu.txt \
+    uv pip install --upgrade pip && \
+    uv pip install -r requirements/xpu.txt
+
+ # used for suffix method speculative decoding
+ # build deps for proto + nanobind-based extensions to set up the build environment
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install grpcio-tools protobuf nanobind
+ # arctic-inference is built from source which needs torch-xpu properly installed first
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/intel/oneapi/setvars.sh --force && \
+    source /opt/intel/oneapi/ccl/2021.15/env/vars.sh --force && \
+    export CMAKE_PREFIX_PATH="$(python -c 'import site; print(site.getsitepackages()[0])'):${CMAKE_PREFIX_PATH}" && \
+    uv pip install --no-build-isolation arctic-inference==0.1.1

 ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"

@@ -66,30 +88,32 @@ RUN --mount=type=bind,source=.git,target=.git \
 ENV VLLM_TARGET_DEVICE=xpu
 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn

-RUN --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
-    pip install --no-build-isolation .
+    uv pip install --no-build-isolation .

 CMD ["/bin/bash"]

 FROM vllm-base AS vllm-openai

 # install additional dependencies for openai api server
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope

 # install development dependencies (for testing)
-RUN python3 -m pip install -e tests/vllm_test_utils
+RUN uv pip install -e tests/vllm_test_utils

 # install nixl from source code
 ENV NIXL_VERSION=0.7.0
-RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
+RUN python /workspace/vllm/tools/install_nixl_from_source_ubuntu.py

-# PyJWT-2.7.0 will influence some wheel behaviors, remove its dist-info to avoid conflicts
-RUN rm /usr/lib/python3/dist-packages/PyJWT-2.7.0.dist-info/ -rf
+# FIX triton
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip uninstall triton triton-xpu && \
+    uv pip install triton-xpu==3.6.0

 # remove torch bundled oneccl to avoid conflicts
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip uninstall oneccl oneccl-devel -y
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip uninstall oneccl oneccl-devel

 ENTRYPOINT ["vllm", "serve"]
--- a/docker/versions.json
+++ b/docker/versions.json
@@ -50,7 +50,7 @@
      "default": "cuda"
    },
    "DEEPGEMM_GIT_REF": {
-      "default": "594953acce41793ae00a1233eb516044d604bcb6"
+      "default": "477618cd51baffca09c4b0b87e97c03fe827ef03"
    },
    "PPLX_COMMIT_HASH": {
      "default": "12cecfd"
@@ -68,7 +68,7 @@
      "default": "true"
    },
    "FLASHINFER_VERSION": {
-      "default": "0.6.1"
+      "default": "0.6.3"
    },
    "GDRCOPY_CUDA_VERSION": {
      "default": "12.8"
--- a/docs/assets/design/arch_overview/v1_process_architecture_tp2_dp4.png
+++ b/docs/assets/design/arch_overview/v1_process_architecture_tp2_dp4.png
--- a/docs/assets/design/arch_overview/v1_process_architecture_tp4.png
+++ b/docs/assets/design/arch_overview/v1_process_architecture_tp4.png
--- a/docs/benchmarking/cli.md
+++ b/docs/benchmarking/cli.md
@@ -32,6 +32,7 @@ th {
 | HuggingFace-Blazedit | ✅ | ✅ | `vdaita/edit_5k_char`, `vdaita/edit_10k_char` |
 | Spec Bench | ✅ | ✅ | `wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl` |
 | Custom | ✅ | ✅ | Local file: `data.jsonl` |
+| Custom MM | ✅ | ✅ | Local file: `mm_data.jsonl` |

 Legend:

@@ -133,6 +134,33 @@ vllm bench serve --port 9001 --save-result --save-detailed \

 You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.

+#### Custom multimodal dataset
+
+If the multimodal dataset you want to benchmark is not supported yet in vLLM, then you can benchmark on it using `CustomMMDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" and "image_files" field per entry, e.g., `mm_data.jsonl`:
+
+```json
+{"prompt": "How many animals are present in the given image?", "image_files": ["/path/to/image/folder/horsepony.jpg"]}
+{"prompt": "What colour is the bird shown in the image?", "image_files": ["/path/to/image/folder/flycatcher.jpeg"]}
+```
+
+```bash
+# need a model with vision capability here
+vllm serve Qwen/Qwen2-VL-7B-Instruct
+```
+
+```bash
+# run benchmarking script
+vllm bench serve--save-result --save-detailed \
+  --backend openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name custom_mm \
+  --dataset-path <path-to-your-mm-data-jsonl> \
+  --allowed-local-media-path /path/to/image/folder
+```
+
+Note that we need to use the `openai-chat` backend and `/v1/chat/completions` endpoint for multimodal inputs.
+
 #### VisionArena Benchmark for Vision Language Models

 ```bash
--- a/docs/benchmarking/sweeps.md
+++ b/docs/benchmarking/sweeps.md
@@ -82,7 +82,7 @@ vllm bench sweep serve \
    You can use `--dry-run` to preview the commands to be run.

    We only start the server once for each `--serve-params`, and keep it running for multiple `--bench-params`.
-    Between each benchmark run, we call the `/reset_prefix_cache` and `/reset_mm_cache` endpoints to get a clean slate for the next run.
+    Between each benchmark run, we call all `/reset_*_cache` endpoints to get a clean slate for the next run.
    In case you are using a custom `--serve-cmd`, you can override the commands used for resetting the state by setting `--after-bench-cmd`.

 !!! note
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -291,6 +291,52 @@ Based on the configuration, the content of the multi-modal caches on `P0` and `P
 K: Stores the hashes of multi-modal items
 V: Stores the processed tensor data of multi-modal items

+## CPU Resources for GPU Deployments
+
+vLLM V1 uses a multi-process architecture (see [V1 Process Architecture](../design/arch_overview.md#v1-process-architecture)) where each process requires CPU resources. Underprovisioning CPU cores is a common source of performance degradation, especially in virtualized environments.
+
+### Minimum CPU Requirements
+
+For a deployment with `N` GPUs, there are at minimum:
+
+- **1 API server process** -- handles HTTP requests, tokenization, and input processing
+- **1 engine core process** -- runs the scheduler and coordinates GPU workers
+- **N GPU worker processes** -- one per GPU, executes model forward passes
+
+This means there are always at least **`2 + N` processes** competing for CPU time.
+
+!!! warning
+    Using fewer physical CPU cores than processes will cause contention and significantly degrade throughput and latency. The engine core process runs a busy loop and is particularly sensitive to CPU starvation.
+
+The minimum is `2 + N` physical cores (1 for the API server, 1 for the engine core, and 1 per GPU worker). In practice, allocating more cores improves performance because the OS, PyTorch background threads, and other system processes also need CPU time.
+
+!!! important
+    Please note we are referring to **physical CPU cores** here. If your system has hyperthreading enabled, then 1 vCPU = 1 hyperthread = 1/2 physical CPU core, so you need `2 x (2 + N)` minimum vCPUs.
+
+### Data Parallel and Multi-API Server Deployments
+
+When using data parallelism or multiple API servers, the CPU requirements increase:
+
+```console
+Minimum physical cores = A + DP + N + (1 if DP > 1 else 0)
+```
+
+where `A` is the API server count (defaults to `DP`), `DP` is the data parallel size, and `N` is the total number of GPUs. For example, with `DP=4, TP=2` on 8 GPUs:
+
+```console
+4 API servers + 4 engine cores + 8 GPU workers + 1 DP coordinator = 17 processes
+```
+
+### Performance Impact
+
+CPU underprovisioning particularly impacts:
+
+- **Input processing throughput** -- tokenization, chat template rendering, and multi-modal data loading all run on CPU
+- **Scheduling latency** -- the engine core scheduler runs on CPU and directly affects how quickly new tokens are dispatched to the GPU workers
+- **Output processing** -- detokenization, networking, and especially streaming token responses use CPU cycles
+
+If you observe that GPU utilization is lower than expected, CPU contention may be the bottleneck. Increasing the number of available CPU cores and even the clock speed can significantly improve end-to-end performance.
+
 ## Attention Backend Selection

 vLLM supports multiple attention backends optimized for different hardware and use cases. The backend is automatically selected based on your GPU architecture, model type, and configuration, but you can also manually specify one for optimal performance.
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -138,7 +138,7 @@ These models should follow the same instructions as case (1), but they should in

 For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](../../../vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](../../../vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
 Please follow the same guidelines as case (2) for implementing these models.
-We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention).
+We use "mamba-like" to refer to layers that possess a state that is updated in-place, rather than being appended-to (like KV cache for attention).
 For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
 It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
 Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this.
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@@ -739,7 +739,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
        ```

    However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called,
-    a BOS token (`<s>`) is also added to the promopt:
+    a BOS token (`<s>`) is also added to the prompt:

    ??? code

--- a/docs/contributing/model/transcription.md
+++ b/docs/contributing/model/transcription.md
@@ -251,6 +251,7 @@ No extra registration is required beyond having your model class available via t
 - Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py)
 - Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py). Make sure to have installed `mistral-common[audio]`.
 - Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py)
+- Qwen3-Omni multimodal with audio embeddings: [vllm/model_executor/models/qwen3_omni_moe_thinker.py](../../../vllm/model_executor/models/qwen3_omni_moe_thinker.py)

 ## Test with the API

--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -1,161 +1,13 @@
+---
+toc_depth: 2
+---
+
 # Using Docker

-## Use vLLM's Official Docker Image
+## Pre-built images

-vLLM offers an official Docker image for deployment.
-The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags).
+--8<-- "docs/getting_started/installation/gpu.md:pre-built-images"

-```bash
-docker run --runtime nvidia --gpus all \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HF_TOKEN=$HF_TOKEN" \
-    -p 8000:8000 \
-    --ipc=host \
-    vllm/vllm-openai:latest \
-    --model Qwen/Qwen3-0.6B
-```
+## Build image from source

-This image can also be used with other container engines such as [Podman](https://podman.io/).
-
-```bash
-podman run --device nvidia.com/gpu=all \
-  -v ~/.cache/huggingface:/root/.cache/huggingface \
-  --env "HF_TOKEN=$HF_TOKEN" \
-  -p 8000:8000 \
-  --ipc=host \
-  docker.io/vllm/vllm-openai:latest \
-  --model Qwen/Qwen3-0.6B
-```
-
-You can add any other [engine-args](../configuration/engine_args.md) you need after the image tag (`vllm/vllm-openai:latest`).
-
-!!! note
-    You can either use the `ipc=host` flag or `--shm-size` flag to allow the
-    container to access the host's shared memory. vLLM uses PyTorch, which uses shared
-    memory to share data between processes under the hood, particularly for tensor parallel inference.
-
-!!! note
-    Optional dependencies are not included in order to avoid licensing issues (e.g. <https://github.com/vllm-project/vllm/issues/8030>).
-
-    If you need to use those dependencies (having accepted the license terms),
-    create a custom Dockerfile on top of the base image with an extra layer that installs them:
-
-    ```Dockerfile
-    FROM vllm/vllm-openai:v0.11.0
-
-    # e.g. install the `audio` optional dependencies
-    # NOTE: Make sure the version of vLLM matches the base image!
-    RUN uv pip install --system vllm[audio]==0.11.0
-    ```
-
-!!! tip
-    Some new models may only be available on the main branch of [HF Transformers](https://github.com/huggingface/transformers).
-
-    To use the development version of `transformers`, create a custom Dockerfile on top of the base image
-    with an extra layer that installs their code from source:
-
-    ```Dockerfile
-    FROM vllm/vllm-openai:latest
-
-    RUN uv pip install --system git+https://github.com/huggingface/transformers.git
-    ```
-
-## Building vLLM's Docker Image from Source
-
-You can build and run vLLM from source via the provided [docker/Dockerfile](../../docker/Dockerfile). To build vLLM:
-
-```bash
-# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
-DOCKER_BUILDKIT=1 docker build . \
-    --target vllm-openai \
-    --tag vllm/vllm-openai \
-    --file docker/Dockerfile
-```
-
-!!! note
-    By default vLLM will build for all GPU types for widest distribution. If you are just building for the
-    current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""`
-    for vLLM to find the current GPU type and build for that.
-
-    If you are using Podman instead of Docker, you might need to disable SELinux labeling by
-    adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184).
-
-!!! note
-    If you have not changed any C++ or CUDA kernel code, you can use precompiled wheels to significantly reduce Docker build time.
-
-    *   **Enable the feature** by adding the build argument: `--build-arg VLLM_USE_PRECOMPILED="1"`.
-    *   **How it works**: By default, vLLM automatically finds the correct wheels from our [Nightly Builds](../contributing/ci/nightly_builds.md) by using the merge-base commit with the upstream `main` branch.
-    *   **Override commit**: To use wheels from a specific commit, provide the `--build-arg VLLM_PRECOMPILED_WHEEL_COMMIT=<commit_hash>` argument.
-
-    For a detailed explanation, refer to the documentation on 'Set up using Python-only build (without compilation)' part in [Build wheel from source](../contributing/ci/nightly_builds.md#precompiled-wheels-usage), these args are similar.
-
-## Building for Arm64/aarch64
-
-A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper and Grace-Blackwell. Using the flag `--platform "linux/arm64"` will build for arm64.
-
-!!! note
-    Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
-    flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
-    Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
-
-??? console "Command"
-
-    ```bash
-    # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
-    DOCKER_BUILDKIT=1 docker build . \
-    --file docker/Dockerfile \
-    --target vllm-openai \
-    --platform "linux/arm64" \
-    -t vllm/vllm-gh200-openai:latest \
-    --build-arg max_jobs=66 \
-    --build-arg nvcc_threads=2 \
-    --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
-    --build-arg RUN_WHEEL_CHECK=false
-    ```
-
-For (G)B300, we recommend using CUDA 13, as shown in the following command.
-
-??? console "Command"
-
-    ```bash
-    DOCKER_BUILDKIT=1 docker build \
-    --build-arg CUDA_VERSION=13.0.1 \
-    --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 \
-    --build-arg max_jobs=256 \
-    --build-arg nvcc_threads=2 \
-    --build-arg RUN_WHEEL_CHECK=false \
-    --build-arg torch_cuda_arch_list='9.0 10.0+PTX' \
-    --platform "linux/arm64" \
-    --tag vllm/vllm-gb300-openai:latest \
-    --target vllm-openai \
-    -f docker/Dockerfile \
-    .
-    ```
-
-!!! note
-    If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution.
-
-    Run the following command on your host machine to register QEMU user static handlers:
-
-    ```bash
-    docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
-    ```
-
-    After setting up QEMU, you can use the `--platform "linux/arm64"` flag in your `docker build` command.
-
-## Use the custom-built vLLM Docker image
-
-To run vLLM with the custom-built Docker image:
-
-```bash
-docker run --runtime nvidia --gpus all \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    -p 8000:8000 \
-    --env "HF_TOKEN=<secret>" \
-    vllm/vllm-openai <args...>
-```
-
-The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command).
-
-!!! note
-    **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .
+--8<-- "docs/getting_started/installation/gpu.md:build-image-from-source"
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@@ -59,11 +59,15 @@ First, create a Kubernetes PVC and Secret for downloading and storing Hugging Fa
 Here, the `token` field stores your **Hugging Face access token**. For details on how to generate a token,
 see the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens).

-Next, start the vLLM server as a Kubernetes Deployment and Service:
+Next, start the vLLM server as a Kubernetes Deployment and Service.
+
+Note that you will want to configure your vLLM image based on your processor arch:

 ??? console "Config"

    ```bash
+    VLLM_IMAGE=public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest       # use this for x86_64
+    VLLM_IMAGE=public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest # use this for arm64
    cat <<EOF |kubectl apply -f -
    apiVersion: apps/v1
    kind: Deployment
@@ -81,7 +85,7 @@ Next, start the vLLM server as a Kubernetes Deployment and Service:
        spec:
          containers:
          - name: vllm
-            image: vllm/vllm-openai:latest
+            image: $VLLM_IMAGE
            command: ["/bin/sh", "-c"]
            args: [
              "vllm serve meta-llama/Llama-3.2-1B-Instruct"
--- a/docs/design/arch_overview.md
+++ b/docs/design/arch_overview.md
@@ -78,6 +78,73 @@ That code can be found in [vllm/entrypoints/openai/api_server.py](../../vllm/ent

 More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.

+## V1 Process Architecture
+
+vLLM V1 uses a multi-process architecture to separate concerns and maximize throughput. Understanding this architecture is important for properly sizing CPU resources in your deployment. The key processes are:
+
+### API Server Process
+
+The API server process handles HTTP requests (e.g., the OpenAI-compatible API), performs input processing (tokenization, multi-modal data loading), and streams results back to clients. It communicates with the engine core process(es) via ZMQ sockets.
+
+By default, there is **1 API server process**, but when data parallelism is used, the API server count automatically scales to match the data parallel size. This can also be manually configured with the `--api-server-count` flag. Each API server connects to **all** engine cores via ZMQ in a many-to-many topology, enabling any API server to route requests to any engine core. Each API server process uses multiple CPU threads for media loading (controlled by `VLLM_MEDIA_LOADING_THREAD_COUNT`, default 8).
+
+The code can be found in [vllm/entrypoints/openai/api_server.py](../../vllm/entrypoints/openai/api_server.py) and [vllm/v1/utils.py](../../vllm/v1/utils.py).
+
+### Engine Core Process
+
+The engine core process runs the scheduler, manages KV cache, and coordinates model execution across GPU workers. It runs a busy loop that continuously schedules requests and dispatches work to the GPU workers.
+
+There is **1 engine core process per data parallel rank**. For example, with `--data-parallel-size 4`, there are 4 engine core processes.
+
+The code can be found in [vllm/v1/engine/core.py](../../vllm/v1/engine/core.py) and [vllm/v1/engine/utils.py](../../vllm/v1/engine/utils.py).
+
+### GPU Worker Processes
+
+Each GPU is managed by a dedicated worker process. The worker process loads model weights, executes forward passes, and manages GPU memory. Workers communicate with the engine core process that owns them.
+
+There is **1 worker process per GPU**. The total number of GPU worker processes equals `tensor_parallel_size x pipeline_parallel_size` per engine core.
+
+The code can be found in [vllm/v1/executor/multiproc_executor.py](../../vllm/v1/executor/multiproc_executor.py) and [vllm/v1/worker/gpu_worker.py](../../vllm/v1/worker/gpu_worker.py).
+
+### DP Coordinator Process (conditional)
+
+When using data parallelism (`--data-parallel-size > 1`), an additional coordinator process manages load balancing across DP ranks and coordinates synchronized forward passes for MoE models.
+
+There is **1 DP coordinator process** (only when data parallelism is enabled).
+
+The code can be found in [vllm/v1/engine/coordinator.py](../../vllm/v1/engine/coordinator.py).
+
+### Process Count Summary
+
+For a deployment with `N` GPUs, `TP` tensor parallel size, `DP` data parallel size, and `A` API server count:
+
+| Process Type | Count | Notes |
+|---|---|---|
+| API Server | `A` (default `DP`) | Handles HTTP requests and input processing |
+| Engine Core | `DP` (default 1) | Scheduler and KV cache management |
+| GPU Worker | `N` (= `DP x TP`) | One per GPU, executes model forward passes |
+| DP Coordinator | 1 if `DP > 1`, else 0 | Load balancing across DP ranks |
+| **Total** | **`A + DP + N` (+ 1 if DP > 1)** | |
+
+For example, a typical single-node deployment with 4 GPUs (`vllm serve -tp=4`) has:
+
+- 1 API server + 1 engine core + 4 GPU workers = **6 processes**
+
+<figure markdown="1">
+![V1 Process Architecture - TP=4](../assets/design/arch_overview/v1_process_architecture_tp4.png)
+</figure>
+
+A data parallel deployment with 8 GPUs (`vllm serve -tp=2 -dp=4`) has:
+
+- 4 API servers + 4 engine cores + 8 GPU workers + 1 DP coordinator = **17 processes**
+
+<figure markdown="1">
+![V1 Process Architecture - TP=2, DP=4](../assets/design/arch_overview/v1_process_architecture_tp2_dp4.png)
+</figure>
+
+For CPU resource sizing recommendations, see
+[CPU Resources for GPU Deployments](../configuration/optimization.md#cpu-resources-for-gpu-deployments).
+
 ## LLM Engine

 The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of
--- a/docs/design/attention_backends.md
+++ b/docs/design/attention_backends.md
@@ -168,7 +168,7 @@ Priority is **1 = highest** (tried first).
 | `FLASH_ATTN` | FA3* | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ❌ | All | 9.x |
 | `FLASH_ATTN_DIFFKV` |  | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | Decoder | Any |
 | `FLEX_ATTENTION` |  | fp16, bf16, fp32 | `auto`, `bfloat16` | Any | Any | ❌ | ✅ | Decoder, Encoder Only | Any |
-| `ROCM_AITER_FA` |  | fp16, bf16 | `auto` | %16 | 64, 128, 256 | ❌ | ❌ | Decoder | N/A |
+| `ROCM_AITER_FA` |  | fp16, bf16 | `auto` | 16, 32 | 64, 128, 256 | ❌ | ❌ | Decoder | N/A |
 | `ROCM_AITER_UNIFIED_ATTN` |  | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | Decoder | N/A |
 | `ROCM_ATTN` |  | fp16, bf16, fp32 | `auto` | 16, 32, 544 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | Decoder | N/A |
 | `TREE_ATTN` |  | fp16, bf16 | `auto` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | Decoder | Any |
--- a/docs/design/debug_vllm_compile.md
+++ b/docs/design/debug_vllm_compile.md
@@ -282,6 +282,15 @@ If vLLM's compile cache is wrong, this usually means that a factor is missing.
 Please see [this example](https://github.com/vllm-project/vllm/blob/18b39828d90413d05d770dfd2e2f48304f4ca0eb/vllm/config/model.py#L310)
 of how vLLM computes part of the cache key.

+vLLM's compilation cache requires that the code being compiled ends up being serializable.
+If this is not the case, then it will error out on save. Usually the fixes are to either:
+
+- rewrite the non-serializable pieces (perhaps difficult because it's difficult to
+  tell right now what is serializable and what isn't)
+- file a bug report
+- ignore the error by setting `VLLM_DISABLE_COMPILE_CACHE=1` (note that this will
+  make warm server starts a lot slower).
+
 ## Debugging CUDAGraphs

 CUDAGraphs is a feature that allows one to:
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -24,7 +24,7 @@ vLLM's plugin system uses the standard Python `entry_points` mechanism. This mec
            ["register_dummy_model = vllm_add_dummy_model:register"]
        })

-    # inside `vllm_add_dummy_model.py` file
+    # inside `vllm_add_dummy_model/__init__.py` file
    def register():
        from vllm import ModelRegistry

@@ -45,7 +45,7 @@ Every plugin has three parts:

 ## Types of supported plugins

- **General plugins** (with group name `vllm.general_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model inside the plugin function.
+- **General plugins** (with group name `vllm.general_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model inside the plugin function. For an example of an official model plugin, see the [bart-plugin](https://github.com/vllm-project/bart-plugin) which adds support for `BartForConditionalGeneration`.

 - **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.

@@ -154,4 +154,4 @@ The interface for the model/module may change during vLLM's development. If you
 !!! warning "Deprecations"
    - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0.
    - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.v1.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
-    - `seed_everything` platform interface is deprecated. It will be removed in v0.15.0 or later. Please use `vllm.utils.torch_utils.set_random_seed` instead.
+    - `seed_everything` platform interface is deprecated. It has been removed in v0.16.0. Please use `vllm.utils.torch_utils.set_random_seed` instead.
--- a/docs/examples/README.md
+++ b/docs/examples/README.md
@@ -2,6 +2,6 @@

 vLLM's examples are split into three categories:

- If you are using vLLM from within Python code, see the *Offline Inference* section.
- If you are using vLLM from an HTTP application or client, see the *Online Serving* section.
- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see the *Others* section.
+- If you are using vLLM from within Python code, see the [Offline Inference](../../examples/offline_inference) section.
+- If you are using vLLM from an HTTP application or client, see the [Online Serving](../../examples/online_serving) section.
+- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see the [Others](../../examples/others) section.
--- a/docs/features/batch_invariance.md
+++ b/docs/features/batch_invariance.md
@@ -108,6 +108,7 @@ Batch invariance has been tested and verified on the following models:
 - **Qwen3 (MoE)**: `Qwen/Qwen3-30B-A3B`, `Qwen/Qwen3-Next-80B-A3B-Instruct`
 - **Qwen2.5**: `Qwen/Qwen2.5-0.5B-Instruct`, `Qwen/Qwen2.5-1.5B-Instruct`, `Qwen/Qwen2.5-3B-Instruct`, `Qwen/Qwen2.5-7B-Instruct`, `Qwen/Qwen2.5-14B-Instruct`, `Qwen/Qwen2.5-32B-Instruct`
 - **Llama 3**: `meta-llama/Llama-3.1-8B-Instruct`, `meta-llama/Llama-3.2-1B-Instruct`
+- **GPT-OSS**: `openai/gpt-oss-20b`, `openai/gpt-oss-120b`

 Other models may also work, but these have been explicitly validated. If you encounter issues with a specific model, please report them on the [GitHub issue tracker](https://github.com/vllm-project/vllm/issues/new/choose).

--- a/docs/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@@ -19,12 +19,13 @@ Two main reasons:

 Please refer to [examples/online_serving/disaggregated_prefill.sh](../../examples/online_serving/disaggregated_prefill.sh) for the example usage of disaggregated prefilling.

-Now supports 5 types of connectors:
+Now supports 6 types of connectors:

 - **ExampleConnector**: refer to [examples/offline_inference/disaggregated-prefill-v1/run.sh](../../examples/offline_inference/disaggregated-prefill-v1/run.sh) for the example usage of ExampleConnector disaggregated prefilling.
 - **LMCacheConnectorV1**: refer to [examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh](../../examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh) for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
 - **NixlConnector**: refer to [tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh) for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
 - **P2pNcclConnector**: refer to [examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh](../../examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh) for the example usage of P2pNcclConnector disaggregated prefilling.
+- **MooncakeConnector**: refer to [examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh](../../examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh) for the example usage of ExampleConnector disaggregated prefilling. For detailed usage guide, see [MooncakeConnector Usage Guide](mooncake_connector_usage.md).
 - **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as:

  ```bash
--- a/docs/features/mooncake_connector_usage.md
+++ b/docs/features/mooncake_connector_usage.md
@@ -31,11 +31,9 @@ vllm serve Qwen/Qwen2.5-7B-Instruct --port 8020 --kv-transfer-config '{"kv_conne
 ### Proxy

 ```bash
-python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --prefiller-host 192.168.0.2 --prefiller-port 8010 --decoder-host 192.168.0.3 --decoder-port 8020
+python examples/online_serving/disaggregated_serving/mooncake_connector/mooncake_connector_proxy.py --prefill http://192.168.0.2:8010 --decode http://192.168.0.3:8020
 ```

-> NOTE: The Mooncake Connector currently uses the proxy from nixl_integration. This will be replaced with a self-developed proxy in the future.
-
 Now you can send requests to the proxy server through port 8000.

 ## Environment Variables
@@ -43,16 +41,29 @@ Now you can send requests to the proxy server through port 8000.
 - `VLLM_MOONCAKE_BOOTSTRAP_PORT`: Port for Mooncake bootstrap server
    - Default: 8998
    - Required only for prefiller instances
-    - Each vLLM worker needs a unique port on its host; using the same port number across different hosts is fine
-    - For TP/DP deployments, each worker's port on a node is computed as: base_port + dp_rank * tp_size + tp_rank
-    - Used for the decoder notifying the prefiller
+    - For headless instances, must be the same as the master instance
+    - Each instance needs a unique port on its host; using the same port number across different hosts is fine

 - `VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT`: Timeout (in seconds) for automatically releasing the prefiller’s KV cache for a particular request. (Optional)
    - Default: 480
    - If a request is aborted and the decoder has not yet notified the prefiller, the prefill instance will release its KV-cache blocks after this timeout to avoid holding them indefinitely.

-## KV Role Options
+## KV Transfer Config
+
+### KV Role Options

 - **kv_producer**: For prefiller instances that generate KV caches
 - **kv_consumer**: For decoder instances that consume KV caches from prefiller
 - **kv_both**: Enables symmetric functionality where the connector can act as both producer and consumer. This provides flexibility for experimental setups and scenarios where the role distinction is not predetermined.
+
+### kv_connector_extra_config
+
+- **num_workers**: Size of thread pool for one prefiller worker to transfer KV caches by mooncake. (default 10)
+- **mooncake_protocol**: Mooncake connector protocol. (default "rdma")
+
+## Example Scripts/Code
+
+Refer to these example scripts in the vLLM repository:
+
+- [run_mooncake_connector.sh](../../examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh)
+- [mooncake_connector_proxy.py](../../examples/online_serving/disaggregated_serving/mooncake_connector/mooncake_connector_proxy.py)
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -510,7 +510,7 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
    If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.

    For certain models, we provide alternative chat templates inside [examples](../../examples).
-    For example, VLM2Vec uses [examples/template_vlm2vec_phi3v.jinja](../../examples/template_vlm2vec_phi3v.jinja) which is different from the default one for Phi-3-Vision.
+    For example, VLM2Vec uses [examples/pooling/embed/template/vlm2vec_phi3v.jinja](../../examples/pooling/embed/template/vlm2vec_phi3v.jinja) which is different from the default one for Phi-3-Vision.

 ### Image Inputs

--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@@ -36,6 +36,35 @@ export UCX_NET_DEVICES=all  # or specify network devices like "mlx5_0:1,mlx5_1:1
 !!! tip
    When using UCX as the transport backend, NCCL environment variables (like `NCCL_IB_HCA`, `NCCL_SOCKET_IFNAME`) are not applicable to NixlConnector, so configure UCX-specific environment variables instead of NCCL variables.

+#### Selecting a NIXL transport backend (plugin)
+
+NixlConnector can use different NIXL transport backends (plugins). By default, NixlConnector uses UCX as the transport backend.
+
+To select a different backend, set `kv_connector_extra_config.backends` in `--kv-transfer-config`.
+
+### Example: using LIBFABRIC backend
+
+```bash
+vllm serve <MODEL> \
+  --kv-transfer-config '{
+    "kv_connector":"NixlConnector",
+    "kv_role":"kv_both",
+    "kv_connector_extra_config":{"backends":["LIBFABRIC"]}
+  }'
+```
+
+You can also pass JSON keys individually using dotted arguments, and you can append list elements using `+`:
+
+```bash
+vllm serve <MODEL> \
+  --kv-transfer-config.kv_connector NixlConnector \
+  --kv-transfer-config.kv_role kv_both \
+  --kv-transfer-config.kv_connector_extra_config.backends+ LIBFABRIC
+```
+
+!!! note
+    Backend availability depends on how NIXL was built and what plugins are present in your environment. Refer to the [NIXL repository](https://github.com/ai-dynamo/nixl) for available backends and build instructions.
+
 ## Basic Usage (on the same host)

 ### Producer (Prefiller) Configuration
@@ -184,6 +213,15 @@ Support use case: Prefill with 'HND' and decode with 'NHD' with experimental con
 --kv-transfer-config '{..., "enable_permute_local_kv":"True"}'
 ```

+### Cross layers blocks
+
+By default, this feature is disabled. On attention backends that support this feature, each logical block is contiguous in physical memory. This reduces the number of buffers that need to be transferred.
+To enable this feature:
+
+```bash
+--kv-transfer-config '{..., "kv_connector_extra_config": {"enable_cross_layers_blocks": "True"}}'
+```
+
 ## Example Scripts/Code

 Refer to these example scripts in the vLLM repository:
--- a/docs/features/quantization/gguf.md
+++ b/docs/features/quantization/gguf.md
@@ -6,34 +6,38 @@
 !!! warning
    Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model.

-To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command:
+To run a GGUF model with vLLM, you can use the `repo_id:quant_type` format to load directly from HuggingFace. For example, to load a Q4_K_M quantized model from [unsloth/Qwen3-0.6B-GGUF](https://huggingface.co/unsloth/Qwen3-0.6B-GGUF):

 ```bash
-wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
 # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
-vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
-   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
+vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M --tokenizer Qwen/Qwen3-0.6B
 ```

 You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs:

 ```bash
-# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
-vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
-   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M \
+   --tokenizer Qwen/Qwen3-0.6B \
   --tensor-parallel-size 2
 ```

+Alternatively, you can download and use a local GGUF file:
+
+```bash
+wget https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf
+vllm serve ./Qwen3-0.6B-Q4_K_M.gguf --tokenizer Qwen/Qwen3-0.6B
+```
+
 !!! warning
    We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.

-GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path
+GGUF assumes that HuggingFace can convert the metadata to a config file. In case HuggingFace doesn't support your model you can manually create a config and pass it as hf-config-path

 ```bash
-# If you model is not supported by huggingface you can manually provide a huggingface compatible config path
-vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
-   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
-   --hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0
+# If your model is not supported by HuggingFace you can manually provide a HuggingFace compatible config path
+vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M \
+   --tokenizer Qwen/Qwen3-0.6B \
+   --hf-config-path Qwen/Qwen3-0.6B
 ```

 You can also use the GGUF model directly through the LLM entrypoint:
@@ -66,10 +70,10 @@ You can also use the GGUF model directly through the LLM entrypoint:
      # Create a sampling params object.
      sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-      # Create an LLM.
+      # Create an LLM using repo_id:quant_type format.
      llm = LLM(
-         model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-         tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+         model="unsloth/Qwen3-0.6B-GGUF:Q4_K_M",
+         tokenizer="Qwen/Qwen3-0.6B",
      )
      # Generate texts from the prompts. The output is a list of RequestOutput objects
      # that contain the prompt, generated text, and other information.
--- a/docs/features/quantization/modelopt.md
+++ b/docs/features/quantization/modelopt.md
@@ -17,6 +17,7 @@ following `quantization.quant_algo` values:
 - `FP8_PER_CHANNEL_PER_TOKEN`: per-channel weight scale and dynamic per-token activation quantization.
 - `FP8_PB_WO` (ModelOpt may emit `fp8_pb_wo`): block-scaled FP8 weight-only (typically 128×128 blocks).
 - `NVFP4`: ModelOpt NVFP4 checkpoints (use `quantization="modelopt_fp4"`).
+- `MXFP8`: ModelOpt MXFP8 checkpoints (use `quantization="modelopt_mxfp8"`).

 ## Quantizing HuggingFace Models with PTQ

--- a/docs/getting_started/installation/cpu.arm.inc.md
+++ b/docs/getting_started/installation/cpu.arm.inc.md
@@ -136,15 +136,31 @@ Testing has been conducted on AWS Graviton3 instances for compatibility.
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]

-See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image.
+To pull the latest image:

-Stable vLLM Docker images are being pre-built for Arm from version 0.12.0. Available image tags are here: [https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo).
+```bash
+docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest
+```
+
+To pull an image with a specific vLLM version:

 ```bash
 export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
 docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${VLLM_VERSION}
 ```

+All available image tags are here: [https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo).
+
+You can run these images via:
+
+```bash
+docker run \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    -p 8000:8000 \
+    --env "HF_TOKEN=<secret>" \
+    public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:<tag> <args...>
+```
+
 You can also access the latest code with Docker images. These are not intended for production use and are meant for CI and testing only. They will expire after several days.

 The latest code can contain bugs and may not be stable. Please use it with caution.
--- a/docs/getting_started/installation/cpu.x86.inc.md
+++ b/docs/getting_started/installation/cpu.x86.inc.md
@@ -161,7 +161,23 @@ uv pip install dist/*.whl
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]

-[https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
+You can pull the latest available CPU image here via:
+
+```bash
+docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest
+```
+
+If you want a more specific build you can find all published CPU based images here: [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
+
+You can run these images via:
+
+```bash
+docker run \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    -p 8000:8000 \
+    --env "HF_TOKEN=<secret>" \
+    public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:<tag> <args...>
+```

 !!! warning
    If deploying the pre-built images on machines without `avx512f`, `avx512_bf16`, or `avx512_vnni` support, an `Illegal instruction` error may be raised. See the build-image-from-source section below for build arguments to match your target CPU capabilities.
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@@ -239,27 +239,168 @@ uv pip install -e .
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]

-See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image.
-
-Another way to access the latest code is to use the docker images:
+vLLM offers an official Docker image for deployment.
+The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags).

 ```bash
-export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HF_TOKEN=$HF_TOKEN" \
+    -p 8000:8000 \
+    --ipc=host \
+    vllm/vllm-openai:latest \
+    --model Qwen/Qwen3-0.6B
 ```

-These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days.
+This image can also be used with other container engines such as [Podman](https://podman.io/).

-The latest code can contain bugs and may not be stable. Please use it with caution.
+```bash
+podman run --device nvidia.com/gpu=all \
+-v ~/.cache/huggingface:/root/.cache/huggingface \
+--env "HF_TOKEN=$HF_TOKEN" \
+-p 8000:8000 \
+--ipc=host \
+docker.io/vllm/vllm-openai:latest \
+--model Qwen/Qwen3-0.6B
+```
+
+You can add any other [engine-args](https://docs.vllm.ai/en/latest/configuration/engine_args/) you need after the image tag (`vllm/vllm-openai:latest`).
+
+!!! note
+    You can either use the `ipc=host` flag or `--shm-size` flag to allow the
+    container to access the host's shared memory. vLLM uses PyTorch, which uses shared
+    memory to share data between processes under the hood, particularly for tensor parallel inference.
+
+!!! note
+    Optional dependencies are not included in order to avoid licensing issues (e.g. <https://github.com/vllm-project/vllm/issues/8030>).
+
+    If you need to use those dependencies (having accepted the license terms),
+    create a custom Dockerfile on top of the base image with an extra layer that installs them:
+
+    ```Dockerfile
+    FROM vllm/vllm-openai:v0.11.0
+
+    # e.g. install the `audio` optional dependencies
+    # NOTE: Make sure the version of vLLM matches the base image!
+    RUN uv pip install --system vllm[audio]==0.11.0
+    ```
+
+!!! tip
+    Some new models may only be available on the main branch of [HF Transformers](https://github.com/huggingface/transformers).
+
+    To use the development version of `transformers`, create a custom Dockerfile on top of the base image
+    with an extra layer that installs their code from source:
+
+    ```Dockerfile
+    FROM vllm/vllm-openai:latest
+
+    RUN uv pip install --system git+https://github.com/huggingface/transformers.git
+    ```

 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]

-See [Building vLLM's Docker Image from Source](../../deployment/docker.md#building-vllms-docker-image-from-source) for instructions on building the Docker image.
+You can build and run vLLM from source via the provided [docker/Dockerfile](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile). To build vLLM:
+
+```bash
+# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
+DOCKER_BUILDKIT=1 docker build . \
+    --target vllm-openai \
+    --tag vllm/vllm-openai \
+    --file docker/Dockerfile
+```
+
+!!! note
+    By default vLLM will build for all GPU types for widest distribution. If you are just building for the
+    current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""`
+    for vLLM to find the current GPU type and build for that.
+
+    If you are using Podman instead of Docker, you might need to disable SELinux labeling by
+    adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184).
+
+!!! note
+    If you have not changed any C++ or CUDA kernel code, you can use precompiled wheels to significantly reduce Docker build time.
+
+    *   **Enable the feature** by adding the build argument: `--build-arg VLLM_USE_PRECOMPILED="1"`.
+    *   **How it works**: By default, vLLM automatically finds the correct wheels from our [Nightly Builds](https://docs.vllm.ai/en/latest/contributing/ci/nightly_builds/) by using the merge-base commit with the upstream `main` branch.
+    *   **Override commit**: To use wheels from a specific commit, provide the `--build-arg VLLM_PRECOMPILED_WHEEL_COMMIT=<commit_hash>` argument.
+
+    For a detailed explanation, refer to the documentation on 'Set up using Python-only build (without compilation)' part in [Build wheel from source](https://docs.vllm.ai/en/latest/contributing/ci/nightly_builds/#precompiled-wheels-usage), these args are similar.
+
+#### Building vLLM's Docker Image from Source for Arm64/aarch64
+
+A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper and Grace-Blackwell. Using the flag `--platform "linux/arm64"` will build for arm64.
+
+!!! note
+    Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
+    flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
+    Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
+
+??? console "Command"
+
+    ```bash
+    # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
+    DOCKER_BUILDKIT=1 docker build . \
+    --file docker/Dockerfile \
+    --target vllm-openai \
+    --platform "linux/arm64" \
+    -t vllm/vllm-gh200-openai:latest \
+    --build-arg max_jobs=66 \
+    --build-arg nvcc_threads=2 \
+    --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
+    --build-arg RUN_WHEEL_CHECK=false
+    ```
+
+For (G)B300, we recommend using CUDA 13, as shown in the following command.
+
+??? console "Command"
+
+    ```bash
+    DOCKER_BUILDKIT=1 docker build \
+    --build-arg CUDA_VERSION=13.0.1 \
+    --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 \
+    --build-arg max_jobs=256 \
+    --build-arg nvcc_threads=2 \
+    --build-arg RUN_WHEEL_CHECK=false \
+    --build-arg torch_cuda_arch_list='9.0 10.0+PTX' \
+    --platform "linux/arm64" \
+    --tag vllm/vllm-gb300-openai:latest \
+    --target vllm-openai \
+    -f docker/Dockerfile \
+    .
+    ```
+
+!!! note
+    If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution.
+
+    Run the following command on your host machine to register QEMU user static handlers:
+
+    ```bash
+    docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
+    ```
+
+    After setting up QEMU, you can use the `--platform "linux/arm64"` flag in your `docker build` command.
+
+#### Use the custom-built vLLM Docker image**
+
+To run vLLM with the custom-built Docker image:
+
+```bash
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    -p 8000:8000 \
+    --env "HF_TOKEN=<secret>" \
+    vllm/vllm-openai <args...>
+```
+
+The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command).
+
+!!! note
+    **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .

 # --8<-- [end:build-image-from-source]
 # --8<-- [start:supported-features]

 See [Feature x Hardware](../../features/README.md#feature-x-hardware) compatibility matrix for feature support information.

-# --8<-- [end:supported-features]
+# --8<-- [end:supported-features]
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@@ -1,3 +1,7 @@
+---
+toc_depth: 3
+---
+
 # GPU

 vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions:
@@ -84,6 +88,9 @@ vLLM is a Python library that supports the following GPU variants. Select your G

 ### Pre-built images

+<!-- markdownlint-disable MD025 -->
+# --8<-- [start:pre-built-images]
+
 === "NVIDIA CUDA"

    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:pre-built-images"
@@ -96,7 +103,15 @@ vLLM is a Python library that supports the following GPU variants. Select your G

    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:pre-built-images"

+# --8<-- [end:pre-built-images]
+<!-- markdownlint-enable MD025 -->
+
+<!-- markdownlint-disable MD001 -->
 ### Build image from source
+<!-- markdownlint-enable MD001 -->
+
+<!-- markdownlint-disable MD025 -->
+# --8<-- [start:build-image-from-source]

 === "NVIDIA CUDA"

@@ -110,6 +125,9 @@ vLLM is a Python library that supports the following GPU variants. Select your G

    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:build-image-from-source"

+# --8<-- [end:build-image-from-source]
+<!-- markdownlint-enable MD025 -->
+
 ## Supported features

 === "NVIDIA CUDA"
--- a/docs/getting_started/installation/gpu.rocm.inc.md
+++ b/docs/getting_started/installation/gpu.rocm.inc.md
@@ -31,7 +31,7 @@ uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/
 To install a specific version and ROCm variant of vLLM wheel.

 ```bash
-uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.14.1/rocm700
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.15.0/rocm700
 ```

 !!! warning "Caveats for using `pip`" 
@@ -41,7 +41,7 @@ uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.14.1/rocm700
    If you insist on using `pip`, you have to specify the exact vLLM version and full URL of the wheel path `https://wheels.vllm.ai/rocm/<version>/<rocm-variant>` (which can be obtained from the web page).

    ```bash
-    pip install vllm==0.14.1+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.14.1/rocm700
+    pip install vllm==0.15.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.15.0/rocm700
    ```

 # --8<-- [end:pre-built-wheels]
@@ -174,67 +174,44 @@ uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.14.1/rocm700
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]

-#### Use vLLM's Official Docker Image
-
 vLLM offers an official Docker image for deployment.
 The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai-rocm](https://hub.docker.com/r/vllm/vllm-openai-rocm/tags).

-???+ console "Commands"
-    ```bash
-    docker run --rm \
-        --group-add=video \
-        --cap-add=SYS_PTRACE \
-        --security-opt seccomp=unconfined \
-        --device /dev/kfd \
-        --device /dev/dri \
-        -v ~/.cache/huggingface:/root/.cache/huggingface \
-        --env "HF_TOKEN=$HF_TOKEN" \
-        -p 8000:8000 \
-        --ipc=host \
-        vllm/vllm-openai-rocm:latest \
-        --model Qwen/Qwen3-0.6B
-    ```
-
-To use the docker image as base for development, you can launch it in interactive session through overriding the entrypoint.
-
-???+ console "Commands"
-    ```bash
-    docker run --rm -it \
-        --group-add=video \
-        --cap-add=SYS_PTRACE \
-        --security-opt seccomp=unconfined \
-        --device /dev/kfd \
-        --device /dev/dri \
-        -v ~/.cache/huggingface:/root/.cache/huggingface \
-        --env "HF_TOKEN=$HF_TOKEN" \
-        -p 8000:8000 \
-        --ipc=host \
-        --entrypoint bash \
-        vllm/vllm-openai-rocm:latest
-    ```
-
-
-#### Use AMD's Docker Images
-
-The [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
-docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator.
-AMD also offers nightly prebuilt docker image from [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev), which has vLLM and all its dependencies installed. The entrypoint of this docker image is `/bin/bash` (different from the vLLM's Official Docker Image).
-
-???+ console "Commands"
-    ```bash
-    docker pull rocm/vllm-dev:nightly # to get the latest image
-    docker run -it --rm \
-    --network=host \
+```bash
+docker run --rm \
    --group-add=video \
-    --ipc=host \
    --cap-add=SYS_PTRACE \
    --security-opt seccomp=unconfined \
    --device /dev/kfd \
    --device /dev/dri \
-    -v <path/to/your/models>:/app/models \
-    -e HF_HOME="/app/models" \
-    rocm/vllm-dev:nightly
-    ```
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HF_TOKEN=$HF_TOKEN" \
+    -p 8000:8000 \
+    --ipc=host \
+    vllm/vllm-openai-rocm:latest \
+    --model Qwen/Qwen3-0.6B
+```
+
+#### Use AMD's Docker Images
+
+Prior to January 20th, 2026 when the official docker images are available on [upstream vLLM docker hub](https://hub.docker.com/v2/repositories/vllm/vllm-openai-rocm/tags/), the [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
+docker image designed for validating inference performance on the AMD Instinct MI300X™ accelerator.
+AMD also offers nightly prebuilt docker image from [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev), which has vLLM and all its dependencies installed. The entrypoint of this docker image is `/bin/bash` (different from the vLLM's Official Docker Image).
+
+```bash
+docker pull rocm/vllm-dev:nightly # to get the latest image
+docker run -it --rm \
+--network=host \
+--group-add=video \
+--ipc=host \
+--cap-add=SYS_PTRACE \
+--security-opt seccomp=unconfined \
+--device /dev/kfd \
+--device /dev/dri \
+-v <path/to/your/models>:/app/models \
+-e HF_HOME="/app/models" \
+rocm/vllm-dev:nightly
+```

 !!! tip
    Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html)
@@ -243,7 +220,7 @@ AMD also offers nightly prebuilt docker image from [Docker Hub](https://hub.dock
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]

-Building the Docker image from source is the recommended way to use vLLM with ROCm.
+You can build and run vLLM from source via the provided [docker/Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm).

 ??? info "(Optional) Build an image with ROCm software stack"

@@ -269,8 +246,6 @@ Building the Docker image from source is the recommended way to use vLLM with RO
        -t rocm/vllm-dev:base .
    ```

-#### Build an image with vLLM
-
 First, build a docker image from [docker/Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm) and launch a docker container from the image.
 It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:

@@ -292,30 +267,46 @@ Their values can be passed in when running `docker build` with `--build-arg` opt

 To build vllm on ROCm 7.0 for MI200 and MI300 series, you can use the default (which build a docker image with `vllm serve` as entrypoint):

-???+ console "Commands"
-    ```bash
-    DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm .
-    ```
-
-To run the above docker image `vllm-rocm`, use the below command:
+```bash
+DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm/vllm-openai-rocm .
+```


-???+ console "Commands"
-    ```bash
-    docker run -it \
-    --network=host \
+To run vLLM with the custom-built Docker image:
+
+```bash
+docker run --rm \
    --group-add=video \
-    --ipc=host \
    --cap-add=SYS_PTRACE \
    --security-opt seccomp=unconfined \
    --device /dev/kfd \
    --device /dev/dri \
-    -v <path/to/model>:/app/model \
-    vllm-rocm \
-    --model Qwen/Qwen3-0.6B
-    ```
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HF_TOKEN=$HF_TOKEN" \
+    -p 8000:8000 \
+    --ipc=host \
+    vllm/vllm-openai-rocm <args...>
+```

-Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models.
+The argument `vllm/vllm-openai-rocm` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command).
+
+To use the docker image as base for development, you can launch it in interactive session through overriding the entrypoint.
+
+???+ console "Commands"
+    ```bash
+    docker run --rm -it \
+        --group-add=video \
+        --cap-add=SYS_PTRACE \
+        --security-opt seccomp=unconfined \
+        --device /dev/kfd \
+        --device /dev/dri \
+        -v ~/.cache/huggingface:/root/.cache/huggingface \
+        --env "HF_TOKEN=$HF_TOKEN" \
+        --network=host \
+        --ipc=host \
+        --entrypoint bash \
+        vllm/vllm-openai-rocm
+    ```

 # --8<-- [end:build-image-from-source]
 # --8<-- [start:supported-features]
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -57,7 +57,7 @@ This guide will help you quickly get started with vLLM to perform:
        It currently supports Python 3.12, ROCm 7.0 and `glibc >= 2.35`.

    !!! note    
-        Note that, previously, docker images were published using AMD's docker release pipeline and were located `rocm/vlm-dev`. This is being deprecated by using vLLM's docker release pipeline.
+        Note that, previously, docker images were published using AMD's docker release pipeline and were located `rocm/vllm-dev`. This is being deprecated by using vLLM's docker release pipeline.

 === "Google TPU"

--- a/docs/mkdocs/javascript/reo.js
+++ b/docs/mkdocs/javascript/reo.js
@@ -0,0 +1,3 @@
+// Reo.Dev documentation tracking
+// https://docs.reo.dev/integrations/tracking-beacon/install-javascript-for-documentation
+!function(){var e,t,n;e="d5c4337961ef0ac",t=function(){Reo.init({clientID:"d5c4337961ef0ac"})},(n=document.createElement("script")).src="https://static.reo.dev/"+e+"/reo.js",n.defer=!0,n.onload=t,document.head.appendChild(n)}();
--- a/Show More
+++ b/Show More