[ROCm][CI] Pin TorchCodec to v0.10.0 for ROCm compatibility (#34447 )

Signed-off-by: Andreas Karatzas <akaratza@amd.com> (cherry picked from commit 4c078fa546) (cherry picked from commit a976961fb77d38129abf69edd4952101731f2421)
[ci] Use the right tag for CPU arm64 image (#34915 )
2026-02-24 20:30:22 -08:00 · 2026-02-24 20:30:13 -08:00 · 2026-02-24 20:30:02 -08:00 · 2026-02-24 20:28:53 -08:00 · 2026-02-24 20:28:45 -08:00 · 2026-02-12 02:37:52 -08:00
475 changed files with 19496 additions and 8884 deletions
--- a/.buildkite/hardware_tests/arm.yaml
+++ b/.buildkite/hardware_tests/arm.yaml
@@ -1,8 +0,0 @@
-group: Hardware
-steps:
-  - label: "Arm CPU Test"
-    soft_fail: true
-    device: arm_cpu
-    no_plugin: true
-    commands: 
-    - bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -0,0 +1,100 @@
+group: CPU
+depends_on: []
+steps:
+- label: CPU-Kernel Tests
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  - csrc/cpu/
+  - cmake/cpu_extension.cmake
+  - CMakeLists.txt
+  - vllm/_custom_ops.py
+  - tests/kernels/attention/test_cpu_attn.py
+  - tests/kernels/moe/test_cpu_fused_moe.py
+  - tests/kernels/test_onednn.py
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+      pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+      pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
+      pytest -x -v -s tests/kernels/test_onednn.py"
+
+- label: CPU-Language Generation and Pooling Model Tests
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  - csrc/cpu/
+  - vllm/
+  - tests/models/language/generation/
+  - tests/models/language/pooling/
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
+      pytest -x -v -s tests/models/language/generation -m cpu_model
+      pytest -x -v -s tests/models/language/pooling -m cpu_model"
+
+- label: CPU-Quantization Model Tests
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  - csrc/cpu/
+  - vllm/model_executor/layers/quantization/cpu_wna16.py
+  - vllm/model_executor/layers/quantization/gptq_marlin.py
+  - vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+  - vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
+  - vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
+  - tests/quantization/test_compressed_tensors.py
+  - tests/quantization/test_cpu_wna16.py
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+      pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
+      pytest -x -v -s tests/quantization/test_cpu_wna16.py"
+      
+- label: CPU-Distributed Tests
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  - csrc/cpu/shm.cpp
+  - vllm/v1/worker/cpu_worker.py
+  - vllm/v1/worker/gpu_worker.py
+  - vllm/v1/worker/cpu_model_runner.py
+  - vllm/v1/worker/gpu_model_runner.py
+  - vllm/platforms/cpu.py
+  - vllm/distributed/parallel_state.py
+  - vllm/distributed/device_communicators/cpu_communicator.py
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
+      bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
+
+- label: CPU-Multi-Modal Model Tests %N
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  # - vllm/
+  - vllm/model_executor/layers/rotary_embedding
+  - tests/models/multimodal/generation/
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m "
+      pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB"
+  parallelism: 2
+
+- label: "Arm CPU Test"
+  depends_on: []
+  soft_fail: true
+  device: arm_cpu
+  no_plugin: true
+  commands: 
+  - bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
--- a/.buildkite/hardware_tests/intel.yaml
+++ b/.buildkite/hardware_tests/intel.yaml
@@ -1,13 +1,6 @@
 group: Hardware
 depends_on: ~
 steps:
-  - label: "Intel CPU Test"
-    soft_fail: true
-    device: intel_cpu
-    no_plugin: true
-    commands: 
-    - bash .buildkite/scripts/hardware_ci/run-cpu-test.sh
-  
  - label: "Intel HPU Test"
    soft_fail: true
    device: intel_hpu
--- a/.buildkite/image_build/image_build_cpu_arm64.sh
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -14,7 +14,7 @@ BUILDKITE_COMMIT=$3
 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY

 # skip build if image already exists
-if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -24,10 +24,10 @@ fi
 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
-  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
+  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
  --target vllm-test \
  --progress plain .

 # push
-docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
+docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -248,8 +248,8 @@ steps:
          DOCKER_BUILDKIT: "1"
          DOCKERHUB_USERNAME: "vllmbot"

-  - group: "Publish wheels"
-    key: "publish-wheels"
+  - group: "Publish release artifacts"
+    key: "publish-release-artifacts"
    steps:
      - block: "Confirm update release wheels to PyPI (experimental, use with caution)?"
        key: block-upload-release-wheels
@@ -265,6 +265,27 @@ steps:
          queue: small_cpu_queue_postmerge
        commands:
          - "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
+      
+      - block: "Confirm update release images to DockerHub"
+        key: block-update-release-images-dockerhub
+        depends_on:
+          - input-release-version
+          - annotate-release-workflow
+      
+      - label: "Publish release images to DockerHub"
+        depends_on:
+          - block-update-release-images-dockerhub
+        agents:
+          queue: small_cpu_queue_postmerge
+        commands:
+          - "bash .buildkite/scripts/push-release-images-dockerhub.sh"
+        plugins:
+          - docker-login#v3.0.0:
+              username: vllmbot
+              password-env: DOCKERHUB_TOKEN
+        env:
+          DOCKER_BUILDKIT: "1"
+          DOCKERHUB_USERNAME: "vllmbot"

  # =============================================================================
  # ROCm Release Pipeline (x86_64 only)
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -27,7 +27,7 @@ aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-
 To download and upload the image:

 \`\`\`
-Download images:
+# Download images:

 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
@@ -35,8 +35,12 @@ docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
+docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
+docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}

-Tag and push images:
+# Tag and push images:
+
+## CUDA

 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
@@ -62,34 +66,21 @@ docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-a
 docker push vllm/vllm-openai:latest-aarch64-cu130
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130

-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:latest
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
-docker push vllm/vllm-openai-rocm:latest
-docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
+## ROCm
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+docker push vllm/vllm-openai-rocm:latest
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}

-Create multi-arch manifest:
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
 docker push vllm/vllm-openai-rocm:latest-base
 docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base

-docker manifest rm vllm/vllm-openai:latest
-docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
-docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
-docker manifest push vllm/vllm-openai:latest
-docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
-
-docker manifest rm vllm/vllm-openai:latest-cu130
-docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
-docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
-docker manifest push vllm/vllm-openai:latest-cu130
-docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
-
-# CPU images (vllm/vllm-openai-cpu)
-docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
-docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
+## CPU

 docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
 docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
@@ -103,6 +94,20 @@ docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-a
 docker push vllm/vllm-openai-cpu:latest-arm64
 docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64

+# Create multi-arch manifest:
+
+docker manifest rm vllm/vllm-openai:latest
+docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+docker manifest push vllm/vllm-openai:latest
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
+
+docker manifest rm vllm/vllm-openai:latest-cu130
+docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+docker manifest push vllm/vllm-openai:latest-cu130
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
+
 docker manifest rm vllm/vllm-openai-cpu:latest || true
 docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
 docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
--- a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+set -euox pipefail
+
+echo "--- PP+TP"
+vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
+server_pid=$!
+timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+vllm bench serve \
+    --backend vllm \
+    --dataset-name random \
+    --model meta-llama/Llama-3.2-3B-Instruct \
+    --num-prompts 20 \
+    --endpoint /v1/completions
+kill -s SIGTERM $server_pid &
+
+echo "--- DP+TP"
+vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
+server_pid=$!
+timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+vllm bench serve \
+    --backend vllm \
+    --dataset-name random \
+    --model meta-llama/Llama-3.2-3B-Instruct \
+    --num-prompts 20 \
+    --endpoint /v1/completions
+kill -s SIGTERM $server_pid &
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -2,119 +2,19 @@

 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
-set -ex
+set -euox pipefail

 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-48-95}
-# used for TP/PP E2E test
-OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
+IMAGE_NAME="cpu-test-$NUMA_NODE"
+TIMEOUT_VAL=$1
+TEST_COMMAND=$2

-export CMAKE_BUILD_PARALLEL_LEVEL=32
-
-# Setup cleanup
-remove_docker_container() {
-    set -e;
-    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
-}
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+# building the docker image
+echo "--- :docker: Building Docker image"
+docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
-
-function cpu_tests() {
-  set -e
-  export NUMA_NODE=$2
-
-  # list packages
-  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
-    set -e
-    pip list"
-
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pip list"
-
-  # offline inference
-  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
-    set -e
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
-
-  # Run kernel tests
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
-    pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
-    pytest -x -v -s tests/kernels/test_onednn.py"
-
-  # Run basic model test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    # Note: disable until supports V1
-    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
-    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
-
-    pytest -x -v -s tests/models/language/generation -m cpu_model
-    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
-
-    pytest -x -v -s tests/models/language/pooling -m cpu_model
-    pytest -x -v -s tests/models/multimodal/generation \
-                --ignore=tests/models/multimodal/generation/test_pixtral.py \
-                -m cpu_model"
-
-  # Run compressed-tensor test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -x -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
-
-  # Run AWQ/GPTQ test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -x -s -v \
-    tests/quantization/test_cpu_wna16.py"
-
-  # Run multi-lora tests
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -x -s -v \
-    tests/lora/test_qwenvl.py"
-
-  # online serving: tp+pp
-  docker exec cpu-test-"$NUMA_NODE" bash -c '
-    set -e
-    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
-    server_pid=$!
-    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
-    vllm bench serve \
-      --backend vllm \
-      --dataset-name random \
-      --model meta-llama/Llama-3.2-3B-Instruct \
-      --num-prompts 20 \
-      --endpoint /v1/completions
-    kill -s SIGTERM $server_pid &'
-
-  # online serving: tp+dp
-  docker exec cpu-test-"$NUMA_NODE" bash -c '
-    set -e
-    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
-    server_pid=$!
-    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
-    vllm bench serve \
-      --backend vllm \
-      --dataset-name random \
-      --model meta-llama/Llama-3.2-3B-Instruct \
-      --num-prompts 20 \
-      --endpoint /v1/completions
-    kill -s SIGTERM $server_pid &'
-}
-
-# All of CPU tests are expected to be finished less than 40 mins.
-export -f cpu_tests
-timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+docker run --rm --cpuset-cpus=$CORE_RANGE --cpuset-mems=$NUMA_NODE -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g $IMAGE_NAME \
+        timeout $TIMEOUT_VAL bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -39,6 +39,8 @@ docker run \
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
+    python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
    cd tests
--- a/.buildkite/scripts/push-release-images-dockerhub.sh
+++ b/.buildkite/scripts/push-release-images-dockerhub.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+
+set -ex
+
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null | sed 's/^v//')
+if [ -z "${RELEASE_VERSION}" ]; then
+    echo "RELEASE_VERSION is not set"
+    exit 1
+fi
+
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
+
+# Download images:
+
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
+docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
+docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
+
+# Tag and push images:
+
+## CUDA
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
+docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
+docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+docker push vllm/vllm-openai:latest-x86_64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
+docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
+docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
+docker push vllm/vllm-openai:latest-x86_64-cu130
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
+docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
+docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+docker push vllm/vllm-openai:latest-aarch64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
+docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
+docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+docker push vllm/vllm-openai:latest-aarch64-cu130
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+
+## ROCm
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+docker push vllm/vllm-openai-rocm:latest
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+docker push vllm/vllm-openai-rocm:latest-base
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+
+## CPU
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
+docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
+docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
+docker push vllm/vllm-openai-cpu:latest-x86_64
+docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
+docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
+docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+docker push vllm/vllm-openai-cpu:latest-arm64
+docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+
+# Create multi-arch manifest:
+
+docker manifest rm vllm/vllm-openai:latest
+docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+docker manifest push vllm/vllm-openai:latest
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
+
+docker manifest rm vllm/vllm-openai:latest-cu130
+docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+docker manifest push vllm/vllm-openai:latest-cu130
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
+
+docker manifest rm vllm/vllm-openai-cpu:latest || true
+docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
+docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+docker manifest push vllm/vllm-openai-cpu:latest
+docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -70,6 +70,7 @@ steps:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
+  - tests/test_pooling_params.py
  - tests/multimodal
  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
@@ -82,6 +83,7 @@ steps:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
+  - pytest -v -s test_pooling_params.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s renderers
  - pytest -v -s tokenizers_
@@ -231,6 +233,7 @@ steps:
  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
+  - examples/offline_inference/new_weight_syncing/
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
@@ -266,10 +269,16 @@ steps:
  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
+  # OLD rlhf examples
  - pushd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd
+  # NEW rlhf examples
+  - pushd ../examples/offline_inference/new_weight_syncing
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
+  - popd

 - label: Distributed Tests (8 GPUs) # 4min
  timeout_in_minutes: 10
@@ -505,7 +514,7 @@ steps:
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    # for pooling models
-    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    - python3 pooling/embed/vision_embedding_offline.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
@@ -525,6 +534,7 @@ steps:
  - tests/cuda
  commands:
    - pytest -v -s cuda/test_cuda_context.py
+    - pytest -v -s cuda/test_platform_no_cuda_init.py

 - label: Samplers Test # 56min
  timeout_in_minutes: 75
@@ -542,7 +552,7 @@ steps:
 - label: LoRA Test %N # 20min each
  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_1
+  agent_pool: mi325_8
  # grade: Blocking
  source_file_dependencies:
  - vllm/lora
@@ -638,7 +648,7 @@ steps:
 - label: Kernels Attention Test %N # 23min
  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
+  agent_pool: mi325_8
  # grade: Blocking
  source_file_dependencies:
  - csrc/attention/
@@ -653,7 +663,7 @@ steps:
 - label: Kernels Quantization Test %N # 64min
  timeout_in_minutes: 90
  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_1
+  agent_pool: mi325_8
  # grade: Blocking
  source_file_dependencies:
  - csrc/quantization/
@@ -666,7 +676,7 @@ steps:
 - label: Kernels MoE Test %N # 40min
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
+  agent_pool: mi325_8
  # grade: Blocking
  source_file_dependencies:
  - csrc/quantization/cutlass_w8a8/moe/
@@ -829,7 +839,7 @@ steps:
 - label: Basic Models Tests (Extra Initialization) %N
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
+  agent_pool: mi325_8
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies:
@@ -854,10 +864,11 @@ steps:
  torch_nightly: true
  source_file_dependencies:
  - vllm/
+  - tests/models/test_terratorch.py
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
  commands:
-    - pytest -v -s models/test_transformers.py models/test_registry.py
+    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py

 - label: Basic Models Test (Other CPU) # 5min
  mirror_hardwares: [amdexperimental, amdproduction]
@@ -890,7 +901,7 @@ steps:
 - label: Language Models Tests (Extra Standard) %N
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_1
+  agent_pool: mi325_8
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies:
@@ -911,7 +922,7 @@ steps:
 - label: Language Models Tests (Hybrid) %N
  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_1
+  agent_pool: mi325_8
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies:
@@ -1180,16 +1191,16 @@ steps:
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/test_fusion_attn.py
-  - tests/compile/test_silu_mul_quant_fusion.py
-  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/passes/test_fusion_attn.py
+  - tests/compile/passes/test_silu_mul_quant_fusion.py
+  - tests/compile/passes/distributed/test_fusion_all_reduce.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
-    - pytest -v -s tests/compile/test_fusion_attn.py
-    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    - pytest -v -s tests/compile/passes/test_fusion_attn.py
+    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py

    # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
    # # Wrap with quotes to escape yaml
@@ -1546,15 +1557,15 @@ steps:
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
-    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
    #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
    # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.

-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
    - pytest -v -s tests/v1/distributed/test_dbo.py
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -63,6 +63,7 @@ steps:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
+  - tests/test_pooling_params.py
  - tests/multimodal
  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
@@ -75,6 +76,7 @@ steps:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
+  - pytest -v -s test_pooling_params.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s renderers
  - pytest -v -s tokenizers_
@@ -204,6 +206,7 @@ steps:
  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
+  - examples/offline_inference/new_weight_syncing/
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
@@ -238,10 +241,16 @@ steps:
  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
+  # OLD rlhf examples
  - pushd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd
+  # NEW rlhf examples
+  - pushd ../examples/offline_inference/new_weight_syncing
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
+  - popd

 - label: Distributed Tests (8 GPUs) # 4min
  timeout_in_minutes: 10
@@ -444,7 +453,7 @@ steps:
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    # for pooling models
-    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    - python3 pooling/embed/vision_embedding_offline.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
@@ -510,6 +519,7 @@ steps:
  # However, find does not normally propagate error codes, so we combine it with xargs
  # (using -0 for proper path handling)
  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
+  - pytest -s -v compile/passes --ignore compile/passes/distributed

 - label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
@@ -795,10 +805,11 @@ steps:
  torch_nightly: true
  source_file_dependencies:
  - vllm/
+  - tests/models/test_terratorch.py
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
  commands:
-    - pytest -v -s models/test_transformers.py models/test_registry.py
+    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py

 - label: Basic Models Test (Other CPU) # 5min
  timeout_in_minutes: 10
@@ -1070,14 +1081,14 @@ steps:
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
-  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/passes/distributed/test_fusion_all_reduce.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
    - pytest -v -s tests/compile/test_fusion_attn.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
    #  # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
    #  # Wrap with quotes to escape yaml
    #  - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
@@ -1144,6 +1155,8 @@ steps:
  - pytest -v -s distributed/test_shm_broadcast.py
  - pytest -v -s distributed/test_shm_buffer.py
  - pytest -v -s distributed/test_shm_storage.py
+  - pytest -v -s distributed/test_packed_tensor.py
+  - pytest -v -s distributed/test_weight_transfer.py

 - label: 2 Node Tests (4 GPUs in total) # 16min
  timeout_in_minutes: 30
@@ -1409,8 +1422,8 @@ steps:
  commands:
    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
    # Run sequence parallel tests
-    - pytest -v -s tests/distributed/test_sequence_parallel.py
-    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
+    - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py

 - label: Distributed Tests (H100) # optional
  gpu: h100
@@ -1418,7 +1431,7 @@ steps:
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -2,7 +2,7 @@ group: Compile
 depends_on: 
  - image-build
 steps:
- label: Sequence Parallel Tests (2 GPUs)
+- label: Sequence Parallel Correctness Tests (2 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/"
  num_devices: 2
@@ -11,12 +11,12 @@ steps:
  - vllm/compilation/
  - vllm/v1/worker/
  - vllm/v1/cudagraph_dispatcher.py
-  - tests/distributed/test_sequence_parallel.py
+  - tests/compile/correctness_e2e/test_sequence_parallel.py
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -v -s tests/distributed/test_sequence_parallel.py
+  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py

- label: Sequence Parallel Tests (2xH100)
+- label: Sequence Parallel Correctness Tests (2xH100)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/"
  device: h100
@@ -24,24 +24,30 @@ steps:
  num_devices: 2
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -v -s tests/distributed/test_sequence_parallel.py
+  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
+
+- label: AsyncTP Correctness Tests (2xH100)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/"
+  device: h100
+  optional: true
+  num_devices: 2
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py

 - label: Distributed Compile Unit Tests (2xH100)
-  timeout_in_minutes: 40
+  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
  device: h100
  num_devices: 2
  source_file_dependencies:
  - vllm/compilation/
  - vllm/model_executor/layers
-  - tests/compile/distributed/test_fusion_all_reduce.py
-  - tests/compile/distributed/test_sequence_parallelism.py
-  - tests/compile/distributed/test_async_tp.py
+  - tests/compile/passes/distributed/
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-  - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-  - pytest -v -s tests/compile/distributed/test_async_tp.py
+  - pytest -s -v tests/compile/passes/distributed

 - label: Fusion and Compile Unit Tests (B200)
  timeout_in_minutes: 20
@@ -55,17 +61,17 @@ steps:
  - vllm/model_executor/layers/attention/attention.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
-  - tests/compile/test_fusion_attn.py
-  - tests/compile/test_silu_mul_quant_fusion.py
-  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/passes/test_fusion_attn.py
+  - tests/compile/passes/test_silu_mul_quant_fusion.py
+  - tests/compile/passes/distributed/test_fusion_all_reduce.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    # b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
    - nvidia-smi
-    - pytest -v -s tests/compile/test_fusion_attn.py -k FLASHINFER
-    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    - pytest -v -s tests/compile/passes/test_fusion_attn.py -k FLASHINFER
+    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_devices=2 is not set
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
    # TODO(luka) move to H100 once pass tests run on H100
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
--- a/.buildkite/test_areas/cuda.yaml
+++ b/.buildkite/test_areas/cuda.yaml
@@ -9,6 +9,7 @@ steps:
  - tests/cuda
  commands:
    - pytest -v -s cuda/test_cuda_context.py
+    - pytest -v -s cuda/test_platform_no_cuda_init.py

 - label: Cudagraph
  timeout_in_minutes: 20
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -62,6 +62,7 @@ steps:
  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
+  - examples/offline_inference/new_weight_syncing/
  - tests/examples/offline_inference/data_parallel.py
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
@@ -96,9 +97,14 @@ steps:
  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
+  # OLD rlhf examples
  - cd ../examples/offline_inference
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+  # NEW rlhf examples
+  - cd new_weight_syncing
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py

 - label: Distributed Tests (8 GPUs)(H100)
  timeout_in_minutes: 10
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -72,7 +72,7 @@ steps:
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
     # for pooling models
-    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    - python3 pooling/embed/vision_embedding_offline.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
@@ -122,6 +122,7 @@ steps:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
+  - tests/test_pooling_params.py
  - tests/multimodal
  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
@@ -134,6 +135,7 @@ steps:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
+  - pytest -v -s test_pooling_params.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s renderers
  - pytest -v -s tokenizers_
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -33,10 +33,11 @@ steps:
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/
+  - tests/models/test_terratorch.py
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
  commands:
-    - pytest -v -s models/test_transformers.py models/test_registry.py
+    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py

 - label: Basic Models Test (Other CPU) # 5min
  depends_on: 
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -3,7 +3,7 @@ depends_on:
  - image-build
 steps:
 - label: PyTorch Compilation Unit Tests
-  timeout_in_minutes: 30
+  timeout_in_minutes: 10
  source_file_dependencies:
    - vllm/
    - tests/compile
@@ -17,6 +17,14 @@ steps:
  # (using -0 for proper path handling)
  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"

+- label: PyTorch Compilation Passes Unit Tests
+  timeout_in_minutes: 20
+  source_file_dependencies:
+    - vllm/
+    - tests/compile/passes
+  commands:
+  - pytest -s -v compile/passes --ignore compile/passes/distributed
+
 - label: PyTorch Fullgraph Smoke Test
  timeout_in_minutes: 35
  source_file_dependencies:
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -121,24 +121,9 @@ repos:
    name: Update Dockerfile dependency graph
    entry: tools/pre_commit/update-dockerfile-graph.sh
    language: script
-  - id: enforce-import-regex-instead-of-re
-    name: Enforce import regex as re
-    entry: python tools/pre_commit/enforce_regex_import.py
-    language: python
-    types: [python]
-    pass_filenames: false
-    additional_dependencies: [regex]
-  # forbid directly import triton
-  - id: forbid-direct-triton-import
-    name: "Forbid direct 'import triton'"
-    entry: python tools/pre_commit/check_triton_import.py
-    language: python
-    types: [python]
-    pass_filenames: false
-    additional_dependencies: [regex]
-  - id: check-pickle-imports
-    name: Prevent new pickle/cloudpickle imports
-    entry: python tools/pre_commit/check_pickle_imports.py
+  - id: check-forbidden-imports
+    name: Check for forbidden imports
+    entry: python tools/pre_commit/check_forbidden_imports.py
    language: python
    types: [python]
    additional_dependencies: [regex]
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -11,7 +11,7 @@ This directory used to contain vLLM's benchmark scripts and utilities for perfor

 ## Usage

-For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).
+For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/benchmarking/cli/#benchmark-cli).

 For full CLI reference see:

--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -686,6 +686,7 @@ def get_model_params(config):
        "DeepseekV2ForCausalLM",
        "DeepseekV3ForCausalLM",
        "DeepseekV32ForCausalLM",
+        "GlmMoeDsaForCausalLM",
        "Glm4MoeForCausalLM",
        "Glm4MoeLiteForCausalLM",
        "NemotronHForCausalLM",
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -44,10 +44,8 @@ def benchmark_permute(
    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
    # output_hidden_states = torch.empty_like(hidden_states)
    if use_fp8_w8a8:
-        align_block_size = 128  # deepgemm needs 128 m aligned block
        qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
    else:
-        align_block_size = None
        qhidden_states = hidden_states

    gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32)
@@ -67,7 +65,6 @@ def benchmark_permute(
            topk_ids=topk_ids,
            n_expert=num_experts,
            expert_map=None,
-            align_block_size=align_block_size,
        )

    # JIT compilation & warmup
@@ -117,10 +114,8 @@ def benchmark_unpermute(
    # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
    if use_fp8_w8a8:
-        align_block_size = 128  # deepgemm needs 128 m aligned block
        qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
    else:
-        align_block_size = None
        qhidden_states = hidden_states

    input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
@@ -142,7 +137,6 @@ def benchmark_unpermute(
            topk_ids=topk_ids,
            n_expert=num_experts,
            expert_map=None,
-            align_block_size=align_block_size,
        )
        # convert to fp16/bf16 as gemm output
        return (
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 2adfc8c2177c5b0e8ddeedfd5a8990d80eb496ff
+          GIT_TAG 188be16520ceefdc625fdf71365585d2ee348fe2
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -1107,7 +1107,8 @@ class AttentionMainLoop {
          if (sliding_window_left != -1) {
            pos = std::max(pos, curr_token_pos - sliding_window_left);
          }
-          return pos;
+          // Clamp to tile end to avoid OOB when window starts past the tile
+          return std::min(pos, kv_tile_end_pos);
        }();

        int32_t right_kv_pos = [&]() {
--- a/csrc/cpu/cpu_attn_neon.hpp
+++ b/csrc/cpu/cpu_attn_neon.hpp
@@ -4,6 +4,9 @@
 #include "cpu_attn_impl.hpp"
 #include <arm_neon.h>
 #include <type_traits>
+#ifdef ARM_BF16_SUPPORT
+  #include "cpu_attn_neon_bfmmla.hpp"
+#endif
 namespace cpu_attention {

 namespace {
@@ -57,7 +60,7 @@ FORCE_INLINE void load_row8_B_as_f32<c10::BFloat16>(const c10::BFloat16* p,
 #endif
 }

-// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with NEON FMLAs
+// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with ASIMD FMLAs
 // #Loads = (K // 4) * (M + 4 * sizeof(kv_cache_t) / 2)
 // #FMLAs = (K // 4) * (4 * 2 * M)
 // We have (4 * 2 * M) FMLAs for (M + 4 * sizeof(kv_cache_t) / 2) loads
@@ -381,6 +384,18 @@ class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
    }
  }
 };
+
+#ifdef ARM_BF16_SUPPORT
+// For BF16 on Arm, reuse the BFMMLA kernels with 32-token alignment.
+template <int64_t head_dim>
+class AttentionImpl<ISA::NEON, c10::BFloat16, head_dim>
+    : public AttentionImplNEONBFMMLA<BLOCK_SIZE_ALIGNMENT, ISA::NEON,
+                                     head_dim> {};
+#endif
 }  // namespace cpu_attention

-#endif  // #ifndef CPU_ATTN_NEON_HPP
+#undef BLOCK_SIZE_ALIGNMENT
+#undef HEAD_SIZE_ALIGNMENT
+#undef MAX_Q_HEAD_NUM_PER_ITER
+
+#endif  // #ifndef CPU_ATTN_ASIMD_HPP
--- a/csrc/cpu/cpu_attn_neon_bfmmla.hpp
+++ b/csrc/cpu/cpu_attn_neon_bfmmla.hpp
@@ -0,0 +1,682 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#ifndef CPU_ATTN_NEON_BFMMLA_HPP
+#define CPU_ATTN_NEON_BFMMLA_HPP
+
+#include "cpu_attn_impl.hpp"
+
+#include <arm_neon.h>
+
+#include <cstdint>
+#include <vector>
+
+namespace cpu_attention {
+
+namespace {
+
+// BFMMLA tile dimensions
+constexpr int32_t TILE_ROWS = 2;  // M dimension
+constexpr int32_t TILE_K = 4;     // K reduction
+constexpr int32_t TILE_COLS = 2;  // N dimension (column-pair)
+
+// Derived constants
+constexpr int32_t OUTPUT_COLS_PER_BLOCK = 8;   // 4 column-pairs
+constexpr int32_t K_TOKENS_PER_GROUP = 8;      // Tokens grouped in K cache
+constexpr int32_t V_TOKENS_PER_ROW_BLOCK = 4;  // Tokens per V cache row block
+constexpr int32_t K_INNER_STRIDE = K_TOKENS_PER_GROUP * TILE_K;
+constexpr int32_t V_INNER_STRIDE = V_TOKENS_PER_ROW_BLOCK * TILE_COLS;
+constexpr int32_t PACK_ELEMENTS_PER_K_CHUNK = TILE_ROWS * TILE_K;  // A packing
+
+// Matrix Packing and Accumulator
+// Reshape two rows of Q into BFMMLA-friendly interleaved
+// Input:  row0 = [a0,a1,a2,a3], row1 = [b0,b1,b2,b3]
+// Output: [a0,a1,a2,a3,b0,b1,b2,b3, a4,a5,a6,a7,b4,b5,b6,b7]
+// For K tail (K % TILE_K != 0): pads with zeros to complete the final chunk
+FORCE_INLINE void reshape_Q_2xK_for_bfmmla(const c10::BFloat16* __restrict r0,
+                                           const c10::BFloat16* __restrict r1,
+                                           c10::BFloat16* __restrict dst,
+                                           int32_t K) {
+  const uint16_t* s0 = reinterpret_cast<const uint16_t*>(r0);
+  const uint16_t* s1 = reinterpret_cast<const uint16_t*>(r1);
+  uint16_t* d = reinterpret_cast<uint16_t*>(dst);
+
+  // Process TILE_K elements at a time (PACK_ELEMENTS_PER_K_CHUNK output)
+  int32_t k = 0;
+  for (; k + TILE_K <= K; k += TILE_K, d += PACK_ELEMENTS_PER_K_CHUNK) {
+    vst1q_u16(d, vcombine_u16(vld1_u16(s0 + k), vld1_u16(s1 + k)));
+  }
+
+  // Handle K tail: pack remaining elements with zero-padding
+  const int32_t tail = K - k;
+  if (tail > 0) {
+    // Pack remaining tail elements: [r0[k..k+tail-1], pad, r1[k..k+tail-1],
+    // pad]
+    for (int32_t t = 0; t < tail; ++t) {
+      d[t] = s0[k + t];
+      d[t + TILE_K] = s1[k + t];
+    }
+    // Zero-pad the rest
+    for (int32_t t = tail; t < TILE_K; ++t) {
+      d[t] = 0;
+      d[t + TILE_K] = 0;
+    }
+  }
+}
+
+// 2x2 accumulator load/store with compile-time row count
+template <int32_t m_rows>
+FORCE_INLINE float32x4_t load_acc_2x2(float* base, int64_t ldc, int col_off) {
+  static_assert(m_rows == 1 || m_rows == 2);
+  float32x2_t row0 = vld1_f32(base + col_off);
+  float32x2_t row1 =
+      (m_rows == 2) ? vld1_f32(base + ldc + col_off) : vdup_n_f32(0.f);
+  return vcombine_f32(row0, row1);
+}
+
+template <int32_t m_rows>
+FORCE_INLINE void store_acc_2x2(float32x4_t acc, float* base, int64_t ldc,
+                                int col_off) {
+  static_assert(m_rows == 1 || m_rows == 2);
+  vst1_f32(base + col_off, vget_low_f32(acc));
+  if constexpr (m_rows == 2) {
+    vst1_f32(base + ldc + col_off, vget_high_f32(acc));
+  }
+}
+
+// Initialize 4 column-pair accumulators for 2 rows (8 columns total)
+#define INIT_ACC_ROWPAIR_4(a0, a1, a2, a3, Crow, ldc, m_rows, accum) \
+  do {                                                               \
+    if (accum) {                                                     \
+      if (m_rows == 2) {                                             \
+        a0 = load_acc_2x2<2>(Crow, ldc, 0);                          \
+        a1 = load_acc_2x2<2>(Crow, ldc, 2);                          \
+        a2 = load_acc_2x2<2>(Crow, ldc, 4);                          \
+        a3 = load_acc_2x2<2>(Crow, ldc, 6);                          \
+      } else {                                                       \
+        a0 = load_acc_2x2<1>(Crow, ldc, 0);                          \
+        a1 = load_acc_2x2<1>(Crow, ldc, 2);                          \
+        a2 = load_acc_2x2<1>(Crow, ldc, 4);                          \
+        a3 = load_acc_2x2<1>(Crow, ldc, 6);                          \
+      }                                                              \
+    } else {                                                         \
+      a0 = a1 = a2 = a3 = vdupq_n_f32(0.f);                          \
+    }                                                                \
+  } while (0)
+
+// Store 4 column-pair accumulators back to C matrix
+#define STORE_ACC_ROWPAIR_4(a0, a1, a2, a3, Crow, ldc, m_rows) \
+  do {                                                         \
+    if (m_rows == 2) {                                         \
+      store_acc_2x2<2>(a0, Crow, ldc, 0);                      \
+      store_acc_2x2<2>(a1, Crow, ldc, 2);                      \
+      store_acc_2x2<2>(a2, Crow, ldc, 4);                      \
+      store_acc_2x2<2>(a3, Crow, ldc, 6);                      \
+    } else {                                                   \
+      store_acc_2x2<1>(a0, Crow, ldc, 0);                      \
+      store_acc_2x2<1>(a1, Crow, ldc, 2);                      \
+      store_acc_2x2<1>(a2, Crow, ldc, 4);                      \
+      store_acc_2x2<1>(a3, Crow, ldc, 6);                      \
+    }                                                          \
+  } while (0)
+
+// Perform 4 BFMMLA operations: acc += A @ B for 4 column-pairs
+#define BFMMLA_COMPUTE_4(r0, r1, r2, r3, a, b0, b1, b2, b3) \
+  do {                                                      \
+    r0 = vbfmmlaq_f32(r0, a, b0);                           \
+    r1 = vbfmmlaq_f32(r1, a, b1);                           \
+    r2 = vbfmmlaq_f32(r2, a, b2);                           \
+    r3 = vbfmmlaq_f32(r3, a, b3);                           \
+  } while (0)
+
+// Micro-kernel: updates a small fixed tile using BFMMLA.
+// RP = number of row-pairs (1,2,4)
+// Computes C[TILE_ROWS*RP, OUTPUT_COLS_PER_BLOCK] += A_packed @ B.
+// A_packed interleaves RP row-pairs; B layout is driven by the attention phase:
+// - AttentionGemmPhase::QK -> token-column layout (Q @ K^T)
+// - AttentionGemmPhase::PV -> token-row layout (P @ V)
+// K_static < 0 enables runtime K (PV only)
+template <int32_t RP, int32_t K_static, AttentionGemmPhase phase>
+FORCE_INLINE void gemm_rowpairs_x8_bfmmla_neon(
+    const bfloat16_t* const* __restrict A_packed_rp,
+    const int32_t* __restrict m_rows_rp, const bfloat16_t* __restrict B_blk,
+    float* __restrict C, int64_t ldc, bool accumulate, int64_t b_stride,
+    int32_t K_runtime = 0) {
+  static_assert(RP == 1 || RP == 2 || RP == 4, "RP must be 1,2,4");
+  static_assert(K_static < 0 || K_static % TILE_K == 0,
+                "K must be divisible by TILE_K");
+  static_assert(K_static >= 0 || phase == AttentionGemmPhase::PV,
+                "Runtime K only supported for PV");
+
+  constexpr bool runtime_k = (K_static < 0);
+  const int32_t K_iters =
+      runtime_k ? (K_runtime / TILE_K) : (K_static / TILE_K);
+  const int32_t K_tail = runtime_k ? (K_runtime % TILE_K) : 0;
+
+  if (!runtime_k) {
+    // Help the compiler fold away unused K_runtime when K is compile-time
+    (void)K_runtime;
+  }
+
+  auto* C_al = C;
+  const auto* B_al = B_blk;
+
+  // Setup A pointers
+  const bfloat16_t* a_ptr[4] = {
+      A_packed_rp[0],
+      (RP >= 2) ? A_packed_rp[1] : nullptr,
+      (RP >= 4) ? A_packed_rp[2] : nullptr,
+      (RP >= 4) ? A_packed_rp[3] : nullptr,
+  };
+
+  // Setup B pointers based on layout
+  const bfloat16_t* b_ptr[4];
+  if constexpr (phase == AttentionGemmPhase::PV) {
+    b_ptr[0] = B_blk + 0 * b_stride;
+    b_ptr[1] = B_blk + 1 * b_stride;
+    b_ptr[2] = B_blk + 2 * b_stride;
+    b_ptr[3] = B_blk + 3 * b_stride;
+  }
+
+  float32x4_t acc[4][4];
+
+// Initialize accumulators
+#define INIT_RP(rp)                                                            \
+  if constexpr (RP > rp) {                                                     \
+    INIT_ACC_ROWPAIR_4(acc[rp][0], acc[rp][1], acc[rp][2], acc[rp][3],         \
+                       C_al + (rp * 2) * ldc, ldc, m_rows_rp[rp], accumulate); \
+  }
+  INIT_RP(0);
+  INIT_RP(1);
+  INIT_RP(2);
+  INIT_RP(3);
+#undef INIT_RP
+
+  // Main compute loop
+  for (int32_t ki = 0; ki < K_iters; ++ki) {
+    bfloat16x8_t b0, b1, b2, b3;
+    if constexpr (phase == AttentionGemmPhase::PV) {
+      b0 = vld1q_bf16(b_ptr[0] + ki * V_INNER_STRIDE);
+      b1 = vld1q_bf16(b_ptr[1] + ki * V_INNER_STRIDE);
+      b2 = vld1q_bf16(b_ptr[2] + ki * V_INNER_STRIDE);
+      b3 = vld1q_bf16(b_ptr[3] + ki * V_INNER_STRIDE);
+    } else {
+      const bfloat16_t* b_base = B_al + ki * b_stride;
+      b0 = vld1q_bf16(b_base + 0 * V_INNER_STRIDE);
+      b1 = vld1q_bf16(b_base + 1 * V_INNER_STRIDE);
+      b2 = vld1q_bf16(b_base + 2 * V_INNER_STRIDE);
+      b3 = vld1q_bf16(b_base + 3 * V_INNER_STRIDE);
+    }
+
+#define COMPUTE_RP(rp)                                                       \
+  if constexpr (RP > rp) {                                                   \
+    bfloat16x8_t a = vld1q_bf16(a_ptr[rp] + ki * PACK_ELEMENTS_PER_K_CHUNK); \
+    BFMMLA_COMPUTE_4(acc[rp][0], acc[rp][1], acc[rp][2], acc[rp][3], a, b0,  \
+                     b1, b2, b3);                                            \
+  }
+    COMPUTE_RP(0);
+    COMPUTE_RP(1);
+    COMPUTE_RP(2);
+    COMPUTE_RP(3);
+#undef COMPUTE_RP
+  }
+
+  // K tail for runtime PV: fallback path
+  if constexpr (runtime_k) {
+    if (K_tail > 0) {
+      const int32_t tail_offset = K_iters * V_INNER_STRIDE;
+      const int32_t a_tail_offset = K_iters * PACK_ELEMENTS_PER_K_CHUNK;
+      for (int32_t kt = 0; kt < K_tail; ++kt) {
+        float32x4_t b_vecs[4];
+        for (int32_t p = 0; p < 4; ++p) {
+          const bfloat16_t* bp = b_ptr[p] + tail_offset + kt * TILE_COLS;
+          const float b0 = vcvtah_f32_bf16(bp[0]);
+          const float b1 = vcvtah_f32_bf16(bp[1]);
+          const float32x2_t b_pair = vset_lane_f32(b1, vdup_n_f32(b0), 1);
+          b_vecs[p] = vcombine_f32(b_pair, b_pair);
+        }
+
+#define TAIL_RP(rp)                                                     \
+  if constexpr (RP > rp) {                                              \
+    const bfloat16_t* ap = A_packed_rp[rp] + a_tail_offset;             \
+    float a_row0 = vcvtah_f32_bf16(ap[kt]);                             \
+    float a_row1 =                                                      \
+        (m_rows_rp[rp] == 2) ? vcvtah_f32_bf16(ap[kt + TILE_K]) : 0.0f; \
+    const float32x4_t a_vec =                                           \
+        vcombine_f32(vdup_n_f32(a_row0), vdup_n_f32(a_row1));           \
+    for (int32_t p = 0; p < 4; ++p) {                                   \
+      acc[rp][p] = vmlaq_f32(acc[rp][p], a_vec, b_vecs[p]);             \
+    }                                                                   \
+  }
+        TAIL_RP(0);
+        TAIL_RP(1);
+        TAIL_RP(2);
+        TAIL_RP(3);
+#undef TAIL_RP
+      }
+    }
+  }
+
+  // Store results
+#define STORE_RP(rp)                                                    \
+  if constexpr (RP > rp) {                                              \
+    STORE_ACC_ROWPAIR_4(acc[rp][0], acc[rp][1], acc[rp][2], acc[rp][3], \
+                        C_al + (rp * 2) * ldc, ldc, m_rows_rp[rp]);     \
+  }
+  STORE_RP(0);
+  STORE_RP(1);
+  STORE_RP(2);
+  STORE_RP(3);
+#undef STORE_RP
+}
+
+// Meso-kernel: packs a small MBxK slice of A, then tiles over N and calls the
+// micro-kernel for each OUTPUT_COLS_PER_BLOCK chunk. K_static < 0 enables
+// runtime K (PV only).
+template <int32_t MB, int32_t N, int32_t K_static, AttentionGemmPhase phase>
+FORCE_INLINE void gemm_packA_compute_MB_xN(
+    const c10::BFloat16* __restrict A, const c10::BFloat16* __restrict B,
+    float* __restrict C, int32_t K_runtime, int64_t lda, int64_t ldc,
+    int64_t b_layout_stride, int64_t b_reduction_stride, bool accumulate) {
+  static_assert(MB >= 1 && MB <= 8, "MB must be in [1,8]");
+  static_assert(N % OUTPUT_COLS_PER_BLOCK == 0,
+                "N must be a multiple of OUTPUT_COLS_PER_BLOCK");
+  static_assert(K_static < 0 || K_static % TILE_K == 0,
+                "K must be divisible by TILE_K");
+  static_assert(K_static >= 0 || phase == AttentionGemmPhase::PV,
+                "Runtime K only supported for PV");
+
+  constexpr bool runtime_k = (K_static < 0);
+  const int32_t K_val = runtime_k ? K_runtime : K_static;
+
+  // Keep small packs on-stack to avoid heap churn
+  constexpr int32_t STACK_PACK_STRIDE =
+      (1024 / TILE_K) * PACK_ELEMENTS_PER_K_CHUNK;
+
+  constexpr int32_t ROW_PAIRS = (MB + 1) / TILE_ROWS;
+  const int32_t pack_stride =
+      runtime_k ? ((K_val + TILE_K - 1) / TILE_K) * PACK_ELEMENTS_PER_K_CHUNK
+                : (K_static / TILE_K) * PACK_ELEMENTS_PER_K_CHUNK;
+
+  alignas(64) c10::BFloat16 A_packed_stack[ROW_PAIRS * STACK_PACK_STRIDE];
+  std::vector<c10::BFloat16> A_packed_heap;
+  c10::BFloat16* A_packed =
+      (pack_stride <= STACK_PACK_STRIDE)
+          ? A_packed_stack
+          : (A_packed_heap.resize(ROW_PAIRS * pack_stride),
+             A_packed_heap.data());
+
+  for (int32_t rp = 0; rp < ROW_PAIRS; ++rp) {
+    const int32_t m = rp * TILE_ROWS;
+    const int32_t m_rows = (m + 1 < MB) ? TILE_ROWS : 1;
+    const c10::BFloat16* A0 = A + m * lda;
+    const c10::BFloat16* A1 = (m_rows == TILE_ROWS) ? (A + (m + 1) * lda) : A0;
+    reshape_Q_2xK_for_bfmmla(A0, A1, A_packed + rp * pack_stride, K_val);
+  }
+
+  for (int32_t n = 0; n < N; n += OUTPUT_COLS_PER_BLOCK) {
+    const c10::BFloat16* B_blk_c10 =
+        (phase == AttentionGemmPhase::PV)
+            ? (B + (n / TILE_COLS) * b_layout_stride)
+            : (B + (n / OUTPUT_COLS_PER_BLOCK) * b_layout_stride);
+    const bfloat16_t* B_blk = reinterpret_cast<const bfloat16_t*>(B_blk_c10);
+
+    // Process row-pairs in groups of 4, 2, then 1
+    int32_t row_pair_idx = 0;
+
+#define PROCESS_RP_GROUP(group_size)                                       \
+  for (; row_pair_idx + (group_size - 1) < ROW_PAIRS;                      \
+       row_pair_idx += group_size) {                                       \
+    const bfloat16_t* Ap[group_size];                                      \
+    int32_t mr[group_size];                                                \
+    for (int32_t i = 0; i < group_size; ++i) {                             \
+      Ap[i] = reinterpret_cast<const bfloat16_t*>(                         \
+          A_packed + (row_pair_idx + i) * pack_stride);                    \
+      mr[i] = (((row_pair_idx + i) * TILE_ROWS + 1) < MB) ? TILE_ROWS : 1; \
+    }                                                                      \
+    float* C_blk = C + (row_pair_idx * TILE_ROWS) * ldc + n;               \
+    if constexpr (runtime_k) {                                             \
+      gemm_rowpairs_x8_bfmmla_neon<group_size, -1, phase>(                 \
+          Ap, mr, B_blk, C_blk, ldc, accumulate, b_layout_stride, K_val);  \
+    } else {                                                               \
+      gemm_rowpairs_x8_bfmmla_neon<group_size, K_static, phase>(           \
+          Ap, mr, B_blk, C_blk, ldc, accumulate,                           \
+          (phase == AttentionGemmPhase::PV) ? b_layout_stride              \
+                                            : b_reduction_stride);         \
+    }                                                                      \
+  }
+
+    PROCESS_RP_GROUP(4);
+    PROCESS_RP_GROUP(2);
+    PROCESS_RP_GROUP(1);
+#undef PROCESS_RP_GROUP
+  }
+}
+
+// Macro-kernel: iterates over M in MB={8,4,2,1} chunks.
+// Supports compile-time K specialization when K >= 0; otherwise uses runtime K
+// (runtime K path is only supported for PV).
+template <AttentionGemmPhase phase, int32_t N, int32_t K = -1>
+FORCE_INLINE void gemm_macro_neon_bfmmla(
+    const c10::BFloat16* __restrict A, const c10::BFloat16* __restrict B,
+    float* __restrict C, int32_t M, int32_t K_runtime, int64_t lda, int64_t ldc,
+    int64_t b_layout_stride, int64_t b_reduction_stride, bool accumulate) {
+  static_assert(N % OUTPUT_COLS_PER_BLOCK == 0,
+                "N must be a multiple of OUTPUT_COLS_PER_BLOCK");
+
+  if constexpr (K >= 0) {
+    static_assert(K % TILE_K == 0, "K must be divisible by TILE_K");
+    for (int32_t m = 0; m < M;) {
+      const int32_t rem = M - m;
+      const c10::BFloat16* A_blk = A + m * lda;
+      float* C_blk = C + m * ldc;
+
+#define DISPATCH_MB(mb)                                                   \
+  gemm_packA_compute_MB_xN<mb, N, K, phase>(A_blk, B, C_blk, 0, lda, ldc, \
+                                            b_layout_stride,              \
+                                            b_reduction_stride, accumulate)
+
+      if (rem >= 8) {
+        DISPATCH_MB(8);
+        m += 8;
+      } else if (rem >= 4) {
+        DISPATCH_MB(4);
+        m += 4;
+      } else if (rem >= 2) {
+        DISPATCH_MB(2);
+        m += 2;
+      } else {
+        DISPATCH_MB(1);
+        m += 1;
+      }
+#undef DISPATCH_MB
+    }
+  } else {
+    static_assert(phase == AttentionGemmPhase::PV,
+                  "Runtime K specialization only supported for PV.");
+    const int32_t K_val = K_runtime;
+
+    for (int32_t m = 0; m < M;) {
+      const int32_t rem = M - m;
+      const c10::BFloat16* A_blk = A + m * lda;
+      float* C_blk = C + m * ldc;
+
+#define DISPATCH_MB_RUNTIME(mb)                                                \
+  gemm_packA_compute_MB_xN<mb, N, -1, phase>(A_blk, B, C_blk, K_val, lda, ldc, \
+                                             b_layout_stride,                  \
+                                             b_reduction_stride, accumulate)
+
+      if (rem >= 8) {
+        DISPATCH_MB_RUNTIME(8);
+        m += 8;
+      } else if (rem >= 4) {
+        DISPATCH_MB_RUNTIME(4);
+        m += 4;
+      } else if (rem >= 2) {
+        DISPATCH_MB_RUNTIME(2);
+        m += 2;
+      } else {
+        DISPATCH_MB_RUNTIME(1);
+        m += 1;
+      }
+#undef DISPATCH_MB_RUNTIME
+    }
+  }
+}
+
+#undef INIT_ACC_ROWPAIR_4
+#undef STORE_ACC_ROWPAIR_4
+#undef BFMMLA_COMPUTE_4
+
+}  // namespace
+
+// TileGemm Adapter for Attention
+
+template <typename kv_cache_t, int32_t BlockTokens, int32_t HeadDim>
+class TileGemmNEONBFMMLA {
+ public:
+  template <AttentionGemmPhase phase, int32_t head_dim_ct>
+  FORCE_INLINE static void gemm(const int32_t m_size, void* __restrict__ a_tile,
+                                kv_cache_t* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                [[maybe_unused]] const int64_t ldb,
+                                const int64_t ldc,
+                                [[maybe_unused]] const int32_t block_size,
+                                [[maybe_unused]] const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    static_assert(BlockTokens % OUTPUT_COLS_PER_BLOCK == 0);
+    // BFMMLA kernels require compile-time head_dim; keep head_dim_ct only for
+    // API parity with other tile_gemm implementations.
+    if constexpr (head_dim_ct >= 0) {
+      static_assert(head_dim_ct == HeadDim,
+                    "BFMMLA expects head_dim_ct to match HeadDim; PV passes "
+                    "-1 for API parity.");
+    }
+
+    if constexpr (phase == AttentionGemmPhase::QK) {
+      const int64_t b_reduction_stride = K_INNER_STRIDE;
+      const int64_t b_token_block_stride = (HeadDim / TILE_K) * K_INNER_STRIDE;
+
+      gemm_macro_neon_bfmmla<AttentionGemmPhase::QK, BlockTokens, HeadDim>(
+          reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
+          m_size, 0, lda, ldc, b_token_block_stride, b_reduction_stride,
+          accum_c);
+    } else {
+      const int64_t b_pair_stride =
+          (block_size / V_TOKENS_PER_ROW_BLOCK) * V_INNER_STRIDE;
+
+      // PV gemm with runtime K specialization
+      switch (dynamic_k_size) {
+        case 32:
+          gemm_macro_neon_bfmmla<AttentionGemmPhase::PV, HeadDim, 32>(
+              reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
+              m_size, 32, lda, ldc, b_pair_stride, 0, accum_c);
+          break;
+        case 128:
+          gemm_macro_neon_bfmmla<AttentionGemmPhase::PV, HeadDim, 128>(
+              reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
+              m_size, 128, lda, ldc, b_pair_stride, 0, accum_c);
+          break;
+        case 256:
+          gemm_macro_neon_bfmmla<AttentionGemmPhase::PV, HeadDim, 256>(
+              reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
+              m_size, 256, lda, ldc, b_pair_stride, 0, accum_c);
+          break;
+        default:
+          gemm_macro_neon_bfmmla<AttentionGemmPhase::PV, HeadDim>(
+              reinterpret_cast<const c10::BFloat16*>(a_tile), b_tile, c_tile,
+              m_size, dynamic_k_size, lda, ldc, b_pair_stride, 0, accum_c);
+          break;
+      }
+    }
+  }
+};
+
+// Shared ASIMD BFMMLA implementation (BF16 only). The block size alignment and
+// ISA tag are template parameters so we can reuse the same kernels for
+// different NEON configurations.
+template <int64_t block_size_alignment, ISA isa_type, int64_t head_dim>
+class AttentionImplNEONBFMMLA {
+ public:
+  using query_t = c10::BFloat16;
+  using q_buffer_t = c10::BFloat16;
+  using kv_cache_t = c10::BFloat16;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = c10::BFloat16;
+
+  static constexpr int64_t BlockSizeAlignment = block_size_alignment;
+  // HeadDimAlignment equals head_dim so that the PV phase processes
+  // the full head dimension in a single gemm call.
+  static constexpr int64_t HeadDimAlignment = head_dim;
+  static constexpr int64_t MaxQHeadNumPerIteration = 16;
+  static constexpr int64_t HeadDim = head_dim;
+  static constexpr ISA ISAType = isa_type;
+  static constexpr bool scale_on_logits = false;
+
+  static_assert(HeadDim % OUTPUT_COLS_PER_BLOCK == 0);
+  static_assert(BlockSizeAlignment % OUTPUT_COLS_PER_BLOCK == 0);
+  static_assert(HeadDim % TILE_K == 0, "HeadDim must be a multiple of TILE_K");
+
+ public:
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    attention<
+        TileGemmNEONBFMMLA<kv_cache_t, static_cast<int32_t>(BlockSizeAlignment),
+                           static_cast<int32_t>(HeadDim)>>
+        attention_iteration;
+    attention_iteration(CPU_ATTENTION_PARAMS);
+  }
+
+  // Key cache stride per token group (TokenColumn layout; QK)
+  static constexpr int64_t k_cache_token_group_stride(
+      [[maybe_unused]] const int32_t block_size) {
+    static_assert(BlockSizeAlignment % K_TOKENS_PER_GROUP == 0);
+    return (BlockSizeAlignment / K_TOKENS_PER_GROUP) *
+           ((head_dim / TILE_K) * K_INNER_STRIDE);
+  }
+
+  // Value cache stride per token group (TokenRow layout; PV)
+  static constexpr int64_t v_cache_token_group_stride(
+      [[maybe_unused]] const int32_t block_size) {
+    static_assert(BlockSizeAlignment % V_TOKENS_PER_ROW_BLOCK == 0);
+    return (BlockSizeAlignment / V_TOKENS_PER_ROW_BLOCK) * V_INNER_STRIDE;
+  }
+
+  // The stride to move to the "next" head_dim group
+  // is the full V cache size per head, since HeadDimAlignment == head_dim.
+  // Hence, the stride is not used in this case
+  static constexpr int64_t v_cache_head_group_stride(
+      [[maybe_unused]] const int32_t block_size) {
+    return head_dim * block_size;
+  }
+
+  // Convert Q heads to BF16 and apply scale factor using native BF16 intrinsics
+  static void copy_q_heads_tile(c10::BFloat16* __restrict__ src,
+                                c10::BFloat16* __restrict__ q_buffer,
+                                const int32_t q_num,
+                                const int32_t q_heads_per_kv,
+                                const int64_t q_num_stride,
+                                const int64_t q_head_stride, float scale) {
+    constexpr int32_t dim = static_cast<int32_t>(head_dim);
+    const float32x4_t scale_vec = vdupq_n_f32(scale);
+
+    for (int32_t qi = 0; qi < q_num; ++qi) {
+      for (int32_t hi = 0; hi < q_heads_per_kv; ++hi) {
+        c10::BFloat16* __restrict__ curr_q =
+            src + qi * q_num_stride + hi * q_head_stride;
+        c10::BFloat16* __restrict__ dst =
+            q_buffer + qi * q_heads_per_kv * head_dim + hi * head_dim;
+
+        for (int32_t i = 0; i < dim; i += OUTPUT_COLS_PER_BLOCK) {
+          bfloat16x8_t in8 =
+              vld1q_bf16(reinterpret_cast<const bfloat16_t*>(curr_q + i));
+          float32x4_t lo = vmulq_f32(vcvtq_low_f32_bf16(in8), scale_vec);
+          float32x4_t hi = vmulq_f32(vcvtq_high_f32_bf16(in8), scale_vec);
+
+          bfloat16x4_t lo_b = vcvt_bf16_f32(lo);
+          bfloat16x4_t hi_b = vcvt_bf16_f32(hi);
+          bfloat16x8_t out = vcombine_bf16(lo_b, hi_b);
+          vst1q_bf16(reinterpret_cast<bfloat16_t*>(dst + i), out);
+        }
+      }
+    }
+  }
+
+ public:
+  // Reshape and cache K/V into BFMMLA-optimized layouts
+  // K cache:
+  // [block_size/K_TOKENS_PER_GROUP][head_dim/TILE_K][K_INNER_STRIDE]
+  // - TokenColumn
+  // V cache:
+  // [head_dim/TILE_COLS][block_size/V_TOKENS_PER_ROW_BLOCK][V_INNER_STRIDE]
+  // - TokenRows
+  static void reshape_and_cache(
+      const c10::BFloat16* __restrict__ key,
+      const c10::BFloat16* __restrict__ value,
+      c10::BFloat16* __restrict__ key_cache,
+      c10::BFloat16* __restrict__ value_cache,
+      const int64_t* __restrict__ slot_mapping, const int64_t token_num,
+      const int64_t key_token_num_stride, const int64_t value_token_num_stride,
+      const int64_t head_num, const int64_t key_head_num_stride,
+      const int64_t value_head_num_stride,
+      [[maybe_unused]] const int64_t num_blocks,
+      const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
+      const int64_t block_size,
+      [[maybe_unused]] const int64_t block_size_stride) {
+    const int64_t k_block_stride = (head_dim / TILE_K) * K_INNER_STRIDE;
+    const int64_t v_pair_stride =
+        (block_size / V_TOKENS_PER_ROW_BLOCK) * V_INNER_STRIDE;
+
+#pragma omp parallel for
+    for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
+      for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
+        const int64_t pos = slot_mapping[token_idx];
+        if (pos < 0) continue;
+
+        const int64_t block_idx = pos / block_size;
+        const int64_t block_offset = pos % block_size;
+
+        // Key cache: TokenColumn QK
+        {
+          const c10::BFloat16* __restrict key_src =
+              key + token_idx * key_token_num_stride +
+              head_idx * key_head_num_stride;
+
+          c10::BFloat16* __restrict key_base = key_cache +
+                                               block_idx * num_blocks_stride +
+                                               head_idx * cache_head_num_stride;
+
+          const int64_t block_in_block = block_offset / K_TOKENS_PER_GROUP;
+          const int64_t pair_in_block =
+              (block_offset % K_TOKENS_PER_GROUP) / TILE_COLS;
+          const int64_t lane_base = (block_offset & 1) ? TILE_K : 0;
+
+          c10::BFloat16* __restrict block_base =
+              key_base + block_in_block * k_block_stride;
+
+          for (int64_t hd4 = 0; hd4 < head_dim / TILE_K; ++hd4) {
+            uint16_t* dst_u16 = reinterpret_cast<uint16_t*>(
+                block_base + hd4 * K_INNER_STRIDE +
+                pair_in_block * V_INNER_STRIDE + lane_base);
+            const uint16_t* src_u16 =
+                reinterpret_cast<const uint16_t*>(key_src + hd4 * TILE_K);
+            vst1_u16(dst_u16, vld1_u16(src_u16));
+          }
+        }
+
+        // Value cache: TokenRow PV
+        {
+          const c10::BFloat16* __restrict value_src =
+              value + token_idx * value_token_num_stride +
+              head_idx * value_head_num_stride;
+
+          c10::BFloat16* __restrict value_base =
+              value_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride;
+
+          const int64_t row_block = block_offset / V_TOKENS_PER_ROW_BLOCK;
+          const int64_t lane = block_offset & (V_TOKENS_PER_ROW_BLOCK - 1);
+
+          c10::BFloat16* __restrict row_block_base =
+              value_base + row_block * V_INNER_STRIDE;
+
+          for (int64_t hd2 = 0; hd2 < head_dim / TILE_COLS; ++hd2) {
+            c10::BFloat16* __restrict dst_val =
+                row_block_base + hd2 * v_pair_stride;
+
+            const uint16_t* src_u16 =
+                reinterpret_cast<const uint16_t*>(value_src);
+            uint16_t* dst_u16 = reinterpret_cast<uint16_t*>(dst_val);
+            dst_u16[lane] = src_u16[hd2 * TILE_COLS + 0];
+            dst_u16[lane + V_TOKENS_PER_ROW_BLOCK] =
+                src_u16[hd2 * TILE_COLS + 1];
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace cpu_attention
+
+#endif  // CPU_ATTN_ASIMD_BFMMLA_HPP
--- a/csrc/cpu/mla_decode.cpp
+++ b/csrc/cpu/mla_decode.cpp
@@ -38,6 +38,15 @@ struct KernelVecType<c10::BFloat16> {
  using qk_vec_type = vec_op::BF16Vec32;
  using v_load_vec_type = vec_op::BF16Vec16;
 };
+
+#elif defined(__s390x__)
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using qk_load_vec_type = vec_op::BF16Vec16;
+  using qk_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::BF16Vec16;
+};
+
 #elif defined(__aarch64__)
 template <>
 struct KernelVecType<c10::BFloat16> {
--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@@ -152,3 +152,14 @@ struct enable_sm120_only : Kernel {
 #endif
  }
 };
+
+// SM12x family includes SM120 (RTX 5090) and SM121 (DGX Spark GB10)
+template <typename Kernel>
+struct enable_sm120_family : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && (__CUDA_ARCH__ >= 1200 && __CUDA_ARCH__ < 1300)
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -14,12 +14,10 @@ void moe_permute(
    const torch::Tensor& token_expert_indices,       // [n_token, topk]
    const std::optional<torch::Tensor>& expert_map,  // [n_expert]
    int64_t n_expert, int64_t n_local_expert, int64_t topk,
-    const std::optional<int64_t>& align_block_size,
    torch::Tensor& permuted_input,             // [permuted_size, hidden]
    torch::Tensor& expert_first_token_offset,  // [n_local_expert + 1]
    torch::Tensor& inv_permuted_idx,           // [n_token, topk]
-    torch::Tensor& permuted_idx,               // [permute_size]
-    torch::Tensor& m_indices) {                // [align_expand_m]
+    torch::Tensor& permuted_idx) {             // [permute_size]
  TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long,
              "expert_first_token_offset must be int64");
  TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
@@ -34,8 +32,6 @@ void moe_permute(
              "token_expert_indices shape must be same as inv_permuted_idx");
  auto n_token = input.sizes()[0];
  auto n_hidden = input.sizes()[1];
-  auto align_block_size_value =
-      align_block_size.has_value() ? align_block_size.value() : -1;
  auto stream = at::cuda::getCurrentCUDAStream().stream();
  const long sorter_size =
      CubKeyValueSorter::getWorkspaceSize(n_token * topk, n_expert);
@@ -73,42 +69,15 @@ void moe_permute(
      get_ptr<int64_t>(expert_first_token_offset), n_token, n_expert,
      n_local_expert, topk, sorter, get_ptr<int>(sort_workspace), stream);

-  // DeepGEMM: use getMIndices kernel to compute
-  // 1) align_expert_first_token_offset (aligned prefix offsets)
-  // 2) m_indices (expert id for each aligned row)
-  // eg. expert0: 3, expert1: 5, expert2: 2 tokens respectively
-  // expert_first_token_offset = [0, 3, 8, 10], align_block_size = 4
-  // expert0: 3->4, expert1: 5->8, expert2: 2->4
-  // align_expert_first_token_offset = [0, 4, 12, 16]
-  // so m_indices = [0,0,0,0, 1,1,1,1,1,1,1,1, 2,2,2,2]
-  torch::Tensor align_expert_first_token_offset;
-  const int64_t* aligned_expert_first_token_offset_ptr = nullptr;
-  if (align_block_size.has_value()) {
-    align_expert_first_token_offset =
-        torch::zeros_like(expert_first_token_offset);
-    getMIndices(get_ptr<int64_t>(expert_first_token_offset),
-                get_ptr<int64_t>(align_expert_first_token_offset),
-                get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
-                stream);
-    aligned_expert_first_token_offset_ptr =
-        get_ptr<int64_t>(align_expert_first_token_offset);
-  }
-
  // dispatch expandInputRowsKernelLauncher
  MOE_DISPATCH(input.scalar_type(), [&] {
    expandInputRowsKernelLauncher<scalar_t>(
        get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
        get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
        get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
-        get_ptr<int64_t>(expert_first_token_offset),
-        aligned_expert_first_token_offset_ptr, n_token, valid_num_ptr, n_hidden,
-        topk, n_local_expert, align_block_size_value, stream);
+        get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
+        n_hidden, topk, n_local_expert, stream);
  });
-
-  // this is only required for DeepGemm and not required for CUTLASS group gemm
-  if (align_block_size.has_value()) {
-    expert_first_token_offset.copy_(align_expert_first_token_offset);
-  }
 }

 void moe_unpermute(
@@ -201,16 +170,13 @@ void shuffle_rows(const torch::Tensor& input_tensor,

 #else

-void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
-                 torch::Tensor& topk_ids,
+void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_ids,
                 const torch::Tensor& token_expert_indices,
                 const std::optional<torch::Tensor>& expert_map,
                 int64_t n_expert, int64_t n_local_expert, int64_t topk,
-                 const std::optional<int64_t>& align_block_size,
                 torch::Tensor& permuted_input,
                 torch::Tensor& expert_first_token_offset,
-                 torch::Tensor& src_row_id2dst_row_id_map,
-                 torch::Tensor& m_indices) {
+                 torch::Tensor& inv_permuted_idx, torch::Tensor& permuted_idx) {
  TORCH_CHECK(false, "moe_permute is not supported on CUDA < 12.0");
 }

--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
@@ -168,64 +168,4 @@ void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
      topk_id_ptr, size, expert_map_ptr, num_experts);
 }

-template <bool ALIGN_BLOCK_SIZE>
-__global__ void getMIndicesKernel(int64_t* expert_first_token_offset,
-                                  int64_t* align_expert_first_token_offset,
-                                  int* m_indices, const int num_local_expert,
-                                  const int align_block_size) {
-  int eidx = blockIdx.x;
-  int tidx = threadIdx.x;
-  extern __shared__ int64_t smem_expert_first_token_offset[];
-  for (int i = tidx; i <= num_local_expert; i += blockDim.x) {
-    smem_expert_first_token_offset[i] = __ldg(expert_first_token_offset + i);
-  }
-  __syncthreads();
-  auto last_token_offset = smem_expert_first_token_offset[eidx + 1];
-  auto first_token_offset = smem_expert_first_token_offset[eidx];
-  int n_token_in_expert = last_token_offset - first_token_offset;
-
-  if constexpr (ALIGN_BLOCK_SIZE) {
-    n_token_in_expert = (n_token_in_expert + align_block_size - 1) /
-                        align_block_size * align_block_size;
-    // round up to ALIGN_BLOCK_SIZE
-    int64_t accumulate_align_offset = 0;
-    for (int i = 1; i <= eidx + 1; i++) {
-      int n_token = smem_expert_first_token_offset[i] -
-                    smem_expert_first_token_offset[i - 1];
-      accumulate_align_offset =
-          accumulate_align_offset + (n_token + align_block_size - 1) /
-                                        align_block_size * align_block_size;
-      if (i == eidx) {
-        first_token_offset = accumulate_align_offset;
-      }
-      // last block store align_expert_first_token_offset
-      if (eidx == num_local_expert - 1 && threadIdx.x == 0) {
-        align_expert_first_token_offset[i] = accumulate_align_offset;
-      }
-    }
-  }
-  for (int idx = tidx; idx < n_token_in_expert; idx += blockDim.x) {
-    // update m_indice with expert id
-    m_indices[first_token_offset + idx] = eidx;
-  }
-}
-
-void getMIndices(int64_t* expert_first_token_offset,
-                 int64_t* align_expert_first_token_offset, int* m_indices,
-                 int num_local_expert, const int align_block_size,
-                 cudaStream_t stream) {
-  int block = 256;
-  int grid = num_local_expert;
-  int smem_size = sizeof(int64_t) * (num_local_expert + 1);
-  if (align_block_size == -1) {
-    getMIndicesKernel<false><<<grid, block, smem_size, stream>>>(
-        expert_first_token_offset, align_expert_first_token_offset, m_indices,
-        num_local_expert, align_block_size);
-  } else {
-    getMIndicesKernel<true><<<grid, block, smem_size, stream>>>(
-        expert_first_token_offset, align_expert_first_token_offset, m_indices,
-        num_local_expert, align_block_size);
-  }
-}
-
 #endif
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
@@ -60,10 +60,9 @@ void expandInputRowsKernelLauncher(
    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
    int const* expanded_dest_row_to_expanded_source_row,
    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
-    int64_t const* expert_first_token_offset,
-    int64_t const* aligned_expert_first_token_offset, int64_t const num_rows,
+    int64_t const* expert_first_token_offset, int64_t const num_rows,
    int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
-    int num_local_experts, const int& align_block_size, cudaStream_t stream);
+    int num_local_experts, cudaStream_t stream);

 template <class T, class OutputType>
 void finalizeMoeRoutingKernelLauncher(
@@ -76,9 +75,4 @@ void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
                              const int* expert_map_ptr, int num_experts,
                              cudaStream_t stream);

-void getMIndices(int64_t* expert_first_token_offset,
-                 int64_t* align_expert_first_token_offset, int* m_indices,
-                 int num_local_expert, const int align_block_size,
-                 cudaStream_t stream);
-
 #include "moe_permute_unpermute_kernel.inl"
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
@@ -1,14 +1,13 @@
 #pragma once

-template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE>
+template <typename T, bool CHECK_SKIPPED>
 __global__ void expandInputRowsKernel(
    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
    int const* expanded_dest_row_to_expanded_source_row,
    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
-    int64_t const* expert_first_token_offset,
-    int64_t const* aligned_expert_first_token_offset, int64_t const num_rows,
+    int64_t const* expert_first_token_offset, int64_t const num_rows,
    int64_t const* num_dest_rows, int64_t const cols, int64_t k,
-    int num_local_experts, int align_block_size) {
+    int num_local_experts) {
  // Reverse permutation map.
  // I do this so that later, we can use the source -> dest map to do the k-way
  // reduction and unpermuting. I need the reverse map for that reduction to
@@ -19,24 +18,6 @@ __global__ void expandInputRowsKernel(
      expanded_dest_row_to_expanded_source_row[expanded_dest_row];
  int expert_id = sorted_experts[expanded_dest_row];

-  if constexpr (ALIGN_BLOCK_SIZE) {
-    // convert (unaligned) expanded_dest_row -> aligned expanded_dest_row.
-    // aligned_expert_first_token_offset[e] provides the aligned prefix start
-    // for expert e. For non-local experts we map to the end (total aligned M).
-    int64_t aligned_base = 0;
-    int64_t token_offset_in_expert = 0;
-    if (expert_id >= num_local_experts) {
-      aligned_base =
-          __ldg(aligned_expert_first_token_offset + num_local_experts);
-      token_offset_in_expert = 0;
-    } else {
-      aligned_base = __ldg(aligned_expert_first_token_offset + expert_id);
-      token_offset_in_expert =
-          expanded_dest_row - __ldg(expert_first_token_offset + expert_id);
-    }
-    expanded_dest_row = aligned_base + token_offset_in_expert;
-  }
-
  if (threadIdx.x == 0) {
    assert(expanded_dest_row <= INT32_MAX);
    expanded_source_row_to_expanded_dest_row[expanded_source_row] =
@@ -76,29 +57,25 @@ void expandInputRowsKernelLauncher(
    T const* unpermuted_input, T* permuted_output, int* sorted_experts,
    int const* expanded_dest_row_to_expanded_source_row,
    int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
-    int64_t const* expert_first_token_offset,
-    int64_t const* aligned_expert_first_token_offset, int64_t const num_rows,
+    int64_t const* expert_first_token_offset, int64_t const num_rows,
    int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
-    int num_local_experts, const int& align_block_size, cudaStream_t stream) {
+    int num_local_experts, cudaStream_t stream) {
  int64_t const blocks = num_rows * k;
  int64_t const threads = 256;
-  using FuncPtr = decltype(&expandInputRowsKernel<T, true, true>);
-  FuncPtr func_map[2][2] = {
-      {&expandInputRowsKernel<T, false, false>,
-       &expandInputRowsKernel<T, false, true>},
-      {&expandInputRowsKernel<T, true, false>,
-       &expandInputRowsKernel<T, true, true>},
+  using FuncPtr = decltype(&expandInputRowsKernel<T, true>);
+  FuncPtr func_map[2] = {
+      &expandInputRowsKernel<T, false>,
+      &expandInputRowsKernel<T, true>,
  };
  bool is_check_skip = num_valid_tokens_ptr != nullptr;
-  bool is_align_block_size = align_block_size != -1;
-  auto func = func_map[is_check_skip][is_align_block_size];
+  auto func = func_map[is_check_skip];

  func<<<blocks, threads, 0, stream>>>(
      unpermuted_input, permuted_output, sorted_experts,
      expanded_dest_row_to_expanded_source_row,
      expanded_source_row_to_expanded_dest_row, permuted_idx,
-      expert_first_token_offset, aligned_expert_first_token_offset, num_rows,
-      num_valid_tokens_ptr, cols, k, num_local_experts, align_block_size);
+      expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
+      num_local_experts);
 }

 template <class T, class U>
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -99,9 +99,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "moe_permute(Tensor input, Tensor topk_ids,"
      "Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
      "int n_local_expert,"
-      "int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
+      "int topk, Tensor! permuted_input, Tensor! "
      "expert_first_token_offset, Tensor! inv_permuted_idx, Tensor! "
-      "permuted_idx, Tensor! m_indices)->()");
+      "permuted_idx)->()");

  m.def(
      "moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights,"
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
@@ -103,7 +103,8 @@ struct cutlass_3x_gemm_fp8_blockwise {
          MainloopScheduler
      >::CollectiveOp;

-  using KernelType = enable_sm120_only<cutlass::gemm::kernel::GemmUniversal<
+  // SM12x family to support both SM120 (RTX 5090) and SM121 (DGX Spark)
+  using KernelType = enable_sm120_family<cutlass::gemm::kernel::GemmUniversal<
      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;

  struct GemmKernel : public KernelType {};
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@@ -1365,13 +1365,12 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
  return out_c;
 }

-#if defined(__gfx950__)  // TODO: Add NAVI support
-  // This version targets big A[] cases, where it is much larger than LDS
-  // capacity
+// This version targets cases skinny where CUs are not filled
+// Wave-SplitK is used with reduction done via atomics.
+#if defined(__gfx950__)
  #define WVSPLITKRC_1KPASS
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
-          int UNRL, int N, int GrpsShrB>
-
+          int UNRL, int N, int GrpsShrB, int CHUNKK>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
    __attribute__((amdgpu_waves_per_eu(1, 1)))
    wvSplitKrc_(const int actlN, const int K, const int M, const int Bx,
@@ -1383,12 +1382,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  int* cntr = (int*)(&glbl[M * N]);

  constexpr int NTILE = 16;
-  constexpr int WVLDS_ = (NTILE * THRDS * A_CHUNK);
  constexpr int APAD = 1;
  constexpr int ASTRD = 64;
  constexpr int BPAD = 1;
-  constexpr int BSTRD = 64;
-  constexpr int WVLDS = ((WVLDS_ + (WVLDS_ / BSTRD) * 4 * BPAD));
+  constexpr int WVLDS_ = THRDS * A_CHUNK / CHUNKK;
+  constexpr int WVLDS = ((WVLDS_ + A_CHUNK * BPAD)) * YTILE;

  constexpr int max_lds_len = LDS_SIZE / 2;

@@ -1442,17 +1440,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
        break;
    }
  #else
-  int constexpr kFit = 512;
+  int constexpr kFit = 512 / CHUNKK;
  int constexpr kfitsPerRdc = 1;
  #endif

-  bool doRdc = (kfitsPerRdc * kFit < K);
+  bool doRdc = true;  // Assuming (kfitsPerRdc * kFit < K) is always true
  uint32_t numCuWithFullK =
      ((M + (WvPrGrp * YTILE / GrpsShrB) - 1) / (WvPrGrp * YTILE / GrpsShrB));
  uint32_t Mmod = numCuWithFullK * (WvPrGrp * YTILE / GrpsShrB);

  // given above k-split, find this wave's position
-  uint32_t kFitPdd = kFit + (kFit / ASTRD) * APAD;
+  uint32_t kFitPdd = kFit * CHUNKK + ((kFit * CHUNKK) / ASTRD) * APAD;
  uint32_t m0 = (blockIdx.x * WvPrGrp / GrpsShrB) * YTILE;
  uint32_t m1 = ((threadIdx.y % WvPrGrp) / GrpsShrB) * YTILE;
  uint32_t m = (m0 + m1) % Mmod;
@@ -1460,8 +1458,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  uint32_t k_end = (m0 / Mmod + 1) * kFit * kfitsPerRdc;
  const uint32_t k_rnd = (K + kFit * kfitsPerRdc - 1) / (kFit * kfitsPerRdc);

-  scalar8 sum4[N / NTILE / GrpsShrB][1];
-  bigType bigB_[YTILE / GrpsShrB][UNRL];
+  scalar8 sum4[N / NTILE / GrpsShrB][1] = {0};
+  bigType bigB_[YTILE / GrpsShrB / CHUNKK][UNRL];
  const uint32_t bLoader = (threadIdx.y % GrpsShrB);
  uint32_t kBase = 0;
  if (k_str >= K) return;
@@ -1498,12 +1496,15 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
    #pragma unroll
  for (uint32_t k2 = 0; k2 < UNRL; k2++) {
    uint32_t k = k_str + k2 * THRDS * A_CHUNK;
-    uint32_t k_ = k + threadIdx.x * A_CHUNK;
+    uint32_t k_ = k + (threadIdx.x % (THRDS / CHUNKK)) * A_CHUNK;
    const scalar_t* B_ = &B[min__(k_, K - A_CHUNK)];
    #pragma unroll
-    for (uint32_t y = 0; y < YTILE / GrpsShrB; y++)
-      bigB_[y][k2].h8 = (loadnt(
-          (scalar8*)(&B_[min__(y * GrpsShrB + bLoader + m, M - 1) * K])));
+    for (uint32_t y = 0; y < YTILE / GrpsShrB; y += CHUNKK)
+      bigB_[y / CHUNKK][k2].h8 = (loadnt(
+          (scalar8*)(&B_[min__((y + threadIdx.x / (THRDS / CHUNKK)) * GrpsShrB +
+                                   bLoader + m,
+                               M - 1) *
+                         K])));
  }
  {
  #else
@@ -1556,48 +1557,51 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
      if (reloada) {
  #endif
        constexpr int sprdN = 4;
-        const uint32_t thrd = ((threadIdx.y / sprdN) * THRDS + threadIdx.x);
+        const uint32_t thrd = threadIdx.x % (THRDS / CHUNKK);

  #ifndef WVSPLITKRC_1KPASS
    #pragma unroll
-        for (int k = 0; k < kFit; k += THRDS * (WvPrGrp / sprdN) * A_CHUNK) {
+        for (int k = 0; k < kFit;
+             k += (THRDS * (WvPrGrp / sprdN) * A_CHUNK) / CHUNKK) {
  #else
        const unsigned int k = 0;
        {
  #endif
          unsigned int kOff = k + (thrd * A_CHUNK);
-          unsigned int kOffcp = min__(K - A_CHUNK, k_str + kOff);
-          const unsigned int k_in = kOffcp + ((threadIdx.y % sprdN)) * K;
-          const unsigned int k_ot = kOff + ((threadIdx.y % sprdN)) * kFitPdd;
-          for (unsigned int n = 0; n < N / 2; n += sprdN) {
-            __builtin_amdgcn_global_load_lds((int*)(&A[k_in + n * K]),
-                                             (int*)(&s[(k_ot + n * kFitPdd)]),
-                                             16, 0, 0);
-            if (((threadIdx.y % sprdN)) + n + N / 2 >= actlN) continue;
+          unsigned int kOffcp =
+              k_str + kOff;  // min__(K - A_CHUNK, k_str + kOff);
+          for (unsigned int n = 0; n < N; n += CHUNKK * sprdN) {
            __builtin_amdgcn_global_load_lds(
-                (int*)(&A[k_in + (n + N / 2) * K]),
-                (int*)(&s[(k_ot + (n + N / 2) * kFitPdd)]), 16, 0, 0);
+                (int*)(&A[min__(
+                    K * actlN - A_CHUNK,
+                    kOffcp + K * (n / CHUNKK +
+                                  (N / CHUNKK) * (threadIdx.x / (64 / CHUNKK)) +
+                                  (threadIdx.y % sprdN)))]),
+                (int*)(&s[(k +
+                           kFitPdd * ((n / CHUNKK) + (threadIdx.y % sprdN)))]),
+                16, 0, 0);
          }

          // Stage loaded B[] to LDS for MFMA swizzling...
          for (uint32_t k2 = 0; k2 < UNRL; k2++) {
            uint32_t k = k1 + k2 * THRDS * A_CHUNK;
-            uint32_t k_ = k + threadIdx.x * A_CHUNK;
+            uint32_t k_ = k + (threadIdx.x % (THRDS / CHUNKK)) * A_CHUNK;
            const bool oob_k = (k_ >= K);
-            for (uint32_t y = 0; y < YTILE / GrpsShrB; y++) {
-              uint32_t idx = threadIdx.x * 4 +
-                             (y * GrpsShrB + bLoader) * ((THRDS + BPAD) * 4);
+            for (uint32_t y = 0; y < YTILE / GrpsShrB; y += CHUNKK) {
+              uint32_t idx =
+                  (threadIdx.x % (THRDS / CHUNKK)) * 4 +
+                  ((y + threadIdx.x / (THRDS / CHUNKK)) * GrpsShrB + bLoader) *
+                      ((THRDS / CHUNKK + BPAD) * 4);
              // zero out if oob
              *((scalar8*)&myStg[idx]) =
-                  (oob_k || (y * GrpsShrB + bLoader + m >= M))
+                  (oob_k)  // TODO: ever necessary (y*GrpsShrB+bLoader+m>=M) ?
                      ? 0
-                      : bigB_[y][k2].h8;
+                      : bigB_[y / CHUNKK][k2].h8;
            }
          }
        }
      }
    }
-
  #ifndef WVSPLITKRC_1KPASS
    // Fire load of next B[] chunk...
    if ((k1 + THRDS * A_CHUNK * UNRL < k_end) &&
@@ -1608,40 +1612,50 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
        uint32_t k_ = k + threadIdx.x * A_CHUNK;
        const scalar_t* B_ = &B[min__(k_, K - A_CHUNK)];
    #pragma unroll
-        for (uint32_t y = 0; y < YTILE / GrpsShrB; y++)
-          bigB_[y][k2].h8 = (loadnt(
-              (scalar8*)(&B_[min__(y * GrpsShrB + bLoader + m, M - 1) * K])));
+        for (uint32_t y = 0; y < YTILE / GrpsShrB; y += CHUNKK)
+          bigB_[y / CHUNKK][k2].h8 = (loadnt(
+              (scalar8*)(&B_[min__((y + threadIdx.x / (THRDS / CHUNKK)) *
+                                           GrpsShrB +
+                                       bLoader + m,
+                                   M - 1) *
+                             K])));
      }
  #endif

    // B[] staging is cooperative across GrpsShrB, so sync here before reading
-    // back
+    // back. This wait is currently inserted by compiler, but not gauranteed.
+    asm volatile("s_waitcnt 0");
    __syncthreads();

    // read back B[] swizzled for MFMA...
-    bigType bigB[YTILE][UNRL];
+    bigType bigB[YTILE / CHUNKK][UNRL];
    for (uint32_t k2 = 0; k2 < UNRL; k2++) {
-      for (uint32_t y = 0; y < YTILE; y++) {
-        unsigned int idx = (threadIdx.x % YTILE) * ((THRDS + BPAD) * 4) +
-                           (threadIdx.x / YTILE) * 4 + y * 16;
+      for (uint32_t y = 0; y < YTILE / CHUNKK; y++) {
+        unsigned int idx =
+            (threadIdx.x % YTILE) * ((THRDS / CHUNKK + BPAD) * 4) +
+            (threadIdx.x / YTILE) * 4 + y * 16;
        bigB[y][k2].h8 = *((scalar8*)&myStg[idx]);
      }
    }

    // rReadback A[] swizzled for MFMA...
-    bigType bigA[N / GrpsShrB][UNRL];
+    bigType bigA[N / GrpsShrB / CHUNKK][UNRL];
  #pragma unroll
    for (uint32_t k2 = 0; k2 < UNRL; k2++) {
      uint32_t k = k1 + k2 * THRDS * A_CHUNK - kBase - k_str;
  #pragma unroll
      for (uint32_t nt = 0; nt < N / GrpsShrB; nt += NTILE)
  #pragma unroll
-        for (uint32_t n = 0; n < NTILE; n++) {
-          uint32_t idxa = (nt + (threadIdx.x % NTILE) +
-                           (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) *
-                              kFitPdd +
-                          A_CHUNK * ((threadIdx.x / NTILE) + n * 4) + k;
-          bigA[nt + n][k2] = *((const bigType*)(&(s[idxa])));
+        for (uint32_t n = 0; n < NTILE / CHUNKK; n++) {
+          uint32_t idxa =
+              ((nt + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) % (N / CHUNKK) +
+               (threadIdx.x % NTILE)) *
+                  kFitPdd +
+              ((nt + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) /
+               (N / CHUNKK)) *
+                  A_CHUNK * (64 / CHUNKK) +
+              A_CHUNK * ((threadIdx.x / NTILE) + n * 4) + k;
+          bigA[nt / CHUNKK + n][k2] = *((const bigType*)(&(s[idxa])));
        }
    }

@@ -1650,152 +1664,75 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
    for (uint32_t k2 = 0; k2 < UNRL; k2++) {
  #pragma unroll
      for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
-        if constexpr (std::is_same_v<scalar_t, half>) {
-          sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16f16(
-              bigA[nt * NTILE + 0][k2].h4[0], bigB[0][k2].h4[0],
-              (k1 == k_str) ? ((scalar8){0}) : sum4[nt][0], 0, 0, 0);
-          sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16f16(
-              bigA[nt * NTILE + 0][k2].h4[1], bigB[0][k2].h4[1], sum4[nt][0], 0,
-              0, 0);
-        } else {  // bf16
-          sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
-              bigA[nt * NTILE + 0][k2].h4[0], bigB[0][k2].h4[0],
-              (k1 == k_str) ? ((scalar8){0}) : sum4[nt][0], 0, 0, 0);
-          sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
-              bigA[nt * NTILE + 0][k2].h4[1], bigB[0][k2].h4[1], sum4[nt][0], 0,
-              0, 0);
-        }
  #pragma unroll
-        for (uint32_t j = 1; j < YTILE; j++) {
+        for (uint32_t j = 0; j < YTILE / CHUNKK; j++) {
          if constexpr (std::is_same_v<scalar_t, half>) {
-            sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16f16(
-                bigA[nt * NTILE + j][k2].h4[0], bigB[j][k2].h4[0], sum4[nt][0],
-                0, 0, 0);
-            sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16f16(
-                bigA[nt * NTILE + j][k2].h4[1], bigB[j][k2].h4[1], sum4[nt][0],
-                0, 0, 0);
+            sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x32_f16(
+                bigA[nt * (YTILE / CHUNKK) + j][k2].h8, bigB[j][k2].h8,
+                sum4[nt][0], 0, 0, 0);
          } else {  // bf16
-            sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
-                bigA[nt * NTILE + j][k2].h4[0], bigB[j][k2].h4[0], sum4[nt][0],
-                0, 0, 0);
-            sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
-                bigA[nt * NTILE + j][k2].h4[1], bigB[j][k2].h4[1], sum4[nt][0],
-                0, 0, 0);
+            sum4[nt][0] = __builtin_amdgcn_mfma_f32_16x16x32_bf16(
+                bigA[nt * (YTILE / CHUNKK) + j][k2].h8, bigB[j][k2].h8,
+                sum4[nt][0], 0, 0, 0);
          }
        }
      }
    }
  }

-  if (!doRdc) {
-    if (m + (threadIdx.x % 16) < M) {
-      scalar_t biases[N / NTILE / GrpsShrB][4] = {0};
+  if (m + (threadIdx.x % 16) < M) {
+    int my_cntr;
+    int mindx = m + (threadIdx.x % 16);
+    int g_mindx = m * 4 + (threadIdx.x % 64);  // coalesced atomic reduction
+    scalar_t biases[N / NTILE / GrpsShrB][4] = {};
+    // Atomic add the output, read biases
+    for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++)
+      for (uint32_t j = 0; j < 4; j++) {
+        // int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+        //             (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+        // int adr = mindx + M * nindx;
+        int g_nindx =
+            j + (nt * NTILE + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) / 4;
+        int g_adr = g_mindx + M * g_nindx * 4;
+        atomicAdd(&glbl[g_adr], sum4[nt][0][j]);
+      }
+    int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
+                 (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+    int adr_ = mindx + M * nindx_ / 4;
+    // Update the complete counter
+    my_cntr = atomicAdd(&cntr[adr_], 1);
+    float vals[N / NTILE / GrpsShrB][4] = {};
+    // If we're the last k-shard, read back the value and convert...
+    if (my_cntr + 1 == k_rnd) {
      if (BIAS)
        for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
          for (uint32_t j = 0; j < 4; j++) {
-            int mindx = m + (threadIdx.x % 16);
            int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
                        (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
-            biases[nt][j] = BIAS[(mindx % Bx) + (nindx % By) * M];
+            biases[nt][j] = BIAS[(mindx % Bx) + (nindx % By) * Bx];
          }
        }
      for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
        for (uint32_t j = 0; j < 4; j++) {
-          int mindx = m + (threadIdx.x % 16);
-          int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
-                      (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
-          int adr = mindx + M * nindx;
-          if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
-            if (BIAS) sum4[nt][0][j] += __bfloat162float(biases[nt][j]);
-            C[adr] = __float2bfloat16(sum4[nt][0][j]);
-          } else {
-            if (BIAS) sum4[nt][0][j] += __half2float(biases[nt][j]);
-            C[adr] = __float2half(sum4[nt][0][j]);
-          }
+          int g_nindx =
+              j + (nt * NTILE + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) / 4;
+          int g_adr = g_mindx + M * g_nindx * 4;
+          vals[nt][j] = glbl[g_adr];
        }
      }
-    }
-  } else {
-    if (m + (threadIdx.x % 16) < M) {
-      int my_cntr;
-      if (!BIAS) {
-        int mindx = m + (threadIdx.x % 16);
-        for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++)
-          for (uint32_t j = 0; j < 4; j++) {
-            int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
-                        (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+      __builtin_amdgcn_sched_barrier(0);
+      for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+        for (uint32_t j = 0; j < 4; j++) {
+          int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                      (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+          if (nindx < actlN) {
            int adr = mindx + M * nindx;
-            atomicAdd(&glbl[adr], sum4[nt][0][j]);
-          }
-        int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
-                     (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
-        int adr_ = mindx + M * nindx_ / 4;
-        my_cntr = atomicAdd(&cntr[adr_], 1);
-        float vals[N / NTILE / GrpsShrB][4] = {};
-        if (my_cntr + 1 == k_rnd) {
-          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
-            for (uint32_t j = 0; j < 4; j++) {
-              int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
-                          (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
-              int adr = mindx + M * nindx;
-              vals[nt][j] = glbl[adr];
-            }
-          }
-          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
-            for (uint32_t j = 0; j < 4; j++) {
-              int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
-                          (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
-              if (nindx >= actlN) break;
-              int adr = mindx + M * nindx;
-              if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
-                C[adr] = __float2bfloat16(vals[nt][j]);
-              } else {
-                C[adr] = __float2half(vals[nt][j]);
-              }
-            }
-          }
-        }
-      } else {
-        int mindx = m + (threadIdx.x % 16);
-        scalar_t biases[N / NTILE / GrpsShrB][4] = {};
-        // Atomic add the output, read biases
-        for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++)
-          for (uint32_t j = 0; j < 4; j++) {
-            int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
-                        (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
-            int adr = mindx + M * nindx;
-            atomicAdd(&glbl[adr], sum4[nt][0][j]);
-            biases[nt][j] = BIAS[(mindx % Bx) + (nindx % By) * M];
-          }
-        int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
-                     (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
-        int adr_ = mindx + M * nindx_ / 4;
-        // Update the complete counter
-        my_cntr = atomicAdd(&cntr[adr_], 1);
-        float vals[N / NTILE / GrpsShrB][4] = {};
-        // If we're the last k-shard, read back the value and convert...
-        if (my_cntr + 1 == k_rnd) {
-          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
-            for (uint32_t j = 0; j < 4; j++) {
-              int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
-                          (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
-              int adr = mindx + M * nindx;
-              vals[nt][j] = glbl[adr];
-            }
-          }
-          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
-            for (uint32_t j = 0; j < 4; j++) {
-              int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
-                          (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
-              if (nindx >= actlN) break;
-              int adr = mindx + M * nindx;
-              if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
-                vals[nt][j] += __bfloat162float(biases[nt][j]);
-                C[adr] = __float2bfloat16(vals[nt][j]);
-              } else {
-                vals[nt][j] += __half2float(biases[nt][j]);
-                C[adr] = __float2half(vals[nt][j]);
-              }
+            if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
+              vals[nt][j] += __bfloat162float(biases[nt][j]);
+              C[adr] = __float2bfloat16(vals[nt][j]);
+            } else {
+              vals[nt][j] += __half2float(biases[nt][j]);
+              C[adr] = __float2half(vals[nt][j]);
            }
          }
        }
@@ -1814,7 +1751,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 }
 #else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
-          int UNRL, int N, int GrpsShrB>
+          int UNRL, int N, int GrpsShrB, int CHUNKK>
 __global__ void wvSplitKrc_(const int actlN, const int K, const int M,
                            const int Bx, const int By, const scalar_t* B,
                            const scalar_t* __restrict__ A,
@@ -1859,10 +1796,10 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  // const int max_lds_len = get_lds_size() / 2;

-#define WVSPLITKrc(_WvPrGrp, _YTILE, _UNRL, _N, _GrpsShrB)                     \
+#define WVSPLITKrc(_N, _GrpsShrB, _CHUNKK)                                     \
  {                                                                            \
-    dim3 block(64, _WvPrGrp);                                                  \
-    wvSplitKrc_<fptype, 64, _YTILE, _WvPrGrp, 8, _UNRL, _N, _GrpsShrB>         \
+    dim3 block(64, 4);                                                         \
+    wvSplitKrc_<fptype, 64, 16, 4, 8, 1, _N, _GrpsShrB, _CHUNKK>               \
        <<<grid, block, 0, stream>>>(N_in, K_in, M_in, Bx_in, By_in, af4, bf4, \
                                     biasf4, glbl, c, CuCount);                \
  }
@@ -1877,15 +1814,37 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
            : nullptr;
    fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
    auto glbl = axl_glbl.data_ptr<float>();
+
+    // With 64 Ms per CU (each of 4 SIMDs working on a 16x16 tile),
+    // and each working on a 512-shard of K, how many CUs would we need?
+    int rndup_cus = ((M_in + 64 - 1) / 64) * ((K_in + 512 - 1) / 512);
+
+    // How many of 4 waves in a group can work on same 16 Ms at same time? First
+    // try to maximize this. This reduces the Ms each group works on, i.e.
+    // increasing the number of CUs needed.
+    int GrpsShrB = min(N_p2 / 16, 4);
+
+    // Given the above, how many CUs would we need?
+    int CuNeeded = rndup_cus * GrpsShrB;
+
+    if (CuNeeded > CuCount) std::runtime_error("Invalid wvSplitKrc size");
+
+    // Can we increase SplitK by shrinking the K-shared to 256?
+    int chunkk = (CuNeeded * 2 <= CuCount) ? 2 : 1;
+
    switch (N_p2) {
      case 16:
-        WVSPLITKrc(4, 16, 1, 16, 1) break;
+        WVSPLITKrc(16, 1, 1) break;
      case 32:
-        WVSPLITKrc(4, 16, 1, 32, 2) break;
+        if (chunkk == 2)
+          WVSPLITKrc(32, 2, 2) else if (chunkk == 1) WVSPLITKrc(32, 2, 1) break;
      case 64:
-        WVSPLITKrc(4, 16, 1, 64, 2) break;
+        if (chunkk == 2)
+          WVSPLITKrc(64, 4, 2) else if (chunkk == 1) WVSPLITKrc(64, 4, 1) break;
      case 128:
-        WVSPLITKrc(4, 16, 1, 128, 4) break;
+        if (chunkk == 2)
+          WVSPLITKrc(128, 4, 2) else if (chunkk == 1)
+              WVSPLITKrc(128, 4, 1) break;
      default:
        throw std::runtime_error(
            "Unsupported N value: " + std::to_string(M_in) + "," +
@@ -1899,8 +1858,9 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
          int A_CHUNK, int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
-    wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, const int Bx,
-                      const int By, const fp8_t* B, const fp8_t* __restrict__ A,
+    wvSplitKQ_hf_sml_(const int K, const int Kap, const int Kbp, const int M,
+                      const int Bx, const int By, const fp8_t* B,
+                      const fp8_t* __restrict__ A,
                      const scalar_t* __restrict__ BIAS, scalar_t* C,
                      const float* __restrict__ s_A,
                      const float* __restrict__ s_B, const int _WvPrGrp,
@@ -1924,9 +1884,14 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  __shared__ fp8_t s[max_lds_len];

  for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
-       k < min__(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
+       k < min__(Kap * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
+  #if defined(__gfx950__)
+    __builtin_amdgcn_global_load_lds((int*)(&A[k]), (int*)(&s[k]), 16, 0, 0);
+  #else
    *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
+  #endif
  }
+  asm volatile("s_waitcnt vmcnt(0)");
  __syncthreads();

  if (threadIdx.y >= _WvPrGrp) return;
@@ -1934,37 +1899,24 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;

  using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
-  floatx16 sum[N][YTILE];
  float sA = *s_A;
  float sB = *s_B;

  while (m < M) {
-    for (int i = 0; i < YTILE; i++)
-      for (int n = 0; n < N; n++) sum[n][i] = {0.f};
-
-    bigType bigA[N][UNRL];
-    bigType bigB[YTILE][UNRL];
-
+    floatx16 sum[N][YTILE] = {};
    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
-  #pragma unroll
-      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
-  #pragma unroll
-        for (uint32_t n = 0; n < N; ++n) bigA[n][k2].h8 = {0.f};
-  #pragma unroll
-        for (uint32_t y = 0; y < YTILE; ++y) bigB[y][k2].h8 = {0.f};
-      }
+      bigType bigA[N][UNRL] = {};
+      bigType bigB[YTILE][UNRL];

      // Fetch the weight matrix from memory!
  #pragma unroll
      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
        uint32_t k_ = k + threadIdx.x * A_CHUNK;
-        if (k_ >= K) break;
-
-        const fp8_t* B_ = &B[(m + 0) * Kp + k_];
+        const fp8_t* B_ = &B[min__(k_, K - A_CHUNK)];
  #pragma unroll
        for (uint32_t y = 0; y < YTILE; ++y) {
-          bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[y * Kp])));
+          bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[min__(y + m, M - 1) * Kbp])));
        }
      }

@@ -1975,16 +1927,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
        uint32_t k_ = k + threadIdx.x * A_CHUNK;
        if (k_ >= K) break;
        for (int n = 0; n < N; n++) {
-          bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+          bigA[n][k2] = *((const bigType*)(&(s[k_ + Kap * n])));
        }
      }

  // Do the matrix multiplication in interleaved manner
  #pragma unroll
      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
-        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
-        if (k >= K) break;
-
        for (uint32_t n = 0; n < N; n++) {
          for (int i = 0; i < A_CHUNK; i += 8) {
            for (int y = 0; y < YTILE; ++y) {
@@ -2002,48 +1951,27 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
      for (int y = 0; y < YTILE; y++) {
        float accm0 = sum[n][y][0];
        float accm16 = sum[n][y][8];
-        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][1]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][9]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][2]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][10]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][3]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][11]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][4]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][12]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][5]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][13]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][6]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][14]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][7]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][15]), "v"(accm16));
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][1], 0x101, 0xf, 0xf,
+                                          1);  // row_shl1
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][9], 0x101, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][2], 0x102, 0xf, 0xf,
+                                          1);  // row_shl2
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][10], 0x102, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][3], 0x103, 0xf, 0xf,
+                                          1);  // row_shl3
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][11], 0x103, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][4], 0x108, 0xf, 0xf,
+                                          1);  // row_shl8
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][12], 0x108, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][5], 0x109, 0xf, 0xf,
+                                          1);  // row_shl9
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][13], 0x109, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][6], 0x10a, 0xf, 0xf,
+                                          1);  // row_shl10
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][14], 0x10a, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][7], 0x10b, 0xf, 0xf,
+                                          1);  // row_shl11
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][15], 0x10b, 0xf, 0xf, 1);
        accm0 += __shfl(accm0, 36);
        accm16 += __shfl(accm16, 52);
        sum[n][y][0] = accm0 + __shfl(accm16, 16);
@@ -2051,19 +1979,23 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
    }

    if (threadIdx.x == 0) {
+      scalar_t biases[N][YTILE] = {};
+      if (BIAS)
+        for (int n = 0; n < N; n++) {
+          for (int y = 0; y < YTILE; y++) {
+            biases[n][y] = BIAS[(m + y) % Bx + (n % By) * Bx];
+          }
+        }
      for (int n = 0; n < N; n++) {
        for (int y = 0; y < YTILE; y++) {
          if (y + m >= M) break;  // To avoid mem access fault.
          sum[n][y][0] *= sA * sB;
          if constexpr (std::is_same_v<scalar_t, half>) {
-            if (BIAS)
-              sum[n][y][0] += __half2float(BIAS[(m + y) % Bx + (n % By) * M]);
+            sum[n][y][0] += __half2float(biases[n][y]);
          } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
-            if (BIAS)
-              sum[n][y][0] +=
-                  __bfloat162float(BIAS[(m + y) % Bx + (n % By) * M]);
+            sum[n][y][0] += __bfloat162float(biases[n][y]);
          }
-          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0]);  // * sA * sB);
+          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0]);
        }
      }
    }
@@ -2074,9 +2006,9 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 #else   // !defined(__HIP__MI3XX__) TODO: Add NAVI support
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
          int A_CHUNK, int UNRL, int N>
-__global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
-                                  const int Bx, const int By, const fp8_t* B,
-                                  const fp8_t* __restrict__ A,
+__global__ void wvSplitKQ_hf_sml_(const int K, const int Kap, const int Kbp,
+                                  const int M, const int Bx, const int By,
+                                  const fp8_t* B, const fp8_t* __restrict__ A,
                                  const scalar_t* __restrict__ BIAS,
                                  scalar_t* C, const float* __restrict__ s_A,
                                  const float* __restrict__ s_B,
@@ -2089,8 +2021,9 @@ __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
          int A_CHUNK, int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
-    wvSplitKQ_hf_(const int K, const int Kp, const int M, const int Bx,
-                  const int By, const fp8_t* B, const fp8_t* __restrict__ A,
+    wvSplitKQ_hf_(const int K, const int Kap, const int Kbp, const int M,
+                  const int Bx, const int By, const fp8_t* B,
+                  const fp8_t* __restrict__ A,
                  const scalar_t* __restrict__ BIAS, scalar_t* C,
                  const float* __restrict__ s_A, const float* __restrict__ s_B,
                  const int _WvPrGrp, const int CuCount) {
@@ -2113,9 +2046,14 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  __shared__ fp8_t s[max_lds_len];

  for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
-       k < min__(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
+       k < min__(Kap * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) {
+  #if defined(__gfx950__)
+    __builtin_amdgcn_global_load_lds((int*)(&A[k]), (int*)(&s[k]), 16, 0, 0);
+  #else
    *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
+  #endif
  }
+  asm volatile("s_waitcnt vmcnt(0)");
  __syncthreads();

  if (threadIdx.y >= _WvPrGrp) return;
@@ -2123,29 +2061,23 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;

  using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
-  floatx16 sum[N][YTILE];
  float sA = *s_A;
  float sB = *s_B;

  while (m < M) {
-    for (int i = 0; i < YTILE; i++)
-      for (int n = 0; n < N; n++) sum[n][i] = {0};
-
-    bigType bigA[N][UNRL];
-    bigType bigB[YTILE][UNRL];
-
+    floatx16 sum[N][YTILE] = {};
    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+      bigType bigA[N][UNRL] = {};
+      bigType bigB[YTILE][UNRL];
+
      // Fetch the weight matrix from memory!
  #pragma unroll
      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
        uint32_t k_ = k + threadIdx.x * A_CHUNK;
-        if (k_ >= K) break;
-
-        const fp8_t* B_ = &B[(m + 0) * Kp + k_];
+        const fp8_t* B_ = &B[min__(k_, K - A_CHUNK)];
        for (int y = 0; y < YTILE; ++y) {
-          if (y + m >= M) break;  // To avoid mem access fault.
-          bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[y * Kp])));
+          bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[min__(y + m, M - 1) * Kbp])));
        }
      }

@@ -2156,20 +2088,16 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
        uint32_t k_ = k + threadIdx.x * A_CHUNK;
        if (k_ >= K) break;
        for (int n = 0; n < N; n++) {
-          if (k_ + K * n < max_lds_len)
-            bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+          if (k_ + Kap * n < max_lds_len)
+            bigA[n][k2] = *((const bigType*)(&(s[k_ + Kap * n])));
          else
-            bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
+            bigA[n][k2] = *((const bigType*)(&(A[k_ + Kap * n])));
        }
      }

  // Do the matrix multiplication in interleaved manner
  #pragma unroll
      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
-        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
-        uint32_t k_ = k + threadIdx.x * A_CHUNK;
-        if (k_ >= K) break;
-
        for (uint32_t n = 0; n < N; n++) {
          for (int i = 0; i < A_CHUNK; i += 8) {
            for (int y = 0; y < YTILE; ++y) {
@@ -2187,48 +2115,27 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
      for (int y = 0; y < YTILE; y++) {
        float accm0 = sum[n][y][0];
        float accm16 = sum[n][y][8];
-        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][1]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][9]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][2]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][10]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][3]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][11]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][4]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][12]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][5]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][13]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][6]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][14]), "v"(accm16));
-        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
-            : "=v"(accm0)
-            : "0"(accm0), "v"(sum[n][y][7]), "v"(accm0));
-        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
-            : "=v"(accm16)
-            : "0"(accm16), "v"(sum[n][y][15]), "v"(accm16));
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][1], 0x101, 0xf, 0xf,
+                                          1);  // row_shl1
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][9], 0x101, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][2], 0x102, 0xf, 0xf,
+                                          1);  // row_shl2
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][10], 0x102, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][3], 0x103, 0xf, 0xf,
+                                          1);  // row_shl3
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][11], 0x103, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][4], 0x108, 0xf, 0xf,
+                                          1);  // row_shl8
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][12], 0x108, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][5], 0x109, 0xf, 0xf,
+                                          1);  // row_shl9
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][13], 0x109, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][6], 0x10a, 0xf, 0xf,
+                                          1);  // row_shl10
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][14], 0x10a, 0xf, 0xf, 1);
+        accm0 += __builtin_amdgcn_mov_dpp(sum[n][y][7], 0x10b, 0xf, 0xf,
+                                          1);  // row_shl11
+        accm16 += __builtin_amdgcn_mov_dpp(sum[n][y][15], 0x10b, 0xf, 0xf, 1);
        accm0 += __shfl(accm0, 36);
        accm16 += __shfl(accm16, 52);
        sum[n][y][0] = accm0 + __shfl(accm16, 16);
@@ -2236,17 +2143,21 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
    }

    if (threadIdx.x == 0) {
+      scalar_t biases[N][YTILE] = {};
+      if (BIAS)
+        for (int n = 0; n < N; n++) {
+          for (int y = 0; y < YTILE; y++) {
+            biases[n][y] = BIAS[(m + y) % Bx + (n % By) * Bx];
+          }
+        }
      for (int n = 0; n < N; n++) {
        for (int y = 0; y < YTILE; y++) {
          if (y + m >= M) break;  // To avoid mem access fault.
          sum[n][y][0] *= sA * sB;
          if constexpr (std::is_same_v<scalar_t, half>) {
-            if (BIAS)
-              sum[n][y][0] += __half2float(BIAS[(m + y) % Bx + (n % By) * M]);
+            sum[n][y][0] += __half2float(biases[n][y]);
          } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
-            if (BIAS)
-              sum[n][y][0] +=
-                  __bfloat162float(BIAS[(m + y) % Bx + (n % By) * M]);
+            sum[n][y][0] += __bfloat162float(biases[n][y]);
          }
          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0]);
        }
@@ -2259,9 +2170,9 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 #else   // !defined(__HIP__MI3XX__) TODO: Add NAVI support
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
          int A_CHUNK, int UNRL, int N>
-__global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
-                              const int Bx, const int By, const fp8_t* B,
-                              const fp8_t* __restrict__ A,
+__global__ void wvSplitKQ_hf_(const int K, const int Kap, const int Kbp,
+                              const int M, const int Bx, const int By,
+                              const fp8_t* B, const fp8_t* __restrict__ A,
                              const scalar_t* __restrict__ BIAS, scalar_t* C,
                              const float* __restrict__ s_A,
                              const float* __restrict__ s_B, const int _WvPrGrp,
@@ -2270,17 +2181,18 @@ __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
 }
 #endif  // defined(__HIP__MI3XX__) TODO: Add NAVI support

-void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
+void wvSplitKQ(const at::Tensor& in_b, const at::Tensor& in_a,
               const std::optional<at::Tensor>& in_bias, at::Tensor& out_c,
               const at::Tensor& scale_a, const at::Tensor& scale_b,
               const int64_t CuCount) {
  static c10::ScalarType kFp8Type = is_fp8_ocp()
                                        ? c10::ScalarType::Float8_e4m3fn
                                        : c10::ScalarType::Float8_e4m3fnuz;
-  auto M_in = in_a.size(0);
-  auto K_in = in_a.size(1);
-  auto N_in = in_b.size(0);
-  auto Kp_in = in_a.stride(0);
+  auto M_in = in_b.size(0);
+  auto K_in = in_b.size(1);
+  auto N_in = in_a.size(0);
+  auto Kap_in = in_a.stride(0);
+  auto Kbp_in = in_b.stride(0);
  auto Bx_in =
      (in_bias.has_value() && in_bias->numel() > 0)
          ? (in_bias->sizes().size() == 2) ? in_bias->size(1) : in_bias->size(0)
@@ -2300,23 +2212,22 @@ void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  const int max_lds_len = get_lds_size();

-#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
-                  _N)                                                          \
-  {                                                                            \
-    dim3 block(64, _WvPrGrp);                                                  \
-    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) {               \
-      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);               \
-      wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N>  \
-          <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, Bx_in, By_in, a_ptr, \
-                                       b_ptr, bias_ptr, c_ptr, s_a, s_b,       \
-                                       __wvPrGrp, CuCount);                    \
-    } else {                                                                   \
-      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp);               \
-      wvSplitKQ_hf_<fptype, fp8_t, 64, _YTILEm, _WvPrGrp, 16, _UNRLm, _N>      \
-          <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, Bx_in, By_in, a_ptr, \
-                                       b_ptr, bias_ptr, c_ptr, s_a, s_b,       \
-                                       __wvPrGrp, CuCount);                    \
-    }                                                                          \
+#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N)             \
+  {                                                                           \
+    dim3 block(64, _WvPrGrp);                                                 \
+    if ((Kap_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) {            \
+      int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEs, 16));     \
+      wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N> \
+          <<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in,     \
+                                       By_in, b_ptr, a_ptr, bias_ptr, c_ptr,  \
+                                       s_a, s_b, __wvPrGrp, CuCount);         \
+    } else {                                                                  \
+      int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEm, 16));     \
+      wvSplitKQ_hf_<fptype, fp8_t, 64, _YTILEm, _WvPrGrp, 16, _UNRLm, _N>     \
+          <<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in,     \
+                                       By_in, b_ptr, a_ptr, bias_ptr, c_ptr,  \
+                                       s_a, s_b, __wvPrGrp, CuCount);         \
+    }                                                                         \
  }

  AT_DISPATCH_REDUCED_FLOATING_TYPES(out_c.scalar_type(), "wvSplitKQ", [&] {
@@ -2332,16 +2243,16 @@ void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
                          : nullptr;
      switch (N_in) {
        case 1:
-          WVSPLITKQ(16, 2, 2, 2, 2, 2, 2, 1)
+          WVSPLITKQ(12, 2, 2, 2, 2, 1)
          break;
        case 2:
-          WVSPLITKQ(16, 2, 2, 2, 2, 2, 2, 2)
+          WVSPLITKQ(12, 2, 2, 2, 2, 2)
          break;
        case 3:
-          WVSPLITKQ(16, 4, 7, 7, 1, 1, 1, 3)
+          WVSPLITKQ(8, 2, 2, 1, 1, 3)
          break;
        case 4:
-          WVSPLITKQ(16, 4, 7, 7, 1, 1, 1, 4)
+          WVSPLITKQ(4, 2, 2, 1, 1, 4)
          break;
        default:
          throw std::runtime_error(
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -320,7 +320,7 @@ WORKDIR /workspace

 # Build DeepGEMM wheel
 # Default moved here from tools/install_deepgemm.sh for centralized version management
-ARG DEEPGEMM_GIT_REF=594953acce41793ae00a1233eb516044d604bcb6
+ARG DEEPGEMM_GIT_REF=477618cd51baffca09c4b0b87e97c03fe827ef03
 COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
 RUN --mount=type=cache,target=/root/.cache/uv \
    mkdir -p /tmp/deepgemm/dist && \
@@ -582,7 +582,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # This is ~1.1GB and only changes when FlashInfer version bumps
 # https://docs.flashinfer.ai/installation.html
 # From versions.json: .flashinfer.version
-ARG FLASHINFER_VERSION=0.6.2
+ARG FLASHINFER_VERSION=0.6.3
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \
    && uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -134,7 +134,6 @@ WORKDIR /vllm-workspace
 # Copy test requirements
 COPY requirements/test.in requirements/cpu-test.in

-# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
 RUN \
    sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
    remove_packages_not_supported_on_aarch64() { \
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -217,13 +217,13 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.


 # build flashinfer for torch nightly from source around 10 mins
-# release version: v0.6.2
+# release version: v0.6.3
 # todo(elainewy): cache flashinfer build result for faster build
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/uv \
    echo "git clone flashinfer..." \
-    && git clone --depth 1 --branch v0.6.2 --recursive https://github.com/flashinfer-ai/flashinfer.git \
+    && git clone --depth 1 --branch v0.6.3 --recursive https://github.com/flashinfer-ai/flashinfer.git \
    && cd flashinfer \
    && git submodule update --init --recursive \
    && echo "finish git clone flashinfer..." \
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -1,5 +1,10 @@
 FROM intel/deep-learning-essentials:2025.3.2-0-devel-ubuntu24.04 AS vllm-base

+WORKDIR /workspace/
+
+ARG PYTHON_VERSION=3.12
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/xpu"
+
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
    add-apt-repository -y ppa:kobuk-team/intel-graphics
@@ -22,13 +27,16 @@ RUN apt clean && apt-get update -y && \
    python3.12-dev \
    python3-pip

-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
-RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
-
 RUN apt update && apt upgrade -y && \
    apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc && \
    apt install -y intel-oneapi-compiler-dpcpp-cpp-2025.3

+ENV PATH="/root/.local/bin:$PATH"
+ENV VIRTUAL_ENV="/opt/venv"
+ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"

 # This oneccl contains the BMG support which is not the case for default version of oneapi 2025.2.
 ARG ONECCL_INSTALLER="intel-oneccl-2021.15.7.8_offline.sh"
@@ -44,20 +52,31 @@ SHELL ["bash", "-c"]
 CMD ["bash", "-c", "source /root/.bashrc && exec bash"]

 WORKDIR /workspace/vllm
-COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt
-COPY requirements/common.txt /workspace/vllm/requirements/common.txt

-# suppress the python externally managed environment error
-RUN python3 -m pip config set global.break-system-packages true
+ENV UV_HTTP_TIMEOUT=500

-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install --no-cache-dir \
-    -r requirements/xpu.txt
+# Configure package index for XPU
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+ENV UV_LINK_MODE="copy"

-# arctic-inference is built from source which needs torch-xpu properly installed
-# used for suffix method speculative decoding
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install --no-cache-dir arctic-inference==0.1.1
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=requirements/common.txt,target=/workspace/vllm/requirements/common.txt \
+    --mount=type=bind,src=requirements/xpu.txt,target=/workspace/vllm/requirements/xpu.txt \
+    uv pip install --upgrade pip && \
+    uv pip install -r requirements/xpu.txt
+
+ # used for suffix method speculative decoding
+ # build deps for proto + nanobind-based extensions to set up the build environment
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install grpcio-tools protobuf nanobind
+ # arctic-inference is built from source which needs torch-xpu properly installed first
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/intel/oneapi/setvars.sh --force && \
+    source /opt/intel/oneapi/ccl/2021.15/env/vars.sh --force && \
+    export CMAKE_PREFIX_PATH="$(python -c 'import site; print(site.getsitepackages()[0])'):${CMAKE_PREFIX_PATH}" && \
+    uv pip install --no-build-isolation arctic-inference==0.1.1

 ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"

@@ -69,33 +88,32 @@ RUN --mount=type=bind,source=.git,target=.git \
 ENV VLLM_TARGET_DEVICE=xpu
 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn

-RUN --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
-    pip install --no-build-isolation .
+    uv pip install --no-build-isolation .

 CMD ["/bin/bash"]

 FROM vllm-base AS vllm-openai

 # install additional dependencies for openai api server
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope

 # install development dependencies (for testing)
-RUN python3 -m pip install -e tests/vllm_test_utils
+RUN uv pip install -e tests/vllm_test_utils

 # install nixl from source code
 ENV NIXL_VERSION=0.7.0
-RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
+RUN python /workspace/vllm/tools/install_nixl_from_source_ubuntu.py

 # FIX triton
-RUN --mount=type=cache,target=/root/.cache/pip pip uninstall triton triton-xpu -y && pip install triton-xpu==3.6.0 --extra-index-url=https://download.pytorch.org/whl/xpu
-
-# PyJWT-2.7.0 will influence some wheel behaviors, remove its dist-info to avoid conflicts
-RUN rm /usr/lib/python3/dist-packages/PyJWT-2.7.0.dist-info/ -rf
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip uninstall triton triton-xpu && \
+    uv pip install triton-xpu==3.6.0

 # remove torch bundled oneccl to avoid conflicts
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip uninstall oneccl oneccl-devel -y
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip uninstall oneccl oneccl-devel

 ENTRYPOINT ["vllm", "serve"]
--- a/docker/versions.json
+++ b/docker/versions.json
@@ -50,7 +50,7 @@
      "default": "cuda"
    },
    "DEEPGEMM_GIT_REF": {
-      "default": "594953acce41793ae00a1233eb516044d604bcb6"
+      "default": "477618cd51baffca09c4b0b87e97c03fe827ef03"
    },
    "PPLX_COMMIT_HASH": {
      "default": "12cecfd"
@@ -68,7 +68,7 @@
      "default": "true"
    },
    "FLASHINFER_VERSION": {
-      "default": "0.6.2"
+      "default": "0.6.3"
    },
    "GDRCOPY_CUDA_VERSION": {
      "default": "12.8"
--- a/docs/assets/design/arch_overview/v1_process_architecture_tp2_dp4.png
+++ b/docs/assets/design/arch_overview/v1_process_architecture_tp2_dp4.png
--- a/docs/assets/design/arch_overview/v1_process_architecture_tp4.png
+++ b/docs/assets/design/arch_overview/v1_process_architecture_tp4.png
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -291,6 +291,52 @@ Based on the configuration, the content of the multi-modal caches on `P0` and `P
 K: Stores the hashes of multi-modal items
 V: Stores the processed tensor data of multi-modal items

+## CPU Resources for GPU Deployments
+
+vLLM V1 uses a multi-process architecture (see [V1 Process Architecture](../design/arch_overview.md#v1-process-architecture)) where each process requires CPU resources. Underprovisioning CPU cores is a common source of performance degradation, especially in virtualized environments.
+
+### Minimum CPU Requirements
+
+For a deployment with `N` GPUs, there are at minimum:
+
+- **1 API server process** -- handles HTTP requests, tokenization, and input processing
+- **1 engine core process** -- runs the scheduler and coordinates GPU workers
+- **N GPU worker processes** -- one per GPU, executes model forward passes
+
+This means there are always at least **`2 + N` processes** competing for CPU time.
+
+!!! warning
+    Using fewer physical CPU cores than processes will cause contention and significantly degrade throughput and latency. The engine core process runs a busy loop and is particularly sensitive to CPU starvation.
+
+The minimum is `2 + N` physical cores (1 for the API server, 1 for the engine core, and 1 per GPU worker). In practice, allocating more cores improves performance because the OS, PyTorch background threads, and other system processes also need CPU time.
+
+!!! important
+    Please note we are referring to **physical CPU cores** here. If your system has hyperthreading enabled, then 1 vCPU = 1 hyperthread = 1/2 physical CPU core, so you need `2 x (2 + N)` minimum vCPUs.
+
+### Data Parallel and Multi-API Server Deployments
+
+When using data parallelism or multiple API servers, the CPU requirements increase:
+
+```console
+Minimum physical cores = A + DP + N + (1 if DP > 1 else 0)
+```
+
+where `A` is the API server count (defaults to `DP`), `DP` is the data parallel size, and `N` is the total number of GPUs. For example, with `DP=4, TP=2` on 8 GPUs:
+
+```console
+4 API servers + 4 engine cores + 8 GPU workers + 1 DP coordinator = 17 processes
+```
+
+### Performance Impact
+
+CPU underprovisioning particularly impacts:
+
+- **Input processing throughput** -- tokenization, chat template rendering, and multi-modal data loading all run on CPU
+- **Scheduling latency** -- the engine core scheduler runs on CPU and directly affects how quickly new tokens are dispatched to the GPU workers
+- **Output processing** -- detokenization, networking, and especially streaming token responses use CPU cycles
+
+If you observe that GPU utilization is lower than expected, CPU contention may be the bottleneck. Increasing the number of available CPU cores and even the clock speed can significantly improve end-to-end performance.
+
 ## Attention Backend Selection

 vLLM supports multiple attention backends optimized for different hardware and use cases. The backend is automatically selected based on your GPU architecture, model type, and configuration, but you can also manually specify one for optimal performance.
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -138,7 +138,7 @@ These models should follow the same instructions as case (1), but they should in

 For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](../../../vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](../../../vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
 Please follow the same guidelines as case (2) for implementing these models.
-We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention).
+We use "mamba-like" to refer to layers that possess a state that is updated in-place, rather than being appended-to (like KV cache for attention).
 For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
 It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
 Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this.
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@@ -739,7 +739,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
        ```

    However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called,
-    a BOS token (`<s>`) is also added to the promopt:
+    a BOS token (`<s>`) is also added to the prompt:

    ??? code

--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -1,161 +1,13 @@
+---
+toc_depth: 2
+---
+
 # Using Docker

-## Use vLLM's Official Docker Image
+## Pre-built images

-vLLM offers an official Docker image for deployment.
-The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags).
+--8<-- "docs/getting_started/installation/gpu.md:pre-built-images"

-```bash
-docker run --runtime nvidia --gpus all \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HF_TOKEN=$HF_TOKEN" \
-    -p 8000:8000 \
-    --ipc=host \
-    vllm/vllm-openai:latest \
-    --model Qwen/Qwen3-0.6B
-```
+## Build image from source

-This image can also be used with other container engines such as [Podman](https://podman.io/).
-
-```bash
-podman run --device nvidia.com/gpu=all \
-  -v ~/.cache/huggingface:/root/.cache/huggingface \
-  --env "HF_TOKEN=$HF_TOKEN" \
-  -p 8000:8000 \
-  --ipc=host \
-  docker.io/vllm/vllm-openai:latest \
-  --model Qwen/Qwen3-0.6B
-```
-
-You can add any other [engine-args](../configuration/engine_args.md) you need after the image tag (`vllm/vllm-openai:latest`).
-
-!!! note
-    You can either use the `ipc=host` flag or `--shm-size` flag to allow the
-    container to access the host's shared memory. vLLM uses PyTorch, which uses shared
-    memory to share data between processes under the hood, particularly for tensor parallel inference.
-
-!!! note
-    Optional dependencies are not included in order to avoid licensing issues (e.g. <https://github.com/vllm-project/vllm/issues/8030>).
-
-    If you need to use those dependencies (having accepted the license terms),
-    create a custom Dockerfile on top of the base image with an extra layer that installs them:
-
-    ```Dockerfile
-    FROM vllm/vllm-openai:v0.11.0
-
-    # e.g. install the `audio` optional dependencies
-    # NOTE: Make sure the version of vLLM matches the base image!
-    RUN uv pip install --system vllm[audio]==0.11.0
-    ```
-
-!!! tip
-    Some new models may only be available on the main branch of [HF Transformers](https://github.com/huggingface/transformers).
-
-    To use the development version of `transformers`, create a custom Dockerfile on top of the base image
-    with an extra layer that installs their code from source:
-
-    ```Dockerfile
-    FROM vllm/vllm-openai:latest
-
-    RUN uv pip install --system git+https://github.com/huggingface/transformers.git
-    ```
-
-## Building vLLM's Docker Image from Source
-
-You can build and run vLLM from source via the provided [docker/Dockerfile](../../docker/Dockerfile). To build vLLM:
-
-```bash
-# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
-DOCKER_BUILDKIT=1 docker build . \
-    --target vllm-openai \
-    --tag vllm/vllm-openai \
-    --file docker/Dockerfile
-```
-
-!!! note
-    By default vLLM will build for all GPU types for widest distribution. If you are just building for the
-    current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""`
-    for vLLM to find the current GPU type and build for that.
-
-    If you are using Podman instead of Docker, you might need to disable SELinux labeling by
-    adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184).
-
-!!! note
-    If you have not changed any C++ or CUDA kernel code, you can use precompiled wheels to significantly reduce Docker build time.
-
-    *   **Enable the feature** by adding the build argument: `--build-arg VLLM_USE_PRECOMPILED="1"`.
-    *   **How it works**: By default, vLLM automatically finds the correct wheels from our [Nightly Builds](../contributing/ci/nightly_builds.md) by using the merge-base commit with the upstream `main` branch.
-    *   **Override commit**: To use wheels from a specific commit, provide the `--build-arg VLLM_PRECOMPILED_WHEEL_COMMIT=<commit_hash>` argument.
-
-    For a detailed explanation, refer to the documentation on 'Set up using Python-only build (without compilation)' part in [Build wheel from source](../contributing/ci/nightly_builds.md#precompiled-wheels-usage), these args are similar.
-
-## Building for Arm64/aarch64
-
-A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper and Grace-Blackwell. Using the flag `--platform "linux/arm64"` will build for arm64.
-
-!!! note
-    Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
-    flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
-    Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
-
-??? console "Command"
-
-    ```bash
-    # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
-    DOCKER_BUILDKIT=1 docker build . \
-    --file docker/Dockerfile \
-    --target vllm-openai \
-    --platform "linux/arm64" \
-    -t vllm/vllm-gh200-openai:latest \
-    --build-arg max_jobs=66 \
-    --build-arg nvcc_threads=2 \
-    --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
-    --build-arg RUN_WHEEL_CHECK=false
-    ```
-
-For (G)B300, we recommend using CUDA 13, as shown in the following command.
-
-??? console "Command"
-
-    ```bash
-    DOCKER_BUILDKIT=1 docker build \
-    --build-arg CUDA_VERSION=13.0.1 \
-    --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 \
-    --build-arg max_jobs=256 \
-    --build-arg nvcc_threads=2 \
-    --build-arg RUN_WHEEL_CHECK=false \
-    --build-arg torch_cuda_arch_list='9.0 10.0+PTX' \
-    --platform "linux/arm64" \
-    --tag vllm/vllm-gb300-openai:latest \
-    --target vllm-openai \
-    -f docker/Dockerfile \
-    .
-    ```
-
-!!! note
-    If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution.
-
-    Run the following command on your host machine to register QEMU user static handlers:
-
-    ```bash
-    docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
-    ```
-
-    After setting up QEMU, you can use the `--platform "linux/arm64"` flag in your `docker build` command.
-
-## Use the custom-built vLLM Docker image
-
-To run vLLM with the custom-built Docker image:
-
-```bash
-docker run --runtime nvidia --gpus all \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    -p 8000:8000 \
-    --env "HF_TOKEN=<secret>" \
-    vllm/vllm-openai <args...>
-```
-
-The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command).
-
-!!! note
-    **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .
+--8<-- "docs/getting_started/installation/gpu.md:build-image-from-source"
--- a/docs/design/arch_overview.md
+++ b/docs/design/arch_overview.md
@@ -78,6 +78,73 @@ That code can be found in [vllm/entrypoints/openai/api_server.py](../../vllm/ent

 More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.

+## V1 Process Architecture
+
+vLLM V1 uses a multi-process architecture to separate concerns and maximize throughput. Understanding this architecture is important for properly sizing CPU resources in your deployment. The key processes are:
+
+### API Server Process
+
+The API server process handles HTTP requests (e.g., the OpenAI-compatible API), performs input processing (tokenization, multi-modal data loading), and streams results back to clients. It communicates with the engine core process(es) via ZMQ sockets.
+
+By default, there is **1 API server process**, but when data parallelism is used, the API server count automatically scales to match the data parallel size. This can also be manually configured with the `--api-server-count` flag. Each API server connects to **all** engine cores via ZMQ in a many-to-many topology, enabling any API server to route requests to any engine core. Each API server process uses multiple CPU threads for media loading (controlled by `VLLM_MEDIA_LOADING_THREAD_COUNT`, default 8).
+
+The code can be found in [vllm/entrypoints/openai/api_server.py](../../vllm/entrypoints/openai/api_server.py) and [vllm/v1/utils.py](../../vllm/v1/utils.py).
+
+### Engine Core Process
+
+The engine core process runs the scheduler, manages KV cache, and coordinates model execution across GPU workers. It runs a busy loop that continuously schedules requests and dispatches work to the GPU workers.
+
+There is **1 engine core process per data parallel rank**. For example, with `--data-parallel-size 4`, there are 4 engine core processes.
+
+The code can be found in [vllm/v1/engine/core.py](../../vllm/v1/engine/core.py) and [vllm/v1/engine/utils.py](../../vllm/v1/engine/utils.py).
+
+### GPU Worker Processes
+
+Each GPU is managed by a dedicated worker process. The worker process loads model weights, executes forward passes, and manages GPU memory. Workers communicate with the engine core process that owns them.
+
+There is **1 worker process per GPU**. The total number of GPU worker processes equals `tensor_parallel_size x pipeline_parallel_size` per engine core.
+
+The code can be found in [vllm/v1/executor/multiproc_executor.py](../../vllm/v1/executor/multiproc_executor.py) and [vllm/v1/worker/gpu_worker.py](../../vllm/v1/worker/gpu_worker.py).
+
+### DP Coordinator Process (conditional)
+
+When using data parallelism (`--data-parallel-size > 1`), an additional coordinator process manages load balancing across DP ranks and coordinates synchronized forward passes for MoE models.
+
+There is **1 DP coordinator process** (only when data parallelism is enabled).
+
+The code can be found in [vllm/v1/engine/coordinator.py](../../vllm/v1/engine/coordinator.py).
+
+### Process Count Summary
+
+For a deployment with `N` GPUs, `TP` tensor parallel size, `DP` data parallel size, and `A` API server count:
+
+| Process Type | Count | Notes |
+|---|---|---|
+| API Server | `A` (default `DP`) | Handles HTTP requests and input processing |
+| Engine Core | `DP` (default 1) | Scheduler and KV cache management |
+| GPU Worker | `N` (= `DP x TP`) | One per GPU, executes model forward passes |
+| DP Coordinator | 1 if `DP > 1`, else 0 | Load balancing across DP ranks |
+| **Total** | **`A + DP + N` (+ 1 if DP > 1)** | |
+
+For example, a typical single-node deployment with 4 GPUs (`vllm serve -tp=4`) has:
+
+- 1 API server + 1 engine core + 4 GPU workers = **6 processes**
+
+<figure markdown="1">
+![V1 Process Architecture - TP=4](../assets/design/arch_overview/v1_process_architecture_tp4.png)
+</figure>
+
+A data parallel deployment with 8 GPUs (`vllm serve -tp=2 -dp=4`) has:
+
+- 4 API servers + 4 engine cores + 8 GPU workers + 1 DP coordinator = **17 processes**
+
+<figure markdown="1">
+![V1 Process Architecture - TP=2, DP=4](../assets/design/arch_overview/v1_process_architecture_tp2_dp4.png)
+</figure>
+
+For CPU resource sizing recommendations, see
+[CPU Resources for GPU Deployments](../configuration/optimization.md#cpu-resources-for-gpu-deployments).
+
 ## LLM Engine

 The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -24,7 +24,7 @@ vLLM's plugin system uses the standard Python `entry_points` mechanism. This mec
            ["register_dummy_model = vllm_add_dummy_model:register"]
        })

-    # inside `vllm_add_dummy_model.py` file
+    # inside `vllm_add_dummy_model/__init__.py` file
    def register():
        from vllm import ModelRegistry

@@ -45,7 +45,7 @@ Every plugin has three parts:

 ## Types of supported plugins

- **General plugins** (with group name `vllm.general_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model inside the plugin function.
+- **General plugins** (with group name `vllm.general_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model inside the plugin function. For an example of an official model plugin, see the [bart-plugin](https://github.com/vllm-project/bart-plugin) which adds support for `BartForConditionalGeneration`.

 - **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.

--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -510,7 +510,7 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
    If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.

    For certain models, we provide alternative chat templates inside [examples](../../examples).
-    For example, VLM2Vec uses [examples/template_vlm2vec_phi3v.jinja](../../examples/template_vlm2vec_phi3v.jinja) which is different from the default one for Phi-3-Vision.
+    For example, VLM2Vec uses [examples/pooling/embed/template/vlm2vec_phi3v.jinja](../../examples/pooling/embed/template/vlm2vec_phi3v.jinja) which is different from the default one for Phi-3-Vision.

 ### Image Inputs

--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@@ -213,6 +213,15 @@ Support use case: Prefill with 'HND' and decode with 'NHD' with experimental con
 --kv-transfer-config '{..., "enable_permute_local_kv":"True"}'
 ```

+### Cross layers blocks
+
+By default, this feature is disabled. On attention backends that support this feature, each logical block is contiguous in physical memory. This reduces the number of buffers that need to be transferred.
+To enable this feature:
+
+```bash
+--kv-transfer-config '{..., "kv_connector_extra_config": {"enable_cross_layers_blocks": "True"}}'
+```
+
 ## Example Scripts/Code

 Refer to these example scripts in the vLLM repository:
--- a/docs/features/quantization/modelopt.md
+++ b/docs/features/quantization/modelopt.md
@@ -17,6 +17,7 @@ following `quantization.quant_algo` values:
 - `FP8_PER_CHANNEL_PER_TOKEN`: per-channel weight scale and dynamic per-token activation quantization.
 - `FP8_PB_WO` (ModelOpt may emit `fp8_pb_wo`): block-scaled FP8 weight-only (typically 128×128 blocks).
 - `NVFP4`: ModelOpt NVFP4 checkpoints (use `quantization="modelopt_fp4"`).
+- `MXFP8`: ModelOpt MXFP8 checkpoints (use `quantization="modelopt_mxfp8"`).

 ## Quantizing HuggingFace Models with PTQ

--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@@ -239,27 +239,168 @@ uv pip install -e .
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]

-See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image.
-
-Another way to access the latest code is to use the docker images:
+vLLM offers an official Docker image for deployment.
+The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags).

 ```bash
-export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HF_TOKEN=$HF_TOKEN" \
+    -p 8000:8000 \
+    --ipc=host \
+    vllm/vllm-openai:latest \
+    --model Qwen/Qwen3-0.6B
 ```

-These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days.
+This image can also be used with other container engines such as [Podman](https://podman.io/).

-The latest code can contain bugs and may not be stable. Please use it with caution.
+```bash
+podman run --device nvidia.com/gpu=all \
+-v ~/.cache/huggingface:/root/.cache/huggingface \
+--env "HF_TOKEN=$HF_TOKEN" \
+-p 8000:8000 \
+--ipc=host \
+docker.io/vllm/vllm-openai:latest \
+--model Qwen/Qwen3-0.6B
+```
+
+You can add any other [engine-args](https://docs.vllm.ai/en/latest/configuration/engine_args/) you need after the image tag (`vllm/vllm-openai:latest`).
+
+!!! note
+    You can either use the `ipc=host` flag or `--shm-size` flag to allow the
+    container to access the host's shared memory. vLLM uses PyTorch, which uses shared
+    memory to share data between processes under the hood, particularly for tensor parallel inference.
+
+!!! note
+    Optional dependencies are not included in order to avoid licensing issues (e.g. <https://github.com/vllm-project/vllm/issues/8030>).
+
+    If you need to use those dependencies (having accepted the license terms),
+    create a custom Dockerfile on top of the base image with an extra layer that installs them:
+
+    ```Dockerfile
+    FROM vllm/vllm-openai:v0.11.0
+
+    # e.g. install the `audio` optional dependencies
+    # NOTE: Make sure the version of vLLM matches the base image!
+    RUN uv pip install --system vllm[audio]==0.11.0
+    ```
+
+!!! tip
+    Some new models may only be available on the main branch of [HF Transformers](https://github.com/huggingface/transformers).
+
+    To use the development version of `transformers`, create a custom Dockerfile on top of the base image
+    with an extra layer that installs their code from source:
+
+    ```Dockerfile
+    FROM vllm/vllm-openai:latest
+
+    RUN uv pip install --system git+https://github.com/huggingface/transformers.git
+    ```

 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]

-See [Building vLLM's Docker Image from Source](../../deployment/docker.md#building-vllms-docker-image-from-source) for instructions on building the Docker image.
+You can build and run vLLM from source via the provided [docker/Dockerfile](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile). To build vLLM:
+
+```bash
+# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
+DOCKER_BUILDKIT=1 docker build . \
+    --target vllm-openai \
+    --tag vllm/vllm-openai \
+    --file docker/Dockerfile
+```
+
+!!! note
+    By default vLLM will build for all GPU types for widest distribution. If you are just building for the
+    current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""`
+    for vLLM to find the current GPU type and build for that.
+
+    If you are using Podman instead of Docker, you might need to disable SELinux labeling by
+    adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184).
+
+!!! note
+    If you have not changed any C++ or CUDA kernel code, you can use precompiled wheels to significantly reduce Docker build time.
+
+    *   **Enable the feature** by adding the build argument: `--build-arg VLLM_USE_PRECOMPILED="1"`.
+    *   **How it works**: By default, vLLM automatically finds the correct wheels from our [Nightly Builds](https://docs.vllm.ai/en/latest/contributing/ci/nightly_builds/) by using the merge-base commit with the upstream `main` branch.
+    *   **Override commit**: To use wheels from a specific commit, provide the `--build-arg VLLM_PRECOMPILED_WHEEL_COMMIT=<commit_hash>` argument.
+
+    For a detailed explanation, refer to the documentation on 'Set up using Python-only build (without compilation)' part in [Build wheel from source](https://docs.vllm.ai/en/latest/contributing/ci/nightly_builds/#precompiled-wheels-usage), these args are similar.
+
+#### Building vLLM's Docker Image from Source for Arm64/aarch64
+
+A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper and Grace-Blackwell. Using the flag `--platform "linux/arm64"` will build for arm64.
+
+!!! note
+    Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
+    flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
+    Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
+
+??? console "Command"
+
+    ```bash
+    # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
+    DOCKER_BUILDKIT=1 docker build . \
+    --file docker/Dockerfile \
+    --target vllm-openai \
+    --platform "linux/arm64" \
+    -t vllm/vllm-gh200-openai:latest \
+    --build-arg max_jobs=66 \
+    --build-arg nvcc_threads=2 \
+    --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
+    --build-arg RUN_WHEEL_CHECK=false
+    ```
+
+For (G)B300, we recommend using CUDA 13, as shown in the following command.
+
+??? console "Command"
+
+    ```bash
+    DOCKER_BUILDKIT=1 docker build \
+    --build-arg CUDA_VERSION=13.0.1 \
+    --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 \
+    --build-arg max_jobs=256 \
+    --build-arg nvcc_threads=2 \
+    --build-arg RUN_WHEEL_CHECK=false \
+    --build-arg torch_cuda_arch_list='9.0 10.0+PTX' \
+    --platform "linux/arm64" \
+    --tag vllm/vllm-gb300-openai:latest \
+    --target vllm-openai \
+    -f docker/Dockerfile \
+    .
+    ```
+
+!!! note
+    If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution.
+
+    Run the following command on your host machine to register QEMU user static handlers:
+
+    ```bash
+    docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
+    ```
+
+    After setting up QEMU, you can use the `--platform "linux/arm64"` flag in your `docker build` command.
+
+#### Use the custom-built vLLM Docker image**
+
+To run vLLM with the custom-built Docker image:
+
+```bash
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    -p 8000:8000 \
+    --env "HF_TOKEN=<secret>" \
+    vllm/vllm-openai <args...>
+```
+
+The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command).
+
+!!! note
+    **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .

 # --8<-- [end:build-image-from-source]
 # --8<-- [start:supported-features]

 See [Feature x Hardware](../../features/README.md#feature-x-hardware) compatibility matrix for feature support information.

-# --8<-- [end:supported-features]
+# --8<-- [end:supported-features]
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@@ -1,3 +1,7 @@
+---
+toc_depth: 3
+---
+
 # GPU

 vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions:
@@ -84,6 +88,9 @@ vLLM is a Python library that supports the following GPU variants. Select your G

 ### Pre-built images

+<!-- markdownlint-disable MD025 -->
+# --8<-- [start:pre-built-images]
+
 === "NVIDIA CUDA"

    --8<-- "docs/getting_started/installation/gpu.cuda.inc.md:pre-built-images"
@@ -96,7 +103,15 @@ vLLM is a Python library that supports the following GPU variants. Select your G

    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:pre-built-images"

+# --8<-- [end:pre-built-images]
+<!-- markdownlint-enable MD025 -->
+
+<!-- markdownlint-disable MD001 -->
 ### Build image from source
+<!-- markdownlint-enable MD001 -->
+
+<!-- markdownlint-disable MD025 -->
+# --8<-- [start:build-image-from-source]

 === "NVIDIA CUDA"

@@ -110,6 +125,9 @@ vLLM is a Python library that supports the following GPU variants. Select your G

    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:build-image-from-source"

+# --8<-- [end:build-image-from-source]
+<!-- markdownlint-enable MD025 -->
+
 ## Supported features

 === "NVIDIA CUDA"
--- a/docs/getting_started/installation/gpu.rocm.inc.md
+++ b/docs/getting_started/installation/gpu.rocm.inc.md
@@ -174,67 +174,44 @@ uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.15.0/rocm700
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]

-#### Use vLLM's Official Docker Image
-
 vLLM offers an official Docker image for deployment.
 The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai-rocm](https://hub.docker.com/r/vllm/vllm-openai-rocm/tags).

-???+ console "Commands"
-    ```bash
-    docker run --rm \
-        --group-add=video \
-        --cap-add=SYS_PTRACE \
-        --security-opt seccomp=unconfined \
-        --device /dev/kfd \
-        --device /dev/dri \
-        -v ~/.cache/huggingface:/root/.cache/huggingface \
-        --env "HF_TOKEN=$HF_TOKEN" \
-        -p 8000:8000 \
-        --ipc=host \
-        vllm/vllm-openai-rocm:latest \
-        --model Qwen/Qwen3-0.6B
-    ```
-
-To use the docker image as base for development, you can launch it in interactive session through overriding the entrypoint.
-
-???+ console "Commands"
-    ```bash
-    docker run --rm -it \
-        --group-add=video \
-        --cap-add=SYS_PTRACE \
-        --security-opt seccomp=unconfined \
-        --device /dev/kfd \
-        --device /dev/dri \
-        -v ~/.cache/huggingface:/root/.cache/huggingface \
-        --env "HF_TOKEN=$HF_TOKEN" \
-        -p 8000:8000 \
-        --ipc=host \
-        --entrypoint bash \
-        vllm/vllm-openai-rocm:latest
-    ```
-
-
-#### Use AMD's Docker Images
-
-The [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
-docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator.
-AMD also offers nightly prebuilt docker image from [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev), which has vLLM and all its dependencies installed. The entrypoint of this docker image is `/bin/bash` (different from the vLLM's Official Docker Image).
-
-???+ console "Commands"
-    ```bash
-    docker pull rocm/vllm-dev:nightly # to get the latest image
-    docker run -it --rm \
-    --network=host \
+```bash
+docker run --rm \
    --group-add=video \
-    --ipc=host \
    --cap-add=SYS_PTRACE \
    --security-opt seccomp=unconfined \
    --device /dev/kfd \
    --device /dev/dri \
-    -v <path/to/your/models>:/app/models \
-    -e HF_HOME="/app/models" \
-    rocm/vllm-dev:nightly
-    ```
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HF_TOKEN=$HF_TOKEN" \
+    -p 8000:8000 \
+    --ipc=host \
+    vllm/vllm-openai-rocm:latest \
+    --model Qwen/Qwen3-0.6B
+```
+
+#### Use AMD's Docker Images
+
+Prior to January 20th, 2026 when the official docker images are available on [upstream vLLM docker hub](https://hub.docker.com/v2/repositories/vllm/vllm-openai-rocm/tags/), the [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
+docker image designed for validating inference performance on the AMD Instinct MI300X™ accelerator.
+AMD also offers nightly prebuilt docker image from [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev), which has vLLM and all its dependencies installed. The entrypoint of this docker image is `/bin/bash` (different from the vLLM's Official Docker Image).
+
+```bash
+docker pull rocm/vllm-dev:nightly # to get the latest image
+docker run -it --rm \
+--network=host \
+--group-add=video \
+--ipc=host \
+--cap-add=SYS_PTRACE \
+--security-opt seccomp=unconfined \
+--device /dev/kfd \
+--device /dev/dri \
+-v <path/to/your/models>:/app/models \
+-e HF_HOME="/app/models" \
+rocm/vllm-dev:nightly
+```

 !!! tip
    Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html)
@@ -243,7 +220,7 @@ AMD also offers nightly prebuilt docker image from [Docker Hub](https://hub.dock
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]

-Building the Docker image from source is the recommended way to use vLLM with ROCm.
+You can build and run vLLM from source via the provided [docker/Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm).

 ??? info "(Optional) Build an image with ROCm software stack"

@@ -269,8 +246,6 @@ Building the Docker image from source is the recommended way to use vLLM with RO
        -t rocm/vllm-dev:base .
    ```

-#### Build an image with vLLM
-
 First, build a docker image from [docker/Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.rocm) and launch a docker container from the image.
 It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:

@@ -292,30 +267,46 @@ Their values can be passed in when running `docker build` with `--build-arg` opt

 To build vllm on ROCm 7.0 for MI200 and MI300 series, you can use the default (which build a docker image with `vllm serve` as entrypoint):

-???+ console "Commands"
-    ```bash
-    DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm .
-    ```
-
-To run the above docker image `vllm-rocm`, use the below command:
+```bash
+DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm/vllm-openai-rocm .
+```


-???+ console "Commands"
-    ```bash
-    docker run -it \
-    --network=host \
+To run vLLM with the custom-built Docker image:
+
+```bash
+docker run --rm \
    --group-add=video \
-    --ipc=host \
    --cap-add=SYS_PTRACE \
    --security-opt seccomp=unconfined \
    --device /dev/kfd \
    --device /dev/dri \
-    -v <path/to/model>:/app/model \
-    vllm-rocm \
-    --model Qwen/Qwen3-0.6B
-    ```
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HF_TOKEN=$HF_TOKEN" \
+    -p 8000:8000 \
+    --ipc=host \
+    vllm/vllm-openai-rocm <args...>
+```

-Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models.
+The argument `vllm/vllm-openai-rocm` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command).
+
+To use the docker image as base for development, you can launch it in interactive session through overriding the entrypoint.
+
+???+ console "Commands"
+    ```bash
+    docker run --rm -it \
+        --group-add=video \
+        --cap-add=SYS_PTRACE \
+        --security-opt seccomp=unconfined \
+        --device /dev/kfd \
+        --device /dev/dri \
+        -v ~/.cache/huggingface:/root/.cache/huggingface \
+        --env "HF_TOKEN=$HF_TOKEN" \
+        --network=host \
+        --ipc=host \
+        --entrypoint bash \
+        vllm/vllm-openai-rocm
+    ```

 # --8<-- [end:build-image-from-source]
 # --8<-- [start:supported-features]
--- a/docs/getting_started/installation/gpu.xpu.inc.md
+++ b/docs/getting_started/installation/gpu.xpu.inc.md
@@ -6,10 +6,11 @@ vLLM initially supports basic model inference and serving on Intel GPU platform.
 # --8<-- [start:requirements]

 - Supported Hardware: Intel Data Center GPU, Intel ARC GPU
- OneAPI requirements: oneAPI 2025.1
+- OneAPI requirements: oneAPI 2025.3
+- Dependency: [vllm-xpu-kernels](https://github.com/vllm-project/vllm-xpu-kernels): a package provide all necessary vllm custom kernel when running vLLM on Intel GPU platform, 
 - Python: 3.12
 !!! warning
-    The provided IPEX whl is Python3.12 specific so this version is a MUST.
+    The provided vllm-xpu-kernels whl is Python3.12 specific so this version is a MUST.

 # --8<-- [end:requirements]
 # --8<-- [start:set-up-using-python]
@@ -24,7 +25,7 @@ Currently, there are no pre-built XPU wheels.
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]

- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.1 or later.
+- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.3 or later.
 - Second, install Python packages for vLLM XPU backend building:

 ```bash
@@ -37,7 +38,7 @@ pip install -v -r requirements/xpu.txt
 - Then, build and install vLLM XPU backend:

 ```bash
-VLLM_TARGET_DEVICE=xpu python setup.py install
+VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation -e . -v
 ```

 # --8<-- [end:build-wheel-from-source]
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -57,7 +57,7 @@ This guide will help you quickly get started with vLLM to perform:
        It currently supports Python 3.12, ROCm 7.0 and `glibc >= 2.35`.

    !!! note    
-        Note that, previously, docker images were published using AMD's docker release pipeline and were located `rocm/vlm-dev`. This is being deprecated by using vLLM's docker release pipeline.
+        Note that, previously, docker images were published using AMD's docker release pipeline and were located `rocm/vllm-dev`. This is being deprecated by using vLLM's docker release pipeline.

 === "Google TPU"

--- a/docs/mkdocs/javascript/reo.js
+++ b/docs/mkdocs/javascript/reo.js
@@ -0,0 +1,3 @@
+// Reo.Dev documentation tracking
+// https://docs.reo.dev/integrations/tracking-beacon/install-javascript-for-documentation
+!function(){var e,t,n;e="d5c4337961ef0ac",t=function(){Reo.init({clientID:"d5c4337961ef0ac"})},(n=document.createElement("script")).src="https://static.reo.dev/"+e+"/reo.js",n.defer=!0,n.onload=t,document.head.appendChild(n)}();
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -174,6 +174,16 @@ class MyConfig(PretrainedConfig):
    - The `list` in the first element of the `tuple` contains the names of the input arguments
    - The `list` in the last element of the `tuple` contains the names of the variables the layer outputs to in your modeling code

+### Plugins
+
+Some model architectures are supported via vLLM plugins. These plugins extend vLLM's capabilities through the [plugin system](../design/plugin_system.md).
+
+| Architecture | Models | Plugin Repository |
+|--------------|--------|-------------------|
+| `BartForConditionalGeneration` | BART | [bart-plugin](https://github.com/vllm-project/bart-plugin) |
+
+For other model architectures not natively supported, in particular for Encoder-Decoder models, we recommend following a similar pattern by implementing support through the plugin system.
+
 ## Loading a Model

 ### Hugging Face Hub
@@ -214,13 +224,13 @@ If you prefer, you can use the Hugging Face CLI to [download a model](https://hu

 ```bash
 # Download a model
-huggingface-cli download HuggingFaceH4/zephyr-7b-beta
+hf download HuggingFaceH4/zephyr-7b-beta

 # Specify a custom cache directory
-huggingface-cli download HuggingFaceH4/zephyr-7b-beta --cache-dir ./path/to/cache
+hf download HuggingFaceH4/zephyr-7b-beta --cache-dir ./path/to/cache

 # Download a specific file from a model repo
-huggingface-cli download HuggingFaceH4/zephyr-7b-beta eval_results.json
+hf download HuggingFaceH4/zephyr-7b-beta eval_results.json
 ```

 #### List the downloaded models
@@ -229,13 +239,13 @@ Use the Hugging Face CLI to [manage models](https://huggingface.co/docs/huggingf

 ```bash
 # List cached models
-huggingface-cli scan-cache
+hf scan-cache

 # Show detailed (verbose) output
-huggingface-cli scan-cache -v
+hf scan-cache -v

 # Specify a custom cache directory
-huggingface-cli scan-cache --dir ~/.cache/huggingface/hub
+hf scan-cache --dir ~/.cache/huggingface/hub
 ```

 #### Delete a cached model
@@ -250,7 +260,7 @@ Use the Hugging Face CLI to interactively [delete downloaded model](https://hugg
 # Please run `pip install huggingface_hub[cli]` to install them.

 # Launch the interactive TUI to select models to delete
-$ huggingface-cli delete-cache
+$ hf delete-cache
 ? Select revisions to delete: 1 revisions selected counting for 438.9M.
  ○ None of the following (if selected, nothing will be deleted).
 Model BAAI/bge-base-en-v1.5 (438.9M, used 1 week ago)
@@ -287,7 +297,7 @@ export https_proxy=http://your.proxy.server:port
 - Set the proxy for just the current command:

 ```shell
-https_proxy=http://your.proxy.server:port huggingface-cli download <model_name>
+https_proxy=http://your.proxy.server:port hf download <model_name>

 # or use vllm cmd directly
 https_proxy=http://your.proxy.server:port  vllm serve <model_name>
@@ -461,7 +471,7 @@ th {
 | `StableLMEpochForCausalLM` | StableLM Epoch | `stabilityai/stablelm-zephyr-3b`, etc. | | ✅︎ |
 | `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ |
 | `Step1ForCausalLM` | Step-Audio | `stepfun-ai/Step-Audio-EditX`, etc. | ✅︎ | ✅︎ |
-| `Step3p5ForCausalLM` | Step-3.5-flash | `stepfun-ai/step-3.5-flash`, etc. |  | ✅︎ |
+| `Step3p5ForCausalLM` | Step-3.5-flash | `stepfun-ai/Step-3.5-Flash`, etc. |  | ✅︎ |
 | `TeleChatForCausalLM` | TeleChat | `chuhac/TeleChat2-35B`, etc. | ✅︎ | ✅︎ |
 | `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ |
 | `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ |
@@ -509,6 +519,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
 | `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ |
 | `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ |
 | `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ |
+| `VoyageQwen3BidirectionalEmbedModel`<sup>C</sup> | Voyage Qwen3-based with bidirectional attention | `voyageai/voyage-4-nano`, etc. | ✅︎ | ✅︎ |
 | `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | |
 | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |

@@ -663,7 +674,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|--------|-------------------|----------------------|---------------------------|
 | `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | |
-| `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A<sup>+</sup> | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-2601-hf` | ✅︎ | ✅︎ |
+| `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-2601-hf` | ✅︎ | ✅︎ |
 | `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ |
 | `BagelForConditionalGeneration` | BAGEL | T + I<sup>+</sup> | `ByteDance-Seed/BAGEL-7B-MoT` | ✅︎ | ✅︎ |
 | `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -311,7 +311,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
    vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling \
      --trust-remote-code \
      --max-model-len 4096 \
-      --chat-template examples/template_vlm2vec_phi3v.jinja
+      --chat-template examples/pooling/embed/template/vlm2vec_phi3v.jinja
    ```

    !!! important
@@ -319,7 +319,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
        to run this model in embedding mode instead of text generation mode.

        The custom chat template is completely different from the original one for this model,
-        and can be found here: [examples/template_vlm2vec_phi3v.jinja](../../examples/template_vlm2vec_phi3v.jinja)
+        and can be found here: [examples/pooling/embed/template/vlm2vec_phi3v.jinja](../../examples/pooling/embed/template/vlm2vec_phi3v.jinja)

    Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:

@@ -359,14 +359,14 @@ and passing a list of `messages` in the request. Refer to the examples below for
    vllm serve MrLight/dse-qwen2-2b-mrl-v1 --runner pooling \
      --trust-remote-code \
      --max-model-len 8192 \
-      --chat-template examples/template_dse_qwen2_vl.jinja
+      --chat-template examples/pooling/embed/template/dse_qwen2_vl.jinja
    ```

    !!! important
        Like with VLM2Vec, we have to explicitly pass `--runner pooling`.

        Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
-        by a custom chat template: [examples/template_dse_qwen2_vl.jinja](../../examples/template_dse_qwen2_vl.jinja)
+        by a custom chat template: [examples/pooling/embed/template/dse_qwen2_vl.jinja](../../examples/pooling/embed/template/dse_qwen2_vl.jinja)

    !!! important
        `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
@@ -532,7 +532,7 @@ The following [sampling parameters](../api/README.md#inference-parameters) are s
 ??? code

    ```python
-    --8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params"
+    --8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:transcription-sampling-params"
    ```

 The following extra parameters are supported:
@@ -540,7 +540,7 @@ The following extra parameters are supported:
 ??? code

    ```python
-    --8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
+    --8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:transcription-extra-params"
    ```

 ### Translations API
@@ -560,13 +560,13 @@ Code example: [examples/online_serving/openai_translation_client.py](../../examp
 The following [sampling parameters](../api/README.md#inference-parameters) are supported.

 ```python
--8<-- "vllm/entrypoints/openai/protocol.py:translation-sampling-params"
+--8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:translation-sampling-params"
 ```

 The following extra parameters are supported:

 ```python
--8<-- "vllm/entrypoints/openai/protocol.py:translation-extra-params"
+--8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:translation-extra-params"
 ```

 ### Realtime API
@@ -954,28 +954,34 @@ You can pass multi-modal inputs to scoring models by passing `content` including

        ```python
        import requests
-
+        
        response = requests.post(
            "http://localhost:8000/v1/score",
            json={
                "model": "jinaai/jina-reranker-m0",
                "queries": "slm markdown",
-                "documents": {
-                    "content": [
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
-                            },
-                        },
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
-                            },
-                        },
-                    ],
-                },
+                "documents": [
+                    {
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
+                                },
+                            }
+                        ],
+                    },
+                    {
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
+                                },
+                            }
+                        ]
+                    },
+                ],
            },
        )
        response.raise_for_status()
@@ -1001,7 +1007,6 @@ The following Score API parameters are supported:

 ```python
 --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
--8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params"
 ```

 The following extra parameters are supported:
@@ -1009,7 +1014,6 @@ The following extra parameters are supported:
 ```python
 --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
 --8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
--8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params"
 ```

 ### Re-rank API
@@ -1092,7 +1096,6 @@ The following Re-rank API parameters are supported:
 ```python
 --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
 --8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
--8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params"
 ```

 The following extra parameters are supported:
@@ -1100,7 +1103,6 @@ The following extra parameters are supported:
 ```python
 --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
 --8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
--8<-- "vllm/entrypoints/pooling/score/protocol.py:rerank-extra-params"
 ```

 ## Ray Serve LLM
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -134,9 +134,12 @@ Please note that prefix caching is not yet supported for any of the above models

 #### Encoder-Decoder Models

-Whisper is supported. Other models requiring cross-attention between separate
-encoder and decoder (e.g., `BartForConditionalGeneration`,
-`MllamaForConditionalGeneration`) are no longer supported.
+Whisper is supported natively. Other encoder-decoder models are supported via the plugin system:
+
+- **BART**: `BartForConditionalGeneration` is supported via the official [bart-plugin](https://github.com/vllm-project/bart-plugin).
+
+For other encoder-decoder models (e.g., `MllamaForConditionalGeneration`), we recommend
+following a similar pattern by implementing support through the [plugin system](../design/plugin_system.md).

 ### Features

--- a/examples/offline_inference/new_weight_syncing/rlhf.py
+++ b/examples/offline_inference/new_weight_syncing/rlhf.py
@@ -0,0 +1,208 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Demonstrates reinforcement learning using vLLM and Ray,
+with native weight syncing APIs at engine instance.
+
+The script separates training and inference workloads onto distinct GPUs
+so that Ray can manage process placement and inter-process communication.
+A Hugging Face Transformer model occupies one GPU for training, whereas a
+2x tensor-parallel vLLM inference engine occupies two GPUs.
+
+The example performs the following steps:
+* Load the training model on one gpu (scheduled via ray)
+* Initialize the inference model with dummy weights across
+  two gpus using vLLM's tensor parallelism and Ray placement groups.
+* Generate gibberish from a list of prompts using the randomly initialized
+  inference engine.
+* Update the weights of the training model and broadcast the updated weights
+  to the inference engine by using a Ray collective RPC group.
+* Generating from the list of prompts after weight sync should result
+  in sensible outputs.
+
+This example assumes a single-node cluster with three GPUs, but Ray
+supports multi-node clusters. vLLM expects the GPUs are only used for vLLM
+workloads. Residual GPU activity interferes with vLLM memory profiling and
+causes unexpected behavior.
+"""
+
+import os
+
+import ray
+from ray.util.placement_group import placement_group
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+from transformers import AutoModelForCausalLM
+
+from vllm import LLM, SamplingParams
+from vllm.config import WeightTransferConfig
+from vllm.distributed.weight_transfer.nccl_engine import (
+    NCCLWeightTransferEngine,
+)
+from vllm.utils.network_utils import get_ip, get_open_port
+
+MODEL_NAME = "facebook/opt-125m"
+# MODEL_NAME = "inference-optimization/Qwen3-0.6B-W4A16-G128"
+
+
+class MyLLM(LLM):
+    """Configure the vLLM worker for Ray placement group execution."""
+
+    def __init__(self, *args, **kwargs):
+        os.environ["VLLM_RAY_BUNDLE_INDICES"] = "0,1"
+        super().__init__(*args, **kwargs)
+
+
+@ray.remote(num_gpus=1)
+class TrainModel:
+    """Ray actor that wraps the training model on a dedicated GPU."""
+
+    def __init__(self, model_name: str):
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+        ).to("cuda:0")
+
+        self.port = get_open_port()
+        self.master_address = get_ip()
+
+    def get_master_address_and_port(self):
+        return self.master_address, self.port
+
+    def get_weight_metadata(self):
+        """Return weight names, dtypes, and shapes for weight transfer."""
+        names = []
+        dtype_names = []
+        shapes = []
+        for name, p in self.model.named_parameters():
+            names.append(name)
+            dtype_names.append(str(p.dtype).split(".")[-1])
+            shapes.append(list(p.shape))
+        return names, dtype_names, shapes
+
+    def init_weight_transfer_group(self, world_size):
+        """Initialize the NCCL process group for weight transfer."""
+        self.model_update_group = NCCLWeightTransferEngine.trainer_init(
+            dict(
+                master_address=self.master_address,
+                master_port=self.port,
+                world_size=world_size,
+            ),
+        )
+
+    def broadcast_weights(self, packed: bool = True):
+        """Broadcast weights to the inference engine."""
+        NCCLWeightTransferEngine.trainer_send_weights(
+            iterator=self.model.named_parameters(),
+            group=self.model_update_group,
+            packed=packed,
+        )
+
+
+# Initialize Ray and set the visible devices. The vLLM engine will
+# be placed on GPUs 1 and 2.
+ray.init()
+
+# Create a placement group that reserves GPU 1–2 for the vLLM inference engine.
+# Learn more about Ray placement groups:
+# https://docs.ray.io/en/latest/placement-groups.html
+# Launch the training model actor. Ray's resource scheduler will allocate
+# 1 GPU (via num_gpus=1 in the decorator), ensuring pg_inference gets different GPUs.
+train_model = TrainModel.remote(MODEL_NAME)
+
+pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2)
+ray.get(pg_inference.ready())
+scheduling_inference = PlacementGroupSchedulingStrategy(
+    placement_group=pg_inference,
+    placement_group_capture_child_tasks=True,
+    placement_group_bundle_index=0,
+)
+
+# Launch the vLLM inference engine. The `enforce_eager` flag reduces
+# start-up latency.
+# Note: Weight transfer APIs (init_weight_transfer_engine, update_weights)
+# are now native to vLLM workers.
+llm = ray.remote(
+    num_cpus=0,
+    num_gpus=0,
+    scheduling_strategy=scheduling_inference,
+)(MyLLM).remote(
+    model=MODEL_NAME,
+    enforce_eager=True,
+    tensor_parallel_size=2,
+    data_parallel_size=1,
+    distributed_executor_backend="ray",
+    weight_transfer_config=WeightTransferConfig(backend="nccl"),
+    load_format="dummy",
+    quantization="fp8",
+)
+
+# Generate text from the prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+sampling_params = SamplingParams(temperature=0)
+
+outputs = ray.get(llm.generate.remote(prompts, sampling_params))
+
+# Generate text with the initial model. The output is expected to be nonsense
+# because the weights are randomly initialized.
+print("-" * 50)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+    print("-" * 50)
+
+# Set up the communication channel between the training process and the
+# inference engine.
+master_address, master_port = ray.get(train_model.get_master_address_and_port.remote())
+
+world_size = ray.get(llm.get_world_size.remote()) + 1  # +1 for the trainer
+inference_handle = llm.init_weight_transfer_engine.remote(
+    dict(
+        init_info=dict(
+            master_address=master_address,
+            master_port=master_port,
+            rank_offset=1,
+            world_size=world_size,
+        )
+    )
+)
+
+# Initialize weight transfer group on both the training actor and inference engine
+train_handle = train_model.init_weight_transfer_group.remote(world_size)
+ray.get([train_handle, inference_handle])
+
+# Synchronize the updated weights to the inference engine using batched API.
+# Collect all weight metadata from the training actor
+names, dtype_names, shapes = ray.get(train_model.get_weight_metadata.remote())
+
+# Issue update_weights call with NCCL-specific update info
+# packed=True enables efficient batched tensor broadcasting
+inference_handle = llm.update_weights.remote(
+    dict(
+        update_info=dict(
+            names=names,
+            dtype_names=dtype_names,
+            shapes=shapes,
+            packed=True,
+        )
+    )
+)
+
+# Broadcast all weights from trainer using the weight transfer API
+train_handle = train_model.broadcast_weights.remote(packed=True)
+ray.get([train_handle, inference_handle])
+
+# Generate text with the updated model. The output is expected to be normal
+# because the weights are updated.
+outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
+print("-" * 50)
+for output in outputs_updated:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+    print("-" * 50)
--- a/examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
+++ b/examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
@@ -0,0 +1,283 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Demonstrates async reinforcement learning using vLLM and Ray,
+with native weight syncing APIs at engine instance.
+
+The script separates training and inference workloads onto distinct GPUs
+so that Ray can manage process placement and inter-process communication.
+A Hugging Face Transformer model occupies one GPU for training, whereas a
+2x tensor-parallel vLLM inference engine occupies two GPUs.
+
+The example performs the following steps:
+* Load the training model on one gpu (scheduled via ray)
+* Initialize the inference model with dummy weights across
+  two gpus using vLLM's tensor parallelism and Ray placement groups.
+* Generate gibberish from a list of prompts using the randomly initialized
+  inference engine.
+* Pause generation once generation completes for one sequence
+* Update the weights of the training model and broadcast the updated weights
+  to the inference engine by using a Ray collective RPC group.
+* Resume generation and print out the results
+
+This example assumes a single-node cluster with three GPUs, but Ray
+supports multi-node clusters. vLLM expects the GPUs are only used for vLLM
+workloads. Residual GPU activity interferes with vLLM memory profiling and
+causes unexpected behavior.
+"""
+
+import os
+import uuid
+from dataclasses import asdict
+
+import ray
+import torch
+from ray.util.placement_group import placement_group
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+import vllm
+from vllm import SamplingParams
+from vllm.config import WeightTransferConfig
+from vllm.distributed.weight_transfer.base import (
+    WeightTransferInitRequest,
+    WeightTransferUpdateRequest,
+)
+from vllm.distributed.weight_transfer.nccl_engine import (
+    NCCLWeightTransferEngine,
+    NCCLWeightTransferInitInfo,
+    NCCLWeightTransferUpdateInfo,
+)
+from vllm.utils.network_utils import get_ip, get_open_port
+from vllm.v1.executor import Executor
+
+MODEL_NAME = "facebook/opt-125m"
+
+
+class MyLLM(vllm.AsyncLLMEngine):
+    """Configure the vLLM worker for Ray placement group execution."""
+
+    def __init__(self, **kwargs):
+        os.environ["VLLM_RAY_BUNDLE_INDICES"] = "0,1"
+        engine_args = vllm.AsyncEngineArgs(**kwargs)
+        vllm_config = engine_args.create_engine_config()
+        executor_class = Executor.get_class(vllm_config)
+        super().__init__(
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_requests=engine_args.enable_log_requests,
+            log_stats=not engine_args.disable_log_stats,
+        )
+
+    async def generate_with_retry(
+        self, prompt_token_ids: list[int], sampling_params: vllm.SamplingParams
+    ) -> vllm.RequestOutput:
+        finish_reason = "abort"
+        while finish_reason == "abort":
+            async for request_output in self.generate(
+                {"prompt_token_ids": prompt_token_ids},
+                sampling_params,
+                request_id=str(uuid.uuid4()),
+            ):
+                output = request_output
+            finish_reason = output.outputs[0].finish_reason
+            if finish_reason == "abort":
+                print(
+                    f"ABORT, prompt_token_ids: {prompt_token_ids}, "
+                    f"generated token_ids: {list(output.outputs[0].token_ids)}"
+                )
+            prompt_token_ids = prompt_token_ids + list(output.outputs[0].token_ids)
+        return output
+
+
+@ray.remote(num_gpus=1)
+class TrainModel:
+    """Ray actor that wraps the training model on a dedicated GPU."""
+
+    def __init__(self, model_name: str):
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name, dtype=torch.bfloat16
+        ).to("cuda:0")
+        self.port = get_open_port()
+        self.master_address = get_ip()
+
+    def get_master_address_and_port(self):
+        return self.master_address, self.port
+
+    def get_weight_metadata(self):
+        """Return weight names, dtypes, and shapes for weight transfer."""
+        names = []
+        dtype_names = []
+        shapes = []
+        for name, p in self.model.named_parameters():
+            names.append(name)
+            dtype_names.append(str(p.dtype).split(".")[-1])
+            shapes.append(list(p.shape))
+        return names, dtype_names, shapes
+
+    def init_weight_transfer_group(self, world_size):
+        """Initialize the NCCL process group for weight transfer."""
+        self.model_update_group = NCCLWeightTransferEngine.trainer_init(
+            dict(
+                master_address=self.master_address,
+                master_port=self.port,
+                world_size=world_size,
+            ),
+        )
+
+    def broadcast_weights(self, packed: bool = True):
+        """Broadcast weights to the inference engine."""
+        NCCLWeightTransferEngine.trainer_send_weights(
+            iterator=self.model.named_parameters(),
+            group=self.model_update_group,
+            packed=packed,
+        )
+
+
+# Initialize Ray and set the visible devices. The vLLM engine will
+# be placed on GPUs 1 and 2.
+ray.init()
+
+# Launch the training model actor. Ray's resource scheduler will allocate
+# 1 GPU (via num_gpus=1 in the decorator), ensuring pg_inference gets different GPUs.
+train_model = TrainModel.remote(MODEL_NAME)
+
+# Create a placement group that reserves GPU 1–2 for the vLLM inference engine.
+# Learn more about Ray placement groups:
+# https://docs.ray.io/en/latest/placement-groups.html
+
+pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2)
+ray.get(pg_inference.ready())
+scheduling_inference = PlacementGroupSchedulingStrategy(
+    placement_group=pg_inference,
+    placement_group_capture_child_tasks=True,
+    placement_group_bundle_index=0,
+)
+
+# Launch the vLLM inference engine. The `enforce_eager` flag reduces
+# start-up latency.
+# Note: Weight transfer APIs (init_weight_transfer_engine, update_weights)
+# are now native to vLLM workers.
+llm = ray.remote(
+    num_cpus=0,
+    num_gpus=0,
+    scheduling_strategy=scheduling_inference,
+)(MyLLM).remote(
+    model=MODEL_NAME,
+    enforce_eager=True,
+    tensor_parallel_size=2,
+    distributed_executor_backend="ray",
+    load_format="dummy",
+    weight_transfer_config=WeightTransferConfig(backend="nccl"),
+)
+
+# Generate text from the prompts.
+prompts = [
+    "My name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+# Tokenize prompts to token IDs
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+prompt_token_ids_list = [
+    tokenizer.encode(prompt, add_special_tokens=False) for prompt in prompts
+]
+
+sampling_params = [
+    SamplingParams(temperature=0, max_tokens=2),
+    SamplingParams(temperature=0, max_tokens=32),
+    SamplingParams(temperature=0, max_tokens=32),
+    SamplingParams(temperature=0, max_tokens=32),
+]
+
+# Set up the communication channel between the training process and the
+# inference engine.
+master_address, master_port = ray.get(train_model.get_master_address_and_port.remote())
+
+world_size = 3  # 1 trainer + 2 inference workers (tensor_parallel_size=2)
+inference_handle = llm.init_weight_transfer_engine.remote(
+    WeightTransferInitRequest(
+        init_info=asdict(
+            NCCLWeightTransferInitInfo(
+                master_address=master_address,
+                master_port=master_port,
+                rank_offset=1,
+                world_size=world_size,
+            )
+        )
+    )
+)
+
+# Initialize weight transfer group on both the training actor and inference engine
+train_handle = train_model.init_weight_transfer_group.remote(world_size)
+ray.get([train_handle, inference_handle])
+
+
+generation_futures = [
+    llm.generate_with_retry.remote(prompt_token_ids, params)
+    for prompt_token_ids, params in zip(prompt_token_ids_list, sampling_params)
+]
+
+finished, pending = ray.wait(generation_futures, num_returns=1)
+
+# Pause generation in preparation for weight sync
+ray.get(llm.pause_generation.remote(wait_for_inflight_requests=False))
+
+# Synchronize the updated weights to the inference engine using batched API.
+# Collect all weight metadata from the training actor
+names, dtype_names, shapes = ray.get(train_model.get_weight_metadata.remote())
+
+# Issue update_weights call with NCCL-specific update info
+# packed=True enables efficient batched tensor broadcasting
+inference_handle = llm.update_weights.remote(
+    WeightTransferUpdateRequest(
+        update_info=asdict(
+            NCCLWeightTransferUpdateInfo(
+                names=names,
+                dtype_names=dtype_names,
+                shapes=shapes,
+                packed=True,
+            )
+        )
+    )
+)
+
+# Broadcast all weights from trainer using the weight transfer API
+train_handle = train_model.broadcast_weights.remote(packed=True)
+ray.get([train_handle, inference_handle])
+
+# Resume generation since weight sync is complete
+ray.get(llm.resume_generation.remote())
+
+# Get outputs separately - finished completed before pause, pending were paused/resumed
+finished_outputs = ray.get(finished)
+pending_outputs = ray.get(pending)
+
+# Requests that finished before the pause: all generation used original weights
+print("-" * 50)
+print("Requests that completed BEFORE weight change:")
+print("-" * 50)
+for output in finished_outputs:
+    prompt_text = tokenizer.decode(output.prompt_token_ids)
+    print(f"Prompt: {prompt_text!r}")
+    print(f"Generated (with original weights): {output.outputs[0].text!r}")
+    print("-" * 50)
+
+# Requests that were paused mid-generation: some text before, some after weight change
+print("Requests that were PAUSED and RESUMED after weight change:")
+print("-" * 50)
+for output in pending_outputs:
+    # Decode the full prompt token IDs (original + generated before pause)
+    full_prompt_text = tokenizer.decode(output.prompt_token_ids)
+    # Find the original prompt by checking which one this output started with
+    original_prompt = next(p for p in prompts if full_prompt_text.startswith(p))
+    # output.prompt_token_ids contains original prompt + tokens generated before pause
+    # output.outputs[0].text is what was generated after resuming with new weights
+    text_before_pause = full_prompt_text[len(original_prompt) :]
+    text_after_pause = output.outputs[0].text
+    print(f"Original prompt: {original_prompt!r}")
+    print(f"Generated before weight change: {text_before_pause!r}")
+    print(f"Generated after weight change: {text_after_pause!r}")
+    print("-" * 50)
--- a/examples/offline_inference/openai_batch/README.md
+++ b/examples/offline_inference/openai_batch/README.md
@@ -20,7 +20,7 @@ We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` e

 * The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`.
    * Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens)
-    * Install the token on your machine (Run `huggingface-cli login`).
+    * Install the token on your machine (Run `hf auth login`).
    * Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions.

 ## Example 1: Running with a local file
--- a/examples/offline_inference/pause_resume.py
+++ b/examples/offline_inference/pause_resume.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test for pause/resume with keep mode.
+
+This test uses concurrent tasks to verify the engine truly stops generating
+during pause:
+1. Generator task: continuously generates and logs time between tokens
+2. Controller task: sends pause/resume commands
+
+If the engine properly pauses, we should see a gap in token timestamps
+matching the pause duration.
+"""
+
+import asyncio
+import time
+
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.v1.engine.async_llm import AsyncLLM
+
+PAUSE_DURATION = 3.0  # seconds
+
+
+async def main():
+    # Create engine with a small model
+    engine_args = AsyncEngineArgs(
+        model="facebook/opt-125m",
+        enforce_eager=True,
+    )
+    engine = AsyncLLM.from_engine_args(engine_args)
+
+    prompt = "Write a story about a dragon. Once upon a time"
+    sampling_params = SamplingParams(max_tokens=30, ignore_eos=True)
+
+    # Track token arrival times
+    token_times: list[tuple[int, float]] = []  # (token_count, timestamp)
+    pause_time: float = 0
+    resume_time: float = 0
+    pause_token_idx: int = 0  # Index in token_times when pause occurred
+
+    async def generator_task():
+        """Generate tokens and record timestamps."""
+        async for output in engine.generate(
+            request_id="test-req",
+            prompt=prompt,
+            sampling_params=sampling_params,
+        ):
+            token_count = len(output.outputs[0].token_ids)
+            token_times.append((token_count, time.monotonic()))
+            print(
+                f"Token {token_count} arrived:"
+                f"T={token_times[-1][1] - token_times[0][1]:.3f}s"
+            )
+        return output
+
+    async def controller_task():
+        """Pause and resume the engine after some tokens generated."""
+        nonlocal pause_time, resume_time, pause_token_idx
+
+        # Wait for some tokens to be generated
+        while len(token_times) < 5:
+            await asyncio.sleep(0.01)
+
+        print(f"\nPausing engine (keep mode) at token {len(token_times)}")
+        pause_time = time.monotonic()
+        await engine.pause_generation(mode="keep")
+        pause_token_idx = len(token_times)
+        print(f"Paused! Sleeping for {PAUSE_DURATION}s...")
+
+        # Sleep while paused - no tokens should be generated during this time
+        await asyncio.sleep(PAUSE_DURATION)
+
+        print("Resuming engine...")
+        await engine.resume_generation()
+        resume_time = time.monotonic()
+        print("Resumed!\n")
+
+    # Run both tasks concurrently
+    gen_task = asyncio.create_task(generator_task())
+    ctrl_task = asyncio.create_task(controller_task())
+
+    final_output, _ = await asyncio.gather(gen_task, ctrl_task)
+
+    # Verify the pause actually stopped generation.
+    # The gap after the pause token should be approximately the sleep duration.
+    pause_gap = token_times[pause_token_idx][1] - token_times[pause_token_idx - 1][1]
+    print(
+        f"\nGap after pause (token {pause_token_idx - 1} -> {pause_token_idx}): "
+        f"{pause_gap:.3f}s"
+    )
+    if pause_gap >= PAUSE_DURATION * 0.9:
+        print(f"✓ Test passed! Engine paused for ~{pause_gap:.1f}s")
+    else:
+        print(
+            f"✗ Test failed! Expected ~{PAUSE_DURATION}s gap after pause, "
+            f"got {pause_gap:.3f}s"
+        )
+        raise AssertionError("Engine did not properly pause")
+
+    # Verify request completed
+    assert final_output.finished, "Request should have finished"
+    assert len(final_output.outputs[0].token_ids) == 30, "Should have all tokens"
+
+    engine.shutdown()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -75,6 +75,7 @@ def parse_args():
    parser.add_argument("--gpu-memory-utilization", type=float, default=0.9)
    parser.add_argument("--disable-padded-drafter-batch", action="store_true")
    parser.add_argument("--max-num-seqs", type=int, default=None)
+    parser.add_argument("--parallel-drafting", action="store_true")
    parser.add_argument("--allowed-local-media-path", type=str, default="")
    return parser.parse_args()

@@ -121,6 +122,7 @@ def main(args):
            "model": eagle_dir,
            "num_speculative_tokens": args.num_spec_tokens,
            "disable_padded_drafter_batch": args.disable_padded_drafter_batch,
+            "parallel_drafting": args.parallel_drafting,
        }
    elif args.method == "ngram":
        speculative_config = {
@@ -137,6 +139,7 @@ def main(args):
            "num_speculative_tokens": args.num_spec_tokens,
            "enforce_eager": args.enforce_eager,
            "max_model_len": args.max_model_len,
+            "parallel_drafting": args.parallel_drafting,
        }
    elif args.method == "mtp":
        speculative_config = {
--- a/examples/online_serving/rlhf_http.py
+++ b/examples/online_serving/rlhf_http.py
@@ -0,0 +1,241 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Demonstrates reinforcement learning from human feedback (RLHF) using vLLM
+via HTTP API, with native weight syncing APIs.
+
+Unlike rlhf.py which creates a vLLM instance programmatically, this script
+assumes you have already started a vLLM server using `vllm serve`. It uses:
+- OpenAI-compatible API for inference requests
+- HTTP endpoints for weight transfer control plane
+- NCCL for actual weight data transfer
+
+Prerequisites:
+    Start a vLLM server with weight transfer enabled:
+
+    $ VLLM_SERVER_DEV_MODE=1 vllm serve facebook/opt-125m \
+        --enforce-eager \
+        --weight-transfer-config '{"backend": "nccl"}' \
+        --load-format dummy
+
+    Then run this script:
+
+    $ python rlhf_http.py
+
+The example performs the following steps:
+
+* Load the training model on GPU 0.
+* Generate text using the vLLM server via OpenAI-compatible API. The output
+  is expected to be nonsense because the server is initialized with dummy weights.
+* Initialize weight transfer via HTTP endpoint.
+* Broadcast the real weights from the training model to the vLLM server
+  using NCCL.
+* Generate text again to show normal output after the weight update.
+"""
+
+import requests
+import torch
+from openai import OpenAI
+from transformers import AutoModelForCausalLM
+
+from vllm.distributed.weight_transfer.nccl_engine import (
+    NCCLWeightTransferEngine,
+)
+from vllm.utils.network_utils import get_ip, get_open_port
+
+BASE_URL = "http://localhost:8000"
+MODEL_NAME = "facebook/opt-125m"
+
+
+def generate_completions(client: OpenAI, model: str, prompts: list[str]) -> list[str]:
+    """Generate completions using the OpenAI-compatible API."""
+    results = []
+    for prompt in prompts:
+        response = client.completions.create(
+            model=model,
+            prompt=prompt,
+            max_tokens=32,
+            temperature=0,
+        )
+        results.append(response.choices[0].text)
+    return results
+
+
+def init_weight_transfer_engine(
+    base_url: str,
+    master_address: str,
+    master_port: int,
+    rank_offset: int,
+    world_size: int,
+) -> None:
+    """Initialize weight transfer via HTTP endpoint."""
+    url = f"{base_url}/init_weight_transfer_engine"
+    payload = {
+        "init_info": dict(
+            master_address=master_address,
+            master_port=master_port,
+            rank_offset=rank_offset,
+            world_size=world_size,
+        )
+    }
+    response = requests.post(url, json=payload, timeout=60)
+    response.raise_for_status()
+
+
+def update_weights(
+    base_url: str,
+    names: list[str],
+    dtype_names: list[str],
+    shapes: list[list[int]],
+    packed: bool = False,
+) -> None:
+    """Update weights via HTTP endpoint."""
+    url = f"{base_url}/update_weights"
+    payload = {
+        "update_info": dict(
+            names=names,
+            dtype_names=dtype_names,
+            shapes=shapes,
+            packed=packed,
+        )
+    }
+    response = requests.post(url, json=payload, timeout=300)
+    response.raise_for_status()
+
+
+def pause_generation(base_url: str) -> None:
+    """Pause generation via HTTP endpoint."""
+    url = f"{base_url}/pause"
+    response = requests.post(url, timeout=60)
+    response.raise_for_status()
+
+
+def resume_generation(base_url: str) -> None:
+    """Resume generation via HTTP endpoint."""
+    url = f"{base_url}/resume"
+    response = requests.post(url, timeout=60)
+    response.raise_for_status()
+
+
+def get_world_size(base_url: str) -> int:
+    """Get world size from the vLLM server."""
+    url = f"{base_url}/get_world_size"
+    response = requests.get(url, timeout=10)
+    response.raise_for_status()
+    return response.json()["world_size"]
+
+
+def main():
+    # Get the inference world size from the vLLM server
+    inference_world_size = get_world_size(BASE_URL)
+    world_size = inference_world_size + 1  # +1 for the trainer
+    device = f"cuda:{inference_world_size}"
+    torch.cuda.set_device(device)
+
+    # Load the training model
+    print(f"Loading training model: {MODEL_NAME}")
+    train_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.bfloat16)
+    train_model.to(device)
+
+    # Create OpenAI client pointing to the vLLM server
+    client = OpenAI(
+        base_url=f"{BASE_URL}/v1",
+        api_key="EMPTY",  # vLLM doesn't require an API key by default
+    )
+
+    # Test prompts
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Generate text before weight update. The output is expected to be nonsense
+    # because the server is initialized with dummy weights.
+    print("-" * 50)
+    print("Generating text BEFORE weight update (expect nonsense):")
+    print("-" * 50)
+    outputs = generate_completions(client, MODEL_NAME, prompts)
+    for prompt, generated_text in zip(prompts, outputs):
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+    # Set up the communication channel between the training process and the
+    # vLLM server. The trainer is rank 0, vLLM worker(s) start at rank_offset.
+    master_address = get_ip()
+    master_port = get_open_port()
+    rank_offset = 1
+
+    print(f"Initializing weight transfer: master={master_address}:{master_port}")
+
+    # Initialize weight transfer on vLLM server (this is async, server will
+    # wait for NCCL connection)
+    import threading
+
+    init_thread = threading.Thread(
+        target=init_weight_transfer_engine,
+        args=(BASE_URL, master_address, master_port, rank_offset, world_size),
+    )
+    init_thread.start()
+
+    # Initialize NCCL process group on trainer side
+    model_update_group = NCCLWeightTransferEngine.trainer_init(
+        dict(
+            master_address=master_address,
+            master_port=master_port,
+            world_size=world_size,
+        ),
+    )
+
+    # Wait for init_weight_transfer_engine to complete
+    init_thread.join()
+
+    # Pause generation before weight sync
+    pause_generation(BASE_URL)
+
+    # Collect weight metadata for the update request
+    names = []
+    dtype_names = []
+    shapes = []
+    for name, p in train_model.named_parameters():
+        names.append(name)
+        dtype_names.append(str(p.dtype).split(".")[-1])
+        shapes.append(list(p.shape))
+
+    # Start the update_weights call in a separate thread since it will block
+    # waiting for NCCL broadcasts
+    # packed=True enables efficient batched tensor broadcasting
+    update_thread = threading.Thread(
+        target=update_weights,
+        args=(BASE_URL, names, dtype_names, shapes, True),  # packed=True
+    )
+    update_thread.start()
+
+    # Broadcast all weights from trainer to vLLM workers
+    print("Broadcasting weights via NCCL...")
+    NCCLWeightTransferEngine.trainer_send_weights(
+        iterator=train_model.named_parameters(),
+        group=model_update_group,
+        packed=True,
+    )
+
+    # Wait for update_weights to complete
+    update_thread.join()
+
+    # Resume generation after weight sync
+    resume_generation(BASE_URL)
+
+    # Generate text after weight update. The output is expected to be normal
+    # because the real weights are now loaded.
+    print("-" * 50)
+    print("Generating text AFTER weight update:")
+    print("-" * 50)
+    outputs_updated = generate_completions(client, MODEL_NAME, prompts)
+    for prompt, generated_text in zip(prompts, outputs_updated):
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/pooling/classify/vision_classification_online.py
+++ b/examples/pooling/classify/vision_classification_online.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+"""Example Python client for multimodal classification API using vLLM API server
+NOTE:
+    start a supported multimodal classification model server with `vllm serve`, e.g.
+    vllm serve muziyongshixin/Qwen2.5-VL-7B-for-VideoCls \
+         --runner pooling \
+         --max-model-len 5000 \
+         --limit-mm-per-prompt '{"video": 1}' \
+         --hf-overrides '{"text_config": {"architectures": ["Qwen2_5_VLForSequenceClassification"]}}'
+"""
+
+import argparse
+import pprint
+
+import requests
+
+from vllm.multimodal.utils import encode_image_url, fetch_image
+
+input_text = "This product was excellent and exceeded my expectations"
+image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
+image_base64 = {"url": encode_image_url(fetch_image(image_url))}
+video_url = "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
+
+
+def parse_args():
+    parse = argparse.ArgumentParser()
+    parse.add_argument("--host", type=str, default="localhost")
+    parse.add_argument("--port", type=int, default=8000)
+    return parse.parse_args()
+
+
+def main(args):
+    base_url = f"http://{args.host}:{args.port}"
+    models_url = base_url + "/v1/models"
+    classify_url = base_url + "/classify"
+
+    response = requests.get(models_url)
+    model_name = response.json()["data"][0]["id"]
+
+    print("Text classification output:")
+    messages = [
+        {
+            "role": "assistant",
+            "content": "Please classify this text request.",
+        },
+        {
+            "role": "user",
+            "content": input_text,
+        },
+    ]
+    response = requests.post(
+        classify_url,
+        json={"model": model_name, "messages": messages},
+    )
+    pprint.pprint(response.json())
+
+    print("Image url classification output:")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Please classify this image."},
+                {"type": "image_url", "image_url": {"url": image_url}},
+            ],
+        }
+    ]
+    response = requests.post(
+        classify_url,
+        json={"model": model_name, "messages": messages},
+    )
+    pprint.pprint(response.json())
+
+    print("Image base64 classification output:")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Please classify this image."},
+                {"type": "image_url", "image_url": image_base64},
+            ],
+        }
+    ]
+    response = requests.post(
+        classify_url,
+        json={"model": model_name, "messages": messages},
+    )
+    pprint.pprint(response.json())
+
+    print("Video url classification output:")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Please classify this video."},
+                {"type": "video_url", "video_url": {"url": video_url}},
+            ],
+        }
+    ]
+    response = requests.post(
+        classify_url,
+        json={"model": model_name, "messages": messages},
+    )
+    pprint.pprint(response.json())
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/pooling/embed/template/dse_qwen2_vl.jinja
+++ b/examples/pooling/embed/template/dse_qwen2_vl.jinja
--- a/examples/pooling/embed/template/vlm2vec_phi3v.jinja
+++ b/examples/pooling/embed/template/vlm2vec_phi3v.jinja
--- a/examples/pooling/embed/template/vlm2vec_qwen2vl.jinja
+++ b/examples/pooling/embed/template/vlm2vec_qwen2vl.jinja
--- a/examples/pooling/embed/vision_embedding_offline.py
+++ b/examples/pooling/embed/vision_embedding_offline.py
@@ -11,23 +11,79 @@ on HuggingFace model repository.

 import argparse
 from dataclasses import asdict
+from pathlib import Path

 from PIL.Image import Image

 from vllm import LLM, EngineArgs
 from vllm.multimodal.utils import fetch_image
+from vllm.utils.print_utils import print_embeddings
+
+ROOT_DIR = Path(__file__).parent.parent.parent
+EMBED_TEMPLATE_DIR = ROOT_DIR / "pooling/embed/template/"

 image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
 text = "A cat standing in the snow."
 multi_modal_data = {"image": fetch_image(image_url)}


-def print_embeddings(embeds: list[float]):
-    embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
-    print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
+def run_clip(seed: int):
+    engine_args = EngineArgs(
+        model="openai/clip-vit-base-patch32",
+        runner="pooling",
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    llm = LLM(**asdict(engine_args) | {"seed": seed})
+
+    print("Text embedding output:")
+    outputs = llm.embed(text, use_tqdm=False)
+    print_embeddings(outputs[0].outputs.embedding)
+
+    print("Image embedding output:")
+    prompt = ""  # For image input, make sure that the prompt text is empty
+    outputs = llm.embed(
+        {
+            "prompt": prompt,
+            "multi_modal_data": multi_modal_data,
+        },
+        use_tqdm=False,
+    )
+    print_embeddings(outputs[0].outputs.embedding)


-def run_qwen3_vl():
+def run_e5_v(seed: int):
+    engine_args = EngineArgs(
+        model="royokong/e5-v",
+        runner="pooling",
+        max_model_len=4096,
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    llm = LLM(**asdict(engine_args) | {"seed": seed})
+
+    llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"  # noqa: E501
+
+    print("Text embedding output:")
+    prompt_text = llama3_template.format(
+        f"{text}\nSummary above sentence in one word: "
+    )
+    outputs = llm.embed(prompt_text, use_tqdm=False)
+    print_embeddings(outputs[0].outputs.embedding)
+
+    print("Image embedding output:")
+    prompt_image = llama3_template.format("<image>\nSummary above image in one word: ")
+    outputs = llm.embed(
+        {
+            "prompt": prompt_image,
+            "multi_modal_data": multi_modal_data,
+        },
+        use_tqdm=False,
+    )
+    print_embeddings(outputs[0].outputs.embedding)
+
+
+def run_qwen3_vl(seed: int):
    try:
        from qwen_vl_utils import smart_resize
    except ModuleNotFoundError:
@@ -61,20 +117,20 @@ def run_qwen3_vl():
    )
    default_instruction = "Represent the user's input."
    image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
-    text_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n"
-    image_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}<|im_end|>\n<|im_start|>assistant\n"
-    image_text_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}{text}<|im_end|>\n<|im_start|>assistant\n"
+    prompt_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n"
+    prompt_image = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}<|im_end|>\n<|im_start|>assistant\n"
+    prompt_image_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}{text}<|im_end|>\n<|im_start|>assistant\n"

-    llm = LLM(**asdict(engine_args))
+    llm = LLM(**asdict(engine_args) | {"seed": seed})

    print("Text embedding output:")
-    outputs = llm.embed(text_prompt, use_tqdm=False)
+    outputs = llm.embed(prompt_text, use_tqdm=False)
    print_embeddings(outputs[0].outputs.embedding)

    print("Image embedding output:")
    outputs = llm.embed(
        {
-            "prompt": image_prompt,
+            "prompt": prompt_image,
            "multi_modal_data": multi_modal_data,
        },
        use_tqdm=False,
@@ -84,7 +140,162 @@ def run_qwen3_vl():
    print("Image+Text embedding output:")
    outputs = llm.embed(
        {
-            "prompt": image_text_prompt,
+            "prompt": prompt_image_text,
+            "multi_modal_data": multi_modal_data,
+        },
+        use_tqdm=False,
+    )
+    print_embeddings(outputs[0].outputs.embedding)
+
+
+def run_siglip(seed: int):
+    engine_args = EngineArgs(
+        model="google/siglip-base-patch16-224",
+        runner="pooling",
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    llm = LLM(**asdict(engine_args) | {"seed": seed})
+
+    print("Text embedding output:")
+    outputs = llm.embed(text, use_tqdm=False)
+    print_embeddings(outputs[0].outputs.embedding)
+
+    print("Image embedding output:")
+    prompt = ""  # For image input, make sure that the prompt text is empty
+    outputs = llm.embed(
+        {
+            "prompt": prompt,
+            "multi_modal_data": multi_modal_data,
+        },
+        use_tqdm=False,
+    )
+    print_embeddings(outputs[0].outputs.embedding)
+
+
+def run_vlm2vec_phi3v(seed: int):
+    engine_args = EngineArgs(
+        model="TIGER-Lab/VLM2Vec-Full",
+        runner="pooling",
+        max_model_len=4096,
+        trust_remote_code=True,
+        mm_processor_kwargs={"num_crops": 4},
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    llm = LLM(**asdict(engine_args) | {"seed": seed})
+    image_token = "<|image_1|>"
+
+    print("Text embedding output:")
+    prompt_text = f"Find me an everyday image that matches the given caption: {text}"
+    outputs = llm.embed(prompt_text, use_tqdm=False)
+    print_embeddings(outputs[0].outputs.embedding)
+
+    print("Image embedding output:")
+    prompt_image = f"{image_token} Find a day-to-day image that looks similar to the provided image."  # noqa: E501
+    outputs = llm.embed(
+        {
+            "prompt": prompt_image,
+            "multi_modal_data": multi_modal_data,
+        },
+        use_tqdm=False,
+    )
+    print_embeddings(outputs[0].outputs.embedding)
+
+    print("Image+Text embedding output:")
+    prompt_image_text = (
+        f"{image_token} Represent the given image with the following question: {text}"  # noqa: E501
+    )
+    outputs = llm.embed(
+        {
+            "prompt": prompt_image_text,
+            "multi_modal_data": multi_modal_data,
+        },
+        use_tqdm=False,
+    )
+    print_embeddings(outputs[0].outputs.embedding)
+
+
+def run_vlm2vec_qwen2vl(seed: int):
+    # vLLM does not support LoRA adapters on multi-modal encoder,
+    # so we merge the weights first
+    from huggingface_hub.constants import HF_HUB_CACHE
+    from peft import PeftConfig, PeftModel
+    from transformers import AutoModelForImageTextToText, AutoProcessor
+
+    from vllm.entrypoints.chat_utils import load_chat_template
+
+    model_id = "TIGER-Lab/VLM2Vec-Qwen2VL-2B"
+
+    base_model = AutoModelForImageTextToText.from_pretrained(model_id)
+    lora_model = PeftModel.from_pretrained(
+        base_model,
+        model_id,
+        config=PeftConfig.from_pretrained(model_id),
+    )
+    model = lora_model.merge_and_unload().to(dtype=base_model.dtype)
+    model._hf_peft_config_loaded = False  # Needed to save the merged model
+
+    processor = AutoProcessor.from_pretrained(
+        model_id,
+        # `min_pixels` and `max_pixels` are deprecated for
+        # transformers `preprocessor_config.json`
+        size={"shortest_edge": 3136, "longest_edge": 12845056},
+    )
+    processor.chat_template = load_chat_template(
+        # The original chat template is not correct
+        EMBED_TEMPLATE_DIR / "vlm2vec_qwen2vl.jinja",
+    )
+
+    merged_path = str(
+        Path(HF_HUB_CACHE) / ("models--" + model_id.replace("/", "--") + "-vllm")
+    )
+    print(f"Saving merged model to {merged_path}...")
+    print(
+        "NOTE: This directory is not tracked by `huggingface_hub` "
+        "so you have to delete this manually if you don't want it anymore."
+    )
+    model.save_pretrained(merged_path)
+    processor.save_pretrained(merged_path)
+    print("Done!")
+
+    engine_args = EngineArgs(
+        model=merged_path,
+        runner="pooling",
+        max_model_len=4096,
+        mm_processor_kwargs={
+            "min_pixels": 3136,
+            "max_pixels": 12845056,
+        },
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    llm = LLM(**asdict(engine_args) | {"seed": seed})
+    image_token = "<|image_pad|>"
+
+    print("Text embedding output:")
+    prompt_text = f"Find me an everyday image that matches the given caption: {text}"
+    outputs = llm.embed(prompt_text, use_tqdm=False)
+    print_embeddings(outputs[0].outputs.embedding)
+
+    print("Image embedding output:")
+    prompt_image = f"{image_token} Find a day-to-day image that looks similar to the provided image."  # noqa: E501
+    outputs = llm.embed(
+        {
+            "prompt": prompt_image,
+            "multi_modal_data": multi_modal_data,
+        },
+        use_tqdm=False,
+    )
+    print_embeddings(outputs[0].outputs.embedding)
+
+    print("Image+Text embedding output:")
+    prompt_image_text = (
+        f"{image_token} Represent the given image with the following question: {text}"  # noqa: E501
+    )
+    outputs = llm.embed(
+        {
+            "prompt": prompt_image_text,
            "multi_modal_data": multi_modal_data,
        },
        use_tqdm=False,
@@ -93,7 +304,12 @@ def run_qwen3_vl():


 model_example_map = {
+    "clip": run_clip,
+    "e5_v": run_e5_v,
    "qwen3_vl": run_qwen3_vl,
+    "siglip": run_siglip,
+    "vlm2vec_phi3v": run_vlm2vec_phi3v,
+    "vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
 }


@@ -103,16 +319,23 @@ def parse_args():
    )
    parser.add_argument(
        "--model",
+        "-m",
        type=str,
+        default="vlm2vec_phi3v",
        choices=model_example_map.keys(),
-        required=True,
        help="The name of the embedding model.",
    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=0,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
    return parser.parse_args()


 def main(args):
-    model_example_map[args.model]()
+    model_example_map[args.model](args.seed)


 if __name__ == "__main__":
--- a/examples/pooling/embed/vision_embedding_online.py
+++ b/examples/pooling/embed/vision_embedding_online.py
@@ -17,6 +17,8 @@ from openai.types.chat import ChatCompletionMessageParam
 from openai.types.create_embedding_response import CreateEmbeddingResponse
 from PIL import Image

+from vllm.utils.print_utils import print_embeddings
+
 # Modify OpenAI's API key and API base to use vLLM's API server.
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
@@ -51,11 +53,6 @@ def create_chat_embeddings(
    )


-def print_embeddings(embeds):
-    embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
-    print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
-
-
 def run_clip(client: OpenAI, model: str):
    """
    Start the server using:
@@ -105,7 +102,7 @@ def run_dse_qwen2_vl(client: OpenAI, model: str):
        --runner pooling \
        --trust-remote-code \
        --max-model-len 8192 \
-        --chat-template examples/template_dse_qwen2_vl.jinja
+        --chat-template examples/pooling/embed/template/dse_qwen2_vl.jinja
    """
    response = create_chat_embeddings(
        client,
@@ -316,7 +313,7 @@ def run_vlm2vec(client: OpenAI, model: str):
        --runner pooling \
        --trust-remote-code \
        --max-model-len 4096 \
-        --chat-template examples/template_vlm2vec_phi3v.jinja
+        --chat-template examples/pooling/embed/template/vlm2vec_phi3v.jinja
    """

    response = create_chat_embeddings(
--- a/examples/pooling/pooling/vision_language_pooling.py
+++ b/examples/pooling/pooling/vision_language_pooling.py
@@ -1,441 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This example shows how to use vLLM for running offline inference with
-the correct prompt format on vision language models for multimodal pooling.
-
-For most models, the prompt format should follow corresponding examples
-on HuggingFace model repository.
-"""
-
-from argparse import Namespace
-from dataclasses import asdict
-from pathlib import Path
-from typing import Literal, NamedTuple, TypeAlias, TypedDict, get_args
-
-from PIL.Image import Image
-
-from vllm import LLM, EngineArgs
-from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
-from vllm.multimodal.utils import fetch_image
-from vllm.utils.argparse_utils import FlexibleArgumentParser
-
-ROOT_DIR = Path(__file__).parent.parent.parent
-EXAMPLES_DIR = ROOT_DIR / "examples"
-
-
-class TextQuery(TypedDict):
-    modality: Literal["text"]
-    text: str
-
-
-class ImageQuery(TypedDict):
-    modality: Literal["image"]
-    image: Image
-
-
-class TextImageQuery(TypedDict):
-    modality: Literal["text+image"]
-    text: str
-    image: Image
-
-
-class TextImagesQuery(TypedDict):
-    modality: Literal["text+images"]
-    text: str
-    image: ScoreMultiModalParam
-
-
-QueryModality = Literal["text", "image", "text+image", "text+images"]
-Query: TypeAlias = TextQuery | ImageQuery | TextImageQuery | TextImagesQuery
-
-
-class ModelRequestData(NamedTuple):
-    engine_args: EngineArgs
-    prompt: str | None = None
-    image: Image | None = None
-    query: str | None = None
-    documents: ScoreMultiModalParam | None = None
-
-
-def run_clip(query: Query) -> ModelRequestData:
-    if query["modality"] == "text":
-        prompt = query["text"]
-        image = None
-    elif query["modality"] == "image":
-        prompt = ""  # For image input, make sure that the prompt text is empty
-        image = query["image"]
-    else:
-        modality = query["modality"]
-        raise ValueError(f"Unsupported query modality: '{modality}'")
-
-    engine_args = EngineArgs(
-        model="openai/clip-vit-base-patch32",
-        runner="pooling",
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompt=prompt,
-        image=image,
-    )
-
-
-def run_e5_v(query: Query) -> ModelRequestData:
-    llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"  # noqa: E501
-
-    if query["modality"] == "text":
-        text = query["text"]
-        prompt = llama3_template.format(f"{text}\nSummary above sentence in one word: ")
-        image = None
-    elif query["modality"] == "image":
-        prompt = llama3_template.format("<image>\nSummary above image in one word: ")
-        image = query["image"]
-    else:
-        modality = query["modality"]
-        raise ValueError(f"Unsupported query modality: '{modality}'")
-
-    engine_args = EngineArgs(
-        model="royokong/e5-v",
-        runner="pooling",
-        max_model_len=4096,
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompt=prompt,
-        image=image,
-    )
-
-
-def run_jinavl_reranker(query: Query) -> ModelRequestData:
-    if query["modality"] != "text+images":
-        raise ValueError(f"Unsupported query modality: '{query['modality']}'")
-
-    engine_args = EngineArgs(
-        model="jinaai/jina-reranker-m0",
-        runner="pooling",
-        max_model_len=32768,
-        trust_remote_code=True,
-        mm_processor_kwargs={
-            "min_pixels": 3136,
-            "max_pixels": 602112,
-        },
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        query=query["text"],
-        documents=query["image"],
-    )
-
-
-def run_qwen3_vl(query: Query) -> ModelRequestData:
-    image_placeholder = "<vision_start><|image_pad|><vision_end>"
-    if query["modality"] == "text":
-        prompt = query["text"]
-        image = None
-    elif query["modality"] == "image":
-        prompt = image_placeholder
-        image = query["image"]
-    elif query["modality"] == "text+image":
-        text = query["text"]
-        prompt = f"{image_placeholder}\n{text}"
-        image = query["image"]
-    else:
-        modality = query["modality"]
-        raise ValueError(f"Unsupported query modality: '{modality}'")
-
-    engine_args = EngineArgs(
-        model="Qwen/Qwen3-VL-Embedding-2B",
-        runner="pooling",
-        max_model_len=8192,
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompt=prompt,
-        image=image,
-    )
-
-
-def run_siglip(query: Query) -> ModelRequestData:
-    if query["modality"] == "text":
-        prompt = query["text"]
-        image = None
-    elif query["modality"] == "image":
-        prompt = ""  # For image input, make sure that the prompt text is empty
-        image = query["image"]
-    else:
-        modality = query["modality"]
-        raise ValueError(f"Unsupported query modality: '{modality}'")
-
-    engine_args = EngineArgs(
-        model="google/siglip-base-patch16-224",
-        runner="pooling",
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompt=prompt,
-        image=image,
-    )
-
-
-def _get_vlm2vec_prompt_image(query: Query, image_token: str):
-    if query["modality"] == "text":
-        text = query["text"]
-        prompt = f"Find me an everyday image that matches the given caption: {text}"
-        image = None
-    elif query["modality"] == "image":
-        prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image."  # noqa: E501
-        image = query["image"]
-    elif query["modality"] == "text+image":
-        text = query["text"]
-        prompt = f"{image_token} Represent the given image with the following question: {text}"  # noqa: E501
-        image = query["image"]
-    else:
-        modality = query["modality"]
-        raise ValueError(f"Unsupported query modality: {modality!r}")
-
-    return prompt, image
-
-
-def run_vlm2vec_phi3v(query: Query) -> ModelRequestData:
-    prompt, image = _get_vlm2vec_prompt_image(query, "<|image_1|>")
-
-    engine_args = EngineArgs(
-        model="TIGER-Lab/VLM2Vec-Full",
-        runner="pooling",
-        max_model_len=4096,
-        trust_remote_code=True,
-        mm_processor_kwargs={"num_crops": 4},
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompt=prompt,
-        image=image,
-    )
-
-
-def run_vlm2vec_qwen2vl(query: Query) -> ModelRequestData:
-    # vLLM does not support LoRA adapters on multi-modal encoder,
-    # so we merge the weights first
-    from huggingface_hub.constants import HF_HUB_CACHE
-    from peft import PeftConfig, PeftModel
-    from transformers import AutoModelForImageTextToText, AutoProcessor
-
-    from vllm.entrypoints.chat_utils import load_chat_template
-
-    model_id = "TIGER-Lab/VLM2Vec-Qwen2VL-2B"
-
-    base_model = AutoModelForImageTextToText.from_pretrained(model_id)
-    lora_model = PeftModel.from_pretrained(
-        base_model,
-        model_id,
-        config=PeftConfig.from_pretrained(model_id),
-    )
-    model = lora_model.merge_and_unload().to(dtype=base_model.dtype)
-    model._hf_peft_config_loaded = False  # Needed to save the merged model
-
-    processor = AutoProcessor.from_pretrained(
-        model_id,
-        # `min_pixels` and `max_pixels` are deprecated for
-        # transformers `preprocessor_config.json`
-        size={"shortest_edge": 3136, "longest_edge": 12845056},
-    )
-    processor.chat_template = load_chat_template(
-        # The original chat template is not correct
-        EXAMPLES_DIR / "template_vlm2vec_qwen2vl.jinja",
-    )
-
-    merged_path = str(
-        Path(HF_HUB_CACHE) / ("models--" + model_id.replace("/", "--") + "-vllm")
-    )
-    print(f"Saving merged model to {merged_path}...")
-    print(
-        "NOTE: This directory is not tracked by `huggingface_hub` "
-        "so you have to delete this manually if you don't want it anymore."
-    )
-    model.save_pretrained(merged_path)
-    processor.save_pretrained(merged_path)
-    print("Done!")
-
-    prompt, image = _get_vlm2vec_prompt_image(query, "<|image_pad|>")
-
-    engine_args = EngineArgs(
-        model=merged_path,
-        runner="pooling",
-        max_model_len=4096,
-        mm_processor_kwargs={
-            "min_pixels": 3136,
-            "max_pixels": 12845056,
-        },
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompt=prompt,
-        image=image,
-    )
-
-
-def get_query(modality: QueryModality):
-    if modality == "text":
-        return TextQuery(modality="text", text="A dog sitting in the grass")
-
-    if modality == "image":
-        return ImageQuery(
-            modality="image",
-            image=fetch_image(
-                "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/eskimo.jpg"  # noqa: E501
-            ),
-        )
-
-    if modality == "text+image":
-        return TextImageQuery(
-            modality="text+image",
-            text="A cat standing in the snow.",
-            image=fetch_image(
-                "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"  # noqa: E501
-            ),
-        )
-
-    if modality == "text+images":
-        return TextImagesQuery(
-            modality="text+images",
-            text="slm markdown",
-            image={
-                "content": [
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
-                        },
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
-                        },
-                    },
-                ]
-            },
-        )
-
-    msg = f"Modality {modality} is not supported."
-    raise ValueError(msg)
-
-
-def run_encode(model: str, modality: QueryModality, seed: int):
-    query = get_query(modality)
-    req_data = model_example_map[model](query)
-
-    # Disable other modalities to save memory
-    default_limits = {"image": 0, "video": 0, "audio": 0}
-    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
-        req_data.engine_args.limit_mm_per_prompt or {}
-    )
-
-    engine_args = asdict(req_data.engine_args) | {"seed": seed}
-    llm = LLM(**engine_args)
-
-    mm_data = {}
-    if req_data.image is not None:
-        mm_data["image"] = req_data.image
-
-    outputs = llm.embed(
-        {
-            "prompt": req_data.prompt,
-            "multi_modal_data": mm_data,
-        }
-    )
-
-    print("-" * 50)
-    for output in outputs:
-        print(output.outputs.embedding)
-        print("-" * 50)
-
-
-def run_score(model: str, modality: QueryModality, seed: int):
-    query = get_query(modality)
-    req_data = model_example_map[model](query)
-
-    engine_args = asdict(req_data.engine_args) | {"seed": seed}
-    llm = LLM(**engine_args)
-
-    outputs = llm.score(req_data.query, req_data.documents)
-
-    print("-" * 30)
-    print([output.outputs.score for output in outputs])
-    print("-" * 30)
-
-
-model_example_map = {
-    "clip": run_clip,
-    "e5_v": run_e5_v,
-    "jinavl_reranker": run_jinavl_reranker,
-    "qwen3_vl": run_qwen3_vl,
-    "siglip": run_siglip,
-    "vlm2vec_phi3v": run_vlm2vec_phi3v,
-    "vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
-}
-
-
-def parse_args():
-    parser = FlexibleArgumentParser(
-        description="Demo on using vLLM for offline inference with "
-        "vision language models for multimodal pooling tasks."
-    )
-    parser.add_argument(
-        "--model-name",
-        "-m",
-        type=str,
-        default="vlm2vec_phi3v",
-        choices=model_example_map.keys(),
-        help="The name of the embedding model.",
-    )
-    parser.add_argument(
-        "--task",
-        "-t",
-        type=str,
-        default="embedding",
-        choices=["embedding", "scoring"],
-        help="The task type.",
-    )
-    parser.add_argument(
-        "--modality",
-        type=str,
-        default="image",
-        choices=get_args(QueryModality),
-        help="Modality of the input.",
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=0,
-        help="Set the seed when initializing `vllm.LLM`.",
-    )
-    return parser.parse_args()
-
-
-def main(args: Namespace):
-    if args.task == "embedding":
-        run_encode(args.model_name, args.modality, args.seed)
-    elif args.task == "scoring":
-        run_score(args.model_name, args.modality, args.seed)
-    else:
-        raise ValueError(f"Unsupported task: {args.task}")
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
--- a/examples/pooling/score/vision_rerank_api_online.py
+++ b/examples/pooling/score/vision_rerank_api_online.py
@@ -30,6 +30,7 @@ document = (
    "as the dog offers its paw in a heartwarming display of companionship and trust."
 )
 image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+video_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
 documents = [
    {
        "type": "text",
@@ -43,6 +44,10 @@ documents = [
        "type": "image_url",
        "image_url": {"url": encode_image_url(fetch_image(image_url))},
    },
+    {
+        "type": "video_url",
+        "video_url": {"url": video_url},
+    },
 ]


@@ -89,6 +94,15 @@ def main(args):
    response = requests.post(rerank_url, json=prompt)
    pprint.pprint(response.json())

+    print("Query: string & Document: video url")
+    prompt = {
+        "model": model,
+        "query": query,
+        "documents": {"content": [documents[3]]},
+    }
+    response = requests.post(rerank_url, json=prompt)
+    pprint.pprint(response.json())
+
    print("Query: string & Document: text + image url")
    prompt = {
        "model": model,
--- a/examples/pooling/score/vision_reranker_offline.py
+++ b/examples/pooling/score/vision_reranker_offline.py
@@ -15,20 +15,47 @@ from pathlib import Path
 from typing import NamedTuple

 from vllm import LLM, EngineArgs
-from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
+from vllm.multimodal.utils import encode_image_url, fetch_image
 from vllm.utils.argparse_utils import FlexibleArgumentParser

 TEMPLATE_HOME = Path(__file__).parent / "template"


+query = "A woman playing with her dog on a beach at sunset."
+document = (
+    "A woman shares a joyful moment with her golden retriever on a sun-drenched "
+    "beach at sunset, as the dog offers its paw in a heartwarming display of "
+    "companionship and trust."
+)
+image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+video_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
+documents = [
+    {
+        "type": "text",
+        "text": document,
+    },
+    {
+        "type": "image_url",
+        "image_url": {"url": image_url},
+    },
+    {
+        "type": "image_url",
+        "image_url": {"url": encode_image_url(fetch_image(image_url))},
+    },
+    {
+        "type": "video_url",
+        "video_url": {"url": video_url},
+    },
+]
+
+
 class RerankModelData(NamedTuple):
    engine_args: EngineArgs
    chat_template: str | None = None
+    modality: set[str] = {}


-def run_jinavl_reranker(modality: str) -> RerankModelData:
-    assert modality == "image"
-
+def run_jinavl_reranker() -> RerankModelData:
    engine_args = EngineArgs(
        model="jinaai/jina-reranker-m0",
        runner="pooling",
@@ -38,19 +65,15 @@ def run_jinavl_reranker(modality: str) -> RerankModelData:
            "min_pixels": 3136,
            "max_pixels": 602112,
        },
-        limit_mm_per_prompt={modality: 1},
-    )
-    return RerankModelData(
-        engine_args=engine_args,
    )
+    return RerankModelData(engine_args=engine_args, modality={"image"})


-def run_qwen3_vl_reranker(modality: str) -> RerankModelData:
+def run_qwen3_vl_reranker() -> RerankModelData:
    engine_args = EngineArgs(
        model="Qwen/Qwen3-VL-Reranker-2B",
        runner="pooling",
        max_model_len=16384,
-        limit_mm_per_prompt={modality: 1},
        # HuggingFace model configuration overrides required for compatibility
        hf_overrides={
            # Manually route to sequence classification architecture
@@ -71,10 +94,11 @@ def run_qwen3_vl_reranker(modality: str) -> RerankModelData:
    return RerankModelData(
        engine_args=engine_args,
        chat_template=chat_template,
+        modality={"image", "video"},
    )


-model_example_map: dict[str, Callable[[str], RerankModelData]] = {
+model_example_map: dict[str, Callable[[], RerankModelData]] = {
    "jinavl_reranker": run_jinavl_reranker,
    "qwen3_vl_reranker": run_qwen3_vl_reranker,
 }
@@ -93,78 +117,67 @@ def parse_args():
        choices=model_example_map.keys(),
        help="The name of the reranker model.",
    )
-    parser.add_argument(
-        "--modality",
-        type=str,
-        default="image",
-        choices=["image", "video"],
-        help="Modality of the multimodal input (image or video).",
-    )
    return parser.parse_args()


-def get_multi_modal_input(modality: str) -> tuple[str, ScoreMultiModalParam]:
-    # Sample query for testing the reranker
-    if modality == "image":
-        query = "A woman playing with her dog on a beach at sunset."
-        # Sample multimodal documents to be scored against the query
-        # Each document contains an image URL that will be fetched and processed
-        documents: ScoreMultiModalParam = {
-            "content": [
-                {
-                    "type": "text",
-                    "text": (
-                        "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, "  # noqa: E501
-                        "as the dog offers its paw in a heartwarming display of companionship and trust."  # noqa: E501
-                    ),
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
-                    },
-                },
-            ]
-        }
-    elif modality == "video":
-        query = "A girl is drawing pictures on an ipad."
-        # Sample video documents to be scored against the query
-        documents: ScoreMultiModalParam = {
-            "content": [
-                {
-                    "type": "text",
-                    "text": "A girl is drawing a guitar on her ipad with Apple Pencil.",
-                },
-                {
-                    "type": "video_url",
-                    "video_url": {
-                        "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
-                    },
-                },
-            ]
-        }
-    else:
-        raise ValueError(f"Unsupported modality: {modality}")
-    return query, documents
-
-
 def main(args: Namespace):
    # Run the selected reranker model
-    modality = args.modality
-    model_request = model_example_map[args.model_name](modality)
+    model_request = model_example_map[args.model_name]()
    engine_args = model_request.engine_args

    llm = LLM(**asdict(engine_args))

-    query, documents = get_multi_modal_input(modality)
-    outputs = llm.score(query, documents, chat_template=model_request.chat_template)
-
-    print("-" * 50)
-    print(f"Model: {engine_args.model}")
-    print(f"Modality: {modality}")
-    print(f"Query: {query}")
+    print("Query: string & Document: string")
+    outputs = llm.score(query, document)
+    print("Relevance scores:", [output.outputs.score for output in outputs])
+
+    print("Query: string & Document: text")
+    outputs = llm.score(
+        query, {"content": [documents[0]]}, chat_template=model_request.chat_template
+    )
+    print("Relevance scores:", [output.outputs.score for output in outputs])
+
+    print("Query: string & Document: image url")
+    outputs = llm.score(
+        query, {"content": [documents[1]]}, chat_template=model_request.chat_template
+    )
+    print("Relevance scores:", [output.outputs.score for output in outputs])
+
+    print("Query: string & Document: image base64")
+    outputs = llm.score(
+        query, {"content": [documents[2]]}, chat_template=model_request.chat_template
+    )
+    print("Relevance scores:", [output.outputs.score for output in outputs])
+
+    if "video" in model_request.modality:
+        print("Query: string & Document: video url")
+        outputs = llm.score(
+            query,
+            {"content": [documents[3]]},
+            chat_template=model_request.chat_template,
+        )
+        print("Relevance scores:", [output.outputs.score for output in outputs])
+
+    print("Query: string & Document: text + image url")
+    outputs = llm.score(
+        query,
+        {"content": [documents[0], documents[1]]},
+        chat_template=model_request.chat_template,
+    )
+    print("Relevance scores:", [output.outputs.score for output in outputs])
+
+    print("Query: string & Document: list")
+    outputs = llm.score(
+        query,
+        [
+            document,
+            {"content": [documents[0]]},
+            {"content": [documents[1]]},
+            {"content": [documents[0], documents[1]]},
+        ],
+        chat_template=model_request.chat_template,
+    )
    print("Relevance scores:", [output.outputs.score for output in outputs])
-    print("-" * 50)


 if __name__ == "__main__":
--- a/examples/pooling/score/vision_score_api_online.py
+++ b/examples/pooling/score/vision_score_api_online.py
@@ -29,6 +29,7 @@ document = (
    "as the dog offers its paw in a heartwarming display of companionship and trust."
 )
 image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+video_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
 documents = [
    {
        "type": "text",
@@ -42,6 +43,10 @@ documents = [
        "type": "image_url",
        "image_url": {"url": encode_image_url(fetch_image(image_url))},
    },
+    {
+        "type": "video_url",
+        "video_url": {"url": video_url},
+    },
 ]


@@ -92,6 +97,15 @@ def main(args):
    response = requests.post(score_url, json=prompt)
    pprint.pprint(response.json())

+    print("Query: string & Document: video url")
+    prompt = {
+        "model": model,
+        "queries": query,
+        "documents": {"content": [documents[3]]},
+    }
+    response = requests.post(score_url, json=prompt)
+    pprint.pprint(response.json())
+
    print("Query: string & Document: text + image url")
    prompt = {
        "model": model,
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -141,6 +141,7 @@ extra_css:
  - mkdocs/stylesheets/extra.css

 extra_javascript:
+  - mkdocs/javascript/reo.js
  - mkdocs/javascript/run_llm_widget.js
  - mkdocs/javascript/mathjax.js
  - https://unpkg.com/mathjax@3.2.2/es5/tex-mml-chtml.js
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
    "torch == 2.9.1",
    "wheel",
    "jinja2",
-    "grpcio-tools",
+    "grpcio-tools==1.78.0",
 ]
 build-backend = "setuptools.build_meta"

--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -9,5 +9,5 @@ wheel
 jinja2>=3.1.6
 regex
 build
-protobuf
-grpcio-tools
+protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.*
+grpcio-tools==1.78.0 # Required for grpc entrypoints
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -9,7 +9,7 @@ blake3
 py-cpuinfo
 transformers >= 4.56.0, < 5
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
-protobuf # Required by LlamaTokenizer, gRPC.
+protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp >= 3.13.3
 openai >= 1.99.1  # For Responses API with reasoning content
@@ -52,4 +52,4 @@ anthropic >= 0.71.0
 model-hosting-container-standards >= 0.1.13, < 1.0.0
 mcp
 grpcio
-grpcio-reflection
+grpcio-reflection
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -10,4 +10,4 @@ torchaudio==2.9.1
 # These must be updated alongside torch
 torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
-flashinfer-python==0.6.2
+flashinfer-python==0.6.3
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -43,5 +43,5 @@ tritonclient>=2.51.0
 numba == 0.61.2 # Required for N-gram speculative decoding
 numpy
 runai-model-streamer[s3,gcs]==0.15.3
-fastsafetensors>=0.1.10
+fastsafetensors>=0.2.2
 pydantic>=2.12 # 2.11 leads to error on python 3.13
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -58,7 +58,7 @@ schemathesis==3.39.15
    # OpenAI schema test

 # Evaluation and benchmarking
-lm-eval[api]>=0.4.9.2
+lm-eval[api]==0.4.9.2
 jiwer==4.0.0

 # Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
@@ -95,4 +95,4 @@ albumentations==1.4.6
 # Pin transformers version
 transformers==4.57.3
 # Pin HF Hub version
-huggingface-hub==0.36.1
+huggingface-hub==0.36.2
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -1,6 +1,11 @@
 # Common dependencies
 -r common.txt

+# The version of gRPC libraries should be consistent with each other
+grpcio==1.78.0
+grpcio-reflection==1.78.0
+grpcio-tools==1.78.0
+
 numba == 0.61.2 # Required for N-gram speculative decoding

 # Dependencies for AMD GPUs
@@ -14,5 +19,4 @@ setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
 runai-model-streamer[s3,gcs]==0.15.3
 conch-triton-kernels==1.2.1
-timm>=1.0.17
-grpcio-tools>=1.76.0
+timm>=1.0.17
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -48,11 +48,16 @@ buildkite-test-collector==0.1.9
 genai_perf>=0.0.8
 tritonclient>=2.51.0

+# The version of gRPC libraries should be consistent with each other
+grpcio==1.78.0
+grpcio-reflection==1.78.0
+grpcio-tools==1.78.0
+
 arctic-inference == 0.1.1 # Required for suffix decoding test
 numba == 0.61.2 # Required for N-gram speculative decoding
 numpy
 runai-model-streamer[s3,gcs]==0.15.3
-fastsafetensors>=0.1.10
+fastsafetensors>=0.2.2 # 0.2.2 contains important fixes for multi-GPU mem usage
 pydantic>=2.12 # 2.11 leads to error on python 3.13
 decord==0.6.0
 terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -220,7 +220,7 @@ fastparquet==2024.11.0
    # via genai-perf
 fastrlock==0.8.2
    # via cupy-cuda12x
-fastsafetensors==0.1.10
+fastsafetensors==0.2.2
    # via -r requirements/test.in
 filelock==3.16.1
    # via
@@ -303,8 +303,17 @@ graphql-relay==3.2.0
    # via graphene
 greenlet==3.2.3
    # via sqlalchemy
-grpcio==1.76.0
-    # via ray
+grpcio==1.78.0
+    # via
+    #   -r requirements/test.in
+    #   grpcio-reflection
+    #   grpcio-tools
+    #   ray
+    #   tensorboard
+grpcio-reflection==1.78.0
+    # via -r requirements/test.in
+grpcio-tools==1.78.0
+    # via -r requirements/test.in
 gunicorn==23.0.0
    # via mlflow
 h11==0.14.0
@@ -332,7 +341,7 @@ httpx==0.27.2
    #   -r requirements/test.in
    #   perceptron
    #   schemathesis
-huggingface-hub==0.36.1
+huggingface-hub==0.36.2
    # via
    #   accelerate
    #   datasets
@@ -777,6 +786,8 @@ protobuf==6.33.2
    # via
    #   google-api-core
    #   googleapis-common-protos
+    #   grpcio-reflection
+    #   grpcio-tools
    #   mlflow-skinny
    #   opentelemetry-proto
    #   proto-plus
@@ -1046,6 +1057,7 @@ sentence-transformers==5.2.0
    #   mteb
 setuptools==77.0.3
    # via
+    #   grpcio-tools
    #   lightning-utilities
    #   pytablewriter
    #   torch
@@ -1164,7 +1176,6 @@ torch==2.9.1+cu129
    #   bitsandbytes
    #   efficientnet-pytorch
    #   encodec
-    #   fastsafetensors
    #   kornia
    #   lightly
    #   lightning
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -15,4 +15,4 @@ torch==2.10.0+xpu
 torchaudio
 torchvision

-vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.1/vllm_xpu_kernels-0.1.1-cp312-cp312-linux_x86_64.whl
+vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.2/vllm_xpu_kernels-0.1.2-cp312-cp312-linux_x86_64.whl
--- a/setup.py
+++ b/setup.py
@@ -1035,7 +1035,7 @@ setup(
    extras_require={
        "bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy"],
        "tensorizer": ["tensorizer==2.10.1"],
-        "fastsafetensors": ["fastsafetensors >= 0.1.10"],
+        "fastsafetensors": ["fastsafetensors >= 0.2.2"],
        "runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
        "audio": [
            "librosa",
@@ -1049,6 +1049,13 @@ setup(
        "petit-kernel": ["petit-kernel"],
        # Optional deps for Helion kernel development
        "helion": ["helion"],
+        # Optional deps for OpenTelemetry tracing
+        "otel": [
+            "opentelemetry-sdk>=1.26.0",
+            "opentelemetry-api>=1.26.0",
+            "opentelemetry-exporter-otlp>=1.26.0",
+            "opentelemetry-semantic-conventions-ai>=0.4.1",
+        ],
    },
    cmdclass=cmdclass,
    package_data=package_data,
--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@@ -11,10 +11,10 @@ from torch import fx
 from torch._ops import OpOverload
 from torch.fx._utils import lazy_format_graph_code

-from vllm.compilation.fx_utils import find_op_nodes
-from vllm.compilation.inductor_pass import InductorPass
-from vllm.compilation.pass_manager import with_pattern_match_debug
-from vllm.compilation.vllm_inductor_pass import VllmInductorPass
+from vllm.compilation.passes.fx_utils import find_op_nodes
+from vllm.compilation.passes.inductor_pass import InductorPass
+from vllm.compilation.passes.pass_manager import with_pattern_match_debug
+from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.logger import init_logger

--- a/tests/compile/correctness_e2e/init.py
+++ b/tests/compile/correctness_e2e/init.py
--- a/tests/compile/correctness_e2e/test_async_tp.py
+++ b/tests/compile/correctness_e2e/test_async_tp.py
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+
+from tests.models.registry import HF_EXAMPLE_MODELS
+from tests.utils import (
+    compare_two_settings,
+    create_new_process_for_each_test,
+)
+from vllm.config import (
+    CompilationMode,
+)
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize(
+    "model_id",
+    ["meta-llama/Llama-3.2-1B-Instruct", "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"],
+)
+@pytest.mark.parametrize("tp_size", [2])
+@pytest.mark.parametrize("async_tp_enabled", [True])
+@pytest.mark.parametrize("distributed_backend", ["mp"])
+@pytest.mark.parametrize("eager_mode", [False, True])
+def test_async_tp_pass_correctness(
+    model_id: str,
+    tp_size: int,
+    async_tp_enabled: bool,
+    distributed_backend: str,
+    eager_mode: bool,
+    num_gpus_available: int,
+):
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_transformers_version(on_fail="skip")
+    model_info.check_available_online(on_fail="skip")
+
+    pp_size = 1
+    if num_gpus_available < tp_size:
+        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
+
+    common_args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "8",
+    ]
+    if eager_mode:
+        common_args.append("--enforce-eager")
+
+    compilation_config = {
+        "mode": CompilationMode.VLLM_COMPILE,
+        "compile_sizes": [2, 4, 8],
+        "splitting_ops": [],
+        "pass_config": {"fuse_gemm_comms": async_tp_enabled},
+    }
+
+    async_tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        distributed_backend,
+        "--compilation_config",
+        json.dumps(compilation_config),
+    ]
+
+    tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        "mp",
+    ]
+
+    compare_two_settings(model_id, async_tp_args, tp_args, method="generate")
--- a/tests/compile/correctness_e2e/test_sequence_parallel.py
+++ b/tests/compile/correctness_e2e/test_sequence_parallel.py
@@ -21,8 +21,8 @@ from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer

-from ..models.registry import HF_EXAMPLE_MODELS
-from ..utils import compare_two_settings, create_new_process_for_each_test
+from ...models.registry import HF_EXAMPLE_MODELS
+from ...utils import compare_two_settings, create_new_process_for_each_test

 logger = init_logger("test_sequence_parallel")

--- a/tests/compile/fusions_e2e/common.py
+++ b/tests/compile/fusions_e2e/common.py
@@ -82,19 +82,17 @@ INDUCTOR_GRAPH_PARTITION = [
 ]

 FUSION_LOG_PATTERNS: dict[str, re.Pattern] = {
-    "rms_quant_fusion": re.compile(
-        r"\[(?:compilation/)?fusion.py:\d+] Replaced (\d+) patterns"
-    ),
-    "act_quant_fusion": re.compile(
-        r"activation_quant_fusion.py:\d+] Replaced (\d+) patterns"
-    ),
+    "rms_quant_fusion": re.compile(r"rms_quant_fusion.py:\d+] Replaced (\d+) patterns"),
+    "act_quant_fusion": re.compile(r"act_quant_fusion.py:\d+] Replaced (\d+) patterns"),
    "norm_rope_fusion": re.compile(
        r"qk_norm_rope_fusion.py:\d+] Fused QK Norm\+RoPE on (\d+) sites"
    ),
    "attn_quant_fusion": re.compile(
-        r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes"
+        r"attn_quant_fusion.py:\d+] Fused quant onto (\d+) attention nodes"
+    ),
+    "ar_rms_fusion": re.compile(
+        r"allreduce_rms_fusion.py:\d+] Replaced (\d+) patterns"
    ),
-    "ar_rms_fusion": re.compile(r"collective_fusion.py:\d+] Replaced (\d+) patterns"),
    "sequence_parallel": re.compile(
        r"sequence_parallelism.py:\d+] Replaced (\d+) patterns"
    ),
--- a/tests/compile/fusions_e2e/models.py
+++ b/tests/compile/fusions_e2e/models.py
@@ -101,9 +101,7 @@ qwen3_a3b_fp8 = ModelFusionInfo(
    model_name="Qwen/Qwen3-30B-A3B-FP8",
    matches=lambda n_layers: Matches(
        rms_quant_fusion=n_layers,
-        # TODO broken on Blackwell:
-        # https://github.com/vllm-project/vllm/issues/33295
-        norm_rope_fusion=0 if is_blackwell() else n_layers,
+        norm_rope_fusion=n_layers,
        attn_quant_fusion=0,  # attn + group quant not supported
        ar_rms_fusion=n_layers * 2 + 1,
        sequence_parallel=n_layers * 2 + 1,
--- a/tests/compile/passes/init.py
+++ b/tests/compile/passes/init.py
--- a/tests/compile/passes/distributed/init.py
+++ b/tests/compile/passes/distributed/init.py
--- a/tests/compile/passes/distributed/test_async_tp.py
+++ b/tests/compile/passes/distributed/test_async_tp.py
@@ -1,16 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import json

 import pytest
 import torch

 import vllm.envs as envs
-from vllm.compilation.collective_fusion import AsyncTPPass
+from tests.compile.backend import TestBackend
+from tests.utils import (
+    multi_gpu_test,
+)
+from vllm.compilation.passes.fusion.collective_fusion import AsyncTPPass
 from vllm.config import (
    CompilationConfig,
-    CompilationMode,
    DeviceConfig,
    ModelConfig,
    PassConfig,
@@ -29,14 +31,6 @@ from vllm.platforms import current_platform
 from vllm.utils.system_utils import update_environment_variables
 from vllm.utils.torch_utils import set_random_seed

-from ...models.registry import HF_EXAMPLE_MODELS
-from ...utils import (
-    compare_two_settings,
-    create_new_process_for_each_test,
-    multi_gpu_test,
-)
-from ..backend import TestBackend
-
 FP8_DTYPE = current_platform.fp8_dtype()

 prompts = [
@@ -377,67 +371,3 @@ def async_tp_pass_on_test_model(
        # In post-nodes, fused_matmul_reduce_scatter or \
        # fused_all_gather_matmul should exist
        backend.check_after_ops(model.ops_in_model_after())
-
-
-@create_new_process_for_each_test()
-@pytest.mark.parametrize(
-    "model_id",
-    ["meta-llama/Llama-3.2-1B-Instruct", "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"],
-)
-@pytest.mark.parametrize("tp_size", [2])
-@pytest.mark.parametrize("async_tp_enabled", [True])
-@pytest.mark.parametrize("distributed_backend", ["mp"])
-@pytest.mark.parametrize("eager_mode", [False, True])
-def test_async_tp_pass_correctness(
-    model_id: str,
-    tp_size: int,
-    async_tp_enabled: bool,
-    distributed_backend: str,
-    eager_mode: bool,
-    num_gpus_available: int,
-):
-    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
-    model_info.check_transformers_version(on_fail="skip")
-    model_info.check_available_online(on_fail="skip")
-
-    pp_size = 1
-    if num_gpus_available < tp_size:
-        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
-
-    common_args = [
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "2048",
-        "--max-num-seqs",
-        "8",
-    ]
-    if eager_mode:
-        common_args.append("--enforce-eager")
-
-    compilation_config = {
-        "mode": CompilationMode.VLLM_COMPILE,
-        "compile_sizes": [2, 4, 8],
-        "splitting_ops": [],
-        "pass_config": {"fuse_gemm_comms": async_tp_enabled},
-    }
-
-    async_tp_args = [
-        *common_args,
-        "--tensor-parallel-size",
-        str(tp_size),
-        "--distributed-executor-backend",
-        distributed_backend,
-        "--compilation_config",
-        json.dumps(compilation_config),
-    ]
-
-    tp_args = [
-        *common_args,
-        "--tensor-parallel-size",
-        str(tp_size),
-        "--distributed-executor-backend",
-        "mp",
-    ]
-
-    compare_two_settings(model_id, async_tp_args, tp_args, method="generate")
--- a/tests/compile/passes/distributed/test_fusion_all_reduce.py
+++ b/tests/compile/passes/distributed/test_fusion_all_reduce.py
@@ -6,11 +6,15 @@ import pytest
 import torch

 import vllm.envs as envs
+from tests.compile.backend import TestBackend
+from tests.utils import TestFP8Layer, has_module_attribute, multi_gpu_test
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
-from vllm.compilation.collective_fusion import AllReduceFusionPass
-from vllm.compilation.fix_functionalization import FixFunctionalizationPass
-from vllm.compilation.noop_elimination import NoOpEliminationPass
-from vllm.compilation.post_cleanup import PostCleanupPass
+from vllm.compilation.passes.fusion.allreduce_rms_fusion import AllReduceFusionPass
+from vllm.compilation.passes.utility.fix_functionalization import (
+    FixFunctionalizationPass,
+)
+from vllm.compilation.passes.utility.noop_elimination import NoOpEliminationPass
+from vllm.compilation.passes.utility.post_cleanup import PostCleanupPass
 from vllm.config import (
    CompilationConfig,
    CompilationMode,
@@ -33,9 +37,6 @@ from vllm.platforms import current_platform
 from vllm.utils.system_utils import update_environment_variables
 from vllm.utils.torch_utils import set_random_seed

-from ...utils import TestFP8Layer, has_module_attribute, multi_gpu_test
-from ..backend import TestBackend
-

 class TestAllReduceRMSNormModel(torch.nn.Module):
    def __init__(self, hidden_size=16, token_num=16, eps=1e-6):
--- a/tests/compile/passes/distributed/test_sequence_parallelism.py
+++ b/tests/compile/passes/distributed/test_sequence_parallelism.py
@@ -5,12 +5,14 @@ import pytest
 import torch

 import vllm.envs as envs
-from vllm.compilation.fusion import RMSNormQuantFusionPass
-from vllm.compilation.fx_utils import find_auto_fn
-from vllm.compilation.noop_elimination import NoOpEliminationPass
-from vllm.compilation.post_cleanup import PostCleanupPass
-from vllm.compilation.sequence_parallelism import SequenceParallelismPass
-from vllm.compilation.vllm_inductor_pass import VllmInductorPass
+from tests.compile.backend import TestBackend
+from tests.utils import TestFP8Layer, multi_gpu_test
+from vllm.compilation.passes.fusion.rms_quant_fusion import RMSNormQuantFusionPass
+from vllm.compilation.passes.fusion.sequence_parallelism import SequenceParallelismPass
+from vllm.compilation.passes.fx_utils import find_auto_fn
+from vllm.compilation.passes.utility.noop_elimination import NoOpEliminationPass
+from vllm.compilation.passes.utility.post_cleanup import PostCleanupPass
+from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
 from vllm.config import (
    CompilationConfig,
    CUDAGraphMode,
@@ -34,9 +36,6 @@ from vllm.platforms import current_platform
 from vllm.utils.system_utils import update_environment_variables
 from vllm.utils.torch_utils import set_random_seed

-from ...utils import TestFP8Layer, multi_gpu_test
-from ..backend import TestBackend
-
 FP8_DTYPE = current_platform.fp8_dtype()
 prompts = [
    "Hello, my name is",
--- a/tests/compile/passes/test_functionalization.py
+++ b/tests/compile/passes/test_functionalization.py
@@ -5,12 +5,18 @@ import pytest
 import torch

 import vllm.envs as envs
-from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass
-from vllm.compilation.fix_functionalization import FixFunctionalizationPass
-from vllm.compilation.fusion import RMSNormQuantFusionPass
-from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
-from vllm.compilation.noop_elimination import NoOpEliminationPass
-from vllm.compilation.post_cleanup import PostCleanupPass
+from tests.compile.backend import TestBackend
+from tests.utils import TestFP8Layer
+from vllm.compilation.passes.fusion.act_quant_fusion import (
+    ActivationQuantFusionPass,
+)
+from vllm.compilation.passes.fusion.rms_quant_fusion import RMSNormQuantFusionPass
+from vllm.compilation.passes.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
+from vllm.compilation.passes.utility.fix_functionalization import (
+    FixFunctionalizationPass,
+)
+from vllm.compilation.passes.utility.noop_elimination import NoOpEliminationPass
+from vllm.compilation.passes.utility.post_cleanup import PostCleanupPass
 from vllm.config import (
    CompilationConfig,
    ModelConfig,
@@ -26,9 +32,6 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.platforms import current_platform

-from ..utils import TestFP8Layer
-from .backend import TestBackend
-
 TEST_FP8 = current_platform.supports_fp8()
 FP8_DTYPE = current_platform.fp8_dtype()

--- a/Show More
+++ b/Show More