[CI] Bump mypy version to 1.19.1 (#36104 )

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
[ROCm][Perf] Enable sparse_mla's cudagraph on ROCm platform (#35719 )
2026-03-10 09:18:28 -07:00 · 2026-03-10 09:14:35 -07:00 · 2026-03-10 09:14:31 -07:00 · 2026-03-10 09:14:27 -07:00 · 2026-03-10 08:14:21 -07:00 · 2026-03-10 15:03:18 +00:00
724 changed files with 42064 additions and 28583 deletions
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -13,9 +13,10 @@ import os
 from contextlib import contextmanager

 import lm_eval
-import numpy as np
 import yaml

+from vllm.platforms import current_platform
+
 DEFAULT_RTOL = 0.08


@@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size):
        "allow_deprecated_quantization=True,"
    )

+    if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]:
+        model_args += "attention_backend=TRITON_ATTN"
+
    env_vars = eval_config.get("env_vars", None)
    with scoped_env_vars(env_vars):
        results = lm_eval.simple_evaluate(
@@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
                f"ground_truth={ground_truth:.3f} | "
                f"measured={measured_value:.3f} | rtol={rtol}"
            )
-            success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
+
+            min_acceptable = ground_truth * (1 - rtol)
+            success = success and measured_value >= min_acceptable

    assert success
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
--- a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
@@ -10,7 +10,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -37,7 +36,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -64,7 +62,6 @@
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -91,7 +88,6 @@
        "server_parameters": {
            "model": "deepseek-ai/DeepSeek-R1",
            "tensor_parallel_size": 8,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
--- a/.buildkite/performance-benchmarks/tests/serving-tests.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests.json
@@ -5,7 +5,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -23,7 +22,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -41,7 +39,6 @@
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -59,7 +56,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", 
            "tensor_parallel_size": 4,
-            "swap_space": 16,
            "speculative_config": {
                "model": "turboderp/Qwama-0.5B-Instruct",
                "num_speculative_tokens": 4,
--- a/.buildkite/scripts/check-ray-compatibility.sh
+++ b/.buildkite/scripts/check-ray-compatibility.sh
@@ -166,12 +166,19 @@ See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for contex
 EOF
 fi

-# Notify Slack if webhook is configured.
+# Notify Slack if webhook is configured and PR/branch are valid.
 if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
-    echo ">>> Sending Slack notification"
-    # Single quotes are intentional: the f-string expressions are Python, not shell.
-    # shellcheck disable=SC2016
-    PAYLOAD=$(python3 -c '
+    PR="${BUILDKITE_PULL_REQUEST:-}"
+    BRANCH="${BUILDKITE_BRANCH:-}"
+
+    # Skip notification if PR is invalid or branch is empty
+    if [[ "$PR" = "false" || -z "$PR" || -z "$BRANCH" ]]; then
+        echo ">>> Skipping Slack notification (invalid PR or empty branch: PR=$PR, branch=$BRANCH)"
+    else
+        echo ">>> Sending Slack notification"
+        # Single quotes are intentional: the f-string expressions are Python, not shell.
+        # shellcheck disable=SC2016
+        PAYLOAD=$(python3 -c '
 import json, os, sys
 pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A")
 branch = os.getenv("BUILDKITE_BRANCH", "unknown")
@@ -194,10 +201,11 @@ data = {
 print(json.dumps(data))
 ')

-    HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
-        -H 'Content-type: application/json' \
-        -d "$PAYLOAD")
-    echo "    Slack webhook response: $HTTP_CODE"
+        HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
+            -H 'Content-type: application/json' \
+            -d "$PAYLOAD")
+        echo "    Slack webhook response: $HTTP_CODE"
+    fi
 else
    echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
 fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -34,7 +34,7 @@ function cpu_tests() {
  # offline inference
  docker exec cpu-test bash -c "
    set -e
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"

  # Run model tests
  docker exec cpu-test bash -c "
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -27,7 +27,7 @@ function cpu_tests() {
  podman exec -it "$container_id" bash -c "
    export TORCH_COMPILE_DISABLE=1
    set -xve
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log

  # Run basic model test
  podman exec -it "$container_id" bash -c "
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@@ -25,5 +25,5 @@ remove_docker_container

 # Run the image and test offline inference
 docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
+    python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -76,7 +76,7 @@ docker run --rm --runtime=habana --name="${container_name}" --network=host \
  -e PT_HPU_LAZY_MODE=1 \
  "${image_name}" \
  /bin/bash -c '
-  cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
+  cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
 '

 EXITCODE=$?
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -34,17 +34,17 @@ docker run \
    set -e
    echo $ZE_AFFINITY_MASK
    pip install tblib==3.1.0
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
-    python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
+    python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
+    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
+    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
    cd tests
-    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
+    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -24,7 +24,7 @@ if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:
  BACKENDS=("allgather_reducescatter")
  # Disable MOE padding for ROCm since it is causing eplb to fail
  export VLLM_ROCM_MOE_PADDING=0
-  PLATFORM_ARGS=("--no-async-scheduling")
+  PLATFORM_ARGS=("--no-async-scheduling" "--attention-backend=TRITON_ATTN")
  echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
 else
  # Non-ROCm platform (CUDA/other)
--- a/.buildkite/scripts/upload-nightly-wheels.sh
+++ b/.buildkite/scripts/upload-nightly-wheels.sh
@@ -72,7 +72,7 @@ obj_json="objects.json"
 aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
 mkdir -p "$INDICES_OUTPUT_DIR"

-# call script to generate indicies for all existing wheels
+# call script to generate indices for all existing wheels
 # this indices have relative paths that could work as long as it is next to the wheel directory in s3
 # i.e., the wheels are always in s3://vllm-wheels/<commit>/
 # and indices can be placed in /<commit>/, or /nightly/, or /<version>/
--- a/.buildkite/scripts/upload-release-wheels-pypi.sh
+++ b/.buildkite/scripts/upload-release-wheels-pypi.sh
@@ -54,10 +54,13 @@ mkdir -p $DIST_DIR
 # include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
 aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
 echo "Wheels copied to local directory"
-# generate source tarball
-git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT"
+# generate source distribution using setup.py
+python setup.py sdist --dist-dir=$DIST_DIR
 ls -la $DIST_DIR

+SDIST_FILE=$(find $DIST_DIR -name "vllm*.tar.gz")
+echo "Found sdist: $SDIST_FILE"
+
 # upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
 PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
 if [[ -z "$PYPI_WHEEL_FILES" ]]; then
@@ -65,6 +68,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
  exit 1
 fi

-python3 -m twine check "$PYPI_WHEEL_FILES"
-python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES"
-echo "Wheels uploaded to PyPI"
+python3 -m twine check "$PYPI_WHEEL_FILES" "$SDIST_FILE"
+python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES" "$SDIST_FILE"
+echo "Wheels and source distribution uploaded to PyPI"
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -467,7 +467,7 @@ steps:
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

-# TODO: Add the "V1 Test attetion (MI300)" test group
+# TODO: Add the "V1 Test attention (MI300)" test group

 - label: V1 Test attention (H100) # 10min
  mirror_hardwares: [amdexperimental, amdproduction]
@@ -499,17 +499,6 @@ steps:
    - pytest -v -s v1/determinism/test_batch_invariance.py
    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py

- label: V1 Test attention (B200) # 10min
-  timeout_in_minutes: 30
-  gpu: b200
-  source_file_dependencies:
-    - vllm/config/attention.py
-    - vllm/model_executor/layers/attention
-    - vllm/v1/attention
-    - tests/v1/attention
-  commands:
-    - pytest -v -s v1/attention
-
 - label: V1 Test others (CPU) # 5 mins
  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
@@ -540,12 +529,12 @@ steps:
  commands:
    - pip install tensorizer # for tensorizer test
    # for basic
-    - python3 offline_inference/basic/chat.py
-    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
-    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/classify.py
-    - python3 offline_inference/basic/embed.py
-    - python3 offline_inference/basic/score.py
+    - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN
+    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
+    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 basic/offline_inference/classify.py
+    - python3 basic/offline_inference/embed.py
+    - python3 basic/offline_inference/score.py
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
@@ -1180,52 +1169,45 @@ steps:
    - pytest -v -s tests/models/test_transformers.py
    # - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

- label: Blackwell Test # 21 min
-  timeout_in_minutes: 30
+- label: Blackwell Fusion and Compile Tests # 30 min
+  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  gpu: b200
-  # optional: true
  source_file_dependencies:
  - csrc/quantization/fp4/
-  - csrc/attention/mla/
-  - csrc/quantization/cutlass_w8a8/moe/
-  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
-  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-  - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/v1/attention/backends/mla/cutlass_mla.py
-  - vllm/v1/attention/backends/mla/flashinfer_mla.py
-  - vllm/v1/attention/selector.py
-  - vllm/platforms/cuda.py
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/passes/test_fusion_attn.py
+  - tests/compile/passes/test_silu_mul_quant_fusion.py
+  - tests/compile/passes/distributed/test_fusion_all_reduce.py
+  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
-    - python3 examples/offline_inference/basic/chat.py
-    # Attention
-    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
-    - pytest -v -s tests/kernels/attention/test_attention_selector.py
-    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
-    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
-    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
-    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
-    # Quantization
-    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
-    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
-    - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
-    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
-    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
-    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
-    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
-    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
-    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
-    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
-    - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
+    - pytest -v -s tests/compile/passes/test_fusion_attn.py
+    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
+
+    # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # # Wrap with quotes to escape yaml
+    # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
+
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
@@ -1258,16 +1240,6 @@ steps:
  commands:
    - pytest -s -v tests/quantization/test_blackwell_moe.py

- label: Blackwell LM Eval Small Models
-  timeout_in_minutes: 120
-  gpu: b200
-  optional: true # run on nightlies
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
-
 #####  1 GPU test  #####
 #####  multi gpus test  #####

@@ -1514,6 +1486,20 @@ steps:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
    - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

+- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
+    - CROSS_LAYERS_BLOCKS=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+
 ##### multi gpus test #####
 ##### A100 test #####

@@ -1653,8 +1639,8 @@ steps:
  - vllm/model_executor/layers/quantization/mxfp4.py
  - vllm/v1/attention/backends/flashinfer.py
  commands:
-    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+  - uv pip install --system 'gpt-oss[eval]==0.0.5'
+  - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx942.txt

 ##### EPLB Accuracy Tests #####
 - label: DeepSeek V2-Lite Accuracy
@@ -1681,16 +1667,6 @@ steps:
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020

- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
-  timeout_in_minutes: 60
-  gpu: b200
-  optional: true
-  num_gpus: 2
-  working_dir: "/vllm-workspace"
-  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
-
-
 - label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental, amdproduction]
@@ -2174,20 +2150,7 @@ steps:
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

-# TODO: Add the "V1 Test attetion (MI300)" test group
-
- label: V1 Test attention (H100) # 10min
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_1
-  timeout_in_minutes: 30
-  gpu: h100
-  source_file_dependencies:
-    - vllm/config/attention.py
-    - vllm/model_executor/layers/attention
-    - vllm/v1/attention
-    - tests/v1/attention
-  commands:
-    - pytest -v -s v1/attention
+# TODO: Add the "V1 Test attention (MI300)" test group

 - label: Batch Invariance Tests (H100) # 10min
  mirror_hardwares: [amdexperimental]
@@ -2205,6 +2168,8 @@ steps:
    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py

 - label: V1 Test attention (B200) # 10min
+  mirror_hardwares: [amdexperimental, amdmi355]
+  agent_pool: mi355_1
  timeout_in_minutes: 30
  gpu: b200
  source_file_dependencies:
@@ -2243,12 +2208,12 @@ steps:
  commands:
    - pip install tensorizer # for tensorizer test
    # for basic
-    - python3 offline_inference/basic/chat.py
-    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
-    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/classify.py
-    - python3 offline_inference/basic/embed.py
-    - python3 offline_inference/basic/score.py
+    - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN
+    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
+    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 basic/offline_inference/classify.py
+    - python3 basic/offline_inference/embed.py
+    - python3 basic/offline_inference/score.py
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
@@ -2824,12 +2789,14 @@ steps:
    - pytest -v -s tests/models/test_transformers.py
    # - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

- label: Blackwell Test # 21 min
+- label: Blackwell Test (MI355) # 21 min
+  mirror_hardwares: [amdexperimental, amdmi355]
+  agent_pool: mi355_1
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  gpu: b200
@@ -2848,28 +2815,28 @@ steps:
  - vllm/v1/attention/selector.py
  - vllm/platforms/cuda.py
  commands:
-    - nvidia-smi
-    - python3 examples/offline_inference/basic/chat.py
+    - rocm-smi
+    - python3 examples/basic/offline_inference/chat.py
    # Attention
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
-    - pytest -v -s tests/kernels/attention/test_attention_selector.py
-    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
-    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
-    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
-    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
-    # Quantization
-    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
-    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
-    - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
-    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
-    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
-    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
-    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
-    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
-    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
-    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
-    - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
+    - pytest -v -s tests/kernels/attention/test_attention_selector.py 
+    #- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+    #- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
+    #- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+    #- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
+    ## Quantization
+    #- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+    #- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+    #- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+    #- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+    #- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
+    #- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+    #- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+    #- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+    #- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+    #- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+    #- pytest -v -s tests/kernels/moe/test_flashinfer.py
+    #- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py

 - label: Blackwell Fusion and Compile Tests # 30 min
  timeout_in_minutes: 40
@@ -2939,13 +2906,15 @@ steps:

 - label: Blackwell LM Eval Small Models
  timeout_in_minutes: 120
+  mirror_hardwares: [amdexperimental, amdproduction, amdmi355]
+  agent_pool: mi355_2
  gpu: b200
  optional: true # run on nightlies
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi355.txt

 #####  1 GPU test  #####
 #####  multi gpus test  #####
@@ -3181,6 +3150,20 @@ steps:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
    - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

+- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi355_4
+  # grade: Blocking
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
+    - CROSS_LAYERS_BLOCKS=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+
 ##### multi gpus test #####
 ##### A100 test #####

@@ -3313,8 +3296,8 @@ steps:
  - vllm/model_executor/layers/quantization/mxfp4.py
  - vllm/v1/attention/backends/flashinfer.py
  commands:
-    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+  - uv pip install --system 'gpt-oss[eval]==0.0.5'
+  - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx950.txt

 ##### EPLB Accuracy Tests #####
 - label: DeepSeek V2-Lite Accuracy
@@ -3328,18 +3311,9 @@ steps:
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010

- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_4
-  timeout_in_minutes: 60
-  gpu: h100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace"
-  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
-
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200-MI355)
+  mirror_hardwares: [amdexperimental, amdproduction, amdmi355]
+  agent_pool: mi355_2
  timeout_in_minutes: 60
  gpu: b200
  optional: true
@@ -3358,3 +3332,18 @@ steps:
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
+
+- label: Attention Benchmarks Smoke Test (B200-MI355)
+  device: b200
+  mirror_hardwares: [amdexperimental, amdmi355]
+  agent_pool: mi355_2
+  num_gpus: 2
+  optional: true
+  working_dir: "/vllm-workspace/"
+  timeout_in_minutes: 10
+  source_file_dependencies:
+  - benchmarks/attention_benchmarks/
+  - vllm/v1/attention/
+  commands:
+  - python3 benchmarks/attention_benchmarks/benchmark.py --backends ROCM_ATTN ROCM_AITER_FA ROCM_AITER_UNIFIED_ATTN --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1
+
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -36,6 +36,16 @@ steps:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py

+- label: AsyncTP Correctness Tests (B200)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/"
+  device: b200
+  optional: true
+  num_devices: 2
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
+
 - label: Distributed Compile Unit Tests (2xH100)
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -67,6 +67,7 @@ steps:
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_symm_mem_allreduce.py
+  - tests/distributed/test_multiproc_executor.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
@@ -95,6 +96,8 @@ steps:
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
  - pytest -v -s distributed/test_symm_mem_allreduce.py
+  # test multi-node TP with multiproc executor (simulated on single node)
+  - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  # OLD rlhf examples
@@ -210,6 +213,19 @@ steps:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

+- label: NixlConnector PD + Spec Decode acceptance (2 GPUs)
+  timeout_in_minutes: 30
+  device: a100
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 2
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - vllm/v1/worker/kv_connector_model_runner_mixin.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
+
 - label: Pipeline + Context Parallelism (4 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -41,6 +41,11 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
  - pytest -v -s entrypoints/test_chat_utils.py
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd

 - label: Entrypoints Integration (API Server 2)
  timeout_in_minutes: 130
@@ -55,6 +60,11 @@ steps:
  - pytest -v -s entrypoints/instrumentator
  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
  - pytest -v -s tool_use
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd

 - label: Entrypoints Integration (Pooling)
  timeout_in_minutes: 50
@@ -87,6 +97,11 @@ steps:
    - tests/v1
  commands:
    - pytest -v -s v1/entrypoints
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd

 - label: OpenAI API Correctness
  timeout_in_minutes: 30
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -8,8 +8,9 @@ steps:
  - csrc/
  - tests/kernels/core
  - tests/kernels/test_top_k_per_row.py
+  - tests/kernels/test_concat_mla_q.py
  commands:
-    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py

 - label: Kernels Attention Test %N
  timeout_in_minutes: 35
@@ -96,7 +97,7 @@ steps:
  - vllm/platforms/cuda.py
  commands:
    - nvidia-smi
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
    # Attention
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
    - pytest -v -s tests/kernels/attention/test_attention_selector.py
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -67,12 +67,13 @@ steps:
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
-    - python3 offline_inference/basic/chat.py # for basic
-    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
-    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/classify.py
-    - python3 offline_inference/basic/embed.py
-    - python3 offline_inference/basic/score.py
+     # for basic
+    - python3 basic/offline_inference/chat.py
+    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
+    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 basic/offline_inference/classify.py
+    - python3 basic/offline_inference/embed.py
+    - python3 basic/offline_inference/score.py
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
@@ -87,6 +88,11 @@ steps:
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd

 - label: Metrics, Tracing (2 GPUs)
  timeout_in_minutes: 20
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -65,7 +65,7 @@ steps:
    - pytest -v -s tests/models/test_transformers.py
    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -12,6 +12,11 @@ steps:
    - pip freeze | grep -E 'torch'
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd

 - label: Multi-Modal Processor Test (CPU)
  depends_on: 
@@ -54,6 +59,11 @@ steps:
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd

 - label: Multi-Modal Models (Extended) 2
  optional: true
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -15,9 +15,12 @@ steps:
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
  # end platform plugin tests
-  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+  # begin io_processor plugins test
+  # test generic io_processor plugins functions
+  - pytest -v -s ./plugins_tests/test_io_processor_plugins.py
+  # test Terratorch io_processor plugins
  - pip install -e ./plugins/prithvi_io_processor_plugin
-  - pytest -v -s plugins_tests/test_io_processor_plugins.py
+  - pytest -v -s plugins_tests/test_terratorch_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # test bge_m3_sparse io_processor plugin
  - pip install -e ./plugins/bge_m3_sparse_plugin
@@ -36,3 +39,8 @@ steps:
  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
+  mirror:
+    amd:
+      device: mi325_2
+      depends_on:
+      - image-build-amd
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -3,6 +3,7 @@ pull_request_rules:
  description: Automatically apply documentation label
  conditions:
    - label != stale
+    - -closed
    - or:
      - files~=^[^/]+\.md$
      - files~=^docs/
@@ -37,15 +38,13 @@ pull_request_rules:

        > [!TIP]
        > <details>
-        > <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
+        > <summary>Is <code>mypy</code> failing?</summary>
        > <br/>
-        > <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
+        > <code>mypy</code> is run differently in CI. If the failure is related to this check, please use the following command to run it locally:
        >
        > ```bash
        > # For mypy (substitute "3.10" with the failing version if needed)
        > pre-commit run --hook-stage manual mypy-3.10
-        > # For markdownlint
-        > pre-commit run --hook-stage manual markdownlint
        > ```
        > </details>

--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -6,6 +6,9 @@ on:
      - main
  workflow_dispatch:  # Manual trigger

+permissions:
+  contents: read
+
 jobs:
  macos-m1-smoke-test:
    runs-on: macos-latest
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@ repos:
    args: [--output-format, github, --fix]
  - id: ruff-format
 - repo: https://github.com/crate-ci/typos
-  rev: v1.38.1
+  rev: v1.43.5
  hooks:
  - id: typos
    args: [--force-exclude]
@@ -24,12 +24,12 @@ repos:
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
    types_or: [c++, cuda]
    args: [--style=file, --verbose]
- repo: https://github.com/igorshubovych/markdownlint-cli
-  rev: v0.45.0
+- repo: https://github.com/DavidAnson/markdownlint-cli2
+  rev: v0.21.0
  hooks:
-  - id: markdownlint
-    exclude: '.*\.inc\.md'
-    stages: [manual] # Only run in CI
+  - id: markdownlint-cli2
+    language_version: lts
+    args: [--fix]
 - repo: https://github.com/rhysd/actionlint
  rev: v1.7.7
  hooks:
@@ -55,7 +55,7 @@ repos:
      language: python
      types_or: [python, pyi]
      require_serial: true
-      additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
+      additional_dependencies: ["mypy[faster-cache]==1.19.1", regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.10
    entry: python tools/pre_commit/mypy.py 1 "3.10"
@@ -127,6 +127,13 @@ repos:
    language: python
    types: [python]
    additional_dependencies: [regex]
+  # prevent use torch.cuda APIs
+  - id: check-torch-cuda-call
+    name: "Prevent new 'torch.cuda' APIs call"
+    entry: python tools/pre_commit/check_torch_cuda.py
+    language: python
+    types: [python]
+    additional_dependencies: [regex]
  - id: validate-config
    name: Validate configuration has default values and that each field has a docstring
    entry: python tools/pre_commit/validate_config.py
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -9,6 +9,7 @@ build:
    python: "3.12"
  jobs:
    post_checkout:
+      - bash docs/maybe_skip_pr_build.sh
      - git fetch origin main --unshallow --no-tags --filter=blob:none || true
    pre_create_environment:
      - pip install uv
--- a/benchmarks/attention_benchmarks/README.md
+++ b/benchmarks/attention_benchmarks/README.md
@@ -187,7 +187,7 @@ python benchmark.py \
 ## Hardware Requirements

 | Backend | Hardware |
-|---------|----------|
+| ------- | -------- |
 | Flash/Triton/FlashInfer | Any CUDA GPU |
 | CUTLASS MLA | Blackwell (SM100+) |
 | FlashAttn MLA | Hopper (SM90+) |
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -30,7 +30,7 @@ def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
        max_kv_len = max(r.kv_len for r in requests) if requests else 0
        return (batch_size, max_q_len, max_kv_len)
    except Exception:
-        # Fallback for unparseable specs
+        # Fallback for unparsable specs
        return (0, 0, 0)


--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -145,7 +145,6 @@ def create_minimal_vllm_config(
    cache_config = CacheConfig(
        block_size=block_size,
        gpu_memory_utilization=0.9,
-        swap_space=0,
        cache_dtype="auto",
        enable_prefix_caching=False,
    )
@@ -701,7 +700,7 @@ def _run_single_benchmark(
    # Warmup
    for _ in range(config.warmup_iters):
        forward_fn()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Benchmark
    times = []
@@ -714,7 +713,7 @@ def _run_single_benchmark(
            forward_fn()
        end.record()

-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        elapsed_ms = start.elapsed_time(end)
        times.append(elapsed_ms / 1000.0 / config.num_layers)

--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -141,7 +141,6 @@ def _create_vllm_config(
    cache_config = CacheConfig(
        block_size=config.block_size,
        cache_dtype="auto",
-        swap_space=0,
    )
    cache_config.num_gpu_blocks = max_num_blocks
    cache_config.num_cpu_blocks = 0
@@ -391,7 +390,7 @@ def _run_single_benchmark(
                attn_metadata,
                output=out,
            )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Benchmark
    times = []
@@ -412,7 +411,7 @@ def _run_single_benchmark(
            )
        end.record()

-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        elapsed_ms = start.elapsed_time(end)
        times.append(elapsed_ms / 1000.0 / config.num_layers)  # seconds per layer

--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -41,7 +41,7 @@ MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LE
 | --- | --- | --- |
 | `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
 | `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` |
-| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
+| `SYSTEM` | **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
 | `TP` | **Required.** The tensor-parallelism size. | `1` |
 | `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
 | `INPUT_LEN` | **Required.** Request input length. | `4000` |
--- a/benchmarks/benchmark_topk_topp.py
+++ b/benchmarks/benchmark_topk_topp.py
@@ -94,7 +94,7 @@ def create_logits(

 def measure_memory() -> tuple[int, int]:
    """Return (allocated, reserved) memory in bytes."""
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()


@@ -102,7 +102,7 @@ def reset_memory_stats():
    """Reset peak memory statistics."""
    reset_buffer_cache()
    torch.cuda.reset_peak_memory_stats()
-    torch.cuda.empty_cache()
+    torch.accelerator.empty_cache()
    gc.collect()


@@ -123,7 +123,7 @@ def benchmark_function(
    for _ in range(warmup_iters):
        logits_copy = logits.clone()
        func(logits_copy, k, p)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Reset memory stats before benchmark
    reset_memory_stats()
@@ -140,7 +140,7 @@ def benchmark_function(
        func(logits_copy, k, p)
        end_events[i].record()

-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Calculate timing
    times = [
--- a/benchmarks/kernels/bench_concat_mla_q.py
+++ b/benchmarks/kernels/bench_concat_mla_q.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.triton_utils import triton
+
+# DeepSeek V3 dimensions
+NOPE_DIM = 512
+ROPE_DIM = 64
+NUM_HEADS = 128
+
+NUM_TOKENS = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
+
+
+def get_configs():
+    return NUM_TOKENS
+
+
+def make_inputs(num_tokens, dtype):
+    """Create inputs matching the real code path.
+
+    Args:
+        contiguous_nope: If False, simulate the transposed BMM output
+                         (non-contiguous nope with stride pattern from
+                         [N,B,L].transpose(0,1)).
+    """
+    # Simulate: bmm output [N, B, L].transpose(0, 1) -> [B, N, L]
+    raw = torch.randn(NUM_HEADS, num_tokens, NOPE_DIM, dtype=dtype, device="cuda")
+    ql_nope = raw.transpose(0, 1)
+
+    q_pe = torch.randn(num_tokens, NUM_HEADS, ROPE_DIM, dtype=dtype, device="cuda")
+    return ql_nope, q_pe
+
+
+# ---- Non-contiguous nope benchmark (real code path) ----
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["num_tokens"],
+        x_vals=get_configs(),
+        line_arg="provider",
+        line_vals=["torch_cat", "concat_mla_q"],
+        line_names=["torch.cat", "concat_mla_q (v8)"],
+        styles=[("blue", "--"), ("green", "-")],
+        ylabel="Latency (us)",
+        plot_name="concat_mla_q-transposed",
+        args={},
+    )
+)
+def bench_transposed(num_tokens, provider):
+    dtype = torch.bfloat16
+    ql_nope, q_pe = make_inputs(num_tokens, dtype)
+
+    q_out = torch.empty(
+        num_tokens, NUM_HEADS, NOPE_DIM + ROPE_DIM, dtype=dtype, device="cuda"
+    )
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "torch_cat":
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: torch.cat((ql_nope, q_pe), dim=-1), quantiles=quantiles, rep=500
+        )
+    else:
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: ops.concat_mla_q(ql_nope, q_pe, q_out), quantiles=quantiles, rep=500
+        )
+
+    return ms * 1000, max_ms * 1000, min_ms * 1000  # us
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark concat_mla_q vs torch.cat")
+    parser.add_argument(
+        "--save-path", type=str, default=None, help="Path to save benchmark results"
+    )
+    args = parser.parse_args()
+
+    print("\n" + "=" * 70)
+    print("CONCAT MLA Q KERNEL BENCHMARKS")
+    print("=" * 70)
+    print(f"Dimensions: nope={NOPE_DIM}, rope={ROPE_DIM}, heads={NUM_HEADS}")
+    print(
+        f"Per-head output: {NOPE_DIM + ROPE_DIM} bf16 = "
+        f"{(NOPE_DIM + ROPE_DIM) * 2} bytes"
+    )
+    print(f"num_tokens (decode=batch_size, prefill=chunk_size): {NUM_TOKENS}")
+    print("=" * 70)
+
+    print("\n--- Non-contiguous nope inputs (transposed BMM output) ---")
+    bench_transposed.run(print_data=True, save_path=args.save_path)
+
+    print("\n" + "=" * 70)
+    print("Benchmarking complete!")
+    print("=" * 70)
--- a/benchmarks/kernels/bench_cp_gather_fp8.py
+++ b/benchmarks/kernels/bench_cp_gather_fp8.py
@@ -0,0 +1,153 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import math
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.triton_utils import triton
+
+# DeepSeek V3 MLA dimensions
+NOPE_DIM = 512
+ROPE_DIM = 64
+HEAD_DIM = NOPE_DIM + ROPE_DIM  # 576 BF16 output elements per token
+ENTRY_BYTES = 656  # 512 FP8 + 16 scales + 128 BF16 RoPE
+BLOCK_SIZE = 64  # tokens per physical cache block - get_supported_kernel_block_sizes
+
+# Realistic prefill scenarios:
+#   - 1 long prefill: single request, 16K-96K tokens
+#   - 4 medium prefills: 4 requests, 4K-24K tokens each
+#   - 16 shorter prefills: 16 requests, 1K-6K tokens each
+SCENARIOS = [
+    # (label, num_reqs, total_tokens_list)
+    ("1-req", 1, [8192, 16384, 32768, 65536, 98304]),
+    ("4-reqs", 4, [8192, 16384, 32768, 65536, 98304]),
+    ("16-reqs", 16, [8192, 16384, 32768, 65536, 98304]),
+]
+
+
+def make_inputs(total_tokens, num_reqs, block_size):
+    """Create synthetic FP8 cache, block table, and output buffer.
+
+    Fills the cache with random bytes (we only measure throughput,
+    not correctness). Block table maps each request to contiguous
+    physical blocks.
+    """
+    # Divide tokens evenly across requests
+    base_len = total_tokens // num_reqs
+    remainder = total_tokens % num_reqs
+    seq_lens = [base_len + (1 if r < remainder else 0) for r in range(num_reqs)]
+
+    # workspace_starts: cumulative sum of seq_lens
+    workspace_starts = [0] * num_reqs
+    for r in range(1, num_reqs):
+        workspace_starts[r] = workspace_starts[r - 1] + seq_lens[r - 1]
+
+    # Physical blocks needed per request
+    blocks_per_req = [math.ceil(s / block_size) for s in seq_lens]
+    total_blocks = sum(blocks_per_req)
+    max_blocks = max(blocks_per_req)
+
+    # Allocate cache with random data (content doesn't matter for perf)
+    cache = torch.randint(
+        0,
+        256,
+        (total_blocks, block_size, ENTRY_BYTES),
+        dtype=torch.uint8,
+        device="cuda",
+    )
+
+    # Block table: contiguous block assignments
+    block_table = torch.zeros(num_reqs, max_blocks, dtype=torch.int32, device="cuda")
+    block_idx = 0
+    for r in range(num_reqs):
+        for b in range(blocks_per_req[r]):
+            block_table[r, b] = block_idx
+            block_idx += 1
+
+    # Output workspace
+    dst = torch.zeros(total_tokens, HEAD_DIM, dtype=torch.bfloat16, device="cuda")
+
+    seq_lens_t = torch.tensor(seq_lens, dtype=torch.int32, device="cuda")
+    workspace_starts_t = torch.tensor(
+        workspace_starts, dtype=torch.int32, device="cuda"
+    )
+
+    return cache, dst, block_table, seq_lens_t, workspace_starts_t
+
+
+def bench_scenario(label, num_reqs, total_tokens_list, save_path):
+    """Run benchmark for a specific (num_reqs, total_tokens) scenario."""
+
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["total_tokens"],
+            x_vals=total_tokens_list,
+            line_arg="provider",
+            line_vals=["cuda_kernel"],
+            line_names=["cp_gather_fp8 (CUDA)"],
+            styles=[("green", "-")],
+            ylabel="Latency (us)",
+            plot_name=f"cp_gather_fp8-{label}-bs{BLOCK_SIZE}",
+            args={"num_reqs": num_reqs},
+        )
+    )
+    def bench_fn(total_tokens, provider, num_reqs):
+        cache, dst, block_table, seq_lens_t, ws_starts = make_inputs(
+            total_tokens, num_reqs, BLOCK_SIZE
+        )
+
+        quantiles = [0.5, 0.2, 0.8]
+
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: ops.cp_gather_and_upconvert_fp8_kv_cache(
+                cache, dst, block_table, seq_lens_t, ws_starts, num_reqs
+            ),
+            quantiles=quantiles,
+            rep=500,
+        )
+
+        return ms * 1000, max_ms * 1000, min_ms * 1000  # us
+
+    seq_len_per_req = total_tokens_list[0] // num_reqs
+    seq_len_per_req_max = total_tokens_list[-1] // num_reqs
+    print(
+        f"\n--- {label}: {num_reqs} request(s), "
+        f"~{seq_len_per_req}-{seq_len_per_req_max} tokens/req ---"
+    )
+    bench_fn.run(print_data=True, save_path=save_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Benchmark cp_gather_and_upconvert_fp8_kv_cache"
+    )
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default=None,
+        help="Path to save benchmark results as CSV",
+    )
+    args = parser.parse_args()
+
+    # Print data volume info for bandwidth analysis
+    read_per_token = ENTRY_BYTES  # 656 bytes from cache
+    write_per_token = HEAD_DIM * 2  # 576 * 2 = 1152 bytes to workspace
+    total_per_token = read_per_token + write_per_token  # 1808 bytes
+
+    print("\n" + "=" * 70)
+    print("CP_GATHER_AND_UPCONVERT_FP8_KV_CACHE BENCHMARKS")
+    print("=" * 70)
+    print(f"Cache entry: {ENTRY_BYTES} bytes (512 FP8 + 16 scales + 128 RoPE)")
+    print(f"Output row:  {HEAD_DIM} BF16 = {HEAD_DIM * 2} bytes")
+    print(f"Per token:   {total_per_token} bytes (read + write)")
+    print(f"Block size:  {BLOCK_SIZE} tokens/block")
+    print("=" * 70)
+
+    for label, num_reqs, total_tokens_list in SCENARIOS:
+        bench_scenario(label, num_reqs, total_tokens_list, args.save_path)
+
+    print("\n" + "=" * 70)
+    print("Benchmarking complete!")
+    print("=" * 70)
--- a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
@@ -168,7 +168,7 @@ def bench_impl(
    # warmup
    for kwargs in kwargs_list:
        impl_type.get_impl()(**kwargs)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Merge into a single kwargs and qualify arguments as ArgPool
    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
@@ -202,7 +202,7 @@ def test_correctness(T: int, N: int):
    # reference output
    ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)

-    # test ouptut
+    # test output
    out_q, out_s = output_from_impl(
        ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
    )
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -171,7 +171,7 @@ def bench_run(
                activation=MoEActivation.SILU,
                global_num_experts=num_experts,
            )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
    triton_stream = torch.cuda.Stream()
@@ -187,14 +187,14 @@ def bench_run(
                topk_ids,
                quant_config=quant_config,
            )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
        """Benchmark CUDA graph using events like benchmark_moe.py"""
        # Warmup
        for _ in range(num_warmup):
            graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

        # Timing
        start_event = torch.Event(enable_timing=True)
@@ -202,7 +202,7 @@ def bench_run(

        latencies = []
        for _ in range(num_iters):
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            start_event.record()
            graph.replay()
            end_event.record()
--- a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
@@ -307,7 +307,7 @@ def bench_run(
    def replay_graph(graph, num_repeats):
        for _ in range(num_repeats):
            graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

    cutlass_stream = torch.cuda.Stream()
    cutlass_graph = torch.cuda.CUDAGraph()
@@ -330,7 +330,7 @@ def bench_run(
            e=num_experts,
            device=device,
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    triton_stream = torch.cuda.Stream()
    triton_graph = torch.cuda.CUDAGraph()
@@ -345,7 +345,7 @@ def bench_run(
            w2_fp8scale,
            a_fp8_scale,
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    min_run_time = 5
    num_warmup = 5
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -342,7 +342,7 @@ class CommunicatorBenchmark:
            if not should_use_fn(tensor):
                return None

-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            stream = torch.cuda.Stream()
            with torch.cuda.stream(stream):
                graph_input = tensor.clone()
@@ -360,17 +360,17 @@ class CommunicatorBenchmark:
                        for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
                            allreduce_fn(graph_input)

-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            for _ in range(num_warmup):
                graph.replay()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()

-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            start_time = time.perf_counter()

            for _ in range(num_trials):
                graph.replay()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()

            end_time = time.perf_counter()

--- a/benchmarks/kernels/benchmark_fused_collective.py
+++ b/benchmarks/kernels/benchmark_fused_collective.py
@@ -385,7 +385,7 @@ def benchmark_operation(
    # Warmup before graph capture
    for _ in range(warmup):
        operation_func(*args, **kwargs)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Create CUDA graph
    graph = torch.cuda.CUDAGraph()
@@ -398,19 +398,19 @@ def benchmark_operation(
            operation_func(*args, **kwargs)

    # Graph warmup
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    for _ in range(warmup):
        graph.replay()

    # Benchmark with CUDA graph
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_time = time.perf_counter()

    for _ in range(trials // num_op_per_cudagraph):
        # operation_func(*args, **kwargs)
        graph.replay()

-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    end_time = time.perf_counter()

    avg_time_ms = ((end_time - start_time) / trials) * 1000
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -224,7 +224,7 @@ def bench_run(
    def replay_graph(graph, num_repeats):
        for _ in range(num_repeats):
            graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

    cutlass_stream = torch.cuda.Stream()
    cutlass_graph = torch.cuda.CUDAGraph()
@@ -239,7 +239,7 @@ def bench_run(
            topk_weights,
            topk_ids,
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    triton_stream = torch.cuda.Stream()
    triton_graph = torch.cuda.CUDAGraph()
@@ -254,7 +254,7 @@ def bench_run(
            w2_scale,
            a_scale,
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    min_run_time = 5
    num_warmup = 5
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -34,14 +34,14 @@ def main(
    residual = torch.randn_like(x) * scale if add_residual else None

    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()

        for _ in range(num_iters):
            layer(x, residual)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

        end_time = time.perf_counter()
        if profile:
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -1035,7 +1035,7 @@ def bench_optype(
    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
    for kwargs in kwargs_list:
        op_type.bench_fn()(**kwargs)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Merge into a single kwargs and qualify arguments as ArgPool
    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
--- a/benchmarks/kernels/benchmark_mla_k_concat.py
+++ b/benchmarks/kernels/benchmark_mla_k_concat.py
@@ -47,13 +47,13 @@ def benchmark_method(
    # Warmup
    for _ in range(num_warmup):
        _ = method(k_nope, k_pe)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Benchmark
    start = time.perf_counter()
    for _ in range(num_iters):
        _ = method(k_nope, k_pe)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    end = time.perf_counter()

    return (end - start) / num_iters * 1000  # Convert to ms
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -54,7 +54,7 @@ def clear_triton_cache():

    # Clear CUDA memory cache
    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
+        torch.accelerator.empty_cache()

    # Try to clear Triton's runtime cache
    try:
@@ -304,19 +304,19 @@ def benchmark_config(

    # JIT compilation & warmup
    run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Capture 10 invocations with CUDA graph
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        for _ in range(10):
            run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Warmup
    for _ in range(5):
        graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
@@ -324,7 +324,7 @@ def benchmark_config(
    latencies: list[float] = []
    for i in range(num_iters):
        prepare(i)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

        start_event.record()
        graph.replay()
--- a/benchmarks/kernels/benchmark_moe_defaults.py
+++ b/benchmarks/kernels/benchmark_moe_defaults.py
@@ -131,7 +131,7 @@ def benchmark_config(
                topk_ids,
                quant_config=quant_config,
            )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Benchmark
    start = torch.cuda.Event(enable_timing=True)
@@ -149,7 +149,7 @@ def benchmark_config(
                quant_config=quant_config,
            )
    end.record()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    return start.elapsed_time(end) / num_iters * 1000  # ms -> us


--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -69,19 +69,19 @@ def benchmark_permute(

    # JIT compilation & warmup
    run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Capture 10 invocations with CUDA graph
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        for _ in range(10):
            run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Warmup
    for _ in range(5):
        graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
@@ -89,7 +89,7 @@ def benchmark_permute(
    latencies: list[float] = []
    for i in range(num_iters):
        prepare(i)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

        start_event.record()
        graph.replay()
@@ -159,26 +159,26 @@ def benchmark_unpermute(
    # JIT compilation & warmup
    input = prepare()
    run(input)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Capture 10 invocations with CUDA graph
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        for _ in range(10):
            run(input)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Warmup
    for _ in range(5):
        graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)

    latencies: list[float] = []
    for i in range(num_iters):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        graph.replay()
        end_event.record()
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -135,14 +135,14 @@ def benchmark_mrope(
            key.clone(),
        )

-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    # Time reference implementation
    torch_times = []
    for _ in range(benchmark_iter):
        query_clone = query.clone()
        key_clone = key.clone()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_time = time.time()

        mrope_helper_class.forward_native(
@@ -151,7 +151,7 @@ def benchmark_mrope(
            key_clone,
        )

-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        torch_times.append(time.time() - start_time)

    # Time triton kernel implementation
@@ -159,14 +159,14 @@ def benchmark_mrope(
    for _ in range(benchmark_iter):
        query_clone = query.clone()
        key_clone = key.clone()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_time = time.time()
        mrope_helper_class.forward_cuda(
            positions,
            query_clone,
            key_clone,
        )
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        triton_times.append(time.time() - start_time)

    # Calculate statistics
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -103,7 +103,7 @@ def main(
        max_logits = torch.empty_like(exp_sums)

    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
@@ -173,7 +173,7 @@ def main(
                    )
            else:
                raise ValueError(f"Invalid version: {version}")
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

        end_time = time.perf_counter()
        if profile:
--- a/benchmarks/kernels/benchmark_per_token_group_quant.py
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@@ -28,7 +28,7 @@ def _time_cuda(
    # warmup
    for _ in range(warmup_iters):
        fn()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    start = torch.Event(enable_timing=True)
    end = torch.Event(enable_timing=True)
@@ -37,7 +37,7 @@ def _time_cuda(
    for _ in range(bench_iters):
        fn()
    end.record()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    return start.elapsed_time(end) / bench_iters  # ms/iter

--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -29,7 +29,7 @@ def main(
    scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None

    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
@@ -39,7 +39,7 @@ def main(
                ops.scaled_int8_quant(x, scale)
            else:
                ops.scaled_fp8_quant(x, scale)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

        end_time = time.perf_counter()
        if profile:
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@@ -84,16 +84,16 @@ def run_benchmark(
        g = torch.cuda.CUDAGraph()
        with torch.cuda.graph(g):
            function_under_test()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        function_under_test = lambda: g.replay()

    def run_cuda_benchmark(n_iters: int) -> float:
        nonlocal key, value, key_cache, value_cache, slot_mapping
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = time.perf_counter()
        for _ in range(n_iters):
            function_under_test()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
        end = time.perf_counter()
        return (end - start) / n_iters

@@ -104,7 +104,7 @@ def run_benchmark(

    # free tensors to mitigate OOM when sweeping
    del key, value, key_cache, value_cache, slot_mapping
-    torch.cuda.empty_cache()
+    torch.accelerator.empty_cache()

    return lat

--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@@ -109,16 +109,16 @@ def run_benchmark(
        g = torch.cuda.CUDAGraph()
        with torch.cuda.graph(g):
            function_under_test()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        function_under_test = lambda: g.replay()

    def run_cuda_benchmark(n_iters: int) -> float:
        nonlocal key, value, key_cache, value_cache, slot_mapping
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = time.perf_counter()
        for _ in range(n_iters):
            function_under_test()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
        end = time.perf_counter()
        return (end - start) / n_iters

@@ -129,7 +129,7 @@ def run_benchmark(

    # free tensors to mitigate OOM when sweeping
    del key, value, key_cache, value_cache, slot_mapping
-    torch.cuda.empty_cache()
+    torch.accelerator.empty_cache()

    return lat

--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -251,7 +251,7 @@ def benchmark(
        kernel(
            y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
@@ -259,7 +259,7 @@ def benchmark(
    # Benchmark
    latencies: list[float] = []
    for _ in range(runs):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()

        start_event.record()
        for i in range(iterations_per_run):
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -126,7 +126,7 @@ def benchmark_decode(
    )

    def time_fn(fn, warmup=10, trials=20):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = torch.Event(enable_timing=True)
        end = torch.Event(enable_timing=True)
        times = []
@@ -136,7 +136,7 @@ def benchmark_decode(
            start.record()
            fn()
            end.record()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            times.append(start.elapsed_time(end))  # ms
        return sum(times) / len(times), torch.std(torch.tensor(times))

--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -138,7 +138,7 @@ def benchmark_prefill(
    )

    def time_fn(fn, warmup=10, trials=20):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = torch.Event(enable_timing=True)
        end = torch.Event(enable_timing=True)
        times = []
@@ -148,7 +148,7 @@ def benchmark_prefill(
            start.record()
            fn()
            end.record()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            times.append(start.elapsed_time(end))  # ms
        return sum(times) / len(times), torch.std(torch.tensor(times))

--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -177,18 +177,18 @@ def benchmark_config(
    def run():
        w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)

-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # JIT complication & warmup
    for _ in range(5):
        run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()

    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)

    latencies: list[float] = []
    for i in range(num_iters):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        run()
        end_event.record()
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -35,7 +35,7 @@ def benchmark_shape(
    B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)

    # Reference result in BF16
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    C_ref = A @ B.t()

    # Pre-quantize B for all implementations
@@ -121,14 +121,14 @@ def benchmark_shape(
        # Warmup
        for _ in range(warmup):
            func()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()

        # Timing loop
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = time.time()
        for _ in range(repeat):
            func()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        end = time.time()

        # Calculate timing and TFLOPS
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -242,13 +242,24 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
        )
    else()
        message(STATUS "Downloading oneDNN from GitHub")
-        FetchContent_Declare(
-            oneDNN
-            GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-            GIT_TAG v3.10
-            GIT_PROGRESS TRUE
-            GIT_SHALLOW TRUE
-        )
+        if(ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
+            message(STATUS "aarch64 detected: using pinned oneDNN commit 9c5be1cc59e368aebf0909e6cf20f981ea61462a")
+            FetchContent_Declare(
+                oneDNN
+                GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
+                GIT_TAG        9c5be1cc59e368aebf0909e6cf20f981ea61462a
+                GIT_PROGRESS   TRUE
+                GIT_SHALLOW    FALSE
+            )
+        else()
+            FetchContent_Declare(
+                oneDNN
+                GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
+                GIT_TAG        v3.10
+                GIT_PROGRESS   TRUE
+                GIT_SHALLOW    TRUE
+            )
+        endif()
    endif()

    set(ONEDNN_LIBRARY_TYPE "STATIC")
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -74,6 +74,12 @@ void indexer_k_quant_and_cache(
    int64_t quant_block_size,     // quantization block size
    const std::string& scale_fmt);

+// Concatenate query nope and rope for MLA/DSA attention
+void concat_mla_q(
+    torch::Tensor& ql_nope,  // [num_tokens, num_heads, nope_dim]
+    torch::Tensor& q_pe,     // [num_tokens, num_heads, rope_dim]
+    torch::Tensor& q_out);   // [num_tokens, num_heads, nope_dim + rope_dim]
+
 // Extract function to gather quantized K cache
 void cp_gather_indexer_k_quant_cache(
    const torch::Tensor& kv_cache,  // [num_blocks, block_size, cache_stride]
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -8,6 +8,7 @@
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
 #include "quantization/vectorization_utils.cuh"
+#include "concat_mla_q.cuh"

 #ifdef USE_ROCM
  #include "quantization/w8a8/fp8/amd/quant_utils.cuh"
@@ -995,75 +996,67 @@ namespace vllm {
 // Similar to cp_gather_cache but specifically for FP8->BF16 conversion
 __global__ void cp_gather_and_upconvert_fp8_kv_cache(
    const uint8_t* __restrict__ src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, 656]
-    __nv_bfloat16* __restrict__ dst,          // [TOT_TOKENS, 576]
-    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
-    const int32_t* __restrict__ seq_lens,     // [BATCH]
-    const int32_t* __restrict__ workspace_starts,  // [BATCH]
-    const int32_t block_size, const int32_t head_dim,
-    const int64_t block_table_stride, const int64_t cache_block_stride,
-    const int64_t cache_entry_stride, const int64_t dst_entry_stride) {
-  const int64_t bid = blockIdx.x;  // Batch ID
-  const int32_t num_splits = gridDim.y;
-  const int32_t split = blockIdx.y;
-  const int32_t seq_start = workspace_starts[bid];
-  const int32_t seq_len = seq_lens[bid];
-  const int32_t tot_slots = seq_len;
-  const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
+    __nv_bfloat16* __restrict__ dst,          // [total_tokens, 576]
+    const int32_t* __restrict__ block_table,  // [num_reqs, BLOCK_INDICES]
+    const int32_t* __restrict__ workspace_starts,  // [num_reqs]
+    const int32_t num_reqs, const int32_t block_size,
+    const int32_t total_tokens, const int64_t block_table_stride,
+    const int64_t cache_block_stride, const int64_t cache_entry_stride,
+    const int64_t dst_entry_stride) {
+  const int flat_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) >> 5;
+  if (flat_warp_id >= total_tokens) return;
+  const int lane_id = threadIdx.x & 31;

-  const int32_t split_start = split * split_slots;
-  const int32_t split_end = min((split + 1) * split_slots, tot_slots);
-
-  const bool is_active_split = (split_start < tot_slots);
-
-  if (!is_active_split) return;
-
-  // Adjust the pointer for the block_table for this batch
-  const int32_t batch_offset = bid * block_table_stride;
-  int32_t offset = split_start;
-  int32_t offset_div = offset / block_size;
-  offset = offset % block_size;
-  const int32_t* batch_block_table = block_table + batch_offset;
-
-  // Adjust dst pointer based on the cumulative sequence lengths
-  dst += seq_start * dst_entry_stride;
-
-  const int tid = threadIdx.x;
-
-  // Process each token in this split
-  for (int pid = split_start; pid < split_end; ++pid) {
-    auto block_id = batch_block_table[offset_div];
-    const uint8_t* token_ptr =
-        src_cache + block_id * cache_block_stride + offset * cache_entry_stride;
-    __nv_bfloat16* dst_ptr = dst + pid * dst_entry_stride;
-
-    // FP8 format: 512 bytes fp8 + 16 bytes scales + 128 bytes rope (64 bf16)
-    const uint8_t* no_pe_ptr = token_ptr;
-    const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
-    const __nv_bfloat16* rope_ptr =
-        reinterpret_cast<const __nv_bfloat16*>(token_ptr + 512 + 16);
-
-    // Parallelize fp8 dequant (512 elements) and rope copy (64 elements)
-    if (tid < 512) {
-      // FP8 dequantization
-      const int tile = tid >> 7;  // each tile is 128 elements
-      const float scale = scales_ptr[tile];
-      const uint8_t val = no_pe_ptr[tid];
-      dst_ptr[tid] =
-          fp8::scaled_convert<__nv_bfloat16, uint8_t,
-                              vllm::Fp8KVCacheDataType::kFp8E4M3>(val, scale);
-    } else if (tid < 576) {
-      // Rope copy (64 bf16 elements)
-      const int rope_idx = tid - 512;
-      dst_ptr[512 + rope_idx] = rope_ptr[rope_idx];
-    }
-
-    // Move to next token
-    offset += 1;
-    if (offset == block_size) {
-      offset_div += 1;
-      offset = 0;
-    }
+  // Binary search to find which request owns this output token
+  int lo = 0, hi = num_reqs - 1;
+  while (lo < hi) {
+    int mid = (lo + hi + 1) >> 1;
+    if (workspace_starts[mid] <= flat_warp_id)
+      lo = mid;
+    else
+      hi = mid - 1;
  }
+  const int req_id = lo;
+
+  // Compute physical token address via block table
+  const int out_token_id = flat_warp_id;
+  const int token_offset = out_token_id - workspace_starts[req_id];
+  const int cache_block_idx = token_offset / block_size;
+  const int offset_in_block = token_offset % block_size;
+  const int physical_block =
+      block_table[req_id * block_table_stride + cache_block_idx];
+
+  const uint8_t* token_ptr = src_cache + physical_block * cache_block_stride +
+                             offset_in_block * cache_entry_stride;
+
+  const int4* nope_src = reinterpret_cast<const int4*>(token_ptr);
+  const int4 fp8_data = nope_src[lane_id];
+
+  const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
+  const float scale = scales_ptr[lane_id >> 3];
+
+  const uint2 fp8_lo = make_uint2(fp8_data.x, fp8_data.y);
+  const uint2 fp8_hi = make_uint2(fp8_data.z, fp8_data.w);
+#ifdef USE_ROCM
+  const bf16_8_t bf16_lo =
+      fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_lo, scale);
+  const bf16_8_t bf16_hi =
+      fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_hi, scale);
+#else
+  const bf16_8_t bf16_lo =
+      fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_lo, scale, __NV_E4M3);
+  const bf16_8_t bf16_hi =
+      fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_hi, scale, __NV_E4M3);
+#endif
+
+  __nv_bfloat16* dst_ptr = dst + out_token_id * dst_entry_stride;
+  int4* nope_dst = reinterpret_cast<int4*>(dst_ptr) + lane_id * 2;
+  nope_dst[0] = *reinterpret_cast<const int4*>(&bf16_lo);
+  nope_dst[1] = *reinterpret_cast<const int4*>(&bf16_hi);
+
+  const int* rope_src = reinterpret_cast<const int*>(token_ptr + 528);
+  int* rope_dst = reinterpret_cast<int*>(dst_ptr + 512);
+  rope_dst[lane_id] = rope_src[lane_id];
 }

 template <typename scalar_t>
@@ -1257,15 +1250,16 @@ void cp_gather_and_upconvert_fp8_kv_cache(
    src_ptr = reinterpret_cast<const uint8_t*>(src_cache.data_ptr());
  }

-  // Decide on the number of splits based on the batch size
-  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
-  dim3 grid(batch_size, num_splits);
-  dim3 block(576);
+  const int total_tokens = dst.size(0);
+  constexpr int warps_per_block = 8;
+  const int grid_size = (total_tokens + warps_per_block - 1) / warps_per_block;
+  const int block_size_threads = warps_per_block * 32;  // 256 threads

-  vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid, block, 0, stream>>>(
+  vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid_size, block_size_threads, 0,
+                                               stream>>>(
      src_ptr, reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
-      block_table.data_ptr<int32_t>(), seq_lens.data_ptr<int32_t>(),
-      workspace_starts.data_ptr<int32_t>(), block_size, head_dim,
+      block_table.data_ptr<int32_t>(), workspace_starts.data_ptr<int32_t>(),
+      static_cast<int32_t>(batch_size), block_size, total_tokens,
      block_table_stride, cache_block_stride, cache_entry_stride,
      dst_entry_stride);
 }
@@ -1365,3 +1359,43 @@ void cp_gather_indexer_k_quant_cache(
    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32);
  }
 }
+
+// Concatenate ql_nope and q_pe into a contiguous q_out tensor for MLA/DSA.
+// Replaces torch.cat((ql_nope, q_pe), dim=-1).
+void concat_mla_q(torch::Tensor& ql_nope,  // [num_tokens, num_heads, nope_dim]
+                  torch::Tensor& q_pe,     // [num_tokens, num_heads, rope_dim]
+                  torch::Tensor& q_out     // [num_tokens, num_heads, nope_dim +
+                                           // rope_dim]
+) {
+  const int num_tokens = ql_nope.size(0);
+  const int num_heads = ql_nope.size(1);
+  const int nope_dim = ql_nope.size(2);
+  const int rope_dim = q_pe.size(2);
+
+  TORCH_CHECK(nope_dim % 512 == 0, "nope_dim must be a multiple of 512, got ",
+              nope_dim);
+  TORCH_CHECK(rope_dim == 64, "rope_dim must be 64, got ", rope_dim);
+  TORCH_CHECK(q_out.size(2) == nope_dim + rope_dim);
+
+  TORCH_CHECK(ql_nope.stride(2) == 1, "ql_nope must have stride 1 in dim 2");
+  TORCH_CHECK(q_pe.stride(2) == 1, "q_pe must have stride 1 in dim 2");
+  TORCH_CHECK(q_out.stride(2) == 1, "q_out must have stride 1 in dim 2");
+
+  if (num_tokens == 0) return;
+
+  constexpr int warps_per_block = 8;
+  const int total_warps = num_tokens * num_heads;
+  const int grid_size = (total_warps + warps_per_block - 1) / warps_per_block;
+  const int block_size = warps_per_block * 32;
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(ql_nope));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  VLLM_DISPATCH_FLOATING_TYPES(ql_nope.scalar_type(), "concat_mla_q", [&] {
+    vllm::ConcatMLAQKernel<scalar_t, 512><<<grid_size, block_size, 0, stream>>>(
+        q_out.data_ptr<scalar_t>(), ql_nope.data_ptr<scalar_t>(),
+        q_pe.data_ptr<scalar_t>(), num_tokens, num_heads, q_out.stride(0),
+        q_out.stride(1), ql_nope.stride(0), ql_nope.stride(1), q_pe.stride(0),
+        q_pe.stride(1));
+  });
+}
--- a/csrc/concat_mla_q.cuh
+++ b/csrc/concat_mla_q.cuh
@@ -0,0 +1,60 @@
+#ifndef CONCAT_MLA_Q_CUH_
+#define CONCAT_MLA_Q_CUH_
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#include "cuda_vec_utils.cuh"
+
+namespace vllm {
+
+// Concatenates ql_nope [num_tokens, num_heads, NOPE_DIM] and
+// q_pe [num_tokens, num_heads, 64]
+// into q_out [num_tokens, num_heads, NOPE_DIM+64].
+// Currently instantiated only for NOPE_DIM=512.
+// Rope dim is hardcoded to 64 (DeepSeek V3.2 MLA)
+template <typename DType, int NOPE_DIM>
+__global__ void ConcatMLAQKernel(
+    DType* __restrict__ q_out, const DType* __restrict__ ql_nope,
+    const DType* __restrict__ q_pe, const int num_tokens, const int num_heads,
+    const int64_t out_stride_0, const int64_t out_stride_1,
+    const int64_t nope_stride_0, const int64_t nope_stride_1,
+    const int64_t pe_stride_0, const int64_t pe_stride_1) {
+  const int flat_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) >> 5;
+  if (flat_warp_id >= num_tokens * num_heads) return;
+
+  const int token_id = flat_warp_id / num_heads;
+  const int head_id = flat_warp_id % num_heads;
+  const int lane_id = threadIdx.x & 31;
+
+  constexpr bool use_256b = VLLM_256B_PTX_ENABLED;
+  constexpr int nope_vec_loads =
+      NOPE_DIM * sizeof(DType) / (VecTraits<use_256b>::ARCH_MAX_VEC_SIZE * 32);
+
+  const DType* nope_src =
+      ql_nope + token_id * nope_stride_0 + head_id * nope_stride_1;
+  DType* nope_dst = q_out + token_id * out_stride_0 + head_id * out_stride_1;
+
+#pragma unroll
+  for (int i = 0; i < nope_vec_loads; i++) {
+    const int offset = i * 32 + lane_id;
+    if constexpr (use_256b) {
+      st256_cs(reinterpret_cast<u32x8_t*>(nope_dst) + offset,
+               ld256_cs(reinterpret_cast<const u32x8_t*>(nope_src) + offset));
+    } else {
+      st128_cs(reinterpret_cast<int4*>(nope_dst) + offset,
+               ld128_cs(reinterpret_cast<const int4*>(nope_src) + offset));
+    }
+  }
+
+  const int* rope_src = reinterpret_cast<const int*>(
+      q_pe + token_id * pe_stride_0 + head_id * pe_stride_1);
+  int* rope_dst = reinterpret_cast<int*>(q_out + token_id * out_stride_0 +
+                                         head_id * out_stride_1 + NOPE_DIM);
+
+  st32_cs(rope_dst + lane_id, ld32_cs(rope_src + lane_id));
+}
+
+}  // namespace vllm
+
+#endif  // CONCAT_MLA_Q_CUH_
--- a/csrc/cpu/cpu_attn_amx.hpp
+++ b/csrc/cpu/cpu_attn_amx.hpp
@@ -420,7 +420,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
      const int64_t block_size, const int64_t block_size_stride) {
    // For AMX 2D tiles, size of each line is 64 bytes
    constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES;
-    // For AMX B martix, N always is 16
+    // For AMX B matrix, N always is 16
    constexpr int64_t amx_b_tile_n_size = AMX_TILE_ROW_BYTES / 4;
    constexpr int64_t amx_b_tile_k_size = amx_tile_row_size / sizeof(scalar_t);
    // For now suppose block_size is divisible by amx_tile_column_num
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@@ -237,13 +237,10 @@ W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
  };
  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
                                   {b_k_stride_, b_n_stride_});
-#ifdef __aarch64__
+
  // dummy M size for prepacking weights
  // Prepacking weights improves performance and avoid runtime reorders
  constexpr dnnl_dim_t kProbeM = 128;
-#else
-  constexpr dnnl_dim_t kProbeM = DNNL_RUNTIME_DIM_VAL;
-#endif

  prepack_weight(args.b_ptr, original_b_md,
                 create_primitive_desc(
@@ -411,21 +408,19 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
                                   {b_k_stride_, b_n_stride_});

+  // dummy M size for prepacking weights
+  // Prepacking weights improves performance and avoid runtime reorders
+  constexpr dnnl_dim_t kProbeM = 128;
+
  prepack_weight(args.b_ptr, original_b_md,
                 create_primitive_desc(
-                     MSizeCacheKey{
-#ifdef VLLM_USE_ACL
-                         // Arm Compute Library (ACL) backend for oneDNN does
-                         // not support runtime
-                         // dimensions, so we set M to a default value
-                         .a_m_size = 128,
-                         .a_m_stride = b_k_size_,
-#else
-                         .a_m_size = DNNL_RUNTIME_DIM_VAL,
-                         .a_m_stride = DNNL_RUNTIME_DIM_VAL,
-#endif
-                         .use_bias = false,
-                         .bias_type = dnnl::memory::data_type::undef},
+                     MSizeCacheKey{// Use a concrete M so oneDNN's kernel
+                                   // selector can choose an optimally blocked
+                                   // weight layout.
+                                   .a_m_size = kProbeM,
+                                   .a_m_stride = b_k_size_,
+                                   .use_bias = false,
+                                   .bias_type = dnnl::memory::data_type::undef},
                     true)
                     .weights_desc());
  init_runtime_memory_cache(args);
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -4,7 +4,7 @@

 #include <torch/library.h>

-// Note: overwrite the external defination for sharing same name between
+// Note: overwrite the external definition for sharing same name between
 // libraries use different ISAs.
 #define TORCH_EXTENSION_NAME _C

--- a/csrc/cuda_vec_utils.cuh
+++ b/csrc/cuda_vec_utils.cuh
@@ -196,7 +196,6 @@ __forceinline__ __device__ u32x8_t ld256_cs(const u32x8_t* addr) {
  return val;
 #else
  assert(false && "ld256_cs requires SM100+ with CUDA 12.9+");
-  return {};
 #endif
 }

@@ -211,23 +210,51 @@ __forceinline__ __device__ void st256_cs(u32x8_t* addr, u32x8_t val) {
 #endif
 }

-// 32-bit cache-streaming (.cs) load / store  — SM100+ only.
+// 32-bit load / store.
+__device__ __forceinline__ int ld32(const int* addr) { return __ldg(addr); }
+
+__device__ __forceinline__ void st32(int* addr, int val) { *addr = val; }
+
+// 32-bit cache-streaming (.cs) load / store.
+// Falls back to ld32/st32 on ROCm (no .cs hint).
 __forceinline__ __device__ int ld32_cs(const int* addr) {
-#if VLLM_256B_PTX_ENABLED
  int val;
+#ifndef USE_ROCM
  asm volatile("ld.global.cs.b32 %0, [%1];" : "=r"(val) : "l"(addr));
-  return val;
 #else
-  assert(false && "ld32_cs requires SM100+ with CUDA 12.9+");
-  return 0;
+  val = ld32(addr);
 #endif
+  return val;
 }

 __forceinline__ __device__ void st32_cs(int* addr, int val) {
-#if VLLM_256B_PTX_ENABLED
+#ifndef USE_ROCM
  asm volatile("st.global.cs.b32 [%0], %1;" ::"l"(addr), "r"(val));
 #else
-  assert(false && "st32_cs requires SM100+ with CUDA 12.9+");
+  st32(addr, val);
+#endif
+}
+
+// 128-bit cache-streaming (.cs) load / store.
+// Falls back to ld128/st128 on ROCm (no .cs hint).
+__forceinline__ __device__ int4 ld128_cs(const int4* addr) {
+  int4 val;
+#ifndef USE_ROCM
+  asm volatile("ld.global.cs.v4.u32 {%0,%1,%2,%3}, [%4];"
+               : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
+               : "l"(addr));
+#else
+  ld128(val, addr);
+#endif
+  return val;
+}
+
+__forceinline__ __device__ void st128_cs(int4* addr, int4 val) {
+#ifndef USE_ROCM
+  asm volatile("st.global.cs.v4.u32 [%0], {%1,%2,%3,%4};" ::"l"(addr),
+               "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w));
+#else
+  st128(val, addr);
 #endif
 }

@@ -260,7 +287,7 @@ __device__ __forceinline__ void ld256_cg_or_zero(u32x8_t& val, const void* ptr,

 __device__ __forceinline__ void ld128_cg_or_zero(uint4& val, const void* ptr,
                                                 bool pred) {
-#if VLLM_256B_PTX_ENABLED
+#ifndef USE_ROCM
  uint32_t r0, r1, r2, r3;

  asm volatile(
@@ -278,7 +305,7 @@ __device__ __forceinline__ void ld128_cg_or_zero(uint4& val, const void* ptr,

  val = uint4{r0, r1, r2, r3};
 #else
-  assert(false && "ld128_cg_or_zero requires SM100+ with CUDA 12.9+");
+  assert(false && "ld128_cg_or_zero is not supported on ROCm");
 #endif
 }

--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -35,11 +35,11 @@ __global__ void batched_moe_align_block_size_kernel(
  int32_t const block_ids_size = sorted_ids_size / block_size;
  int32_t const SENTINEL =
      num_batches * max_tokens_per_batch;  // To denote invalid entries.
-  // Intialize sorted_ids
+  // Initialize sorted_ids
  for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
    sorted_ids[i] = SENTINEL;
  }
-  // Intialize expert_ids with -1
+  // Initialize expert_ids with -1
  for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
    block_ids[i] = -1;
  }
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@@ -542,7 +542,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
      if (!lane_id) {
        // Store scales.
        if constexpr (std::is_same<scale_t, uint8_t>::value) {
-          // Packed UE8MO format. Remove Mantissa.
+          // Packed UE8M0 format. Remove Mantissa.
          *y_s_ptr = reinterpret_cast<int16_t&>(y_s) >> 7;

          bool const jump_pack = (current_group_id + 1) % 4 == 0;
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@@ -12,6 +12,7 @@
 #include "../cuda_compat.h"
 #include "dispatch_utils.h"
 #include "quantization/w8a8/fp8/common.cuh"
+#include "core/batch_invariant.hpp"

 // TODO(rasmith): The kernels in this file are susceptible to integer overflow
 // issues, do not take strides, and are unable to handle PyTorch tensors that
@@ -1224,17 +1225,14 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
 #if defined(__gfx950__)
  #define WVSPLITKRC_1KPASS
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
-          int UNRL, int N, int GrpsShrB, int CHUNKK>
+          int UNRL, int N, int GrpsShrB, int CHUNKK, int DTRMNSTC>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
    __attribute__((amdgpu_waves_per_eu(1, 1)))
-    wvSplitKrc_(const int actlN, const int K, const int M, const int Bx,
-                const int By, const scalar_t* __restrict__ B,
-                const scalar_t* __restrict__ A,
-                const scalar_t* __restrict__ BIAS, float* glbl, scalar_t* C,
-                const int CuCount) {
-  // Use upper half of glbl buffer for atomic reduce counting
-  int* cntr = (int*)(&glbl[M * N]);
-
+    wvSplitKrc_(const int actlN, const int K, const int Kap, const int M,
+                const int Bx, const int By, const scalar_t* __restrict__ A,
+                const scalar_t* __restrict__ B,
+                const scalar_t* __restrict__ BIAS, float* glbl, int* cntr,
+                scalar_t* C, const int CuCount) {
  constexpr int NTILE = 16;
  constexpr int APAD = 1;
  constexpr int ASTRD = 64;
@@ -1425,11 +1423,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
          unsigned int kOffcp = min__(K - A_CHUNK, k_str + kOff);
          for (unsigned int n = 0; n < N; n += CHUNKK * sprdN) {
            __builtin_amdgcn_global_load_lds(
-                (int*)(&A[min__(
-                    K * actlN - A_CHUNK,
-                    kOffcp + K * (n / CHUNKK +
-                                  (N / CHUNKK) * (threadIdx.x / (64 / CHUNKK)) +
-                                  (threadIdx.y % sprdN)))]),
+                (int*)(&A[min__(Kap * actlN - A_CHUNK,
+                                kOffcp + Kap * (n / CHUNKK +
+                                                (N / CHUNKK) * (threadIdx.x /
+                                                                (64 / CHUNKK)) +
+                                                (threadIdx.y % sprdN)))]),
                (int*)(&s[(k +
                           kFitPdd * ((n / CHUNKK) + (threadIdx.y % sprdN)))]),
                16, 0, 0);
@@ -1476,7 +1474,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  #endif

    // B[] staging is cooperative across GrpsShrB, so sync here before reading
-    // back. This wait is currently inserted by compiler, but not gauranteed.
+    // back. This wait is currently inserted by compiler, but not guaranteed.
    asm volatile("s_waitcnt 0");
    __syncthreads();

@@ -1533,45 +1531,98 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
    }
  }

+  union flt4 {
+    scalar8 s8;
+    float2 f2[2];
+    float4 f4;
+  };
  if (m + (threadIdx.x % 16) < M) {
    int my_cntr;
    int mindx = m + (threadIdx.x % 16);
    int g_mindx = m * 4 + (threadIdx.x % 64);  // coalesced atomic reduction
    scalar_t biases[N / NTILE / GrpsShrB][4] = {};
    // Atomic add the output, read biases
-    for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++)
-      for (uint32_t j = 0; j < 4; j++) {
-        // int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
-        //             (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
-        // int adr = mindx + M * nindx;
-        int g_nindx =
-            j + (nt * NTILE + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) / 4;
-        int g_adr = g_mindx + M * g_nindx * 4;
-        atomicAdd(&glbl[g_adr], sum4[nt][0][j]);
+    for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+      int g_nindx =
+          (nt * NTILE + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) / 4;
+      int g_adr = g_mindx * 4 + 0 + M * g_nindx * 4;
+      if (DTRMNSTC) {
+        flt4 flt4_ = {.s8 = sum4[nt][0]};
+        __hip_atomic_store((float2*)&glbl[g_adr + M * N * (m0 / Mmod)],
+                           flt4_.f2[0], __ATOMIC_RELAXED,
+                           __HIP_MEMORY_SCOPE_AGENT);
+        __hip_atomic_store((float2*)&glbl[g_adr + 2 + M * N * (m0 / Mmod)],
+                           flt4_.f2[1], __ATOMIC_RELAXED,
+                           __HIP_MEMORY_SCOPE_AGENT);
+      } else {
+        for (uint32_t j = 0; j < 4; j++)
+          atomicAdd((&glbl[g_adr + j]), sum4[nt][0][j]);
      }
+    }
+
+    __atomic_signal_fence(__ATOMIC_SEQ_CST);
+    asm volatile("s_waitcnt vmcnt(0)" ::: "memory");
+    __atomic_signal_fence(__ATOMIC_SEQ_CST);
+
    int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
                 (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
    int adr_ = mindx + M * nindx_ / 4;
-    // Update the complete counter
    my_cntr = atomicAdd(&cntr[adr_], 1);
-    float vals[N / NTILE / GrpsShrB][4] = {};
+
+    // make sure LDS is free for write out staging
+    if (DTRMNSTC) __syncthreads();
+
+    // Update the complete counter
+    flt4 vals[N / NTILE / GrpsShrB] = {};
    // If we're the last k-shard, read back the value and convert...
    if (my_cntr + 1 == k_rnd) {
-      if (BIAS)
-        for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
-          for (uint32_t j = 0; j < 4; j++) {
-            int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
-                        (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
-            biases[nt][j] = BIAS[(mindx % Bx) + (nindx % By) * Bx];
+      cntr[adr_] = 0;  // clear for next round
+      if constexpr (DTRMNSTC) {
+  #pragma unroll
+        for (int ks = 0; ks < k_rnd; ks++) {
+          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+            int g_nindx =
+                (nt * NTILE + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) / 4;
+            int g_adr = g_mindx * 4 + 0 + M * g_nindx * 4;
+            __builtin_amdgcn_global_load_lds(
+                (float4*)(&glbl[g_adr + M * N * ks]),
+                &(((float4*)s)[(threadIdx.y * THRDS) + ks * THRDS * 4 +
+                               nt * THRDS * 4 * k_rnd]),
+                16, 0, 0);
          }
        }
-      for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
-        for (uint32_t j = 0; j < 4; j++) {
-          int g_nindx =
-              j + (nt * NTILE + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) / 4;
-          int g_adr = g_mindx + M * g_nindx * 4;
-          vals[nt][j] = glbl[g_adr];
+        if (BIAS)
+          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+            for (uint32_t j = 0; j < 4; j++) {
+              int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                          (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+              biases[nt][j] = BIAS[(mindx % Bx) + (nindx % By) * Bx];
+            }
+          }
+        asm volatile("s_waitcnt 0");
+        for (int ks = 0; ks < k_rnd; ks++) {
+          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+            float4 eval = ((float4*)s)[(threadIdx.x + threadIdx.y * THRDS) +
+                                       ks * THRDS * 4 + nt * THRDS * 4 * k_rnd];
+            vals[nt].f4 += eval;
+          }
        }
+      } else {
+        for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+          int g_nindx =
+              (nt * NTILE + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) / 4;
+          int g_adr = g_mindx * 4 + 0 + M * g_nindx * 4;
+          vals[nt].f4 = *(float4*)(&glbl[g_adr]);
+          *(float4*)(&glbl[g_adr]) = {};  // clear out for next round
+        }
+        if (BIAS)
+          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
+            for (uint32_t j = 0; j < 4; j++) {
+              int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
+                          (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
+              biases[nt][j] = BIAS[(mindx % Bx) + (nindx % By) * Bx];
+            }
+          }
      }
      __builtin_amdgcn_sched_barrier(0);
      for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
@@ -1581,11 +1632,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
          if (nindx < actlN) {
            int adr = mindx + M * nindx;
            if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
-              vals[nt][j] += __bfloat162float(biases[nt][j]);
-              C[adr] = __float2bfloat16(vals[nt][j]);
+              vals[nt].s8[j] += __bfloat162float(biases[nt][j]);
+              C[adr] = __float2bfloat16(vals[nt].s8[j]);
            } else {
-              vals[nt][j] += __half2float(biases[nt][j]);
-              C[adr] = __float2half(vals[nt][j]);
+              vals[nt].s8[j] += __half2float(biases[nt][j]);
+              C[adr] = __float2half(vals[nt].s8[j]);
            }
          }
        }
@@ -1604,21 +1655,25 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 }
 #else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
-          int UNRL, int N, int GrpsShrB, int CHUNKK>
-__global__ void wvSplitKrc_(const int actlN, const int K, const int M,
-                            const int Bx, const int By, const scalar_t* B,
-                            const scalar_t* __restrict__ A,
+          int UNRL, int N, int GrpsShrB, int CHUNKK, int DTRMNSTC>
+__global__ void wvSplitKrc_(const int actlN, const int K, const int Kap,
+                            const int M, const int Bx, const int By,
+                            const scalar_t* B, const scalar_t* __restrict__ A,
                            const scalar_t* __restrict__ BIAS, float* glbl,
-                            // int* cntr,
-                            scalar_t* C, const int CuCount){UNREACHABLE_CODE}
+                            int* cntr, scalar_t* C,
+                            const int CuCount){UNREACHABLE_CODE}
 #endif  // defined(__HIP__GFX9__) TODO: Add NAVI support

 torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
                         const std::optional<at::Tensor>& in_bias,
                         const int64_t CuCount) {
-  auto M_in = in_a.size(0);
-  auto N_in = in_b.size(0);
-  auto K_in = in_a.size(1);
+  int _DTRMNSTC = 1;  // vllm::vllm_is_batch_invariant();
+
+  auto M_in = in_b.size(0);
+  auto N_in = in_a.size(0);
+  auto K_in = in_b.size(1);
+  auto Kap_in = in_a.stride(0);
+
  auto Bx_in =
      (in_bias.has_value() && in_bias->numel() > 0)
          ? (in_bias->sizes().size() == 2) ? in_bias->size(1) : in_bias->size(0)
@@ -1635,13 +1690,9 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,

  auto out_c = torch::empty(
      {N_in, M_in},
-      torch::TensorOptions().dtype(in_b.dtype()).device(in_b.device()));
+      torch::TensorOptions().dtype(in_a.dtype()).device(in_a.device()));

  auto N_p2 = 1U << (32 - __builtin_clz(N_in - 1));
-  auto axl_glbl = torch::empty(
-      {N_p2 + N_p2 / 4, M_in + M_in / 4},
-      torch::TensorOptions().dtype(torch::kFloat32).device(in_b.device()));
-  axl_glbl.zero_();  // disable for FAST_UNSAFE_RDC_INIT

  dim3 grid(CuCount);

@@ -1649,55 +1700,70 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  // const int max_lds_len = get_lds_size() / 2;

+  // With 64 Ms per CU (each of 4 SIMDs working on a 16x16 tile),
+  // and each working on a 512-shard of K, how many CUs would we need?
+  int rndup_cus = ((M_in + 64 - 1) / 64) * ((K_in + 512 - 1) / 512);
+
+  // How many of 4 waves in a group can work on same 16 Ms at same time? First
+  // try to maximize this. This reduces the Ms each group works on, i.e.
+  // increasing the number of CUs needed.
+  int GrpsShrB = min(N_p2 / 16, 4);
+
+  // Given the above, how many CUs would we need?
+  int CuNeeded = rndup_cus * GrpsShrB;
+
+  if (CuNeeded > CuCount) throw std::runtime_error("Invalid wvSplitKrc size");
+
+  // Can we increase SplitK by shrinking the K-shared to 256?
+  int chunkk = (CuNeeded * 2 <= CuCount) ? 2 : 1;
+
+  static torch::Tensor axl_glbl =
+      torch::zeros(
+          128 * 1024 * (_DTRMNSTC ? 12 : 1),
+          torch::TensorOptions().dtype(torch::kFloat32).device(in_a.device()))
+          .detach();
+  static torch::Tensor axl_cntr =
+      torch::zeros(
+          128 * 1024 * (_DTRMNSTC ? 12 : 1) / 4,
+          torch::TensorOptions().dtype(torch::kInt).device(in_a.device()))
+          .detach();
+  auto glbl = axl_glbl.data_ptr<float>();
+  auto cntr = axl_cntr.data_ptr<int>();
+
 #define WVSPLITKrc(_N, _GrpsShrB, _CHUNKK)                                     \
  {                                                                            \
    dim3 block(64, 4);                                                         \
-    wvSplitKrc_<fptype, 64, 16, 4, 8, 1, _N, _GrpsShrB, _CHUNKK>               \
-        <<<grid, block, 0, stream>>>(N_in, K_in, M_in, Bx_in, By_in, af4, bf4, \
-                                     biasf4, glbl, c, CuCount);                \
+    if (_DTRMNSTC)                                                             \
+      wvSplitKrc_<fptype, 64, 16, 4, 8, 1, _N, _GrpsShrB, _CHUNKK, 1>          \
+          <<<grid, block, 0, stream>>>(N_in, K_in, Kap_in, M_in, Bx_in, By_in, \
+                                       af4, bf4, biasf4, glbl, cntr, c,        \
+                                       CuCount);                               \
+    else                                                                       \
+      wvSplitKrc_<fptype, 64, 16, 4, 8, 1, _N, _GrpsShrB, _CHUNKK, 0>          \
+          <<<grid, block, 0, stream>>>(N_in, K_in, Kap_in, M_in, Bx_in, By_in, \
+                                       af4, bf4, biasf4, glbl, cntr, c,        \
+                                       CuCount);                               \
  }

-  AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitKrc", [&] {
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(in_a.scalar_type(), "wvSplitKrc", [&] {
    using fptype = typename scalar<scalar_t>::type;
-    fptype* af4 = reinterpret_cast<fptype*>(in_a.data_ptr());
+    const fptype* af4 = reinterpret_cast<const fptype*>(in_a.data_ptr());
    const fptype* bf4 = reinterpret_cast<const fptype*>(in_b.data_ptr());
    const fptype* biasf4 =
        (in_bias.has_value() && in_bias->numel() > 0)
            ? reinterpret_cast<const fptype*>(in_bias->data_ptr())
            : nullptr;
    fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
-    auto glbl = axl_glbl.data_ptr<float>();
-
-    // With 64 Ms per CU (each of 4 SIMDs working on a 16x16 tile),
-    // and each working on a 512-shard of K, how many CUs would we need?
-    int rndup_cus = ((M_in + 64 - 1) / 64) * ((K_in + 512 - 1) / 512);
-
-    // How many of 4 waves in a group can work on same 16 Ms at same time? First
-    // try to maximize this. This reduces the Ms each group works on, i.e.
-    // increasing the number of CUs needed.
-    int GrpsShrB = min(N_p2 / 16, 4);
-
-    // Given the above, how many CUs would we need?
-    int CuNeeded = rndup_cus * GrpsShrB;
-
-    if (CuNeeded > CuCount) std::runtime_error("Invalid wvSplitKrc size");
-
-    // Can we increase SplitK by shrinking the K-shared to 256?
-    int chunkk = (CuNeeded * 2 <= CuCount) ? 2 : 1;

    switch (N_p2) {
      case 16:
        WVSPLITKrc(16, 1, 1) break;
      case 32:
-        if (chunkk == 2)
-          WVSPLITKrc(32, 2, 2) else if (chunkk == 1) WVSPLITKrc(32, 2, 1) break;
+        if (chunkk == 2) WVSPLITKrc(32, 2, 2) else WVSPLITKrc(32, 2, 1) break;
      case 64:
-        if (chunkk == 2)
-          WVSPLITKrc(64, 4, 2) else if (chunkk == 1) WVSPLITKrc(64, 4, 1) break;
+        if (chunkk == 2) WVSPLITKrc(64, 4, 2) else WVSPLITKrc(64, 4, 1) break;
      case 128:
-        if (chunkk == 2)
-          WVSPLITKrc(128, 4, 2) else if (chunkk == 1)
-              WVSPLITKrc(128, 4, 1) break;
+        if (chunkk == 2) WVSPLITKrc(128, 4, 2) else WVSPLITKrc(128, 4, 1) break;
      default:
        throw std::runtime_error(
            "Unsupported N value: " + std::to_string(M_in) + "," +
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -802,6 +802,10 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
  cache_ops.impl("indexer_k_quant_and_cache", torch::kCUDA,
                 &indexer_k_quant_and_cache);

+  cache_ops.def(
+      "concat_mla_q(Tensor ql_nope, Tensor q_pe, Tensor! q_out) -> ()");
+  cache_ops.impl("concat_mla_q", torch::kCUDA, &concat_mla_q);
+
  cache_ops.def(
      "cp_gather_indexer_k_quant_cache(Tensor kv_cache, Tensor! dst_k, Tensor! "
      "dst_scale, Tensor block_table, Tensor cu_seq_lens) -> ()");
--- a/docs/benchmarking/cli.md
+++ b/docs/benchmarking/cli.md
@@ -18,14 +18,14 @@ th {
 </style>

 | Dataset | Online | Offline | Data Path |
-|---------|--------|---------|-----------|
+| ------- | ------ | ------- | --------- |
 | ShareGPT | ✅ | ✅ | `wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json` |
 | ShareGPT4V (Image) | ✅ | ✅ | `wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json`<br>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:<br>`wget http://images.cocodataset.org/zips/train2017.zip` |
 | ShareGPT4Video (Video) | ✅ | ✅ | `git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video` |
 | BurstGPT | ✅ | ✅ | `wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv` |
 | Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
 | Random | ✅ | ✅ | `synthetic` |
-| RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` |
+| RandomMultiModal (Image/Video) | ✅ | ✅ | `synthetic` |
 | RandomForReranking | ✅ | ✅ | `synthetic` |
 | Prefix Repetition | ✅ | ✅ | `synthetic` |
 | HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` |
@@ -383,14 +383,14 @@ The `--burstiness` parameter mathematically controls request arrival patterns us

 Load Pattern Recommendations by Use Case:

-| Use Case           | Burstiness   | Request Rate    | Max Concurrency | Description                                               |
-| ---                | ---          | ---             | ---             | ---                                                       |
+| Use Case           | Burstiness   | Request Rate    | Max Concurrency | Description                                                                        |
+| ---                | ---          | ---             | ---             | ---                                                                                |
 | Maximum Throughput | N/A          | Infinite        | Limited         | **Most common**: Simulates load balancer/gateway limits with unlimited user demand |
-| Realistic Testing  | 1.0          | Moderate (5-20) | Infinite        | Natural Poisson traffic patterns for baseline performance |
-| Stress Testing     | 0.1-0.5      | High (20-100)   | Infinite        | Challenging burst patterns to test resilience             |
-| Latency Profiling  | 2.0-5.0      | Low (1-10)      | Infinite        | Uniform load for consistent timing analysis               |
-| Capacity Planning  | 1.0          | Variable        | Limited         | Test resource limits with realistic constraints           |
-| SLA Validation     | 1.0          | Target rate     | SLA limit       | Production-like constraints for compliance testing        |
+| Realistic Testing  | 1.0          | Moderate (5-20) | Infinite        | Natural Poisson traffic patterns for baseline performance                          |
+| Stress Testing     | 0.1-0.5      | High (20-100)   | Infinite        | Challenging burst patterns to test resilience                                      |
+| Latency Profiling  | 2.0-5.0      | Low (1-10)      | Infinite        | Uniform load for consistent timing analysis                                        |
+| Capacity Planning  | 1.0          | Variable        | Limited         | Test resource limits with realistic constraints                                    |
+| SLA Validation     | 1.0          | Target rate     | SLA limit       | Production-like constraints for compliance testing                                 |

 These load patterns help evaluate different aspects of your vLLM deployment, from basic performance characteristics to resilience under challenging traffic conditions.

@@ -545,6 +545,24 @@ vllm bench throughput \
  --lora-path yard1/llama-2-7b-sql-lora-test
 ```

+#### Synthetic Random Multimodal (random-mm)
+
+Generate synthetic multimodal inputs for offline throughput testing without external datasets.
+Use `--backend vllm-chat` so that image tokens are counted correctly.
+
+```bash
+vllm bench throughput \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name random-mm \
+  --num-prompts 100 \
+  --random-input-len 300 \
+  --random-output-len 40 \
+  --random-mm-base-items-per-request 2 \
+  --random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
+  --random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}'
+```
+
 </details>

 ### 🛠️ Structured Output Benchmark
@@ -846,8 +864,8 @@ Generate synthetic image inputs alongside random text prompts to stress-test vis

 Notes:

- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
- Video sampling is not yet implemented.
+- For online benchmarks, use `--backend openai-chat` with endpoint `/v1/chat/completions`.
+- For offline benchmarks, use `--backend vllm-chat` (see [Offline Throughput Benchmark](#-offline-throughput-benchmark) for an example).

 Start the server (example):

@@ -913,6 +931,74 @@ This should be seen as an edge case, and if this behavior can be avoided by sett

 </details>

+### 🔬 Multimodal Processor Benchmark
+
+Benchmark per-stage latency of the multimodal (MM) input processor pipeline, including the encoder forward pass. This is useful for profiling preprocessing bottlenecks in vision-language models.
+
+<details class="admonition abstract" markdown="1">
+<summary>Show more</summary>
+
+The benchmark measures the following stages for each request:
+
+| Stage | Description |
+| ----- | ----------- |
+| `get_mm_hashes_secs` | Time spent hashing multimodal inputs |
+| `get_cache_missing_items_secs` | Time spent looking up the processor cache |
+| `apply_hf_processor_secs` | Time spent in the HuggingFace processor |
+| `merge_mm_kwargs_secs` | Time spent merging multimodal kwargs |
+| `apply_prompt_updates_secs` | Time spent updating prompt tokens |
+| `preprocessor_total_secs` | Total preprocessing time |
+| `encoder_forward_secs` | Time spent in the encoder model forward pass |
+| `num_encoder_calls` | Number of encoder invocations per request |
+
+The benchmark also reports end-to-end latency (TTFT + decode time) per
+request. Use `--metric-percentiles` to select which percentiles to report
+(default: p99) and `--output-json` to save results.
+
+#### Basic Example with Synthetic Data (random-mm)
+
+```bash
+vllm bench mm-processor \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --dataset-name random-mm \
+  --num-prompts 50 \
+  --random-input-len 300 \
+  --random-output-len 40 \
+  --random-mm-base-items-per-request 2 \
+  --random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
+  --random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}'
+```
+
+#### Using a HuggingFace Dataset
+
+```bash
+vllm bench mm-processor \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --dataset-name hf \
+  --dataset-path lmarena-ai/VisionArena-Chat \
+  --hf-split train \
+  --num-prompts 100
+```
+
+#### Warmup, Custom Percentiles, and JSON Output
+
+```bash
+vllm bench mm-processor \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --dataset-name random-mm \
+  --num-prompts 200 \
+  --num-warmups 5 \
+  --random-input-len 300 \
+  --random-output-len 40 \
+  --random-mm-base-items-per-request 1 \
+  --metric-percentiles 50,90,95,99 \
+  --output-json results.json
+```
+
+See [`vllm bench mm-processor`](../cli/bench/mm_processor.md) for the full argument reference.
+
+</details>
+
 ### Embedding Benchmark

 Benchmark the performance of embedding requests in vLLM.
--- a/docs/benchmarking/dashboard.md
+++ b/docs/benchmarking/dashboard.md
@@ -60,12 +60,12 @@ Here is an example using the script to compare result_a and result_b with max co

 ***Output Tput (tok/s) — Model : [ meta-llama/Llama-3.1-8B-Instruct ] , Dataset Name : [ random ] , Input Len : [ 2048.0 ] , Output Len : [ 2048.0 ]***

-|    | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
-|----|------|-----|-----------|----------|----------|
-| 0  | 12 | inf | 24.98   | 186.03 |  7.45 |
-| 1  | 16 | inf|  25.49  | 246.92 | 9.69 |
-| 2  | 24 | inf| 27.74  | 293.34 |  10.57 |
-| 3  | 32 | inf| 28.61  |306.69 | 10.72 |
+| | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
+| | -------------------- | --- | -------------------------------- | -------------------------------- | ---------- |
+| 0 | 12 | inf | 24.98 | 186.03 |  7.45 |
+| 1 | 16 | inf |  25.49 | 246.92 | 9.69 |
+| 2 | 24 | inf | 27.74 | 293.34 |  10.57 |
+| 3 | 32 | inf | 28.61 |306.69 | 10.72 |

 ***compare-json-results.py – Command-Line Parameters***  

--- a/docs/cli/bench/mm_processor.md
+++ b/docs/cli/bench/mm_processor.md
@@ -1,5 +1,51 @@
 # vllm bench mm-processor

+## Overview
+
+`vllm bench mm-processor` profiles the multimodal input processor pipeline of
+vision-language models. It measures per-stage latency from the HuggingFace
+processor through to the encoder forward pass, helping you identify
+preprocessing bottlenecks and understand how different image resolutions or
+item counts affect end-to-end request time.
+
+The benchmark supports two data sources: synthetic random multimodal inputs
+(`random-mm`) and HuggingFace datasets (`hf`). Warmup requests are run before
+measurement to ensure stable results.
+
+## Quick Start
+
+```bash
+vllm bench mm-processor \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --dataset-name random-mm \
+  --num-prompts 50 \
+  --random-input-len 300 \
+  --random-output-len 40 \
+  --random-mm-base-items-per-request 2 \
+  --random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
+  --random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}'
+```
+
+## Measured Stages
+
+| Stage | Description |
+| ----- | ----------- |
+| `get_mm_hashes_secs` | Time spent hashing multimodal inputs |
+| `get_cache_missing_items_secs` | Time spent looking up the processor cache |
+| `apply_hf_processor_secs` | Time spent in the HuggingFace processor |
+| `merge_mm_kwargs_secs` | Time spent merging multimodal kwargs |
+| `apply_prompt_updates_secs` | Time spent updating prompt tokens |
+| `preprocessor_total_secs` | Total preprocessing time |
+| `encoder_forward_secs` | Time spent in the encoder model forward pass |
+| `num_encoder_calls` | Number of encoder invocations per request |
+
+The benchmark also reports end-to-end latency (TTFT + decode time) per
+request. Use `--metric-percentiles` to select which percentiles to report
+(default: p99) and `--output-json` to save results.
+
+For more examples (HF datasets, warmup, JSON output), see
+[Benchmarking CLI — Multimodal Processor Benchmark](../../benchmarking/cli.md#multimodal-processor-benchmark).
+
 ## JSON CLI Arguments

 --8<-- "docs/cli/json_tip.inc.md"
--- a/docs/cli/json_tip.inc.md
+++ b/docs/cli/json_tip.inc.md
@@ -1,3 +1,4 @@
+<!-- markdownlint-disable MD041 -->
 When passing JSON CLI arguments, the following sets of arguments are equivalent:

 - `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`
@@ -6,4 +7,4 @@ When passing JSON CLI arguments, the following sets of arguments are equivalent:
 Additionally, list elements can be passed individually using `+`:

 - `--json-arg '{"key4": ["value3", "value4", "value5"]}'`
- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`
+- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -5,6 +5,17 @@ This guide covers optimization strategies and performance tuning for vLLM V1.
 !!! tip
    Running out of memory? Consult [this guide](./conserving_memory.md) on how to conserve memory.

+## Optimization Levels
+
+vLLM provides 4 optimization levels (`-O0`, `-O1`, `-O2`, `-O3`) that allow users to trade off startup time for performance:
+
+- `-O0`: No optimizations. Fastest startup time, but lowest performance.
+- `-O1`: Fast optimization. Simple compilation and fast fusions, and PIECEWISE cudagraphs.
+- `-O2`: Default optimization. Additional compilation ranges, additional fusions, FULL_AND_PIECEWISE cudagraphs.
+- `-O3`: Aggressive optimization. Currently equal to `-O2`, but may include additional time-consuming or experimental optimizations in the future.
+
+For more information, see the [optimization level documentation](../design/optimization_levels.md).
+
 ## Preemption

 Due to the autoregressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
@@ -282,7 +293,7 @@ llm = LLM(
 Based on the configuration, the content of the multi-modal caches on `P0` and `P1` are as follows:

 | mm_processor_cache_type | Cache Type | `P0` Cache | `P1` Engine Cache | `P1` Worker Cache | Max. Memory |
-|-------------------|-------------|------------|------------|-------------|-------------|
+| ----------------- | ----------- | ---------- | ---------- | ----------- | ----------- |
 | lru | Processor Caching | K + V | N/A | N/A | `mm_processor_cache_gb * data_parallel_size` |
 | lru | Key-Replicated Caching | K | K + V | N/A | `mm_processor_cache_gb * api_server_count` |
 | shm | Shared Memory Caching | K | N/A | V | `mm_processor_cache_gb * api_server_count` |
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@@ -94,7 +94,6 @@ vLLM's `pre-commit` hooks will now run automatically every time you commit.
    Some `pre-commit` hooks only run in CI. If you need to, you can run them locally with:

    ```bash
-    pre-commit run --hook-stage manual markdownlint
    pre-commit run --hook-stage manual mypy-3.10
    ```

--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@@ -66,12 +66,12 @@ This complicates the process as we cannot use the out-of-the-box
 - Important indexes at the moment include:

 | Platform | `--extra-index-url` |
-|----------|-----------------|
-| CUDA 12.8| [https://download.pytorch.org/whl/cu128](https://download.pytorch.org/whl/cu128)|
-| CPU      | [https://download.pytorch.org/whl/cpu](https://download.pytorch.org/whl/cpu)|
+| -------- | ------------------- |
+| CUDA 12.8 | [https://download.pytorch.org/whl/cu128](https://download.pytorch.org/whl/cu128) |
+| CPU | [https://download.pytorch.org/whl/cpu](https://download.pytorch.org/whl/cpu) |
 | ROCm 6.2 | [https://download.pytorch.org/whl/rocm6.2.4](https://download.pytorch.org/whl/rocm6.2.4) |
 | ROCm 6.3 | [https://download.pytorch.org/whl/rocm6.3](https://download.pytorch.org/whl/rocm6.3) |
-| XPU      | [https://download.pytorch.org/whl/xpu](https://download.pytorch.org/whl/xpu) |
+| XPU | [https://download.pytorch.org/whl/xpu](https://download.pytorch.org/whl/xpu) |

 - Update the below files to match the CUDA version from step 1. This makes sure that the release vLLM wheel is tested on CI.
    - `.buildkite/release-pipeline.yaml`
--- a/docs/contributing/deprecation_policy.md
+++ b/docs/contributing/deprecation_policy.md
@@ -66,7 +66,7 @@ stages will be removed.
 Assume a feature is deprecated in `v0.9.0`.

 | Release       | Status                                                                                          |
-|---------------|-------------------------------------------------------------------------------------------------|
+| ------------- | ----------------------------------------------------------------------------------------------- |
 | `v0.9.0`      | Feature is deprecated with clear removal version listed.                                        |
 | `v0.10.0`     | Feature is now off by default, throws an error when used, and can be re-enabled for legacy use. |
 | `v0.11.0`     | Feature is removed.                                                                             |
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -5,8 +5,12 @@

 ## Profile with PyTorch Profiler

-We support tracing vLLM workers using the `torch.profiler` module. You can enable the torch profiler by setting `--profiler-config`
-when launching the server, and setting the entries `profiler` to `'torch'` and `torch_profiler_dir` to the directory where you want to save the traces. Additionally, you can control the profiling content by specifying the following additional arguments in the config:
+We support tracing vLLM workers using different profilers. You can enable profiling by setting the `--profiler-config` flag when launching the server.
+
+!!! note
+    The `--profiler-config` flag is available in vLLM v0.13.0 and later. If you are using an earlier version, please upgrade to use this feature.
+
+To use the `torch.profiler` module, set the `profiler` entry to `'torch'` and `torch_profiler_dir` to the directory where you want to save the traces. Additionally, you can control the profiling content by specifying the following additional arguments in the config:

 - `torch_profiler_record_shapes` to enable recording Tensor Shapes, off by default
 - `torch_profiler_with_memory` to record memory, off by default
--- a/docs/deployment/frameworks/helm.md
+++ b/docs/deployment/frameworks/helm.md
@@ -49,7 +49,7 @@ chart **including persistent volumes** and deletes the release.
 The following table describes configurable parameters of the chart in `values.yaml`:

 | Key | Type | Default | Description |
-|-----|------|---------|-------------|
+| --- | ---- | ------- | ----------- |
 | autoscaling | object | {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} | Autoscaling configuration |
 | autoscaling.enabled | bool | false | Enable autoscaling |
 | autoscaling.maxReplicas | int | 100 | Maximum replicas |
--- a/docs/deployment/frameworks/runpod.md
+++ b/docs/deployment/frameworks/runpod.md
@@ -0,0 +1,87 @@
+# RunPod
+
+vLLM can be deployed on [RunPod](https://www.runpod.io/), a cloud GPU platform that provides on-demand and serverless GPU instances for AI inference workloads.
+
+## Prerequisites
+
+- A RunPod account with GPU pod access
+- A GPU pod running a CUDA-compatible template (e.g., `runpod/pytorch`)
+
+## Starting the Server
+
+SSH into your RunPod pod and launch the vLLM OpenAI-compatible server:
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+    --model <model-name> \
+    --host 0.0.0.0 \
+    --port 8000
+```
+
+!!! note
+
+    Use `--host 0.0.0.0` to bind to all interfaces so the server is reachable from outside the container.
+
+## Exposing Port 8000
+
+RunPod exposes HTTP services through its proxy. To make port 8000 accessible:
+
+1. In the RunPod dashboard, navigate to your pod settings.
+2. Add `8000` to the list of exposed HTTP ports.
+3. After the pod restarts, RunPod provides a public URL in the format:
+
+    ```text
+    https://<pod-id>-8000.proxy.runpod.net
+    ```
+
+## Troubleshooting 502 Bad Gateway
+
+A `502 Bad Gateway` error from the RunPod proxy typically means the server is not yet listening. Common causes:
+
+- **Model still loading** — Large models take time to download and load into GPU memory. Check the pod logs for progress.
+- **Wrong host binding** — Ensure you passed `--host 0.0.0.0`. Binding to `127.0.0.1` (the default) makes the server unreachable from the proxy.
+- **Port mismatch** — Verify the `--port` value matches the port exposed in the RunPod dashboard.
+- **Out of GPU memory** — The model may be too large for the allocated GPU. Check logs for CUDA OOM errors and consider using a larger instance or adding `--tensor-parallel-size` for multi-GPU pods.
+
+## Verifying the Deployment
+
+Once the server is running, test it with a curl request:
+
+!!! console "Command"
+
+    ```bash
+    curl https://<pod-id>-8000.proxy.runpod.net/v1/chat/completions \
+        -H "Content-Type: application/json" \
+        -d '{
+            "model": "<model-name>",
+            "messages": [
+                {"role": "user", "content": "Hello, how are you?"}
+            ],
+            "max_tokens": 50
+        }'
+    ```
+
+!!! console "Response"
+
+    ```json
+    {
+        "id": "chat-abc123",
+        "object": "chat.completion",
+        "choices": [
+            {
+                "message": {
+                    "role": "assistant",
+                    "content": "I'm doing well, thank you for asking! How can I help you today?"
+                },
+                "index": 0,
+                "finish_reason": "stop"
+            }
+        ]
+    }
+    ```
+
+You can also check the server health endpoint:
+
+```bash
+curl https://<pod-id>-8000.proxy.runpod.net/health
+```
--- a/docs/deployment/integrations/aibrix.md
+++ b/docs/deployment/integrations/aibrix.md
@@ -0,0 +1,5 @@
+# AIBrix
+
+[AIBrix](https://github.com/vllm-project/aibrix) is a cloud-native control plane that integrates with vLLM to simplify Kubernetes deployment, scaling, routing, and LoRA adapter management for large language model inference.
+
+For installation and usage instructions, please refer to the [AIBrix documentation](https://aibrix.readthedocs.io/).
--- a/docs/deployment/integrations/dynamo.md
+++ b/docs/deployment/integrations/dynamo.md
@@ -0,0 +1,7 @@
+# NVIDIA Dynamo
+
+[NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo) is an open-source framework for distributed LLM inference that can run vLLM on Kubernetes with flexible serving architectures (e.g. aggregated/disaggregated, optional router/planner).
+
+For Kubernetes deployment instructions and examples (including vLLM), see the [Deploying Dynamo on Kubernetes](https://github.com/ai-dynamo/dynamo/blob/main/docs/kubernetes/README.md) guide.
+
+Background reading: InfoQ news coverage — [NVIDIA Dynamo simplifies Kubernetes deployment for LLM inference](https://www.infoq.com/news/2025/12/nvidia-dynamo-kubernetes/).
--- a/docs/deployment/integrations/kubeai.md
+++ b/docs/deployment/integrations/kubeai.md
@@ -5,6 +5,7 @@
 Please see the Installation Guides for environment specific instructions:

 - [Any Kubernetes Cluster](https://www.kubeai.org/installation/any/)
+- [AKS](https://www.kubeai.org/installation/aks/)
 - [EKS](https://www.kubeai.org/installation/eks/)
 - [GKE](https://www.kubeai.org/installation/gke/)

--- a/docs/deployment/integrations/kuberay.md
+++ b/docs/deployment/integrations/kuberay.md
@@ -6,7 +6,7 @@ A Ray cluster can be declared in YAML, and the operator then handles pod schedul
 ## Why KubeRay instead of manual scripts?

 | Feature | Manual scripts | KubeRay |
-|---------|-----------------------------------------------------------|---------|
+| ------- | --------------------------------------------------------- | ------- |
 | Cluster bootstrap | Manually SSH into every node and run a script | One command to create or update the whole cluster: `kubectl apply -f cluster.yaml` |
 | Autoscaling | Manual | Automatically patches CRDs for adjusting cluster size |
 | Upgrades | Tear down & re-create manually | Blue/green deployment updates supported |
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@@ -11,6 +11,7 @@ Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine le
 Alternatively, you can deploy vLLM to Kubernetes using any of the following:

 - [Helm](frameworks/helm.md)
+- [NVIDIA Dynamo](integrations/dynamo.md)
 - [InftyAI/llmaz](integrations/llmaz.md)
 - [llm-d](integrations/llm-d.md)
 - [KAITO](integrations/kaito.md)
@@ -20,7 +21,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 - [kubernetes-sigs/lws](frameworks/lws.md)
 - [meta-llama/llama-stack](integrations/llamastack.md)
 - [substratusai/kubeai](integrations/kubeai.md)
- [vllm-project/aibrix](https://github.com/vllm-project/aibrix)
+- [vllm-project/AIBrix](integrations/aibrix.md)
 - [vllm-project/production-stack](integrations/production-stack.md)

 ## Deployment with CPUs
--- a/docs/design/arch_overview.md
+++ b/docs/design/arch_overview.md
@@ -119,10 +119,10 @@ The code can be found in [vllm/v1/engine/coordinator.py](../../vllm/v1/engine/co
 For a deployment with `N` GPUs, `TP` tensor parallel size, `DP` data parallel size, and `A` API server count:

 | Process Type | Count | Notes |
-|---|---|---|
+| - | - | - |
 | API Server | `A` (default `DP`) | Handles HTTP requests and input processing |
 | Engine Core | `DP` (default 1) | Scheduler and KV cache management |
-| GPU Worker | `N` (= `DP x TP`) | One per GPU, executes model forward passes |
+| GPU Worker | `N` (= `DP x PP x TP`) | One per GPU, executes model forward passes |
 | DP Coordinator | 1 if `DP > 1`, else 0 | Load balancing across DP ranks |
 | **Total** | **`A + DP + N` (+ 1 if DP > 1)** | |

--- a/docs/design/attention_backends.md
+++ b/docs/design/attention_backends.md
@@ -101,7 +101,7 @@ Priority is **1 = highest** (tried first).
 **Blackwell (SM 10.x):**

 | Priority | Backend |
-|----------|---------|
+| -------- | ------- |
 | 1 | `FLASHINFER` |
 | 2 | `FLASH_ATTN` |
 | 3 | `TRITON_ATTN` |
@@ -110,7 +110,7 @@ Priority is **1 = highest** (tried first).
 **Ampere/Hopper (SM 8.x-9.x):**

 | Priority | Backend |
-|----------|---------|
+| -------- | ------- |
 | 1 | `FLASH_ATTN` |
 | 2 | `FLASHINFER` |
 | 3 | `TRITON_ATTN` |
@@ -121,7 +121,7 @@ Priority is **1 = highest** (tried first).
 **Blackwell (SM 10.x):**

 | Priority | Backend |
-|----------|---------|
+| -------- | ------- |
 | 1 | `FLASHINFER_MLA` |
 | 2 | `CUTLASS_MLA` |
 | 3 | `FLASH_ATTN_MLA` |
@@ -133,7 +133,7 @@ Priority is **1 = highest** (tried first).
 **Ampere/Hopper (SM 8.x-9.x):**

 | Priority | Backend |
-|----------|---------|
+| -------- | ------- |
 | 1 | `FLASH_ATTN_MLA` |
 | 2 | `FLASHMLA` |
 | 3 | `FLASHINFER_MLA` |
@@ -145,7 +145,7 @@ Priority is **1 = highest** (tried first).
 ## Legend

 | Column | Description |
-|--------|-------------|
+| ------ | ----------- |
 | **Dtypes** | Supported model data types (fp16, bf16, fp32) |
 | **KV Dtypes** | Supported KV cache data types (`auto`, `fp8`, `fp8_e4m3`, etc.) |
 | **Block Sizes** | Supported KV cache block sizes (%N means multiples of N) |
@@ -162,20 +162,20 @@ Priority is **1 = highest** (tried first).
 ## Standard Attention (MHA, MQA, GQA) Backends

 | Backend | Version | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | MM Prefix | DCP | Attention Types | Compute Cap. |
-|---------|---------|--------|-----------|-------------|------------|------|-----------|-----|-----------------|--------------|
-| `CPU_ATTN` |  | fp16, bf16, fp32 | `auto` | Any | 32, 64, 80, 96, 112, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | All | N/A |
+| ------- | ------- | ------ | --------- | ----------- | ---------- | ---- | --------- | --- | --------------- | ------------ |
+| `CPU_ATTN` | | fp16, bf16, fp32 | `auto` | Any | 32, 64, 80, 96, 112, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | All | N/A |
 | `FLASHINFER` | Native† | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ❌ | ❌ | ✅ | Decoder | 7.x-9.x |
 | `FLASHINFER` | TRTLLM† | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ✅ | ❌ | ✅ | Decoder | 10.x |
 | `FLASH_ATTN` | FA2* | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ✅ | All | ≥8.0 |
 | `FLASH_ATTN` | FA3* | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ❌ | ✅ | All | 9.x |
 | `FLASH_ATTN` | FA4* | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ✅ | All | ≥10.0 |
-| `FLASH_ATTN_DIFFKV` |  | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ✅ | Decoder | Any |
-| `FLEX_ATTENTION` |  | fp16, bf16, fp32 | `auto`, `bfloat16` | Any | Any | ❌ | ✅ | ❌ | Decoder, Encoder Only | Any |
-| `ROCM_AITER_FA` |  | fp16, bf16 | `auto` | 16, 32 | 64, 128, 256 | ❌ | ❌ | ❌ | Decoder | N/A |
-| `ROCM_AITER_UNIFIED_ATTN` |  | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | All | N/A |
-| `ROCM_ATTN` |  | fp16, bf16, fp32 | `auto` | 16, 32, 544 | 32, 64, 80, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | All | N/A |
-| `TREE_ATTN` |  | fp16, bf16 | `auto` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | Decoder | Any |
-| `TRITON_ATTN` |  | fp16, bf16, fp32 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ✅ | ❌ | All | Any |
+| `FLASH_ATTN_DIFFKV` | | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ✅ | Decoder | Any |
+| `FLEX_ATTENTION` | | fp16, bf16, fp32 | `auto`, `bfloat16` | Any | Any | ❌ | ✅ | ❌ | Decoder, Encoder Only | Any |
+| `ROCM_AITER_FA` | | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32 | 64, 128, 256 | ❌ | ❌ | ❌ | Decoder, Enc-Dec | N/A |
+| `ROCM_AITER_UNIFIED_ATTN` | | fp16, bf16 | `auto` | %16 | Any | ✅ | ✅ | ❌ | All | N/A |
+| `ROCM_ATTN` | | fp16, bf16, fp32 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 544 | 32, 64, 80, 96, 128, 160, 192, 224, 256 | ✅ | ✅ | ❌ | All | N/A |
+| `TREE_ATTN` | | fp16, bf16 | `auto` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | Decoder | Any |
+| `TRITON_ATTN` | | fp16, bf16, fp32 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ✅ | ❌ | All | Any |

 > **†** FlashInfer uses TRTLLM attention on Blackwell (SM100), which supports sinks. Disable via `--attention-config.use_trtllm_attention=0`.
 >
@@ -191,10 +191,10 @@ The prefill backend is selected at runtime based on hardware and
 configuration.

 | Backend | Description | Compute Cap. | Enable | Disable | Notes |
-|---------|-------------|--------------|--------|---------|-------|
+| ------- | ----------- | ------------ | ------ | ------- | ----- |
 | TRT-LLM Ragged‡ | TensorRT-LLM ragged attention | 10.x | Default on SM100 | `-ac.use_trtllm_ragged_deepseek_prefill=0` | DeepSeek R1 dims only |
 | FlashInfer | FlashInfer CUTLASS backend | 10.x | `-ac.disable_flashinfer_prefill=0` | `-ac.disable_flashinfer_prefill=1` | DeepSeek R1 dims only |
-| cuDNN | cuDNN-based attention | 10.x | `-ac.use_cudnn_prefill=1` | `-ac.use_cudnn_prefill=0` |  |
+| cuDNN | cuDNN-based attention | 10.x | `-ac.use_cudnn_prefill=1` | `-ac.use_cudnn_prefill=0` | |
 | FlashAttention | FlashAttention varlen (FA2/FA3) | Any | Default fallback | Use other backends | FA3 on SM90, FA2 otherwise |

 > **‡** TRT-LLM Ragged is the default on Blackwell (SM100).
@@ -203,14 +203,14 @@ configuration.
 ### Decode Backends

 | Backend | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | Sparse | MM Prefix | DCP | Attention Types | Compute Cap. |
-|---------|--------|-----------|-------------|------------|------|--------|-----------|-----|-----------------|--------------|
+| ------- | ------ | --------- | ----------- | ---------- | ---- | ------ | --------- | --- | --------------- | ------------ |
 | `CUTLASS_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 128 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 10.x |
 | `FLASHINFER_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | 10.x |
-| `FLASHINFER_MLA_SPARSE` | fp16, bf16 | `auto`, `bfloat16` | 32, 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 10.x |
+| `FLASHINFER_MLA_SPARSE` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 10.x |
 | `FLASHMLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 64 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x-10.x |
 | `FLASHMLA_SPARSE` | bf16 | `auto`, `bfloat16`, `fp8_ds_mla` | 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 9.x-10.x |
 | `FLASH_ATTN_MLA` | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x |
-| `ROCM_AITER_MLA` | fp16, bf16 | `auto` | 1 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
-| `ROCM_AITER_MLA_SPARSE` | fp16, bf16 | `auto` | Any | 576 | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
+| `ROCM_AITER_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 1 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
+| `ROCM_AITER_MLA_SPARSE` | fp16, bf16 | `auto`, `bfloat16` | 1 | Any | ❌ | ✅ | ❌ | ❌ | Decoder | N/A |
 | `ROCM_AITER_TRITON_MLA` | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
-| `TRITON_MLA` | fp16, bf16 | `auto`, `bfloat16` | Any | Any | ❌ | ❌ | ❌ | ✅ | Decoder | Any |
+| `TRITON_MLA` | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | Any |
--- a/docs/design/cuda_graphs.md
+++ b/docs/design/cuda_graphs.md
@@ -98,7 +98,7 @@ The goal of this structure is to uniquely identify a (padded) batch with minimal

 ### `CudagraphDispatcher`

-The [CudagraphDispatcher][vllm.v1.cudagraph_dispatcher.CudagraphDispatcher] takes responsibility for maintaining two sets of valid dispatching keys, one set for `FULL` runtime mode and one set for `PIECEWISE` runtime mode, and dispatches the correct runtime mode and the dispatching keys before executing the model's forwards. It will take in the initial key (a rough batch_descriptor for the padded input) and return the selected runtime mode and the final batch_descriptor, then tell the CUDAGraphWarpper instances that decision through forward contexts. Notice that `CudagraphDispatcher` is the only source of truth for available CUDA Graph keys and `CUDAGraphWrapper` instances can blindly trust the forward context on what CUDA Graphs to dispatch to. This lets us simplify the wrapper code and centralize the logic in the dispatcher.
+The [CudagraphDispatcher][vllm.v1.cudagraph_dispatcher.CudagraphDispatcher] takes responsibility for maintaining two sets of valid dispatching keys, one set for `FULL` runtime mode and one set for `PIECEWISE` runtime mode, and dispatches the correct runtime mode and the dispatching keys before executing the model's forwards. It will take in the initial key (a rough batch_descriptor for the padded input) and return the selected runtime mode and the final batch_descriptor, then tell the CUDAGraphWrapper instances that decision through forward contexts. Notice that `CudagraphDispatcher` is the only source of truth for available CUDA Graph keys and `CUDAGraphWrapper` instances can blindly trust the forward context on what CUDA Graphs to dispatch to. This lets us simplify the wrapper code and centralize the logic in the dispatcher.

 The dispatching keys are initialized through the dispatcher's `initialize_cudagraph_keys` method, which is called by the gpu_model_runner after all possible attention backends are initialized. This is where we can get much fancier in the future and “prepare” all kinds of CUDA Graphs combinations. For now, we just append available keys based on the valid combos of `decode_mode`/`mixed_mode` of `cudagraph_mode` and `cudagraph_capture_sizes` in the compilation config.

@@ -174,18 +174,18 @@ Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that
 The following table lists backends that support full CUDA Graphs at the time of writing.

 | Attention Backend | cudagraph_support | Comments |
-|:---|:---|:---|
+| :---------------- | :---------------- | :------- |
 | FlashAttention v2 | `UNIFORM_BATCH` | Actually `ALWAYS` but workaround to fallback to `FULL_AND_PIECEWISE` for performance reason |
 | FlashAttention v3 | `ALWAYS` | has unified routine for both batches, so `FULL` mode is good |
 | Triton Attention | `ALWAYS` | prefer `FULL_AND_PIECEWISE` since it has different kernels for prefill/mixed and pure decode batches |
-| AITER FlashAttention | `UNIFORM_BATCH`| |
+| AITER FlashAttention | `UNIFORM_BATCH` | |
 | FlashInfer | `UNIFORM_SINGLE_TOKEN_DECODE` | Will be set to `UNIFORM_BATCH` when using TRTLLM attention on Blackwell |
 | FlashMLA | `UNIFORM_BATCH` | |
 | FlashInferMLA | `UNIFORM_BATCH` | |
 | FlashInferMLASparse | `UNIFORM_BATCH` | |
 | AITER MLA | `UNIFORM_SINGLE_TOKEN_DECODE` | |
 | CUTLASS MLA | `UNIFORM_SINGLE_TOKEN_DECODE` | |
-| Mamba attention| `UNIFORM_SINGLE_TOKEN_DECODE` | |
+| Mamba attention | `UNIFORM_SINGLE_TOKEN_DECODE` | |

 Unlisted backends are all declared as `NEVER`.

--- a/docs/design/debug_vllm_compile.md
+++ b/docs/design/debug_vllm_compile.md
@@ -5,12 +5,12 @@ TL;DR:
 - use tlparse to acquire torch.compile logs. Include these logs in bug reports and/or support asks.
 - The vLLM-torch.compile integration is multiple pieces. vLLM exposes flags to turn off each piece:

-| Online Flag | Offline Flag   |      Result |
-|----------|----------|-------------|
-| --enforce-eager | enforce_eager=True |  Turn off torch.compile and CUDAGraphs |
-| -cc.mode=0 | mode=CompilationMode.NONE |  Turn off torch.compile only |
-| -cc.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) |  Turn off CUDAGraphs only |
-| -cc.backend=eager | compilation_config=CompilationConfig(backend='eager') |  Turn off TorchInductor |
+| Online Flag | Offline Flag | Result |
+| ----------- | ------------ | ------ |
+| --enforce-eager | enforce_eager=True | Turn off torch.compile and CUDAGraphs |
+| -cc.mode=0 | mode=CompilationMode.NONE | Turn off torch.compile only |
+| -cc.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) | Turn off CUDAGraphs only |
+| -cc.backend=eager | compilation_config=CompilationConfig(backend='eager') | Turn off TorchInductor |

 ## vLLM-torch.compile overview

--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -47,7 +47,7 @@ The TopK Weight Application and Reduction components happen right after the Unpe
 Please find the implementations of TopKWeightAndReduce [here](../../vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).

 `FusedMoEPrepareAndFinalizeModular::finalize()` method accepts a `TopKWeightAndReduce` argument that is invoked inside the method.
-The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEExpertsModular` and `FusedMoEPerpareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens.
+The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEExpertsModular` and `FusedMoEPrepareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens.

 * `FusedMoEExpertsModular::finalize_weight_and_reduce_impl` method returns `TopKWeightAndReduceNoOp` if the `FusedMoEExpertsModular` implementation does the weight application and reduction itself.
 * `FusedMoEExpertsModular::finalize_weight_and_reduce_impl` method returns `TopKWeightAndReduceContiguous` / `TopKWeightAndReduceNaiveBatched` / `TopKWeightAndReduceDelegate` if the `FusedMoEExpertsModular` implementation needs the `FusedMoEPrepareAndFinalizeModular::finalize()` to do the weight application and reduction.
--- a/docs/design/fusions.md
+++ b/docs/design/fusions.md
@@ -0,0 +1,339 @@
+# Fusion torch.compile passes
+
+vLLM applies a set of kernel/operator fusions at compile time (via custom [`torch.compile`](torch_compile.md) Inductor passes)
+to separate optimizations from model definitions and avoid breaking layer abstractions in model code.
+These fusions are controlled by fields in [`PassConfig`][vllm.config.compilation.PassConfig] and are automatically enabled
+at appropriate [optimization levels](optimization_levels.md).
+
+## Quick Reference
+
+The table below maps each fusion to its controlling flag/config knob, the
+operations it fuses, what level enables it by default, and an indicative speedup.
+The Fullgraph column indicates whether the fusion requires the entire model graph to be
+visible (either via Inductor partition or `splitting_ops=[]`),
+and the last column indicates whether the fusion activates for all `num_tokens`
+or just on the low or high end.
+
+!!! info
+    Speedup depends heavily on the exact model, batch size, and hardware.
+    If tuning performance by hand, always benchmark your exact use-case with and without the fusion to verify the impact.
+
+| Fusion                                                                         | `PassConfig` flag            | Fused operations                               | Default at                     | E2E Speedup        | Fullgraph | `num_tokens` |
+| ------------------------------------------------------------------------------ | ---------------------------- | ---------------------------------------------- | ------------------------------ | ------------------ | --------- | ------------ |
+| [AllReduce + RMSNorm](#allreduce--rmsnorm-fuse_allreduce_rms)                  | `fuse_allreduce_rms`         | All-reduce → RMSNorm (+residual_add) (→ quant) | O2 (Hopper/Blackwell + TP > 1) | 5-20%              | No        | Low          |
+| [Attention + Quant](#attention--quantization-fuse_attn_quant)                  | `fuse_attn_quant`            | Attention output → FP8/NVFP4 quant             | Off by default                 | 3-7%               | Yes       | Always       |
+| [RoPE + KV-Cache Update](#rope--kv-cache-update-fuse_rope_kvcache)             | `fuse_rope_kvcache`          | Rotary embedding → KV cache write              | O1 (ROCm/AITER only)           | TBD                | No        | Low          |
+| [QK Norm + RoPE](#qk-norm--rope-enable_qk_norm_rope_fusion)                    | `enable_qk_norm_rope_fusion` | Q/K RMSNorm → rotary embedding                 | Off by default                 | 2-3%               | No        | Low          |
+| [Sequence Parallelism](#sequence-parallelism-enable_sp)                        | `enable_sp`                  | AllReduce → ReduceScatter + AllGather          | Off by default                 | Prereq for AsyncTP | Yes       | High         |
+| [AsyncTP GEMM + collective](#asynctp-gemm--collective-overlap-fuse_gemm_comms) | `fuse_gemm_comms`            | GEMM → reduce-scatter / all-gather → GEMM      | Off by default                 | 7-10%              | Yes       | High         |
+| [RMSNorm + Quant](#rmsnorm--quantization-fuse_norm_quant)                      | `fuse_norm_quant`            | RMSNorm (+residual add) → FP8/FP4 quant        | O1 (conditional)               | 1-4%               | No        | Always       |
+| [SiLU+Mul + Quant](#silumul--quantization-fuse_act_quant)                      | `fuse_act_quant`             | SiLU+Mul activation → FP8/FP4 quant            | O1 (conditional)               | 1-4%               | No        | Always       |
+| [RMSNorm + Padding](#rmsnorm--padding-fuse_act_padding)                        | `fuse_act_padding`           | Residual add + RMSNorm → padding               | O1 (ROCm/AITER only)           | TBD                | No        | Always       |
+
+## Support Matrix
+
+The table below lists the quantization schemes supported by each fusion on each platform.
+**—** means the fusion is not available on that platform. The latest and in-progress work is available in the tracking issue:
+[#36066](https://github.com/vllm-project/vllm/issues/36066)
+
+| Fusion                       | SM100 (Blackwell)                        | SM90 (Hopper)                            | SM89 (Ada)                               | SM80 (Ampere) | ROCm                                     |
+| ---------------------------- | ---------------------------------------- | ---------------------------------------- | ---------------------------------------- | ------------- | ---------------------------------------- |
+| `fuse_allreduce_rms`         | FP16/BF16, FP8 static, NVFP4             | FP16/BF16, FP8 static                    | —                                        | —             | —                                        |
+| `fuse_attn_quant`\*          | FP8 static\*, NVFP4\*                    | FP8 static\*                             | FP8 static\*                             | —             | FP8 static\*                             |
+| `fuse_rope_kvcache`          | —                                        | —                                        | —                                        | —             | FP16/BF16                                |
+| `enable_qk_norm_rope_fusion` | FP16/BF16                                | FP16/BF16                                | FP16/BF16†                               | FP16/BF16†    | —                                        |
+| `enable_sp`                  | FP16/BF16, FP8 static†                   | FP16/BF16, FP8 static                    | FP16/BF16†                               | FP16/BF16†    | —                                        |
+| `fuse_gemm_comms`            | FP16/BF16, FP8 static†                   | FP16/BF16, FP8 static                    | FP16/BF16†                               | FP16/BF16†    | —                                        |
+| `fuse_norm_quant`            | FP8 static, FP8 per-token, FP8 per-group | FP8 static, FP8 per-token, FP8 per-group | FP8 static, FP8 per-token, FP8 per-group | —             | FP8 static, FP8 per-token, FP8 per-group |
+| `fuse_act_quant`             | FP8 static, NVFP4                        | FP8 static                               | FP8 static                               | —             | FP8 per-group                            |
+| `fuse_act_padding`           | —                                        | —                                        | —                                        | —             | FP16/BF16                                |
+
+\* `fuse_attn_quant` support depends on the attention backend in use; not all backends support
+fused quantization output. See the [`fuse_attn_quant` section](#attention--quantization-fuse_attn_quant)
+for per-backend details.
+
+† `enable_sp` and `fuse_gemm_comms` are only autoconfigured for SM90 today;
+other architectures support requires setting `PassConfig.sp_min_token_num` explicitly.
+SM100 support also requires setting `VLLM_DISABLED_KERNELS=FlashInferFP8ScaledMMLinearKernel`.
+
+## Enabling / Disabling Fusions
+
+Fusions are exposed through `PassConfig`, which is nested inside `CompilationConfig`:
+
+```python
+from vllm import LLM
+from vllm.config import CompilationConfig, PassConfig
+
+llm = LLM(
+    model="...",
+    optimization_level=2, # Default optimization level
+    compilation_config=CompilationConfig(
+        pass_config=PassConfig(
+            fuse_norm_quant=True,
+            fuse_act_quant=True,
+            fuse_allreduce_rms=False,  # disable a specific fusion
+        )
+    ),
+)
+```
+
+Fusions can also be enabled using command-line flags with any `vllm ...` command:
+
+```bash
+# Enable O2 defaults, but turn off allreduce fusion
+vllm serve meta-llama/Llama-3.1-8B-Instruct -O2 -cc.pass_config.fuse_allreduce_rms=False
+
+# The above is equivalent to the more verbose:
+vllm serve meta-llama/Llama-3.1-8B-Instruct -O2 --compilation-config '{"pass_config": {"fuse_allreduce_rms": false}}'
+
+# Same syntax in other commands, e.g. vllm bench:
+vllm bench latency --model=meta-llama/Llama-3.1-8B-Instruct -O2 -cc.pass_config.fuse_allreduce_rms=False
+```
+
+Fields set explicitly by the user always take precedence over optimization-level defaults.
+
+## Fusion Details
+
+### AllReduce + RMSNorm (`fuse_allreduce_rms`)
+
+!!! warning
+    TP+DP and TP+PP combinations are currently broken
+    ([#34458](https://github.com/vllm-project/vllm/issues/34458) and
+    [#35426](https://github.com/vllm-project/vllm/issues/35426)).
+    Only supported on NVIDIA Hopper (SM90) and Blackwell (SM100) with FlashInfer installed.
+
+**What it fuses.** Fuses the tensor-parallel all-reduce collective with the subsequent residual add,
+RMSNorm, and optionally a quantization step into a single FlashInfer / TRT-LLM communication kernel.
+This fusion is only profitable for small `num_tokens`,
+so the fusion is only performed in the lower compiled range.
+
+Patterns covered:
+
+- `AllReduce → RMSNorm(+residual_add)`: CUDA sm90+ with FlashInfer
+- `AllReduce → RMSNorm(+residual_add) → FP8 static quant`: CUDA sm90+ with FlashInfer
+- `AllReduce → RMSNorm(+residual_add) → NVFP4 dynamic quant`: CUDA sm100+ with FlashInfer
+
+The maximum tensor size below which the fused kernel is used is hardware-dependent (64 MB for TP=2
+on SM90/SM100) and configurable via `PassConfig.fi_allreduce_fusion_max_size_mb`.
+
+**Code locations.**
+
+- Pass: [`vllm/compilation/passes/fusion/allreduce_rms_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/allreduce_rms_fusion.py)
+- FlashInfer all-reduce: [`vllm/distributed/device_communicators/flashinfer_all_reduce.py`](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/device_communicators/flashinfer_all_reduce.py)
+- Benchmark: [`benchmarks/kernels/benchmark_fused_collective.py`](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_fused_collective.py)
+
+### Attention + Quantization (`fuse_attn_quant`)
+
+!!! info
+    `fuse_attn_quant` is currently not enabled at any optimization level by default and must be set
+    explicitly. It requires the full model graph to be visible (Inductor partition or `splitting_ops=[]`).
+
+**What it fuses.** Fuses the attention output quantization directly after the attention computation,
+eliminating a full-precision memory round-trip of the attention output. Patterns covered:
+
+`Attention → FP8 static quant`:
+
+- `TRITON_ATTN`: CUDA, ROCm
+- `FLASHINFER`: CUDA sm100+ with FlashInfer installed
+- `ROCM_ATTN`: ROCm
+- `ROCM_AITER_UNIFIED_ATTN`: ROCm with AITER
+
+`Attention → NVFP4 dynamic quant`:
+
+- `FLASHINFER`: CUDA sm100+ with FlashInfer installed
+
+Other attention backends do not support fused output quantization yet.
+
+**Code locations.**
+
+- Pass: [`vllm/compilation/passes/fusion/attn_quant_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/attn_quant_fusion.py)
+- Attention backends: [`vllm/v1/attention/backends/`](https://github.com/vllm-project/vllm/blob/main/vllm/v1/attention/backends/)
+
+### RoPE + KV-Cache Update (`fuse_rope_kvcache`)
+
+!!! info
+    ROCm/AITER-only. Not available on NVIDIA CUDA or CPU. The fusion is only enabled for
+    `num_tokens ≤ 256` by default due to AITER fused kernel performance issues.
+    This threshold is configurable via `PassConfig.rope_kvcache_fusion_max_token_num`.
+
+**What it fuses.** Fuses the rotary positional embedding kernel with the KV-cache scatter/write into
+a single kernel, avoiding separate reads and writes of the key and value tensors.
+
+Requires: AMD ROCm with AITER enabled, the `rotary_embedding` custom op active (automatic),
+and the `kv_cache` update op visible in the graph: either by using Inductor graph partition
+or removed from `splitting_ops`.
+If these conditions are set, the fusion is enabled automatically for optimization level O1 and above.
+
+**Code locations.**
+
+- Pass: [`vllm/compilation/passes/fusion/rope_kvcache_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/rope_kvcache_fusion.py)
+
+### Sequence Parallelism (`enable_sp`)
+
+**What it fuses.** Replaces all-reduce collectives with reduce-scatter + local RMSNorm + all-gather,
+splitting the sequence dimension across TP ranks. This restructures the graph so the subsequent AsyncTP
+pass can fuse the reduce-scatter / all-gather with the surrounding GEMMs.
+
+Sequence Parallelism itself does not directly improve performance; it is a prerequisite for the
+AsyncTP pass (`fuse_gemm_comms`). SP is only applied above a minimum token threshold that is
+autoconfigured based on device capability and model `hidden_size`. Currently only active on
+H100/SM90 for models with `hidden_size >= 8192`. The threshold is configurable via
+`PassConfig.sp_min_token_num`.
+
+The general transformation:
+
+```text
+Input → AllReduce → RMSNorm → Output
+becomes:
+Input → ReduceScatter → local RMSNorm → AllGather → Output
+```
+
+Patterns covered:
+
+- First block: `AllReduce → RMSNorm` → `ReduceScatter → RMSNorm → AllGather`
+- Middle blocks: `AllReduce → fused_add_RMSNorm` → `ReduceScatter → fused_add_RMSNorm → AllGather`
+- Both with optional `→ FP8 static quant` suffix
+
+Requires: `use_inductor_graph_partition=True` **or** piecewise compilation with static sizes
+divisible by `tensor_parallel_size`.
+
+Supported hardware: Only tested on NVIDIA CUDA, possibly works on ROCm. FP8 all-gather requires sm90+.
+
+**Code locations.**
+
+- Pass: [`vllm/compilation/passes/fusion/sequence_parallelism.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/sequence_parallelism.py)
+
+### AsyncTP GEMM + Collective Overlap (`fuse_gemm_comms`)
+
+!!! info
+    Requires `enable_sp=True` (enabled automatically). This pass is a no-op if Sequence Parallelism has not been applied.
+
+**What it fuses.** After Sequence Parallelism transforms the graph, fuses GEMM kernels with the
+surrounding reduce-scatter (output projection) and all-gather (input projection) using
+`torch.ops.symm_mem` symmetric-memory primitives, overlapping communication and computation.
+This overlap is only profitable for large `num_tokens`, so the fusion (and preceding SP)
+is only performed in the higher compiled range above `PassConfig.sp_min_token_num`.
+
+Patterns covered:
+
+- `GEMM → reduce-scatter` → `fused_matmul_reduce_scatter`
+- `all-gather → GEMM` → `all_gather_matmul`
+- FP8 scaled variants of both patterns
+
+Supported hardware: NVIDIA CUDA with symmetric-memory (`torch.distributed._symmetric_memory`) support.
+
+On B200, pattern-matching fp8 FlashInfer scaled MM is not supported, so it must be disabled
+([#27893](https://github.com/vllm-project/vllm/issues/27893))
+
+```shell
+VLLM_DISABLED_KERNELS=FlashInferFP8ScaledMMLinearKernel ...
+```
+
+**Code locations.**
+
+- Pass: [`vllm/compilation/passes/fusion/collective_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/collective_fusion.py)
+- Sequence parallelism pass: [`vllm/compilation/passes/fusion/sequence_parallelism.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/sequence_parallelism.py)
+
+### QK Norm + RoPE (`enable_qk_norm_rope_fusion`)
+
+!!! info
+    Only applicable to models that apply per-head RMSNorm to Q and K before rotary positional
+    embedding (e.g. Qwen). Not enabled by default at any optimization level due to perf issues on H100:
+    [#34391](https://github.com/vllm-project/vllm/issues/34391)
+
+**What it fuses.** Fuses the sequence: split QKV → reshape → Q/K RMSNorm → reshape → rotary
+embedding into a single `fused_qk_norm_rope` CUDA kernel.
+
+```text
+# Unfused:
+q, k, v = split(qkv)
+q_norm = rms_norm(q.view(heads))
+k_norm = rms_norm(k.view(kv_heads))
+q_rope, k_rope = rotary_embedding(q_norm, k_norm, ...)
+
+# Fused:
+fused_qk_norm_rope(qkv, ...)
+```
+
+Supported hardware: CUDA (sm80+) only, tested only on sm90 and sm100.
+
+**Code locations.**
+
+- Pass: [`vllm/compilation/passes/fusion/qk_norm_rope_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/qk_norm_rope_fusion.py)
+- CUDA kernel: [`csrc/ops.h`](https://github.com/vllm-project/vllm/blob/main/csrc/ops.h) (`fused_qk_norm_rope`)
+
+### RMSNorm + Quantization (`fuse_norm_quant`)
+
+!!! warning
+    On NVIDIA, Inductor actually generates a faster fused kernel than our custom CUDA kernel.
+    Hence, this fusion is only enabled when either `rms_norm` or `quant_fp8` is using a custom kernel.
+
+**What it fuses.** Combines the custom `rms_norm` / `fused_add_rms_norm`
+operations with subsequent quantization into a single fused kernel,
+eliminating an intermediate read/write of the full-precision activation tensor.
+Two variants are fused:
+
+- *Plain RMSNorm + quant*: `rms_norm(x) → quant_fp8(y)`
+- *Fused-add RMSNorm + quant*: `fused_add_rms_norm(x, residual) → quant_fp8(y)` — also updates the residual in-place.
+
+Note that AITER fusions are currently in a separate pass in `vllm.compilation.passes.fusion.rocm_aiter_fusion`.
+
+Supported quantization scheme/hardware combinations:
+
+- FP8 static per-tensor: CUDA & HIP kernel
+- FP8 dynamic per-token: CUDA & HIP kernel, AITER
+- FP8 dynamic per-token-group (128/64): CUDA & HIP kernel, AITER
+
+**Code locations.**
+
+- Pass: [`vllm/compilation/passes/fusion/rms_quant_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/rms_quant_fusion.py)
+- ROCm AITER pass: [`vllm/compilation/passes/fusion/rocm_aiter_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/rocm_aiter_fusion.py)
+- CUDA/HIP kernels: [`csrc/layernorm_quant_kernels.cu`](https://github.com/vllm-project/vllm/blob/main/csrc/layernorm_quant_kernels.cu)
+
+### SiLU+Mul + Quantization (`fuse_act_quant`)
+
+!!! warning
+    Same as `fuse_norm_quant`: on NVIDIA, Inductor generates a faster fused kernel than our custom ops.
+    This fusion is only enabled when either `silu_and_mul` or `quant_fp8` are using a custom kernel,
+    or for NVFP4-quantized models (where FP4 quant is always a custom op).
+
+**What it fuses.** Fuses the `silu_and_mul` gate-up projection activation with subsequent quantization into a single kernel,
+avoiding materialization of the full-precision post-activation tensor.
+
+Note that AITER fusions are in a separate pass in `vllm.compilation.passes.fusion.rocm_aiter_fusion`.
+
+Supported quantization scheme/hardware combinations:
+
+- FP8 static per-tensor: CUDA & HIP kernel
+- NVFP4 dynamic: CUDA sm100+ only with FlashInfer
+- FP8 per-token-group (128): ROCm AITER only
+
+**Code locations.**
+
+- Pass: [`vllm/compilation/passes/fusion/act_quant_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/act_quant_fusion.py)
+- ROCm AITER pass: [`vllm/compilation/passes/fusion/rocm_aiter_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/rocm_aiter_fusion.py)
+- CUDA/HIP kernels: [`csrc/quantization/`](https://github.com/vllm-project/vllm/blob/main/csrc/quantization/)
+
+### RMSNorm + Padding (`fuse_act_padding`)
+
+!!! info
+    ROCm/AITER-only. Targeted at GPT-OSS models.
+
+**What it fuses.** Fuses a residual add + RMSNorm with a subsequent padding operation that pads
+the hidden dimension to a multiple required by downstream AITER Triton GEMM kernels.
+
+Requires: AMD ROCm with AITER RMSNorm enabled. Enabled by default in optimization level O1 and above
+when the hidden size is 2880 and AITER Triton GEMMs *not* enabled.
+
+**Code locations.**
+
+- Pass: [`vllm/compilation/passes/fusion/rocm_aiter_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/rocm_aiter_fusion.py) (`RocmAiterTritonAddRMSNormPadFusionPass`)
+
+## See Also
+
+- [Optimization Levels](optimization_levels.md) — high-level presets that set
+  fusion defaults.
+- [torch.compile in vLLM](torch_compile.md) — how the Inductor pass pipeline
+  works.
+- [Attention Backends](attention_backends.md) — attention-specific kernel
+  selection.
--- a/docs/design/logits_processors.md
+++ b/docs/design/logits_processors.md
@@ -352,7 +352,7 @@ The `BatchUpdate` abstraction models the persistent batch as a list of requests,
        (s, d, UNIDIRECTIONAL or SWAP)
        ```

-    * If the Move specifies `UNIDRECTIONAL`:
+    * If the Move specifies `UNIDIRECTIONAL`:

        * The request at index `s` is moved to index `d`; index `s` becomes an empty slot

--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@@ -507,10 +507,10 @@ longer relevant in v1:
 - `vllm:num_requests_swapped`
 - `vllm:cpu_cache_usage_perc`

-In this mode, when a request is preempted (e.g. to make room in KV
-cache to complete other requests), we swap kv cache blocks out to CPU
-memory. This is also known as "KV cache offloading" and is configured
-with `--swap-space` and `--preemption-mode`.
+In this mode, when a request was preempted (e.g. to make room in KV
+cache to complete other requests), kv cache blocks were swapped out to
+CPU memory. The `--swap-space` flag has been removed as this feature
+is no longer used in V1.

 Historically, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The
 SequenceGroup encapsulated the idea of N Sequences which
--- a/docs/design/model_runner_v2.md
+++ b/docs/design/model_runner_v2.md
@@ -50,7 +50,7 @@ V1 was not originally designed with async scheduling in mind, and support requir

 ## 3. Removing Async Barrier

-A key requirement for async execution is that CPU operations remain non-blocking. Both explicit sync (for example, `torch.cuda.synchronize`) and implicit sync (for example, unpinned `.to("cuda")`) must be avoided.
+A key requirement for async execution is that CPU operations remain non-blocking. Both explicit sync (for example, `torch.accelerator.synchronize`) and implicit sync (for example, unpinned `.to("cuda")`) must be avoided.

 However, async execution can introduce race conditions when CPU and GPU concurrently touch the same memory.

--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -31,7 +31,7 @@ th {
 </style>

 | Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Subclass |
-|---------|--------------------|--------------|---------------|-------|-----------------------|-----------|
+| ------- | ------------------ | ------------ | ------------- | ----- | --------------------- | --------- |
 | naive | standard | all<sup>1</sup> | G,A,T | N | <sup>6</sup> | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE] |
 | deepep_high_throughput | standard | fp8 | G(128),A,T<sup>2</sup> | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] |
 | deepep_low_latency | batched | fp8 | G(128),A,T<sup>3</sup> | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] |
@@ -78,7 +78,7 @@ Most experts flavors include an equivalent modular interface which will be a sub
 To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE kernels must have compatible activation formats, quantization types and quantization formats.

 | Kernel | Input act. format | Quant. types | Quant. format | Activation function | Apply Weight On Input | Modular | Source |
-|--------|-------------------|--------------|---------------|---------------------|-----------------------|---------|--------|
+| ------ | ----------------- | ------------ | ------------- | ------------------- | --------------------- | ------- | ------ |
 | triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] |
 | triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
 | deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | </br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
@@ -105,7 +105,7 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k
 The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts. Note that the "naive" backend will work with any non-modular experts.

 | backend | `FusedMoEPrepareAndFinalizeModular` subclasses | `FusedMoEExpertsModular` subclasses |
-|---------|-----------------------------------------|----------------------------------------------|
-| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` |  `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
-| deepep_low_latency | `DeepEPLLPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
+| ------- | ---------------------------------------------- | ----------------------------------- |
+| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
+| deepep_low_latency | `DeepEPLLPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
 | flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
--- a/docs/design/multiprocessing.md
+++ b/docs/design/multiprocessing.md
@@ -12,9 +12,8 @@ page for information on known issues and how to solve them.

 The use of Python multiprocessing in vLLM is complicated by:

- The use of vLLM as a library and the inability to control the code using vLLM
- Varying levels of incompatibilities between multiprocessing methods and vLLM
-  dependencies
+- using vLLM as a library, which limits control over its internal code;
+- incompatibilities between certain multiprocessing methods and vLLM dependencies.

 This document describes how vLLM deals with these challenges.

@@ -22,11 +21,9 @@ This document describes how vLLM deals with these challenges.

 [Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:

- `spawn` - spawn a new Python process. The default on Windows and macOS.
-
+- `spawn` - Spawn a new Python process. The default on Windows and macOS.
 - `fork` - Use `os.fork()` to fork the Python interpreter. The default on
  Linux for Python versions prior to 3.14.
-
 - `forkserver` - Spawn a server process that will fork a new process on request.
  The default on Linux for Python version 3.14 and newer.

@@ -36,8 +33,8 @@ This document describes how vLLM deals with these challenges.
 threads. If you are under macOS, using `fork` may cause the process to crash.

 `spawn` is more compatible with dependencies, but can be problematic when vLLM
-is used as a library. If the consuming code does not use a `__main__` guard (`if
-__name__ == "__main__":`), the code will be inadvertently re-executed when vLLM
+is used as a library. If the consuming code does not use a `__main__` guard
+(`if __name__ == "__main__":`), the code will be inadvertently re-executed when vLLM
 spawns a new process. This can lead to infinite recursion, among other problems.

 `forkserver` will spawn a new server process that will fork new processes on
@@ -57,8 +54,7 @@ Multiple vLLM dependencies indicate either a preference or requirement for using
 - <https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors>
 - <https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders>

-It is perhaps more accurate to say that there are known problems with using
-`fork` after initializing these dependencies.
+Known issues exist when using `fork` after initializing these dependencies.

 ## Current State (v0)

@@ -66,8 +62,8 @@ The environment variable `VLLM_WORKER_MULTIPROC_METHOD` can be used to control w

 - <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/envs.py#L339-L342>

-When we know we own the process because the `vllm` command was used, we use
-`spawn` because it's the most widely compatible.
+If the main process is controlled via the `vllm` command,
+`spawn` is used because it's the most widely compatible.

 - <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/scripts.py#L123-L140>

@@ -104,8 +100,8 @@ dependencies and code using vLLM as a library.
 ### Changes Made in v1

 There is not an easy solution with Python's `multiprocessing` that will work
-everywhere. As a first step, we can get v1 into a state where it does "best
-effort" choice of multiprocessing method to maximize compatibility.
+everywhere. As a first step, we can get v1 into a state where it does
+"best effort" choice of multiprocessing method to maximize compatibility.

 - Default to `fork`.
 - Use `spawn` when we know we control the main process (`vllm` was executed).
@@ -154,8 +150,8 @@ RuntimeError:
 ### Detect if a `__main__` guard is present

 It has been suggested that we could behave better if we could detect whether
-code using vLLM as a library has a `__main__` guard in place. This [post on
-stackoverflow](https://stackoverflow.com/questions/77220442/multiprocessing-pool-in-a-python-class-without-name-main-guard)
+code using vLLM as a library has a `__main__` guard in place. This
+[post on Stack Overflow](https://stackoverflow.com/questions/77220442/multiprocessing-pool-in-a-python-class-without-name-main-guard)
 was from a library author facing the same question.

 It is possible to detect whether we are in the original, `__main__` process, or
@@ -192,4 +188,4 @@ that works around these challenges.
 2. We can explore other libraries that may better suit our needs. Examples to
   consider:

- <https://github.com/joblib/loky>
+    - <https://github.com/joblib/loky>
--- a/docs/design/optimization_levels.md
+++ b/docs/design/optimization_levels.md
@@ -1,29 +1,18 @@
-<!-- markdownlint-disable -->
-
 # Optimization Levels

 ## Overview

-vLLM now supports optimization levels (`-O0`, `-O1`, `-O2`, `-O3`). Optimization levels provide an intuitive mechanism for users to trade startup time for performance. Higher levels have better performance but worse startup time. These optimization levels have associated defaults to help users get desired out-of-the-box performance. Importantly, defaults set by optimization levels are purely defaults; explicit user settings will not be overwritten.
+vLLM provides 4 optimization levels (`-O0`, `-O1`, `-O2`, `-O3`) that allow users to trade off startup time for performance:
+
+- `-O0`: No optimization. Fastest startup time, but lowest performance.
+- `-O1`: Fast optimization. Simple compilation and fast fusions, and PIECEWISE cudagraphs.
+- `-O2`: Default optimization. Additional compilation ranges, additional fusions, FULL_AND_PIECEWISE cudagraphs.
+- `-O3`: Aggressive optimization. Currently equal to `-O2`, but may include additional time-consuming or experimental optimizations in the future.
+
+All optimization level defaults can be achieved by manually setting the underlying flags.
+User-set flags take precedence over optimization level defaults.

 ## Level Summaries and Usage Examples
-```bash
-# CLI usage
-python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O0
-
-# Python API usage
-from vllm.entrypoints.llm import LLM
-
-llm = LLM(
-    model="RedHatAI/Llama-3.2-1B-FP8",
-    optimization_level=0
-)
-```
-
-#### `-O1`: Quick Optimizations
- **Startup**: Moderate startup time
- **Performance**: Inductor compilation, CUDAGraphMode.PIECEWISE
- **Use case**:  Balance for most development scenarios

 ```bash
 # CLI usage
@@ -34,31 +23,59 @@ from vllm.entrypoints.llm import LLM

 llm = LLM(
    model="RedHatAI/Llama-3.2-1B-FP8",
-    optimization_level=1
+    optimization_level=2 # equivalent to -O2
 )
 ```

-#### `-O2`: Full Optimizations (Default)
- **Startup**: Longer startup time
- **Performance**: `-O1` + CUDAGraphMode.FULL_AND_PIECEWISE
- **Use case**: Production workloads where performance is important. This is the default use case. It is also very similar to the previous default. The primary difference is that  noop & fusion flags are enabled. 
+### `-O0`: No Optimization

-```bash
-# CLI usage (default, so optional)
-python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O2
+Startup as fast as possible - no autotuning, no compilation, and no cudagraphs.
+This level is good for initial phases of development and debugging.

-# Python API usage
-from vllm.entrypoints.llm import LLM
+Settings:

-llm = LLM(
-    model="RedHatAI/Llama-3.2-1B-FP8",
-    optimization_level=2  # This is the default
-)
-```
+- `-cc.cudagraph_mode=NONE`
+- `-cc.mode=NONE` (also resulting in `-cc.custom_ops=["none"]`)
+- `-cc.pass_config.fuse_...=False` (all fusions disabled)
+- `--kernel-config.enable_flashinfer_autotune=False`

-#### `-O3`: Full Optimization
-Still in development. Added infrastructure to prevent changing API in future 
-release. Currently behaves the same O2.
+### `-O1`: Fast Optimization
+
+Prioritize fast startup, but still enable basic optimizations like compilation and cudagraphs.
+This level is a good balance for most development scenarios where you want faster startup but
+still make sure your code does not break cudagraphs or compilation.
+
+Settings:
+
+- `-cc.cudagraph_mode=PIECEWISE`
+- `-cc.mode=VLLM_COMPILE`
+- `--kernel-config.enable_flashinfer_autotune=True`
+
+Fusions:
+
+- `-cc.pass_config.fuse_norm_quant=True`*
+- `-cc.pass_config.fuse_act_quant=True`*
+- `-cc.pass_config.fuse_act_padding=True`†
+- `-cc.pass_config.fuse_rope_kvcache=True`† (will be moved to O2)
+
+\* These fusions are only enabled when either op is using a custom kernel, otherwise Inductor fusion is better.</br>
+† These fusions are ROCm-only and require AITER.
+
+### `-O2`: Full Optimization (Default)
+
+Prioritize performance at the expense of additional startup time.
+This level is recommended for production workloads and is hence the default.
+Fusions in this level _may_ take longer due to additional compile ranges.
+
+Settings (on top of `-O1`):
+
+- `-cc.cudagraph_mode=FULL_AND_PIECEWISE`
+- `-cc.pass_config.fuse_allreduce_rms=True`
+
+### `-O3`: Aggressive Optimization
+
+This level is currently the same as `-O2`, but may include additional optimizations
+in the future that are more time-consuming or experimental.

 ## Troubleshooting

@@ -66,4 +83,4 @@ release. Currently behaves the same O2.

 1. **Startup Time Too Long**: Use `-O0` or `-O1` for faster startup
 2. **Compilation Errors**: Use `debug_dump_path` for additional debugging information
-3. **Performance Issues**: Ensure using `-O2` for production
+3. **Performance Issues**: Ensure using `-O2` for production
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -141,7 +141,7 @@ Every plugin has three parts:
    - triton ops
      Custom way doesn't work for triton ops now.

-7. (optional) Implement other plugable modules, such as lora, graph backend, quantization, mamba attention backend, etc.
+7. (optional) Implement other pluggable modules, such as lora, graph backend, quantization, mamba attention backend, etc.

 ## Compatibility Guarantee

@@ -155,4 +155,4 @@ The interface for the model/module may change during vLLM's development. If you
    - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0.
    - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.v1.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
    - `seed_everything` platform interface is deprecated. It has been removed in v0.16.0. Please use `vllm.utils.torch_utils.set_random_seed` instead.
-    - `prompt` in `Platform.validate_request` is deprecated and will be removed in v0.18.0.
+    - `prompt` in `Platform.validate_request` is deprecated. It has been removed in v0.18.0.
--- a/docs/design/torch_compile_multimodal.md
+++ b/docs/design/torch_compile_multimodal.md
@@ -26,7 +26,7 @@ This feature is off by default, but can be enabled by setting `compile_mm_encode

 To compile a multimodal component such as an encoder, we follow the same mechanism as the LLM text backbone, with a few additional scaffoldings:

-1. The `@support_torch_compile` decorator should include `enable_if=should_torch_compile_mm_vit`. This will gate the compilation behind our
+1. The `@support_torch_compile` decorator should include `enable_if=should_torch_compile_mm_encoder`. This will gate the compilation behind our
 `compile_mm_encoder` configuration

 2. `with set_model_tag("<component_name>", is_encoder=True)` context manager should be used around the nn.Module's instantiation. Since torch.compile
--- a/docs/features/README.md
+++ b/docs/features/README.md
@@ -37,7 +37,7 @@ th:not(:first-child) {
 </style>

 | Feature | [CP](../configuration/optimization.md#chunked-prefill) | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](speculative_decoding/README.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
-|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
+| - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
 | [CP](../configuration/optimization.md#chunked-prefill) | ✅ | | | | | | | | | | | | | | |
 | [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
 | [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
@@ -59,23 +59,23 @@ th:not(:first-child) {

 ### Feature x Hardware

-| Feature                                                   | Volta               | Turing    | Ampere    | Ada    | Hopper     | CPU                | AMD    | Intel GPU |
-|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------| ------------|
-| [CP](../configuration/optimization.md#chunked-prefill)                                     | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
-| [APC](automatic_prefix_caching.md)                        | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
-| [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
-| [SD](speculative_decoding/README.md)                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ✅        |
-| CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | [❌](https://github.com/vllm-project/vllm/issues/26970)        |
-| [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
-| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ✅        |
-| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
-| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     | ✅        |
-| <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
-| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
-| <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ✅        |
-| multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅     | ✅        |
-| best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
-| beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | Intel GPU |
+| ------- | ----- | ------ | ------ | --- | ------ | --- | --- | --------- |
+| [CP](../configuration/optimization.md#chunked-prefill) | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [APC](automatic_prefix_caching.md) | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [SD](speculative_decoding/README.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
+| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/26970) |
+| [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |
+| [mm](multimodal_inputs.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ |
+| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ |
+| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅ | ✅ |
+| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |

 !!! note
    For information on feature support on Google TPU, please refer to the [TPU-Inference Recommended Models and Features](https://docs.vllm.ai/projects/tpu/en/latest/recommended_models_features/) documentation.
--- a/Show More
+++ b/Show More