[CI] Bump mypy version to 1.19.1 (#36104 )

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
[ROCm][Perf] Enable sparse_mla's cudagraph on ROCm platform (#35719 )
2026-03-10 09:18:28 -07:00 · 2026-03-10 09:14:35 -07:00 · 2026-03-10 09:14:31 -07:00 · 2026-03-10 09:14:27 -07:00 · 2026-03-10 08:14:21 -07:00 · 2026-03-10 15:03:18 +00:00
724 changed files with 42064 additions and 28583 deletions
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -13,9 +13,10 @@ import os
 from contextlib import contextmanager
 import lm_eval
 import numpy as np
 import yaml
 from vllm.platforms import current_platform
 DEFAULT_RTOL = 0.08
@@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size):
        "allow_deprecated_quantization=True,"
    )
    if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]:
        model_args += "attention_backend=TRITON_ATTN"
    env_vars = eval_config.get("env_vars", None)
    with scoped_env_vars(env_vars):
        results = lm_eval.simple_evaluate(
@@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
                f"ground_truth={ground_truth:.3f} | "
                f"measured={measured_value:.3f} | rtol={rtol}"
            )
-            success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
+
            min_acceptable = ground_truth * (1 - rtol)
            success = success and measured_value >= min_acceptable
    assert success
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
--- a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
@@ -10,7 +10,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -37,7 +36,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -64,7 +62,6 @@
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -91,7 +88,6 @@
        "server_parameters": {
            "model": "deepseek-ai/DeepSeek-R1",
            "tensor_parallel_size": 8,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
--- a/.buildkite/performance-benchmarks/tests/serving-tests.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests.json
@@ -5,7 +5,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -23,7 +22,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -41,7 +39,6 @@
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -59,7 +56,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", 
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "speculative_config": {
                "model": "turboderp/Qwama-0.5B-Instruct",
                "num_speculative_tokens": 4,
--- a/.buildkite/scripts/check-ray-compatibility.sh
+++ b/.buildkite/scripts/check-ray-compatibility.sh
@@ -166,8 +166,15 @@ See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for contex
 EOF
 fi
-# Notify Slack if webhook is configured.
+# Notify Slack if webhook is configured and PR/branch are valid.
 if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
    PR="${BUILDKITE_PULL_REQUEST:-}"
    BRANCH="${BUILDKITE_BRANCH:-}"
    # Skip notification if PR is invalid or branch is empty
    if [[ "$PR" = "false" || -z "$PR" || -z "$BRANCH" ]]; then
        echo ">>> Skipping Slack notification (invalid PR or empty branch: PR=$PR, branch=$BRANCH)"
    else
        echo ">>> Sending Slack notification"
        # Single quotes are intentional: the f-string expressions are Python, not shell.
        # shellcheck disable=SC2016
@@ -198,6 +205,7 @@ print(json.dumps(data))
            -H 'Content-type: application/json' \
            -d "$PAYLOAD")
        echo "    Slack webhook response: $HTTP_CODE"
    fi
 else
    echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
 fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -34,7 +34,7 @@ function cpu_tests() {
  # offline inference
  docker exec cpu-test bash -c "
    set -e
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"
  # Run model tests
  docker exec cpu-test bash -c "
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -27,7 +27,7 @@ function cpu_tests() {
  podman exec -it "$container_id" bash -c "
    export TORCH_COMPILE_DISABLE=1
    set -xve
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log
  # Run basic model test
  podman exec -it "$container_id" bash -c "
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@@ -25,5 +25,5 @@ remove_docker_container
 # Run the image and test offline inference
 docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
+    python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -76,7 +76,7 @@ docker run --rm --runtime=habana --name="${container_name}" --network=host \
  -e PT_HPU_LAZY_MODE=1 \
  "${image_name}" \
  /bin/bash -c '
-  cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
+  cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
 '
 EXITCODE=$?
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -34,17 +34,17 @@ docker run \
    set -e
    echo $ZE_AFFINITY_MASK
    pip install tblib==3.1.0
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
-    python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
+    python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
+    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
+    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
    cd tests
-    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
+    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@@ -24,7 +24,7 @@ if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:
  BACKENDS=("allgather_reducescatter")
  # Disable MOE padding for ROCm since it is causing eplb to fail
  export VLLM_ROCM_MOE_PADDING=0
-  PLATFORM_ARGS=("--no-async-scheduling")
+  PLATFORM_ARGS=("--no-async-scheduling" "--attention-backend=TRITON_ATTN")
  echo "Disabled async scheduling for ROCm platform due to issues with spec decode."
 else
  # Non-ROCm platform (CUDA/other)
--- a/.buildkite/scripts/upload-nightly-wheels.sh
+++ b/.buildkite/scripts/upload-nightly-wheels.sh
@@ -72,7 +72,7 @@ obj_json="objects.json"
 aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
 mkdir -p "$INDICES_OUTPUT_DIR"
-# call script to generate indicies for all existing wheels
+# call script to generate indices for all existing wheels
 # this indices have relative paths that could work as long as it is next to the wheel directory in s3
 # i.e., the wheels are always in s3://vllm-wheels/<commit>/
 # and indices can be placed in /<commit>/, or /nightly/, or /<version>/
--- a/.buildkite/scripts/upload-release-wheels-pypi.sh
+++ b/.buildkite/scripts/upload-release-wheels-pypi.sh
@@ -54,10 +54,13 @@ mkdir -p $DIST_DIR
 # include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64')
 aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
 echo "Wheels copied to local directory"
-# generate source tarball
+# generate source distribution using setup.py
-git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT"
+python setup.py sdist --dist-dir=$DIST_DIR
 ls -la $DIST_DIR
 SDIST_FILE=$(find $DIST_DIR -name "vllm*.tar.gz")
 echo "Found sdist: $SDIST_FILE"
 # upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
 PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
 if [[ -z "$PYPI_WHEEL_FILES" ]]; then
@@ -65,6 +68,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
  exit 1
 fi
-python3 -m twine check "$PYPI_WHEEL_FILES"
+python3 -m twine check "$PYPI_WHEEL_FILES" "$SDIST_FILE"
-python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES"
+python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES" "$SDIST_FILE"
-echo "Wheels uploaded to PyPI"
+echo "Wheels and source distribution uploaded to PyPI"
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -467,7 +467,7 @@ steps:
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
-# TODO: Add the "V1 Test attetion (MI300)" test group
+# TODO: Add the "V1 Test attention (MI300)" test group
 - label: V1 Test attention (H100) # 10min
  mirror_hardwares: [amdexperimental, amdproduction]
@@ -499,17 +499,6 @@ steps:
    - pytest -v -s v1/determinism/test_batch_invariance.py
    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
 - label: V1 Test attention (B200) # 10min
  timeout_in_minutes: 30
  gpu: b200
  source_file_dependencies:
    - vllm/config/attention.py
    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
    - pytest -v -s v1/attention
 - label: V1 Test others (CPU) # 5 mins
  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
  agent_pool: mi325_1
@@ -540,12 +529,12 @@ steps:
  commands:
    - pip install tensorizer # for tensorizer test
    # for basic
-    - python3 offline_inference/basic/chat.py
+    - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN
-    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
-    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/classify.py
+    - python3 basic/offline_inference/classify.py
-    - python3 offline_inference/basic/embed.py
+    - python3 basic/offline_inference/embed.py
-    - python3 offline_inference/basic/score.py
+    - python3 basic/offline_inference/score.py
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
@@ -1180,52 +1169,45 @@ steps:
    - pytest -v -s tests/models/test_transformers.py
    # - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Blackwell Test # 21 min
+- label: Blackwell Fusion and Compile Tests # 30 min
-  timeout_in_minutes: 30
+  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  gpu: b200
  # optional: true
  source_file_dependencies:
  - csrc/quantization/fp4/
  - csrc/attention/mla/
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/v1/attention/backends/mla/cutlass_mla.py
+  - vllm/v1/worker/
-  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/v1/cudagraph_dispatcher.py
-  - vllm/v1/attention/selector.py
+  - vllm/compilation/
-  - vllm/platforms/cuda.py
+  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  - tests/compile/passes/test_fusion_attn.py
  - tests/compile/passes/test_silu_mul_quant_fusion.py
  - tests/compile/passes/distributed/test_fusion_all_reduce.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
-    - python3 examples/offline_inference/basic/chat.py
+    - pytest -v -s tests/compile/passes/test_fusion_attn.py
-    # Attention
+    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
-    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/kernels/attention/test_attention_selector.py
+    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
-    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+
-    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
+    # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+    # # Wrap with quotes to escape yaml
-    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
+    # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
-    # Quantization
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
-    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
-    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+
-    - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
@@ -1258,16 +1240,6 @@ steps:
  commands:
    - pytest -s -v tests/quantization/test_blackwell_moe.py
 - label: Blackwell LM Eval Small Models
  timeout_in_minutes: 120
  gpu: b200
  optional: true # run on nightlies
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
 #####  1 GPU test  #####
 #####  multi gpus test  #####
@@ -1514,6 +1486,20 @@ steps:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
    - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 - label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi325_4
  # grade: Blocking
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
    - CROSS_LAYERS_BLOCKS=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 ##### multi gpus test #####
 ##### A100 test #####
@@ -1654,7 +1640,7 @@ steps:
  - vllm/v1/attention/backends/flashinfer.py
  commands:
  - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+  - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx942.txt
 ##### EPLB Accuracy Tests #####
 - label: DeepSeek V2-Lite Accuracy
@@ -1681,16 +1667,6 @@ steps:
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
 - label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
  timeout_in_minutes: 60
  gpu: b200
  optional: true
  num_gpus: 2
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
 - label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental, amdproduction]
@@ -2174,20 +2150,7 @@ steps:
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
-# TODO: Add the "V1 Test attetion (MI300)" test group
+# TODO: Add the "V1 Test attention (MI300)" test group
 - label: V1 Test attention (H100) # 10min
  mirror_hardwares: [amdexperimental]
  agent_pool: mi355_1
  timeout_in_minutes: 30
  gpu: h100
  source_file_dependencies:
    - vllm/config/attention.py
    - vllm/model_executor/layers/attention
    - vllm/v1/attention
    - tests/v1/attention
  commands:
    - pytest -v -s v1/attention
 - label: Batch Invariance Tests (H100) # 10min
  mirror_hardwares: [amdexperimental]
@@ -2205,6 +2168,8 @@ steps:
    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
 - label: V1 Test attention (B200) # 10min
  mirror_hardwares: [amdexperimental, amdmi355]
  agent_pool: mi355_1
  timeout_in_minutes: 30
  gpu: b200
  source_file_dependencies:
@@ -2243,12 +2208,12 @@ steps:
  commands:
    - pip install tensorizer # for tensorizer test
    # for basic
-    - python3 offline_inference/basic/chat.py
+    - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN
-    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
-    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/classify.py
+    - python3 basic/offline_inference/classify.py
-    - python3 offline_inference/basic/embed.py
+    - python3 basic/offline_inference/embed.py
-    - python3 offline_inference/basic/score.py
+    - python3 basic/offline_inference/score.py
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
@@ -2824,12 +2789,14 @@ steps:
    - pytest -v -s tests/models/test_transformers.py
    # - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Blackwell Test # 21 min
+- label: Blackwell Test (MI355) # 21 min
  mirror_hardwares: [amdexperimental, amdmi355]
  agent_pool: mi355_1
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  gpu: b200
@@ -2848,28 +2815,28 @@ steps:
  - vllm/v1/attention/selector.py
  - vllm/platforms/cuda.py
  commands:
-    - nvidia-smi
+    - rocm-smi
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
    # Attention
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
    - pytest -v -s tests/kernels/attention/test_attention_selector.py 
-    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+    #- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
-    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
+    #- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
-    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+    #- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
-    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
+    #- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
-    # Quantization
+    ## Quantization
-    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+    #- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
-    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+    #- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
-    - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+    #- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
-    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+    #- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
-    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
+    #- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
-    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+    #- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
-    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+    #- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
-    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+    #- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
-    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+    #- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
-    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+    #- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
-    - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    #- pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
+    #- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
 - label: Blackwell Fusion and Compile Tests # 30 min
  timeout_in_minutes: 40
@@ -2939,13 +2906,15 @@ steps:
 - label: Blackwell LM Eval Small Models
  timeout_in_minutes: 120
  mirror_hardwares: [amdexperimental, amdproduction, amdmi355]
  agent_pool: mi355_2
  gpu: b200
  optional: true # run on nightlies
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi355.txt
 #####  1 GPU test  #####
 #####  multi gpus test  #####
@@ -3181,6 +3150,20 @@ steps:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
    - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 - label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
  mirror_hardwares: [amdexperimental, amdproduction]
  agent_pool: mi355_4
  # grade: Blocking
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
    - CROSS_LAYERS_BLOCKS=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 ##### multi gpus test #####
 ##### A100 test #####
@@ -3314,7 +3297,7 @@ steps:
  - vllm/v1/attention/backends/flashinfer.py
  commands:
  - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+  - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx950.txt
 ##### EPLB Accuracy Tests #####
 - label: DeepSeek V2-Lite Accuracy
@@ -3328,18 +3311,9 @@ steps:
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200-MI355)
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdmi355]
-  agent_pool: mi355_4
+  agent_pool: mi355_2
  timeout_in_minutes: 60
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
 - label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
  timeout_in_minutes: 60
  gpu: b200
  optional: true
@@ -3358,3 +3332,18 @@ steps:
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
 - label: Attention Benchmarks Smoke Test (B200-MI355)
  device: b200
  mirror_hardwares: [amdexperimental, amdmi355]
  agent_pool: mi355_2
  num_gpus: 2
  optional: true
  working_dir: "/vllm-workspace/"
  timeout_in_minutes: 10
  source_file_dependencies:
  - benchmarks/attention_benchmarks/
  - vllm/v1/attention/
  commands:
  - python3 benchmarks/attention_benchmarks/benchmark.py --backends ROCM_ATTN ROCM_AITER_FA ROCM_AITER_UNIFIED_ATTN --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -36,6 +36,16 @@ steps:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
 - label: AsyncTP Correctness Tests (B200)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/"
  device: b200
  optional: true
  num_devices: 2
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
 - label: Distributed Compile Unit Tests (2xH100)
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -67,6 +67,7 @@ steps:
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_symm_mem_allreduce.py
  - tests/distributed/test_multiproc_executor.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  - export NCCL_CUMEM_HOST_ENABLE=0
@@ -95,6 +96,8 @@ steps:
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
  - pytest -v -s distributed/test_symm_mem_allreduce.py
  # test multi-node TP with multiproc executor (simulated on single node)
  - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  # OLD rlhf examples
@@ -210,6 +213,19 @@ steps:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 - label: NixlConnector PD + Spec Decode acceptance (2 GPUs)
  timeout_in_minutes: 30
  device: a100
  working_dir: "/vllm-workspace/tests"
  num_devices: 2
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - vllm/v1/worker/kv_connector_model_runner_mixin.py
    - tests/v1/kv_connector/nixl_integration/
  commands:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
 - label: Pipeline + Context Parallelism (4 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -41,6 +41,11 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
  - pytest -v -s entrypoints/test_chat_utils.py
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Entrypoints Integration (API Server 2)
  timeout_in_minutes: 130
@@ -55,6 +60,11 @@ steps:
  - pytest -v -s entrypoints/instrumentator
  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
  - pytest -v -s tool_use
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Entrypoints Integration (Pooling)
  timeout_in_minutes: 50
@@ -87,6 +97,11 @@ steps:
    - tests/v1
  commands:
    - pytest -v -s v1/entrypoints
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: OpenAI API Correctness
  timeout_in_minutes: 30
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -8,8 +8,9 @@ steps:
  - csrc/
  - tests/kernels/core
  - tests/kernels/test_top_k_per_row.py
  - tests/kernels/test_concat_mla_q.py
  commands:
-    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py
 - label: Kernels Attention Test %N
  timeout_in_minutes: 35
@@ -96,7 +97,7 @@ steps:
  - vllm/platforms/cuda.py
  commands:
    - nvidia-smi
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
    # Attention
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
    - pytest -v -s tests/kernels/attention/test_attention_selector.py
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -67,12 +67,13 @@ steps:
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
-    - python3 offline_inference/basic/chat.py # for basic
+     # for basic
-    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+    - python3 basic/offline_inference/chat.py
-    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
-    - python3 offline_inference/basic/classify.py
+    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/embed.py
+    - python3 basic/offline_inference/classify.py
-    - python3 offline_inference/basic/score.py
+    - python3 basic/offline_inference/embed.py
    - python3 basic/offline_inference/score.py
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
@@ -87,6 +88,11 @@ steps:
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Metrics, Tracing (2 GPUs)
  timeout_in_minutes: 20
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -65,7 +65,7 @@ steps:
    - pytest -v -s tests/models/test_transformers.py
    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
-    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/basic/offline_inference/chat.py
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -12,6 +12,11 @@ steps:
    - pip freeze | grep -E 'torch'
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Multi-Modal Processor Test (CPU)
  depends_on: 
@@ -54,6 +59,11 @@ steps:
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
  mirror:
    amd:
      device: mi325_1
      depends_on:
      - image-build-amd
 - label: Multi-Modal Models (Extended) 2
  optional: true
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -15,9 +15,12 @@ steps:
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
  # end platform plugin tests
-  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+  # begin io_processor plugins test
  # test generic io_processor plugins functions
  - pytest -v -s ./plugins_tests/test_io_processor_plugins.py
  # test Terratorch io_processor plugins
  - pip install -e ./plugins/prithvi_io_processor_plugin
-  - pytest -v -s plugins_tests/test_io_processor_plugins.py
+  - pytest -v -s plugins_tests/test_terratorch_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # test bge_m3_sparse io_processor plugin
  - pip install -e ./plugins/bge_m3_sparse_plugin
@@ -36,3 +39,8 @@ steps:
  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
  mirror:
    amd:
      device: mi325_2
      depends_on:
      - image-build-amd
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -3,6 +3,7 @@ pull_request_rules:
  description: Automatically apply documentation label
  conditions:
    - label != stale
    - -closed
    - or:
      - files~=^[^/]+\.md$
      - files~=^docs/
@@ -37,15 +38,13 @@ pull_request_rules:
        > [!TIP]
        > <details>
-        > <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
+        > <summary>Is <code>mypy</code> failing?</summary>
        > <br/>
-        > <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
+        > <code>mypy</code> is run differently in CI. If the failure is related to this check, please use the following command to run it locally:
        >
        > ```bash
        > # For mypy (substitute "3.10" with the failing version if needed)
        > pre-commit run --hook-stage manual mypy-3.10
        > # For markdownlint
        > pre-commit run --hook-stage manual markdownlint
        > ```
        > </details>
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -6,6 +6,9 @@ on:
      - main
  workflow_dispatch:  # Manual trigger
 permissions:
  contents: read
 jobs:
  macos-m1-smoke-test:
    runs-on: macos-latest
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@ repos:
    args: [--output-format, github, --fix]
  - id: ruff-format
 - repo: https://github.com/crate-ci/typos
-  rev: v1.38.1
+  rev: v1.43.5
  hooks:
  - id: typos
    args: [--force-exclude]
@@ -24,12 +24,12 @@ repos:
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
    types_or: [c++, cuda]
    args: [--style=file, --verbose]
- repo: https://github.com/igorshubovych/markdownlint-cli
+- repo: https://github.com/DavidAnson/markdownlint-cli2
-  rev: v0.45.0
+  rev: v0.21.0
  hooks:
-  - id: markdownlint
+  - id: markdownlint-cli2
-    exclude: '.*\.inc\.md'
+    language_version: lts
-    stages: [manual] # Only run in CI
+    args: [--fix]
 - repo: https://github.com/rhysd/actionlint
  rev: v1.7.7
  hooks:
@@ -55,7 +55,7 @@ repos:
      language: python
      types_or: [python, pyi]
      require_serial: true
-      additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
+      additional_dependencies: ["mypy[faster-cache]==1.19.1", regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.10
    entry: python tools/pre_commit/mypy.py 1 "3.10"
@@ -127,6 +127,13 @@ repos:
    language: python
    types: [python]
    additional_dependencies: [regex]
  # prevent use torch.cuda APIs
  - id: check-torch-cuda-call
    name: "Prevent new 'torch.cuda' APIs call"
    entry: python tools/pre_commit/check_torch_cuda.py
    language: python
    types: [python]
    additional_dependencies: [regex]
  - id: validate-config
    name: Validate configuration has default values and that each field has a docstring
    entry: python tools/pre_commit/validate_config.py
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -9,6 +9,7 @@ build:
    python: "3.12"
  jobs:
    post_checkout:
      - bash docs/maybe_skip_pr_build.sh
      - git fetch origin main --unshallow --no-tags --filter=blob:none || true
    pre_create_environment:
      - pip install uv
--- a/benchmarks/attention_benchmarks/README.md
+++ b/benchmarks/attention_benchmarks/README.md
@@ -187,7 +187,7 @@ python benchmark.py \
 ## Hardware Requirements
 | Backend | Hardware |
-|---------|----------|
+| ------- | -------- |
 | Flash/Triton/FlashInfer | Any CUDA GPU |
 | CUTLASS MLA | Blackwell (SM100+) |
 | FlashAttn MLA | Hopper (SM90+) |
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -30,7 +30,7 @@ def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
        max_kv_len = max(r.kv_len for r in requests) if requests else 0
        return (batch_size, max_q_len, max_kv_len)
    except Exception:
-        # Fallback for unparseable specs
+        # Fallback for unparsable specs
        return (0, 0, 0)
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -145,7 +145,6 @@ def create_minimal_vllm_config(
    cache_config = CacheConfig(
        block_size=block_size,
        gpu_memory_utilization=0.9,
        swap_space=0,
        cache_dtype="auto",
        enable_prefix_caching=False,
    )
@@ -701,7 +700,7 @@ def _run_single_benchmark(
    # Warmup
    for _ in range(config.warmup_iters):
        forward_fn()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Benchmark
    times = []
@@ -714,7 +713,7 @@ def _run_single_benchmark(
            forward_fn()
        end.record()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        elapsed_ms = start.elapsed_time(end)
        times.append(elapsed_ms / 1000.0 / config.num_layers)
--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -141,7 +141,6 @@ def _create_vllm_config(
    cache_config = CacheConfig(
        block_size=config.block_size,
        cache_dtype="auto",
        swap_space=0,
    )
    cache_config.num_gpu_blocks = max_num_blocks
    cache_config.num_cpu_blocks = 0
@@ -391,7 +390,7 @@ def _run_single_benchmark(
                attn_metadata,
                output=out,
            )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Benchmark
    times = []
@@ -412,7 +411,7 @@ def _run_single_benchmark(
            )
        end.record()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        elapsed_ms = start.elapsed_time(end)
        times.append(elapsed_ms / 1000.0 / config.num_layers)  # seconds per layer
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -41,7 +41,7 @@ MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LE
 | --- | --- | --- |
 | `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
 | `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` |
-| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
+| `SYSTEM` | **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
 | `TP` | **Required.** The tensor-parallelism size. | `1` |
 | `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
 | `INPUT_LEN` | **Required.** Request input length. | `4000` |
--- a/benchmarks/benchmark_topk_topp.py
+++ b/benchmarks/benchmark_topk_topp.py
@@ -94,7 +94,7 @@ def create_logits(
 def measure_memory() -> tuple[int, int]:
    """Return (allocated, reserved) memory in bytes."""
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()
@@ -102,7 +102,7 @@ def reset_memory_stats():
    """Reset peak memory statistics."""
    reset_buffer_cache()
    torch.cuda.reset_peak_memory_stats()
-    torch.cuda.empty_cache()
+    torch.accelerator.empty_cache()
    gc.collect()
@@ -123,7 +123,7 @@ def benchmark_function(
    for _ in range(warmup_iters):
        logits_copy = logits.clone()
        func(logits_copy, k, p)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Reset memory stats before benchmark
    reset_memory_stats()
@@ -140,7 +140,7 @@ def benchmark_function(
        func(logits_copy, k, p)
        end_events[i].record()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Calculate timing
    times = [
--- a/benchmarks/kernels/bench_concat_mla_q.py
+++ b/benchmarks/kernels/bench_concat_mla_q.py
@@ -0,0 +1,98 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import torch
 from vllm import _custom_ops as ops
 from vllm.triton_utils import triton
 # DeepSeek V3 dimensions
 NOPE_DIM = 512
 ROPE_DIM = 64
 NUM_HEADS = 128
 NUM_TOKENS = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
 def get_configs():
    return NUM_TOKENS
 def make_inputs(num_tokens, dtype):
    """Create inputs matching the real code path.
    Args:
        contiguous_nope: If False, simulate the transposed BMM output
                         (non-contiguous nope with stride pattern from
                         [N,B,L].transpose(0,1)).
    """
    # Simulate: bmm output [N, B, L].transpose(0, 1) -> [B, N, L]
    raw = torch.randn(NUM_HEADS, num_tokens, NOPE_DIM, dtype=dtype, device="cuda")
    ql_nope = raw.transpose(0, 1)
    q_pe = torch.randn(num_tokens, NUM_HEADS, ROPE_DIM, dtype=dtype, device="cuda")
    return ql_nope, q_pe
 # ---- Non-contiguous nope benchmark (real code path) ----
@triton.testing.perf_report(
    triton.testing.Benchmark(
        x_names=["num_tokens"],
        x_vals=get_configs(),
        line_arg="provider",
        line_vals=["torch_cat", "concat_mla_q"],
        line_names=["torch.cat", "concat_mla_q (v8)"],
        styles=[("blue", "--"), ("green", "-")],
        ylabel="Latency (us)",
        plot_name="concat_mla_q-transposed",
        args={},
    )
 )
 def bench_transposed(num_tokens, provider):
    dtype = torch.bfloat16
    ql_nope, q_pe = make_inputs(num_tokens, dtype)
    q_out = torch.empty(
        num_tokens, NUM_HEADS, NOPE_DIM + ROPE_DIM, dtype=dtype, device="cuda"
    )
    quantiles = [0.5, 0.2, 0.8]
    if provider == "torch_cat":
        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
            lambda: torch.cat((ql_nope, q_pe), dim=-1), quantiles=quantiles, rep=500
        )
    else:
        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
            lambda: ops.concat_mla_q(ql_nope, q_pe, q_out), quantiles=quantiles, rep=500
        )
    return ms * 1000, max_ms * 1000, min_ms * 1000  # us
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Benchmark concat_mla_q vs torch.cat")
    parser.add_argument(
        "--save-path", type=str, default=None, help="Path to save benchmark results"
    )
    args = parser.parse_args()
    print("\n" + "=" * 70)
    print("CONCAT MLA Q KERNEL BENCHMARKS")
    print("=" * 70)
    print(f"Dimensions: nope={NOPE_DIM}, rope={ROPE_DIM}, heads={NUM_HEADS}")
    print(
        f"Per-head output: {NOPE_DIM + ROPE_DIM} bf16 = "
        f"{(NOPE_DIM + ROPE_DIM) * 2} bytes"
    )
    print(f"num_tokens (decode=batch_size, prefill=chunk_size): {NUM_TOKENS}")
    print("=" * 70)
    print("\n--- Non-contiguous nope inputs (transposed BMM output) ---")
    bench_transposed.run(print_data=True, save_path=args.save_path)
    print("\n" + "=" * 70)
    print("Benchmarking complete!")
    print("=" * 70)
--- a/benchmarks/kernels/bench_cp_gather_fp8.py
+++ b/benchmarks/kernels/bench_cp_gather_fp8.py
@@ -0,0 +1,153 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import math
 import torch
 from vllm import _custom_ops as ops
 from vllm.triton_utils import triton
 # DeepSeek V3 MLA dimensions
 NOPE_DIM = 512
 ROPE_DIM = 64
 HEAD_DIM = NOPE_DIM + ROPE_DIM  # 576 BF16 output elements per token
 ENTRY_BYTES = 656  # 512 FP8 + 16 scales + 128 BF16 RoPE
 BLOCK_SIZE = 64  # tokens per physical cache block - get_supported_kernel_block_sizes
 # Realistic prefill scenarios:
 #   - 1 long prefill: single request, 16K-96K tokens
 #   - 4 medium prefills: 4 requests, 4K-24K tokens each
 #   - 16 shorter prefills: 16 requests, 1K-6K tokens each
 SCENARIOS = [
    # (label, num_reqs, total_tokens_list)
    ("1-req", 1, [8192, 16384, 32768, 65536, 98304]),
    ("4-reqs", 4, [8192, 16384, 32768, 65536, 98304]),
    ("16-reqs", 16, [8192, 16384, 32768, 65536, 98304]),
 ]
 def make_inputs(total_tokens, num_reqs, block_size):
    """Create synthetic FP8 cache, block table, and output buffer.
    Fills the cache with random bytes (we only measure throughput,
    not correctness). Block table maps each request to contiguous
    physical blocks.
    """
    # Divide tokens evenly across requests
    base_len = total_tokens // num_reqs
    remainder = total_tokens % num_reqs
    seq_lens = [base_len + (1 if r < remainder else 0) for r in range(num_reqs)]
    # workspace_starts: cumulative sum of seq_lens
    workspace_starts = [0] * num_reqs
    for r in range(1, num_reqs):
        workspace_starts[r] = workspace_starts[r - 1] + seq_lens[r - 1]
    # Physical blocks needed per request
    blocks_per_req = [math.ceil(s / block_size) for s in seq_lens]
    total_blocks = sum(blocks_per_req)
    max_blocks = max(blocks_per_req)
    # Allocate cache with random data (content doesn't matter for perf)
    cache = torch.randint(
        0,
        256,
        (total_blocks, block_size, ENTRY_BYTES),
        dtype=torch.uint8,
        device="cuda",
    )
    # Block table: contiguous block assignments
    block_table = torch.zeros(num_reqs, max_blocks, dtype=torch.int32, device="cuda")
    block_idx = 0
    for r in range(num_reqs):
        for b in range(blocks_per_req[r]):
            block_table[r, b] = block_idx
            block_idx += 1
    # Output workspace
    dst = torch.zeros(total_tokens, HEAD_DIM, dtype=torch.bfloat16, device="cuda")
    seq_lens_t = torch.tensor(seq_lens, dtype=torch.int32, device="cuda")
    workspace_starts_t = torch.tensor(
        workspace_starts, dtype=torch.int32, device="cuda"
    )
    return cache, dst, block_table, seq_lens_t, workspace_starts_t
 def bench_scenario(label, num_reqs, total_tokens_list, save_path):
    """Run benchmark for a specific (num_reqs, total_tokens) scenario."""
    @triton.testing.perf_report(
        triton.testing.Benchmark(
            x_names=["total_tokens"],
            x_vals=total_tokens_list,
            line_arg="provider",
            line_vals=["cuda_kernel"],
            line_names=["cp_gather_fp8 (CUDA)"],
            styles=[("green", "-")],
            ylabel="Latency (us)",
            plot_name=f"cp_gather_fp8-{label}-bs{BLOCK_SIZE}",
            args={"num_reqs": num_reqs},
        )
    )
    def bench_fn(total_tokens, provider, num_reqs):
        cache, dst, block_table, seq_lens_t, ws_starts = make_inputs(
            total_tokens, num_reqs, BLOCK_SIZE
        )
        quantiles = [0.5, 0.2, 0.8]
        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
            lambda: ops.cp_gather_and_upconvert_fp8_kv_cache(
                cache, dst, block_table, seq_lens_t, ws_starts, num_reqs
            ),
            quantiles=quantiles,
            rep=500,
        )
        return ms * 1000, max_ms * 1000, min_ms * 1000  # us
    seq_len_per_req = total_tokens_list[0] // num_reqs
    seq_len_per_req_max = total_tokens_list[-1] // num_reqs
    print(
        f"\n--- {label}: {num_reqs} request(s), "
        f"~{seq_len_per_req}-{seq_len_per_req_max} tokens/req ---"
    )
    bench_fn.run(print_data=True, save_path=save_path)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Benchmark cp_gather_and_upconvert_fp8_kv_cache"
    )
    parser.add_argument(
        "--save-path",
        type=str,
        default=None,
        help="Path to save benchmark results as CSV",
    )
    args = parser.parse_args()
    # Print data volume info for bandwidth analysis
    read_per_token = ENTRY_BYTES  # 656 bytes from cache
    write_per_token = HEAD_DIM * 2  # 576 * 2 = 1152 bytes to workspace
    total_per_token = read_per_token + write_per_token  # 1808 bytes
    print("\n" + "=" * 70)
    print("CP_GATHER_AND_UPCONVERT_FP8_KV_CACHE BENCHMARKS")
    print("=" * 70)
    print(f"Cache entry: {ENTRY_BYTES} bytes (512 FP8 + 16 scales + 128 RoPE)")
    print(f"Output row:  {HEAD_DIM} BF16 = {HEAD_DIM * 2} bytes")
    print(f"Per token:   {total_per_token} bytes (read + write)")
    print(f"Block size:  {BLOCK_SIZE} tokens/block")
    print("=" * 70)
    for label, num_reqs, total_tokens_list in SCENARIOS:
        bench_scenario(label, num_reqs, total_tokens_list, args.save_path)
    print("\n" + "=" * 70)
    print("Benchmarking complete!")
    print("=" * 70)
--- a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
@@ -168,7 +168,7 @@ def bench_impl(
    # warmup
    for kwargs in kwargs_list:
        impl_type.get_impl()(**kwargs)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Merge into a single kwargs and qualify arguments as ArgPool
    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
@@ -202,7 +202,7 @@ def test_correctness(T: int, N: int):
    # reference output
    ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
-    # test ouptut
+    # test output
    out_q, out_s = output_from_impl(
        ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
    )
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -171,7 +171,7 @@ def bench_run(
                activation=MoEActivation.SILU,
                global_num_experts=num_experts,
            )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
    triton_stream = torch.cuda.Stream()
@@ -187,14 +187,14 @@ def bench_run(
                topk_ids,
                quant_config=quant_config,
            )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
        """Benchmark CUDA graph using events like benchmark_moe.py"""
        # Warmup
        for _ in range(num_warmup):
            graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        # Timing
        start_event = torch.Event(enable_timing=True)
@@ -202,7 +202,7 @@ def bench_run(
        latencies = []
        for _ in range(num_iters):
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            start_event.record()
            graph.replay()
            end_event.record()
--- a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py
@@ -307,7 +307,7 @@ def bench_run(
    def replay_graph(graph, num_repeats):
        for _ in range(num_repeats):
            graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
    cutlass_stream = torch.cuda.Stream()
    cutlass_graph = torch.cuda.CUDAGraph()
@@ -330,7 +330,7 @@ def bench_run(
            e=num_experts,
            device=device,
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    triton_stream = torch.cuda.Stream()
    triton_graph = torch.cuda.CUDAGraph()
@@ -345,7 +345,7 @@ def bench_run(
            w2_fp8scale,
            a_fp8_scale,
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    min_run_time = 5
    num_warmup = 5
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -342,7 +342,7 @@ class CommunicatorBenchmark:
            if not should_use_fn(tensor):
                return None
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            stream = torch.cuda.Stream()
            with torch.cuda.stream(stream):
                graph_input = tensor.clone()
@@ -360,17 +360,17 @@ class CommunicatorBenchmark:
                        for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
                            allreduce_fn(graph_input)
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            for _ in range(num_warmup):
                graph.replay()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            start_time = time.perf_counter()
            for _ in range(num_trials):
                graph.replay()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            end_time = time.perf_counter()
--- a/benchmarks/kernels/benchmark_fused_collective.py
+++ b/benchmarks/kernels/benchmark_fused_collective.py
@@ -385,7 +385,7 @@ def benchmark_operation(
    # Warmup before graph capture
    for _ in range(warmup):
        operation_func(*args, **kwargs)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Create CUDA graph
    graph = torch.cuda.CUDAGraph()
@@ -398,19 +398,19 @@ def benchmark_operation(
            operation_func(*args, **kwargs)
    # Graph warmup
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    for _ in range(warmup):
        graph.replay()
    # Benchmark with CUDA graph
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_time = time.perf_counter()
    for _ in range(trials // num_op_per_cudagraph):
        # operation_func(*args, **kwargs)
        graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    end_time = time.perf_counter()
    avg_time_ms = ((end_time - start_time) / trials) * 1000
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -224,7 +224,7 @@ def bench_run(
    def replay_graph(graph, num_repeats):
        for _ in range(num_repeats):
            graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
    cutlass_stream = torch.cuda.Stream()
    cutlass_graph = torch.cuda.CUDAGraph()
@@ -239,7 +239,7 @@ def bench_run(
            topk_weights,
            topk_ids,
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    triton_stream = torch.cuda.Stream()
    triton_graph = torch.cuda.CUDAGraph()
@@ -254,7 +254,7 @@ def bench_run(
            w2_scale,
            a_scale,
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    min_run_time = 5
    num_warmup = 5
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -34,14 +34,14 @@ def main(
    residual = torch.randn_like(x) * scale if add_residual else None
    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
        for _ in range(num_iters):
            layer(x, residual)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        end_time = time.perf_counter()
        if profile:
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -1035,7 +1035,7 @@ def bench_optype(
    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
    for kwargs in kwargs_list:
        op_type.bench_fn()(**kwargs)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Merge into a single kwargs and qualify arguments as ArgPool
    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
--- a/benchmarks/kernels/benchmark_mla_k_concat.py
+++ b/benchmarks/kernels/benchmark_mla_k_concat.py
@@ -47,13 +47,13 @@ def benchmark_method(
    # Warmup
    for _ in range(num_warmup):
        _ = method(k_nope, k_pe)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Benchmark
    start = time.perf_counter()
    for _ in range(num_iters):
        _ = method(k_nope, k_pe)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    end = time.perf_counter()
    return (end - start) / num_iters * 1000  # Convert to ms
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -54,7 +54,7 @@ def clear_triton_cache():
    # Clear CUDA memory cache
    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
+        torch.accelerator.empty_cache()
    # Try to clear Triton's runtime cache
    try:
@@ -304,19 +304,19 @@ def benchmark_config(
    # JIT compilation & warmup
    run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Capture 10 invocations with CUDA graph
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        for _ in range(10):
            run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Warmup
    for _ in range(5):
        graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
@@ -324,7 +324,7 @@ def benchmark_config(
    latencies: list[float] = []
    for i in range(num_iters):
        prepare(i)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        graph.replay()
--- a/benchmarks/kernels/benchmark_moe_defaults.py
+++ b/benchmarks/kernels/benchmark_moe_defaults.py
@@ -131,7 +131,7 @@ def benchmark_config(
                topk_ids,
                quant_config=quant_config,
            )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Benchmark
    start = torch.cuda.Event(enable_timing=True)
@@ -149,7 +149,7 @@ def benchmark_config(
                quant_config=quant_config,
            )
    end.record()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    return start.elapsed_time(end) / num_iters * 1000  # ms -> us
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -69,19 +69,19 @@ def benchmark_permute(
    # JIT compilation & warmup
    run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Capture 10 invocations with CUDA graph
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        for _ in range(10):
            run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Warmup
    for _ in range(5):
        graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
@@ -89,7 +89,7 @@ def benchmark_permute(
    latencies: list[float] = []
    for i in range(num_iters):
        prepare(i)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        graph.replay()
@@ -159,26 +159,26 @@ def benchmark_unpermute(
    # JIT compilation & warmup
    input = prepare()
    run(input)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Capture 10 invocations with CUDA graph
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        for _ in range(10):
            run(input)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Warmup
    for _ in range(5):
        graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
    latencies: list[float] = []
    for i in range(num_iters):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        graph.replay()
        end_event.record()
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -135,14 +135,14 @@ def benchmark_mrope(
            key.clone(),
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Time reference implementation
    torch_times = []
    for _ in range(benchmark_iter):
        query_clone = query.clone()
        key_clone = key.clone()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_time = time.time()
        mrope_helper_class.forward_native(
@@ -151,7 +151,7 @@ def benchmark_mrope(
            key_clone,
        )
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        torch_times.append(time.time() - start_time)
    # Time triton kernel implementation
@@ -159,14 +159,14 @@ def benchmark_mrope(
    for _ in range(benchmark_iter):
        query_clone = query.clone()
        key_clone = key.clone()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_time = time.time()
        mrope_helper_class.forward_cuda(
            positions,
            query_clone,
            key_clone,
        )
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        triton_times.append(time.time() - start_time)
    # Calculate statistics
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -103,7 +103,7 @@ def main(
        max_logits = torch.empty_like(exp_sums)
    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
@@ -173,7 +173,7 @@ def main(
                    )
            else:
                raise ValueError(f"Invalid version: {version}")
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        end_time = time.perf_counter()
        if profile:
--- a/benchmarks/kernels/benchmark_per_token_group_quant.py
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@@ -28,7 +28,7 @@ def _time_cuda(
    # warmup
    for _ in range(warmup_iters):
        fn()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start = torch.Event(enable_timing=True)
    end = torch.Event(enable_timing=True)
@@ -37,7 +37,7 @@ def _time_cuda(
    for _ in range(bench_iters):
        fn()
    end.record()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    return start.elapsed_time(end) / bench_iters  # ms/iter
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -29,7 +29,7 @@ def main(
    scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None
    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
@@ -39,7 +39,7 @@ def main(
                ops.scaled_int8_quant(x, scale)
            else:
                ops.scaled_fp8_quant(x, scale)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        end_time = time.perf_counter()
        if profile:
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@@ -84,16 +84,16 @@ def run_benchmark(
        g = torch.cuda.CUDAGraph()
        with torch.cuda.graph(g):
            function_under_test()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        function_under_test = lambda: g.replay()
    def run_cuda_benchmark(n_iters: int) -> float:
        nonlocal key, value, key_cache, value_cache, slot_mapping
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = time.perf_counter()
        for _ in range(n_iters):
            function_under_test()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
        end = time.perf_counter()
        return (end - start) / n_iters
@@ -104,7 +104,7 @@ def run_benchmark(
    # free tensors to mitigate OOM when sweeping
    del key, value, key_cache, value_cache, slot_mapping
-    torch.cuda.empty_cache()
+    torch.accelerator.empty_cache()
    return lat
--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@@ -109,16 +109,16 @@ def run_benchmark(
        g = torch.cuda.CUDAGraph()
        with torch.cuda.graph(g):
            function_under_test()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        function_under_test = lambda: g.replay()
    def run_cuda_benchmark(n_iters: int) -> float:
        nonlocal key, value, key_cache, value_cache, slot_mapping
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = time.perf_counter()
        for _ in range(n_iters):
            function_under_test()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
        end = time.perf_counter()
        return (end - start) / n_iters
@@ -129,7 +129,7 @@ def run_benchmark(
    # free tensors to mitigate OOM when sweeping
    del key, value, key_cache, value_cache, slot_mapping
-    torch.cuda.empty_cache()
+    torch.accelerator.empty_cache()
    return lat
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -251,7 +251,7 @@ def benchmark(
        kernel(
            y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
@@ -259,7 +259,7 @@ def benchmark(
    # Benchmark
    latencies: list[float] = []
    for _ in range(runs):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        for i in range(iterations_per_run):
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -126,7 +126,7 @@ def benchmark_decode(
    )
    def time_fn(fn, warmup=10, trials=20):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = torch.Event(enable_timing=True)
        end = torch.Event(enable_timing=True)
        times = []
@@ -136,7 +136,7 @@ def benchmark_decode(
            start.record()
            fn()
            end.record()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            times.append(start.elapsed_time(end))  # ms
        return sum(times) / len(times), torch.std(torch.tensor(times))
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -138,7 +138,7 @@ def benchmark_prefill(
    )
    def time_fn(fn, warmup=10, trials=20):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = torch.Event(enable_timing=True)
        end = torch.Event(enable_timing=True)
        times = []
@@ -148,7 +148,7 @@ def benchmark_prefill(
            start.record()
            fn()
            end.record()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            times.append(start.elapsed_time(end))  # ms
        return sum(times) / len(times), torch.std(torch.tensor(times))
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -177,18 +177,18 @@ def benchmark_config(
    def run():
        w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # JIT complication & warmup
    for _ in range(5):
        run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
    latencies: list[float] = []
    for i in range(num_iters):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        run()
        end_event.record()
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -35,7 +35,7 @@ def benchmark_shape(
    B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
    # Reference result in BF16
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    C_ref = A @ B.t()
    # Pre-quantize B for all implementations
@@ -121,14 +121,14 @@ def benchmark_shape(
        # Warmup
        for _ in range(warmup):
            func()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
        # Timing loop
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = time.time()
        for _ in range(repeat):
            func()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        end = time.time()
        # Calculate timing and TFLOPS
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -242,6 +242,16 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
        )
    else()
        message(STATUS "Downloading oneDNN from GitHub")
        if(ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
            message(STATUS "aarch64 detected: using pinned oneDNN commit 9c5be1cc59e368aebf0909e6cf20f981ea61462a")
            FetchContent_Declare(
                oneDNN
                GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
                GIT_TAG        9c5be1cc59e368aebf0909e6cf20f981ea61462a
                GIT_PROGRESS   TRUE
                GIT_SHALLOW    FALSE
            )
        else()
            FetchContent_Declare(
                oneDNN
                GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
@@ -250,6 +260,7 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
                GIT_SHALLOW    TRUE
            )
        endif()
    endif()
    set(ONEDNN_LIBRARY_TYPE "STATIC")
    set(ONEDNN_BUILD_DOC "OFF")
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -74,6 +74,12 @@ void indexer_k_quant_and_cache(
    int64_t quant_block_size,     // quantization block size
    const std::string& scale_fmt);
 // Concatenate query nope and rope for MLA/DSA attention
 void concat_mla_q(
    torch::Tensor& ql_nope,  // [num_tokens, num_heads, nope_dim]
    torch::Tensor& q_pe,     // [num_tokens, num_heads, rope_dim]
    torch::Tensor& q_out);   // [num_tokens, num_heads, nope_dim + rope_dim]
 // Extract function to gather quantized K cache
 void cp_gather_indexer_k_quant_cache(
    const torch::Tensor& kv_cache,  // [num_blocks, block_size, cache_stride]
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -8,6 +8,7 @@
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
 #include "quantization/vectorization_utils.cuh"
 #include "concat_mla_q.cuh"
 #ifdef USE_ROCM
  #include "quantization/w8a8/fp8/amd/quant_utils.cuh"
@@ -995,75 +996,67 @@ namespace vllm {
 // Similar to cp_gather_cache but specifically for FP8->BF16 conversion
 __global__ void cp_gather_and_upconvert_fp8_kv_cache(
    const uint8_t* __restrict__ src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, 656]
-    __nv_bfloat16* __restrict__ dst,          // [TOT_TOKENS, 576]
+    __nv_bfloat16* __restrict__ dst,          // [total_tokens, 576]
-    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
+    const int32_t* __restrict__ block_table,  // [num_reqs, BLOCK_INDICES]
-    const int32_t* __restrict__ seq_lens,     // [BATCH]
+    const int32_t* __restrict__ workspace_starts,  // [num_reqs]
-    const int32_t* __restrict__ workspace_starts,  // [BATCH]
+    const int32_t num_reqs, const int32_t block_size,
-    const int32_t block_size, const int32_t head_dim,
+    const int32_t total_tokens, const int64_t block_table_stride,
-    const int64_t block_table_stride, const int64_t cache_block_stride,
+    const int64_t cache_block_stride, const int64_t cache_entry_stride,
-    const int64_t cache_entry_stride, const int64_t dst_entry_stride) {
+    const int64_t dst_entry_stride) {
-  const int64_t bid = blockIdx.x;  // Batch ID
+  const int flat_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) >> 5;
-  const int32_t num_splits = gridDim.y;
+  if (flat_warp_id >= total_tokens) return;
-  const int32_t split = blockIdx.y;
+  const int lane_id = threadIdx.x & 31;
  const int32_t seq_start = workspace_starts[bid];
  const int32_t seq_len = seq_lens[bid];
  const int32_t tot_slots = seq_len;
  const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
-  const int32_t split_start = split * split_slots;
+  // Binary search to find which request owns this output token
-  const int32_t split_end = min((split + 1) * split_slots, tot_slots);
+  int lo = 0, hi = num_reqs - 1;
  while (lo < hi) {
    int mid = (lo + hi + 1) >> 1;
    if (workspace_starts[mid] <= flat_warp_id)
      lo = mid;
    else
      hi = mid - 1;
  }
  const int req_id = lo;
-  const bool is_active_split = (split_start < tot_slots);
+  // Compute physical token address via block table
  const int out_token_id = flat_warp_id;
  const int token_offset = out_token_id - workspace_starts[req_id];
  const int cache_block_idx = token_offset / block_size;
  const int offset_in_block = token_offset % block_size;
  const int physical_block =
      block_table[req_id * block_table_stride + cache_block_idx];
-  if (!is_active_split) return;
+  const uint8_t* token_ptr = src_cache + physical_block * cache_block_stride +
                             offset_in_block * cache_entry_stride;
-  // Adjust the pointer for the block_table for this batch
+  const int4* nope_src = reinterpret_cast<const int4*>(token_ptr);
-  const int32_t batch_offset = bid * block_table_stride;
+  const int4 fp8_data = nope_src[lane_id];
  int32_t offset = split_start;
  int32_t offset_div = offset / block_size;
  offset = offset % block_size;
  const int32_t* batch_block_table = block_table + batch_offset;
  // Adjust dst pointer based on the cumulative sequence lengths
  dst += seq_start * dst_entry_stride;
  const int tid = threadIdx.x;
  // Process each token in this split
  for (int pid = split_start; pid < split_end; ++pid) {
    auto block_id = batch_block_table[offset_div];
    const uint8_t* token_ptr =
        src_cache + block_id * cache_block_stride + offset * cache_entry_stride;
    __nv_bfloat16* dst_ptr = dst + pid * dst_entry_stride;
    // FP8 format: 512 bytes fp8 + 16 bytes scales + 128 bytes rope (64 bf16)
    const uint8_t* no_pe_ptr = token_ptr;
  const float* scales_ptr = reinterpret_cast<const float*>(token_ptr + 512);
-    const __nv_bfloat16* rope_ptr =
+  const float scale = scales_ptr[lane_id >> 3];
        reinterpret_cast<const __nv_bfloat16*>(token_ptr + 512 + 16);
-    // Parallelize fp8 dequant (512 elements) and rope copy (64 elements)
+  const uint2 fp8_lo = make_uint2(fp8_data.x, fp8_data.y);
-    if (tid < 512) {
+  const uint2 fp8_hi = make_uint2(fp8_data.z, fp8_data.w);
-      // FP8 dequantization
+#ifdef USE_ROCM
-      const int tile = tid >> 7;  // each tile is 128 elements
+  const bf16_8_t bf16_lo =
-      const float scale = scales_ptr[tile];
+      fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_lo, scale);
-      const uint8_t val = no_pe_ptr[tid];
+  const bf16_8_t bf16_hi =
-      dst_ptr[tid] =
+      fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_hi, scale);
-          fp8::scaled_convert<__nv_bfloat16, uint8_t,
+#else
-                              vllm::Fp8KVCacheDataType::kFp8E4M3>(val, scale);
+  const bf16_8_t bf16_lo =
-    } else if (tid < 576) {
+      fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_lo, scale, __NV_E4M3);
-      // Rope copy (64 bf16 elements)
+  const bf16_8_t bf16_hi =
-      const int rope_idx = tid - 512;
+      fp8::scaled_vec_conversion<bf16_8_t, uint2>(fp8_hi, scale, __NV_E4M3);
-      dst_ptr[512 + rope_idx] = rope_ptr[rope_idx];
+#endif
    }
-    // Move to next token
+  __nv_bfloat16* dst_ptr = dst + out_token_id * dst_entry_stride;
-    offset += 1;
+  int4* nope_dst = reinterpret_cast<int4*>(dst_ptr) + lane_id * 2;
-    if (offset == block_size) {
+  nope_dst[0] = *reinterpret_cast<const int4*>(&bf16_lo);
-      offset_div += 1;
+  nope_dst[1] = *reinterpret_cast<const int4*>(&bf16_hi);
-      offset = 0;
+
-    }
+  const int* rope_src = reinterpret_cast<const int*>(token_ptr + 528);
-  }
+  int* rope_dst = reinterpret_cast<int*>(dst_ptr + 512);
  rope_dst[lane_id] = rope_src[lane_id];
 }
 template <typename scalar_t>
@@ -1257,15 +1250,16 @@ void cp_gather_and_upconvert_fp8_kv_cache(
    src_ptr = reinterpret_cast<const uint8_t*>(src_cache.data_ptr());
  }
-  // Decide on the number of splits based on the batch size
+  const int total_tokens = dst.size(0);
-  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
+  constexpr int warps_per_block = 8;
-  dim3 grid(batch_size, num_splits);
+  const int grid_size = (total_tokens + warps_per_block - 1) / warps_per_block;
-  dim3 block(576);
+  const int block_size_threads = warps_per_block * 32;  // 256 threads
-  vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid, block, 0, stream>>>(
+  vllm::cp_gather_and_upconvert_fp8_kv_cache<<<grid_size, block_size_threads, 0,
                                               stream>>>(
      src_ptr, reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()),
-      block_table.data_ptr<int32_t>(), seq_lens.data_ptr<int32_t>(),
+      block_table.data_ptr<int32_t>(), workspace_starts.data_ptr<int32_t>(),
-      workspace_starts.data_ptr<int32_t>(), block_size, head_dim,
+      static_cast<int32_t>(batch_size), block_size, total_tokens,
      block_table_stride, cache_block_stride, cache_entry_stride,
      dst_entry_stride);
 }
@@ -1365,3 +1359,43 @@ void cp_gather_indexer_k_quant_cache(
    CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32);
  }
 }
 // Concatenate ql_nope and q_pe into a contiguous q_out tensor for MLA/DSA.
 // Replaces torch.cat((ql_nope, q_pe), dim=-1).
 void concat_mla_q(torch::Tensor& ql_nope,  // [num_tokens, num_heads, nope_dim]
                  torch::Tensor& q_pe,     // [num_tokens, num_heads, rope_dim]
                  torch::Tensor& q_out     // [num_tokens, num_heads, nope_dim +
                                           // rope_dim]
 ) {
  const int num_tokens = ql_nope.size(0);
  const int num_heads = ql_nope.size(1);
  const int nope_dim = ql_nope.size(2);
  const int rope_dim = q_pe.size(2);
  TORCH_CHECK(nope_dim % 512 == 0, "nope_dim must be a multiple of 512, got ",
              nope_dim);
  TORCH_CHECK(rope_dim == 64, "rope_dim must be 64, got ", rope_dim);
  TORCH_CHECK(q_out.size(2) == nope_dim + rope_dim);
  TORCH_CHECK(ql_nope.stride(2) == 1, "ql_nope must have stride 1 in dim 2");
  TORCH_CHECK(q_pe.stride(2) == 1, "q_pe must have stride 1 in dim 2");
  TORCH_CHECK(q_out.stride(2) == 1, "q_out must have stride 1 in dim 2");
  if (num_tokens == 0) return;
  constexpr int warps_per_block = 8;
  const int total_warps = num_tokens * num_heads;
  const int grid_size = (total_warps + warps_per_block - 1) / warps_per_block;
  const int block_size = warps_per_block * 32;
  const at::cuda::OptionalCUDAGuard device_guard(device_of(ql_nope));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(ql_nope.scalar_type(), "concat_mla_q", [&] {
    vllm::ConcatMLAQKernel<scalar_t, 512><<<grid_size, block_size, 0, stream>>>(
        q_out.data_ptr<scalar_t>(), ql_nope.data_ptr<scalar_t>(),
        q_pe.data_ptr<scalar_t>(), num_tokens, num_heads, q_out.stride(0),
        q_out.stride(1), ql_nope.stride(0), ql_nope.stride(1), q_pe.stride(0),
        q_pe.stride(1));
  });
 }
--- a/csrc/concat_mla_q.cuh
+++ b/csrc/concat_mla_q.cuh
@@ -0,0 +1,60 @@
 #ifndef CONCAT_MLA_Q_CUH_
 #define CONCAT_MLA_Q_CUH_
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #include "cuda_vec_utils.cuh"
 namespace vllm {
 // Concatenates ql_nope [num_tokens, num_heads, NOPE_DIM] and
 // q_pe [num_tokens, num_heads, 64]
 // into q_out [num_tokens, num_heads, NOPE_DIM+64].
 // Currently instantiated only for NOPE_DIM=512.
 // Rope dim is hardcoded to 64 (DeepSeek V3.2 MLA)
 template <typename DType, int NOPE_DIM>
 __global__ void ConcatMLAQKernel(
    DType* __restrict__ q_out, const DType* __restrict__ ql_nope,
    const DType* __restrict__ q_pe, const int num_tokens, const int num_heads,
    const int64_t out_stride_0, const int64_t out_stride_1,
    const int64_t nope_stride_0, const int64_t nope_stride_1,
    const int64_t pe_stride_0, const int64_t pe_stride_1) {
  const int flat_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) >> 5;
  if (flat_warp_id >= num_tokens * num_heads) return;
  const int token_id = flat_warp_id / num_heads;
  const int head_id = flat_warp_id % num_heads;
  const int lane_id = threadIdx.x & 31;
  constexpr bool use_256b = VLLM_256B_PTX_ENABLED;
  constexpr int nope_vec_loads =
      NOPE_DIM * sizeof(DType) / (VecTraits<use_256b>::ARCH_MAX_VEC_SIZE * 32);
  const DType* nope_src =
      ql_nope + token_id * nope_stride_0 + head_id * nope_stride_1;
  DType* nope_dst = q_out + token_id * out_stride_0 + head_id * out_stride_1;
 #pragma unroll
  for (int i = 0; i < nope_vec_loads; i++) {
    const int offset = i * 32 + lane_id;
    if constexpr (use_256b) {
      st256_cs(reinterpret_cast<u32x8_t*>(nope_dst) + offset,
               ld256_cs(reinterpret_cast<const u32x8_t*>(nope_src) + offset));
    } else {
      st128_cs(reinterpret_cast<int4*>(nope_dst) + offset,
               ld128_cs(reinterpret_cast<const int4*>(nope_src) + offset));
    }
  }
  const int* rope_src = reinterpret_cast<const int*>(
      q_pe + token_id * pe_stride_0 + head_id * pe_stride_1);
  int* rope_dst = reinterpret_cast<int*>(q_out + token_id * out_stride_0 +
                                         head_id * out_stride_1 + NOPE_DIM);
  st32_cs(rope_dst + lane_id, ld32_cs(rope_src + lane_id));
 }
 }  // namespace vllm
 #endif  // CONCAT_MLA_Q_CUH_
--- a/csrc/cpu/cpu_attn_amx.hpp
+++ b/csrc/cpu/cpu_attn_amx.hpp
@@ -420,7 +420,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
      const int64_t block_size, const int64_t block_size_stride) {
    // For AMX 2D tiles, size of each line is 64 bytes
    constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES;
-    // For AMX B martix, N always is 16
+    // For AMX B matrix, N always is 16
    constexpr int64_t amx_b_tile_n_size = AMX_TILE_ROW_BYTES / 4;
    constexpr int64_t amx_b_tile_k_size = amx_tile_row_size / sizeof(scalar_t);
    // For now suppose block_size is divisible by amx_tile_column_num
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@@ -237,13 +237,10 @@ W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
  };
  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
                                   {b_k_stride_, b_n_stride_});
-#ifdef __aarch64__
+
  // dummy M size for prepacking weights
  // Prepacking weights improves performance and avoid runtime reorders
  constexpr dnnl_dim_t kProbeM = 128;
 #else
  constexpr dnnl_dim_t kProbeM = DNNL_RUNTIME_DIM_VAL;
 #endif
  prepack_weight(args.b_ptr, original_b_md,
                 create_primitive_desc(
@@ -411,19 +408,17 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
                                   {b_k_stride_, b_n_stride_});
  // dummy M size for prepacking weights
  // Prepacking weights improves performance and avoid runtime reorders
  constexpr dnnl_dim_t kProbeM = 128;
  prepack_weight(args.b_ptr, original_b_md,
                 create_primitive_desc(
-                     MSizeCacheKey{
+                     MSizeCacheKey{// Use a concrete M so oneDNN's kernel
-#ifdef VLLM_USE_ACL
+                                   // selector can choose an optimally blocked
-                         // Arm Compute Library (ACL) backend for oneDNN does
+                                   // weight layout.
-                         // not support runtime
+                                   .a_m_size = kProbeM,
                         // dimensions, so we set M to a default value
                         .a_m_size = 128,
                                   .a_m_stride = b_k_size_,
 #else
                         .a_m_size = DNNL_RUNTIME_DIM_VAL,
                         .a_m_stride = DNNL_RUNTIME_DIM_VAL,
 #endif
                                   .use_bias = false,
                                   .bias_type = dnnl::memory::data_type::undef},
                     true)
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -4,7 +4,7 @@
 #include <torch/library.h>
-// Note: overwrite the external defination for sharing same name between
+// Note: overwrite the external definition for sharing same name between
 // libraries use different ISAs.
 #define TORCH_EXTENSION_NAME _C
--- a/csrc/cuda_vec_utils.cuh
+++ b/csrc/cuda_vec_utils.cuh
@@ -196,7 +196,6 @@ __forceinline__ __device__ u32x8_t ld256_cs(const u32x8_t* addr) {
  return val;
 #else
  assert(false && "ld256_cs requires SM100+ with CUDA 12.9+");
  return {};
 #endif
 }
@@ -211,23 +210,51 @@ __forceinline__ __device__ void st256_cs(u32x8_t* addr, u32x8_t val) {
 #endif
 }
-// 32-bit cache-streaming (.cs) load / store  — SM100+ only.
+// 32-bit load / store.
 __device__ __forceinline__ int ld32(const int* addr) { return __ldg(addr); }
 __device__ __forceinline__ void st32(int* addr, int val) { *addr = val; }
 // 32-bit cache-streaming (.cs) load / store.
 // Falls back to ld32/st32 on ROCm (no .cs hint).
 __forceinline__ __device__ int ld32_cs(const int* addr) {
 #if VLLM_256B_PTX_ENABLED
  int val;
 #ifndef USE_ROCM
  asm volatile("ld.global.cs.b32 %0, [%1];" : "=r"(val) : "l"(addr));
  return val;
 #else
-  assert(false && "ld32_cs requires SM100+ with CUDA 12.9+");
+  val = ld32(addr);
  return 0;
 #endif
  return val;
 }
 __forceinline__ __device__ void st32_cs(int* addr, int val) {
-#if VLLM_256B_PTX_ENABLED
+#ifndef USE_ROCM
  asm volatile("st.global.cs.b32 [%0], %1;" ::"l"(addr), "r"(val));
 #else
-  assert(false && "st32_cs requires SM100+ with CUDA 12.9+");
+  st32(addr, val);
 #endif
 }
 // 128-bit cache-streaming (.cs) load / store.
 // Falls back to ld128/st128 on ROCm (no .cs hint).
 __forceinline__ __device__ int4 ld128_cs(const int4* addr) {
  int4 val;
 #ifndef USE_ROCM
  asm volatile("ld.global.cs.v4.u32 {%0,%1,%2,%3}, [%4];"
               : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
               : "l"(addr));
 #else
  ld128(val, addr);
 #endif
  return val;
 }
 __forceinline__ __device__ void st128_cs(int4* addr, int4 val) {
 #ifndef USE_ROCM
  asm volatile("st.global.cs.v4.u32 [%0], {%1,%2,%3,%4};" ::"l"(addr),
               "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w));
 #else
  st128(val, addr);
 #endif
 }
@@ -260,7 +287,7 @@ __device__ __forceinline__ void ld256_cg_or_zero(u32x8_t& val, const void* ptr,
 __device__ __forceinline__ void ld128_cg_or_zero(uint4& val, const void* ptr,
                                                 bool pred) {
-#if VLLM_256B_PTX_ENABLED
+#ifndef USE_ROCM
  uint32_t r0, r1, r2, r3;
  asm volatile(
@@ -278,7 +305,7 @@ __device__ __forceinline__ void ld128_cg_or_zero(uint4& val, const void* ptr,
  val = uint4{r0, r1, r2, r3};
 #else
-  assert(false && "ld128_cg_or_zero requires SM100+ with CUDA 12.9+");
+  assert(false && "ld128_cg_or_zero is not supported on ROCm");
 #endif
 }
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -35,11 +35,11 @@ __global__ void batched_moe_align_block_size_kernel(
  int32_t const block_ids_size = sorted_ids_size / block_size;
  int32_t const SENTINEL =
      num_batches * max_tokens_per_batch;  // To denote invalid entries.
-  // Intialize sorted_ids
+  // Initialize sorted_ids
  for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
    sorted_ids[i] = SENTINEL;
  }
-  // Intialize expert_ids with -1
+  // Initialize expert_ids with -1
  for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
    block_ids[i] = -1;
  }
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@@ -542,7 +542,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
      if (!lane_id) {
        // Store scales.
        if constexpr (std::is_same<scale_t, uint8_t>::value) {
-          // Packed UE8MO format. Remove Mantissa.
+          // Packed UE8M0 format. Remove Mantissa.
          *y_s_ptr = reinterpret_cast<int16_t&>(y_s) >> 7;
          bool const jump_pack = (current_group_id + 1) % 4 == 0;
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@@ -12,6 +12,7 @@
 #include "../cuda_compat.h"
 #include "dispatch_utils.h"
 #include "quantization/w8a8/fp8/common.cuh"
 #include "core/batch_invariant.hpp"
 // TODO(rasmith): The kernels in this file are susceptible to integer overflow
 // issues, do not take strides, and are unable to handle PyTorch tensors that
@@ -1224,17 +1225,14 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
 #if defined(__gfx950__)
  #define WVSPLITKRC_1KPASS
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
-          int UNRL, int N, int GrpsShrB, int CHUNKK>
+          int UNRL, int N, int GrpsShrB, int CHUNKK, int DTRMNSTC>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
    __attribute__((amdgpu_waves_per_eu(1, 1)))
-    wvSplitKrc_(const int actlN, const int K, const int M, const int Bx,
+    wvSplitKrc_(const int actlN, const int K, const int Kap, const int M,
-                const int By, const scalar_t* __restrict__ B,
+                const int Bx, const int By, const scalar_t* __restrict__ A,
-                const scalar_t* __restrict__ A,
+                const scalar_t* __restrict__ B,
-                const scalar_t* __restrict__ BIAS, float* glbl, scalar_t* C,
+                const scalar_t* __restrict__ BIAS, float* glbl, int* cntr,
-                const int CuCount) {
+                scalar_t* C, const int CuCount) {
  // Use upper half of glbl buffer for atomic reduce counting
  int* cntr = (int*)(&glbl[M * N]);
  constexpr int NTILE = 16;
  constexpr int APAD = 1;
  constexpr int ASTRD = 64;
@@ -1425,10 +1423,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
          unsigned int kOffcp = min__(K - A_CHUNK, k_str + kOff);
          for (unsigned int n = 0; n < N; n += CHUNKK * sprdN) {
            __builtin_amdgcn_global_load_lds(
-                (int*)(&A[min__(
+                (int*)(&A[min__(Kap * actlN - A_CHUNK,
-                    K * actlN - A_CHUNK,
+                                kOffcp + Kap * (n / CHUNKK +
-                    kOffcp + K * (n / CHUNKK +
+                                                (N / CHUNKK) * (threadIdx.x /
-                                  (N / CHUNKK) * (threadIdx.x / (64 / CHUNKK)) +
+                                                                (64 / CHUNKK)) +
                                                (threadIdx.y % sprdN)))]),
                (int*)(&s[(k +
                           kFitPdd * ((n / CHUNKK) + (threadIdx.y % sprdN)))]),
@@ -1476,7 +1474,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  #endif
    // B[] staging is cooperative across GrpsShrB, so sync here before reading
-    // back. This wait is currently inserted by compiler, but not gauranteed.
+    // back. This wait is currently inserted by compiler, but not guaranteed.
    asm volatile("s_waitcnt 0");
    __syncthreads();
@@ -1533,30 +1531,66 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
    }
  }
  union flt4 {
    scalar8 s8;
    float2 f2[2];
    float4 f4;
  };
  if (m + (threadIdx.x % 16) < M) {
    int my_cntr;
    int mindx = m + (threadIdx.x % 16);
    int g_mindx = m * 4 + (threadIdx.x % 64);  // coalesced atomic reduction
    scalar_t biases[N / NTILE / GrpsShrB][4] = {};
    // Atomic add the output, read biases
-    for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++)
+    for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
      for (uint32_t j = 0; j < 4; j++) {
        // int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
        //             (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
        // int adr = mindx + M * nindx;
      int g_nindx =
-            j + (nt * NTILE + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) / 4;
+          (nt * NTILE + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) / 4;
-        int g_adr = g_mindx + M * g_nindx * 4;
+      int g_adr = g_mindx * 4 + 0 + M * g_nindx * 4;
-        atomicAdd(&glbl[g_adr], sum4[nt][0][j]);
+      if (DTRMNSTC) {
        flt4 flt4_ = {.s8 = sum4[nt][0]};
        __hip_atomic_store((float2*)&glbl[g_adr + M * N * (m0 / Mmod)],
                           flt4_.f2[0], __ATOMIC_RELAXED,
                           __HIP_MEMORY_SCOPE_AGENT);
        __hip_atomic_store((float2*)&glbl[g_adr + 2 + M * N * (m0 / Mmod)],
                           flt4_.f2[1], __ATOMIC_RELAXED,
                           __HIP_MEMORY_SCOPE_AGENT);
      } else {
        for (uint32_t j = 0; j < 4; j++)
          atomicAdd((&glbl[g_adr + j]), sum4[nt][0][j]);
      }
    }
    __atomic_signal_fence(__ATOMIC_SEQ_CST);
    asm volatile("s_waitcnt vmcnt(0)" ::: "memory");
    __atomic_signal_fence(__ATOMIC_SEQ_CST);
    int nindx_ = (0 + (threadIdx.x / 16) * 4) + 0 * NTILE +
                 (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
    int adr_ = mindx + M * nindx_ / 4;
    // Update the complete counter
    my_cntr = atomicAdd(&cntr[adr_], 1);
-    float vals[N / NTILE / GrpsShrB][4] = {};
+
    // make sure LDS is free for write out staging
    if (DTRMNSTC) __syncthreads();
    // Update the complete counter
    flt4 vals[N / NTILE / GrpsShrB] = {};
    // If we're the last k-shard, read back the value and convert...
    if (my_cntr + 1 == k_rnd) {
      cntr[adr_] = 0;  // clear for next round
      if constexpr (DTRMNSTC) {
  #pragma unroll
        for (int ks = 0; ks < k_rnd; ks++) {
          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
            int g_nindx =
                (nt * NTILE + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) / 4;
            int g_adr = g_mindx * 4 + 0 + M * g_nindx * 4;
            __builtin_amdgcn_global_load_lds(
                (float4*)(&glbl[g_adr + M * N * ks]),
                &(((float4*)s)[(threadIdx.y * THRDS) + ks * THRDS * 4 +
                               nt * THRDS * 4 * k_rnd]),
                16, 0, 0);
          }
        }
        if (BIAS)
          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
            for (uint32_t j = 0; j < 4; j++) {
@@ -1565,12 +1599,29 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
              biases[nt][j] = BIAS[(mindx % Bx) + (nindx % By) * Bx];
            }
          }
        asm volatile("s_waitcnt 0");
        for (int ks = 0; ks < k_rnd; ks++) {
          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
            float4 eval = ((float4*)s)[(threadIdx.x + threadIdx.y * THRDS) +
                                       ks * THRDS * 4 + nt * THRDS * 4 * k_rnd];
            vals[nt].f4 += eval;
          }
        }
      } else {
        for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
          int g_nindx =
              (nt * NTILE + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) / 4;
          int g_adr = g_mindx * 4 + 0 + M * g_nindx * 4;
          vals[nt].f4 = *(float4*)(&glbl[g_adr]);
          *(float4*)(&glbl[g_adr]) = {};  // clear out for next round
        }
        if (BIAS)
          for (uint32_t nt = 0; nt < N / NTILE / GrpsShrB; nt++) {
            for (uint32_t j = 0; j < 4; j++) {
-          int g_nindx =
+              int nindx = (j + (threadIdx.x / 16) * 4) + nt * NTILE +
-              j + (nt * NTILE + (N / GrpsShrB) * (threadIdx.y % GrpsShrB)) / 4;
+                          (N / GrpsShrB) * (threadIdx.y % GrpsShrB);
-          int g_adr = g_mindx + M * g_nindx * 4;
+              biases[nt][j] = BIAS[(mindx % Bx) + (nindx % By) * Bx];
-          vals[nt][j] = glbl[g_adr];
+            }
          }
      }
      __builtin_amdgcn_sched_barrier(0);
@@ -1581,11 +1632,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
          if (nindx < actlN) {
            int adr = mindx + M * nindx;
            if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
-              vals[nt][j] += __bfloat162float(biases[nt][j]);
+              vals[nt].s8[j] += __bfloat162float(biases[nt][j]);
-              C[adr] = __float2bfloat16(vals[nt][j]);
+              C[adr] = __float2bfloat16(vals[nt].s8[j]);
            } else {
-              vals[nt][j] += __half2float(biases[nt][j]);
+              vals[nt].s8[j] += __half2float(biases[nt][j]);
-              C[adr] = __float2half(vals[nt][j]);
+              C[adr] = __float2half(vals[nt].s8[j]);
            }
          }
        }
@@ -1604,21 +1655,25 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 }
 #else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
-          int UNRL, int N, int GrpsShrB, int CHUNKK>
+          int UNRL, int N, int GrpsShrB, int CHUNKK, int DTRMNSTC>
-__global__ void wvSplitKrc_(const int actlN, const int K, const int M,
+__global__ void wvSplitKrc_(const int actlN, const int K, const int Kap,
-                            const int Bx, const int By, const scalar_t* B,
+                            const int M, const int Bx, const int By,
-                            const scalar_t* __restrict__ A,
+                            const scalar_t* B, const scalar_t* __restrict__ A,
                            const scalar_t* __restrict__ BIAS, float* glbl,
-                            // int* cntr,
+                            int* cntr, scalar_t* C,
-                            scalar_t* C, const int CuCount){UNREACHABLE_CODE}
+                            const int CuCount){UNREACHABLE_CODE}
 #endif  // defined(__HIP__GFX9__) TODO: Add NAVI support
 torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
                         const std::optional<at::Tensor>& in_bias,
                         const int64_t CuCount) {
-  auto M_in = in_a.size(0);
+  int _DTRMNSTC = 1;  // vllm::vllm_is_batch_invariant();
-  auto N_in = in_b.size(0);
+
-  auto K_in = in_a.size(1);
+  auto M_in = in_b.size(0);
  auto N_in = in_a.size(0);
  auto K_in = in_b.size(1);
  auto Kap_in = in_a.stride(0);
  auto Bx_in =
      (in_bias.has_value() && in_bias->numel() > 0)
          ? (in_bias->sizes().size() == 2) ? in_bias->size(1) : in_bias->size(0)
@@ -1635,13 +1690,9 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
  auto out_c = torch::empty(
      {N_in, M_in},
-      torch::TensorOptions().dtype(in_b.dtype()).device(in_b.device()));
+      torch::TensorOptions().dtype(in_a.dtype()).device(in_a.device()));
  auto N_p2 = 1U << (32 - __builtin_clz(N_in - 1));
  auto axl_glbl = torch::empty(
      {N_p2 + N_p2 / 4, M_in + M_in / 4},
      torch::TensorOptions().dtype(torch::kFloat32).device(in_b.device()));
  axl_glbl.zero_();  // disable for FAST_UNSAFE_RDC_INIT
  dim3 grid(CuCount);
@@ -1649,25 +1700,6 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  // const int max_lds_len = get_lds_size() / 2;
 #define WVSPLITKrc(_N, _GrpsShrB, _CHUNKK)                                     \
  {                                                                            \
    dim3 block(64, 4);                                                         \
    wvSplitKrc_<fptype, 64, 16, 4, 8, 1, _N, _GrpsShrB, _CHUNKK>               \
        <<<grid, block, 0, stream>>>(N_in, K_in, M_in, Bx_in, By_in, af4, bf4, \
                                     biasf4, glbl, c, CuCount);                \
  }
  AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitKrc", [&] {
    using fptype = typename scalar<scalar_t>::type;
    fptype* af4 = reinterpret_cast<fptype*>(in_a.data_ptr());
    const fptype* bf4 = reinterpret_cast<const fptype*>(in_b.data_ptr());
    const fptype* biasf4 =
        (in_bias.has_value() && in_bias->numel() > 0)
            ? reinterpret_cast<const fptype*>(in_bias->data_ptr())
            : nullptr;
    fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
    auto glbl = axl_glbl.data_ptr<float>();
  // With 64 Ms per CU (each of 4 SIMDs working on a 16x16 tile),
  // and each working on a 512-shard of K, how many CUs would we need?
  int rndup_cus = ((M_in + 64 - 1) / 64) * ((K_in + 512 - 1) / 512);
@@ -1680,24 +1712,58 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
  // Given the above, how many CUs would we need?
  int CuNeeded = rndup_cus * GrpsShrB;
-    if (CuNeeded > CuCount) std::runtime_error("Invalid wvSplitKrc size");
+  if (CuNeeded > CuCount) throw std::runtime_error("Invalid wvSplitKrc size");
  // Can we increase SplitK by shrinking the K-shared to 256?
  int chunkk = (CuNeeded * 2 <= CuCount) ? 2 : 1;
  static torch::Tensor axl_glbl =
      torch::zeros(
          128 * 1024 * (_DTRMNSTC ? 12 : 1),
          torch::TensorOptions().dtype(torch::kFloat32).device(in_a.device()))
          .detach();
  static torch::Tensor axl_cntr =
      torch::zeros(
          128 * 1024 * (_DTRMNSTC ? 12 : 1) / 4,
          torch::TensorOptions().dtype(torch::kInt).device(in_a.device()))
          .detach();
  auto glbl = axl_glbl.data_ptr<float>();
  auto cntr = axl_cntr.data_ptr<int>();
 #define WVSPLITKrc(_N, _GrpsShrB, _CHUNKK)                                     \
  {                                                                            \
    dim3 block(64, 4);                                                         \
    if (_DTRMNSTC)                                                             \
      wvSplitKrc_<fptype, 64, 16, 4, 8, 1, _N, _GrpsShrB, _CHUNKK, 1>          \
          <<<grid, block, 0, stream>>>(N_in, K_in, Kap_in, M_in, Bx_in, By_in, \
                                       af4, bf4, biasf4, glbl, cntr, c,        \
                                       CuCount);                               \
    else                                                                       \
      wvSplitKrc_<fptype, 64, 16, 4, 8, 1, _N, _GrpsShrB, _CHUNKK, 0>          \
          <<<grid, block, 0, stream>>>(N_in, K_in, Kap_in, M_in, Bx_in, By_in, \
                                       af4, bf4, biasf4, glbl, cntr, c,        \
                                       CuCount);                               \
  }
  AT_DISPATCH_REDUCED_FLOATING_TYPES(in_a.scalar_type(), "wvSplitKrc", [&] {
    using fptype = typename scalar<scalar_t>::type;
    const fptype* af4 = reinterpret_cast<const fptype*>(in_a.data_ptr());
    const fptype* bf4 = reinterpret_cast<const fptype*>(in_b.data_ptr());
    const fptype* biasf4 =
        (in_bias.has_value() && in_bias->numel() > 0)
            ? reinterpret_cast<const fptype*>(in_bias->data_ptr())
            : nullptr;
    fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
    switch (N_p2) {
      case 16:
        WVSPLITKrc(16, 1, 1) break;
      case 32:
-        if (chunkk == 2)
+        if (chunkk == 2) WVSPLITKrc(32, 2, 2) else WVSPLITKrc(32, 2, 1) break;
          WVSPLITKrc(32, 2, 2) else if (chunkk == 1) WVSPLITKrc(32, 2, 1) break;
      case 64:
-        if (chunkk == 2)
+        if (chunkk == 2) WVSPLITKrc(64, 4, 2) else WVSPLITKrc(64, 4, 1) break;
          WVSPLITKrc(64, 4, 2) else if (chunkk == 1) WVSPLITKrc(64, 4, 1) break;
      case 128:
-        if (chunkk == 2)
+        if (chunkk == 2) WVSPLITKrc(128, 4, 2) else WVSPLITKrc(128, 4, 1) break;
          WVSPLITKrc(128, 4, 2) else if (chunkk == 1)
              WVSPLITKrc(128, 4, 1) break;
      default:
        throw std::runtime_error(
            "Unsupported N value: " + std::to_string(M_in) + "," +
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -802,6 +802,10 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
  cache_ops.impl("indexer_k_quant_and_cache", torch::kCUDA,
                 &indexer_k_quant_and_cache);
  cache_ops.def(
      "concat_mla_q(Tensor ql_nope, Tensor q_pe, Tensor! q_out) -> ()");
  cache_ops.impl("concat_mla_q", torch::kCUDA, &concat_mla_q);
  cache_ops.def(
      "cp_gather_indexer_k_quant_cache(Tensor kv_cache, Tensor! dst_k, Tensor! "
      "dst_scale, Tensor block_table, Tensor cu_seq_lens) -> ()");
--- a/docs/benchmarking/cli.md
+++ b/docs/benchmarking/cli.md
@@ -18,14 +18,14 @@ th {
 </style>
 | Dataset | Online | Offline | Data Path |
-|---------|--------|---------|-----------|
+| ------- | ------ | ------- | --------- |
 | ShareGPT | ✅ | ✅ | `wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json` |
 | ShareGPT4V (Image) | ✅ | ✅ | `wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json`<br>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:<br>`wget http://images.cocodataset.org/zips/train2017.zip` |
 | ShareGPT4Video (Video) | ✅ | ✅ | `git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video` |
 | BurstGPT | ✅ | ✅ | `wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv` |
 | Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
 | Random | ✅ | ✅ | `synthetic` |
-| RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` |
+| RandomMultiModal (Image/Video) | ✅ | ✅ | `synthetic` |
 | RandomForReranking | ✅ | ✅ | `synthetic` |
 | Prefix Repetition | ✅ | ✅ | `synthetic` |
 | HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` |
@@ -545,6 +545,24 @@ vllm bench throughput \
  --lora-path yard1/llama-2-7b-sql-lora-test
 ```
 #### Synthetic Random Multimodal (random-mm)
 Generate synthetic multimodal inputs for offline throughput testing without external datasets.
 Use `--backend vllm-chat` so that image tokens are counted correctly.
 ```bash
 vllm bench throughput \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --backend vllm-chat \
  --dataset-name random-mm \
  --num-prompts 100 \
  --random-input-len 300 \
  --random-output-len 40 \
  --random-mm-base-items-per-request 2 \
  --random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
  --random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}'
 ```
 </details>
 ### 🛠️ Structured Output Benchmark
@@ -846,8 +864,8 @@ Generate synthetic image inputs alongside random text prompts to stress-test vis
 Notes:
- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
+- For online benchmarks, use `--backend openai-chat` with endpoint `/v1/chat/completions`.
- Video sampling is not yet implemented.
+- For offline benchmarks, use `--backend vllm-chat` (see [Offline Throughput Benchmark](#-offline-throughput-benchmark) for an example).
 Start the server (example):
@@ -913,6 +931,74 @@ This should be seen as an edge case, and if this behavior can be avoided by sett
 </details>
 ### 🔬 Multimodal Processor Benchmark
 Benchmark per-stage latency of the multimodal (MM) input processor pipeline, including the encoder forward pass. This is useful for profiling preprocessing bottlenecks in vision-language models.
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 The benchmark measures the following stages for each request:
 | Stage | Description |
 | ----- | ----------- |
 | `get_mm_hashes_secs` | Time spent hashing multimodal inputs |
 | `get_cache_missing_items_secs` | Time spent looking up the processor cache |
 | `apply_hf_processor_secs` | Time spent in the HuggingFace processor |
 | `merge_mm_kwargs_secs` | Time spent merging multimodal kwargs |
 | `apply_prompt_updates_secs` | Time spent updating prompt tokens |
 | `preprocessor_total_secs` | Total preprocessing time |
 | `encoder_forward_secs` | Time spent in the encoder model forward pass |
 | `num_encoder_calls` | Number of encoder invocations per request |
 The benchmark also reports end-to-end latency (TTFT + decode time) per
 request. Use `--metric-percentiles` to select which percentiles to report
 (default: p99) and `--output-json` to save results.
 #### Basic Example with Synthetic Data (random-mm)
 ```bash
 vllm bench mm-processor \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --dataset-name random-mm \
  --num-prompts 50 \
  --random-input-len 300 \
  --random-output-len 40 \
  --random-mm-base-items-per-request 2 \
  --random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
  --random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}'
 ```
 #### Using a HuggingFace Dataset
 ```bash
 vllm bench mm-processor \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --dataset-name hf \
  --dataset-path lmarena-ai/VisionArena-Chat \
  --hf-split train \
  --num-prompts 100
 ```
 #### Warmup, Custom Percentiles, and JSON Output
 ```bash
 vllm bench mm-processor \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --dataset-name random-mm \
  --num-prompts 200 \
  --num-warmups 5 \
  --random-input-len 300 \
  --random-output-len 40 \
  --random-mm-base-items-per-request 1 \
  --metric-percentiles 50,90,95,99 \
  --output-json results.json
 ```
 See [`vllm bench mm-processor`](../cli/bench/mm_processor.md) for the full argument reference.
 </details>
 ### Embedding Benchmark
 Benchmark the performance of embedding requests in vLLM.
--- a/docs/benchmarking/dashboard.md
+++ b/docs/benchmarking/dashboard.md
@@ -61,11 +61,11 @@ Here is an example using the script to compare result_a and result_b with max co
 ***Output Tput (tok/s) — Model : [ meta-llama/Llama-3.1-8B-Instruct ] , Dataset Name : [ random ] , Input Len : [ 2048.0 ] , Output Len : [ 2048.0 ]***
 | | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
-|----|------|-----|-----------|----------|----------|
+| | -------------------- | --- | -------------------------------- | -------------------------------- | ---------- |
 | 0 | 12 | inf | 24.98 | 186.03 |  7.45 |
-| 1  | 16 | inf|  25.49  | 246.92 | 9.69 |
+| 1 | 16 | inf |  25.49 | 246.92 | 9.69 |
-| 2  | 24 | inf| 27.74  | 293.34 |  10.57 |
+| 2 | 24 | inf | 27.74 | 293.34 |  10.57 |
-| 3  | 32 | inf| 28.61  |306.69 | 10.72 |
+| 3 | 32 | inf | 28.61 |306.69 | 10.72 |
 ***compare-json-results.py – Command-Line Parameters***  
--- a/docs/cli/bench/mm_processor.md
+++ b/docs/cli/bench/mm_processor.md
@@ -1,5 +1,51 @@
 # vllm bench mm-processor
 ## Overview
 `vllm bench mm-processor` profiles the multimodal input processor pipeline of
 vision-language models. It measures per-stage latency from the HuggingFace
 processor through to the encoder forward pass, helping you identify
 preprocessing bottlenecks and understand how different image resolutions or
 item counts affect end-to-end request time.
 The benchmark supports two data sources: synthetic random multimodal inputs
 (`random-mm`) and HuggingFace datasets (`hf`). Warmup requests are run before
 measurement to ensure stable results.
 ## Quick Start
 ```bash
 vllm bench mm-processor \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --dataset-name random-mm \
  --num-prompts 50 \
  --random-input-len 300 \
  --random-output-len 40 \
  --random-mm-base-items-per-request 2 \
  --random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
  --random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}'
 ```
 ## Measured Stages
 | Stage | Description |
 | ----- | ----------- |
 | `get_mm_hashes_secs` | Time spent hashing multimodal inputs |
 | `get_cache_missing_items_secs` | Time spent looking up the processor cache |
 | `apply_hf_processor_secs` | Time spent in the HuggingFace processor |
 | `merge_mm_kwargs_secs` | Time spent merging multimodal kwargs |
 | `apply_prompt_updates_secs` | Time spent updating prompt tokens |
 | `preprocessor_total_secs` | Total preprocessing time |
 | `encoder_forward_secs` | Time spent in the encoder model forward pass |
 | `num_encoder_calls` | Number of encoder invocations per request |
 The benchmark also reports end-to-end latency (TTFT + decode time) per
 request. Use `--metric-percentiles` to select which percentiles to report
 (default: p99) and `--output-json` to save results.
 For more examples (HF datasets, warmup, JSON output), see
 [Benchmarking CLI — Multimodal Processor Benchmark](../../benchmarking/cli.md#multimodal-processor-benchmark).
 ## JSON CLI Arguments
 --8<-- "docs/cli/json_tip.inc.md"
--- a/docs/cli/json_tip.inc.md
+++ b/docs/cli/json_tip.inc.md
@@ -1,3 +1,4 @@
 <!-- markdownlint-disable MD041 -->
 When passing JSON CLI arguments, the following sets of arguments are equivalent:
 - `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -5,6 +5,17 @@ This guide covers optimization strategies and performance tuning for vLLM V1.
 !!! tip
    Running out of memory? Consult [this guide](./conserving_memory.md) on how to conserve memory.
 ## Optimization Levels
 vLLM provides 4 optimization levels (`-O0`, `-O1`, `-O2`, `-O3`) that allow users to trade off startup time for performance:
 - `-O0`: No optimizations. Fastest startup time, but lowest performance.
 - `-O1`: Fast optimization. Simple compilation and fast fusions, and PIECEWISE cudagraphs.
 - `-O2`: Default optimization. Additional compilation ranges, additional fusions, FULL_AND_PIECEWISE cudagraphs.
 - `-O3`: Aggressive optimization. Currently equal to `-O2`, but may include additional time-consuming or experimental optimizations in the future.
 For more information, see the [optimization level documentation](../design/optimization_levels.md).
 ## Preemption
 Due to the autoregressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
@@ -282,7 +293,7 @@ llm = LLM(
 Based on the configuration, the content of the multi-modal caches on `P0` and `P1` are as follows:
 | mm_processor_cache_type | Cache Type | `P0` Cache | `P1` Engine Cache | `P1` Worker Cache | Max. Memory |
-|-------------------|-------------|------------|------------|-------------|-------------|
+| ----------------- | ----------- | ---------- | ---------- | ----------- | ----------- |
 | lru | Processor Caching | K + V | N/A | N/A | `mm_processor_cache_gb * data_parallel_size` |
 | lru | Key-Replicated Caching | K | K + V | N/A | `mm_processor_cache_gb * api_server_count` |
 | shm | Shared Memory Caching | K | N/A | V | `mm_processor_cache_gb * api_server_count` |
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@@ -94,7 +94,6 @@ vLLM's `pre-commit` hooks will now run automatically every time you commit.
    Some `pre-commit` hooks only run in CI. If you need to, you can run them locally with:
    ```bash
    pre-commit run --hook-stage manual markdownlint
    pre-commit run --hook-stage manual mypy-3.10
    ```
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@@ -66,9 +66,9 @@ This complicates the process as we cannot use the out-of-the-box
 - Important indexes at the moment include:
 | Platform | `--extra-index-url` |
-|----------|-----------------|
+| -------- | ------------------- |
-| CUDA 12.8| [https://download.pytorch.org/whl/cu128](https://download.pytorch.org/whl/cu128)|
+| CUDA 12.8 | [https://download.pytorch.org/whl/cu128](https://download.pytorch.org/whl/cu128) |
-| CPU      | [https://download.pytorch.org/whl/cpu](https://download.pytorch.org/whl/cpu)|
+| CPU | [https://download.pytorch.org/whl/cpu](https://download.pytorch.org/whl/cpu) |
 | ROCm 6.2 | [https://download.pytorch.org/whl/rocm6.2.4](https://download.pytorch.org/whl/rocm6.2.4) |
 | ROCm 6.3 | [https://download.pytorch.org/whl/rocm6.3](https://download.pytorch.org/whl/rocm6.3) |
 | XPU | [https://download.pytorch.org/whl/xpu](https://download.pytorch.org/whl/xpu) |
--- a/docs/contributing/deprecation_policy.md
+++ b/docs/contributing/deprecation_policy.md
@@ -66,7 +66,7 @@ stages will be removed.
 Assume a feature is deprecated in `v0.9.0`.
 | Release       | Status                                                                                          |
-|---------------|-------------------------------------------------------------------------------------------------|
+| ------------- | ----------------------------------------------------------------------------------------------- |
 | `v0.9.0`      | Feature is deprecated with clear removal version listed.                                        |
 | `v0.10.0`     | Feature is now off by default, throws an error when used, and can be re-enabled for legacy use. |
 | `v0.11.0`     | Feature is removed.                                                                             |
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -5,8 +5,12 @@
 ## Profile with PyTorch Profiler
-We support tracing vLLM workers using the `torch.profiler` module. You can enable the torch profiler by setting `--profiler-config`
+We support tracing vLLM workers using different profilers. You can enable profiling by setting the `--profiler-config` flag when launching the server.
-when launching the server, and setting the entries `profiler` to `'torch'` and `torch_profiler_dir` to the directory where you want to save the traces. Additionally, you can control the profiling content by specifying the following additional arguments in the config:
+
 !!! note
    The `--profiler-config` flag is available in vLLM v0.13.0 and later. If you are using an earlier version, please upgrade to use this feature.
 To use the `torch.profiler` module, set the `profiler` entry to `'torch'` and `torch_profiler_dir` to the directory where you want to save the traces. Additionally, you can control the profiling content by specifying the following additional arguments in the config:
 - `torch_profiler_record_shapes` to enable recording Tensor Shapes, off by default
 - `torch_profiler_with_memory` to record memory, off by default
--- a/docs/deployment/frameworks/helm.md
+++ b/docs/deployment/frameworks/helm.md
@@ -49,7 +49,7 @@ chart **including persistent volumes** and deletes the release.
 The following table describes configurable parameters of the chart in `values.yaml`:
 | Key | Type | Default | Description |
-|-----|------|---------|-------------|
+| --- | ---- | ------- | ----------- |
 | autoscaling | object | {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} | Autoscaling configuration |
 | autoscaling.enabled | bool | false | Enable autoscaling |
 | autoscaling.maxReplicas | int | 100 | Maximum replicas |
--- a/docs/deployment/frameworks/runpod.md
+++ b/docs/deployment/frameworks/runpod.md
@@ -0,0 +1,87 @@
 # RunPod
 vLLM can be deployed on [RunPod](https://www.runpod.io/), a cloud GPU platform that provides on-demand and serverless GPU instances for AI inference workloads.
 ## Prerequisites
 - A RunPod account with GPU pod access
 - A GPU pod running a CUDA-compatible template (e.g., `runpod/pytorch`)
 ## Starting the Server
 SSH into your RunPod pod and launch the vLLM OpenAI-compatible server:
 ```bash
 python -m vllm.entrypoints.openai.api_server \
    --model <model-name> \
    --host 0.0.0.0 \
    --port 8000
 ```
 !!! note
    Use `--host 0.0.0.0` to bind to all interfaces so the server is reachable from outside the container.
 ## Exposing Port 8000
 RunPod exposes HTTP services through its proxy. To make port 8000 accessible:
 1. In the RunPod dashboard, navigate to your pod settings.
 2. Add `8000` to the list of exposed HTTP ports.
 3. After the pod restarts, RunPod provides a public URL in the format:
    ```text
    https://<pod-id>-8000.proxy.runpod.net
    ```
 ## Troubleshooting 502 Bad Gateway
 A `502 Bad Gateway` error from the RunPod proxy typically means the server is not yet listening. Common causes:
 - **Model still loading** — Large models take time to download and load into GPU memory. Check the pod logs for progress.
 - **Wrong host binding** — Ensure you passed `--host 0.0.0.0`. Binding to `127.0.0.1` (the default) makes the server unreachable from the proxy.
 - **Port mismatch** — Verify the `--port` value matches the port exposed in the RunPod dashboard.
 - **Out of GPU memory** — The model may be too large for the allocated GPU. Check logs for CUDA OOM errors and consider using a larger instance or adding `--tensor-parallel-size` for multi-GPU pods.
 ## Verifying the Deployment
 Once the server is running, test it with a curl request:
 !!! console "Command"
    ```bash
    curl https://<pod-id>-8000.proxy.runpod.net/v1/chat/completions \
        -H "Content-Type: application/json" \
        -d '{
            "model": "<model-name>",
            "messages": [
                {"role": "user", "content": "Hello, how are you?"}
            ],
            "max_tokens": 50
        }'
    ```
 !!! console "Response"
    ```json
    {
        "id": "chat-abc123",
        "object": "chat.completion",
        "choices": [
            {
                "message": {
                    "role": "assistant",
                    "content": "I'm doing well, thank you for asking! How can I help you today?"
                },
                "index": 0,
                "finish_reason": "stop"
            }
        ]
    }
    ```
 You can also check the server health endpoint:
 ```bash
 curl https://<pod-id>-8000.proxy.runpod.net/health
 ```
--- a/docs/deployment/integrations/aibrix.md
+++ b/docs/deployment/integrations/aibrix.md
@@ -0,0 +1,5 @@
 # AIBrix
 [AIBrix](https://github.com/vllm-project/aibrix) is a cloud-native control plane that integrates with vLLM to simplify Kubernetes deployment, scaling, routing, and LoRA adapter management for large language model inference.
 For installation and usage instructions, please refer to the [AIBrix documentation](https://aibrix.readthedocs.io/).
--- a/docs/deployment/integrations/dynamo.md
+++ b/docs/deployment/integrations/dynamo.md
@@ -0,0 +1,7 @@
 # NVIDIA Dynamo
 [NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo) is an open-source framework for distributed LLM inference that can run vLLM on Kubernetes with flexible serving architectures (e.g. aggregated/disaggregated, optional router/planner).
 For Kubernetes deployment instructions and examples (including vLLM), see the [Deploying Dynamo on Kubernetes](https://github.com/ai-dynamo/dynamo/blob/main/docs/kubernetes/README.md) guide.
 Background reading: InfoQ news coverage — [NVIDIA Dynamo simplifies Kubernetes deployment for LLM inference](https://www.infoq.com/news/2025/12/nvidia-dynamo-kubernetes/).
--- a/docs/deployment/integrations/kubeai.md
+++ b/docs/deployment/integrations/kubeai.md
@@ -5,6 +5,7 @@
 Please see the Installation Guides for environment specific instructions:
 - [Any Kubernetes Cluster](https://www.kubeai.org/installation/any/)
 - [AKS](https://www.kubeai.org/installation/aks/)
 - [EKS](https://www.kubeai.org/installation/eks/)
 - [GKE](https://www.kubeai.org/installation/gke/)
--- a/docs/deployment/integrations/kuberay.md
+++ b/docs/deployment/integrations/kuberay.md
@@ -6,7 +6,7 @@ A Ray cluster can be declared in YAML, and the operator then handles pod schedul
 ## Why KubeRay instead of manual scripts?
 | Feature | Manual scripts | KubeRay |
-|---------|-----------------------------------------------------------|---------|
+| ------- | --------------------------------------------------------- | ------- |
 | Cluster bootstrap | Manually SSH into every node and run a script | One command to create or update the whole cluster: `kubectl apply -f cluster.yaml` |
 | Autoscaling | Manual | Automatically patches CRDs for adjusting cluster size |
 | Upgrades | Tear down & re-create manually | Blue/green deployment updates supported |
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@@ -11,6 +11,7 @@ Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine le
 Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 - [Helm](frameworks/helm.md)
 - [NVIDIA Dynamo](integrations/dynamo.md)
 - [InftyAI/llmaz](integrations/llmaz.md)
 - [llm-d](integrations/llm-d.md)
 - [KAITO](integrations/kaito.md)
@@ -20,7 +21,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 - [kubernetes-sigs/lws](frameworks/lws.md)
 - [meta-llama/llama-stack](integrations/llamastack.md)
 - [substratusai/kubeai](integrations/kubeai.md)
- [vllm-project/aibrix](https://github.com/vllm-project/aibrix)
+- [vllm-project/AIBrix](integrations/aibrix.md)
 - [vllm-project/production-stack](integrations/production-stack.md)
 ## Deployment with CPUs
--- a/docs/design/arch_overview.md
+++ b/docs/design/arch_overview.md
@@ -119,10 +119,10 @@ The code can be found in [vllm/v1/engine/coordinator.py](../../vllm/v1/engine/co
 For a deployment with `N` GPUs, `TP` tensor parallel size, `DP` data parallel size, and `A` API server count:
 | Process Type | Count | Notes |
-|---|---|---|
+| - | - | - |
 | API Server | `A` (default `DP`) | Handles HTTP requests and input processing |
 | Engine Core | `DP` (default 1) | Scheduler and KV cache management |
-| GPU Worker | `N` (= `DP x TP`) | One per GPU, executes model forward passes |
+| GPU Worker | `N` (= `DP x PP x TP`) | One per GPU, executes model forward passes |
 | DP Coordinator | 1 if `DP > 1`, else 0 | Load balancing across DP ranks |
 | **Total** | **`A + DP + N` (+ 1 if DP > 1)** | |
--- a/docs/design/attention_backends.md
+++ b/docs/design/attention_backends.md
@@ -101,7 +101,7 @@ Priority is **1 = highest** (tried first).
 **Blackwell (SM 10.x):**
 | Priority | Backend |
-|----------|---------|
+| -------- | ------- |
 | 1 | `FLASHINFER` |
 | 2 | `FLASH_ATTN` |
 | 3 | `TRITON_ATTN` |
@@ -110,7 +110,7 @@ Priority is **1 = highest** (tried first).
 **Ampere/Hopper (SM 8.x-9.x):**
 | Priority | Backend |
-|----------|---------|
+| -------- | ------- |
 | 1 | `FLASH_ATTN` |
 | 2 | `FLASHINFER` |
 | 3 | `TRITON_ATTN` |
@@ -121,7 +121,7 @@ Priority is **1 = highest** (tried first).
 **Blackwell (SM 10.x):**
 | Priority | Backend |
-|----------|---------|
+| -------- | ------- |
 | 1 | `FLASHINFER_MLA` |
 | 2 | `CUTLASS_MLA` |
 | 3 | `FLASH_ATTN_MLA` |
@@ -133,7 +133,7 @@ Priority is **1 = highest** (tried first).
 **Ampere/Hopper (SM 8.x-9.x):**
 | Priority | Backend |
-|----------|---------|
+| -------- | ------- |
 | 1 | `FLASH_ATTN_MLA` |
 | 2 | `FLASHMLA` |
 | 3 | `FLASHINFER_MLA` |
@@ -145,7 +145,7 @@ Priority is **1 = highest** (tried first).
 ## Legend
 | Column | Description |
-|--------|-------------|
+| ------ | ----------- |
 | **Dtypes** | Supported model data types (fp16, bf16, fp32) |
 | **KV Dtypes** | Supported KV cache data types (`auto`, `fp8`, `fp8_e4m3`, etc.) |
 | **Block Sizes** | Supported KV cache block sizes (%N means multiples of N) |
@@ -162,7 +162,7 @@ Priority is **1 = highest** (tried first).
 ## Standard Attention (MHA, MQA, GQA) Backends
 | Backend | Version | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | MM Prefix | DCP | Attention Types | Compute Cap. |
-|---------|---------|--------|-----------|-------------|------------|------|-----------|-----|-----------------|--------------|
+| ------- | ------- | ------ | --------- | ----------- | ---------- | ---- | --------- | --- | --------------- | ------------ |
 | `CPU_ATTN` | | fp16, bf16, fp32 | `auto` | Any | 32, 64, 80, 96, 112, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | All | N/A |
 | `FLASHINFER` | Native† | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ❌ | ❌ | ✅ | Decoder | 7.x-9.x |
 | `FLASHINFER` | TRTLLM† | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ✅ | ❌ | ✅ | Decoder | 10.x |
@@ -171,9 +171,9 @@ Priority is **1 = highest** (tried first).
 | `FLASH_ATTN` | FA4* | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ✅ | All | ≥10.0 |
 | `FLASH_ATTN_DIFFKV` | | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ✅ | Decoder | Any |
 | `FLEX_ATTENTION` | | fp16, bf16, fp32 | `auto`, `bfloat16` | Any | Any | ❌ | ✅ | ❌ | Decoder, Encoder Only | Any |
-| `ROCM_AITER_FA` |  | fp16, bf16 | `auto` | 16, 32 | 64, 128, 256 | ❌ | ❌ | ❌ | Decoder | N/A |
+| `ROCM_AITER_FA` | | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32 | 64, 128, 256 | ❌ | ❌ | ❌ | Decoder, Enc-Dec | N/A |
-| `ROCM_AITER_UNIFIED_ATTN` |  | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | All | N/A |
+| `ROCM_AITER_UNIFIED_ATTN` | | fp16, bf16 | `auto` | %16 | Any | ✅ | ✅ | ❌ | All | N/A |
-| `ROCM_ATTN` |  | fp16, bf16, fp32 | `auto` | 16, 32, 544 | 32, 64, 80, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | All | N/A |
+| `ROCM_ATTN` | | fp16, bf16, fp32 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 544 | 32, 64, 80, 96, 128, 160, 192, 224, 256 | ✅ | ✅ | ❌ | All | N/A |
 | `TREE_ATTN` | | fp16, bf16 | `auto` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | Decoder | Any |
 | `TRITON_ATTN` | | fp16, bf16, fp32 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ✅ | ❌ | All | Any |
@@ -191,7 +191,7 @@ The prefill backend is selected at runtime based on hardware and
 configuration.
 | Backend | Description | Compute Cap. | Enable | Disable | Notes |
-|---------|-------------|--------------|--------|---------|-------|
+| ------- | ----------- | ------------ | ------ | ------- | ----- |
 | TRT-LLM Ragged‡ | TensorRT-LLM ragged attention | 10.x | Default on SM100 | `-ac.use_trtllm_ragged_deepseek_prefill=0` | DeepSeek R1 dims only |
 | FlashInfer | FlashInfer CUTLASS backend | 10.x | `-ac.disable_flashinfer_prefill=0` | `-ac.disable_flashinfer_prefill=1` | DeepSeek R1 dims only |
 | cuDNN | cuDNN-based attention | 10.x | `-ac.use_cudnn_prefill=1` | `-ac.use_cudnn_prefill=0` | |
@@ -203,14 +203,14 @@ configuration.
 ### Decode Backends
 | Backend | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | Sparse | MM Prefix | DCP | Attention Types | Compute Cap. |
-|---------|--------|-----------|-------------|------------|------|--------|-----------|-----|-----------------|--------------|
+| ------- | ------ | --------- | ----------- | ---------- | ---- | ------ | --------- | --- | --------------- | ------------ |
 | `CUTLASS_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 128 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 10.x |
 | `FLASHINFER_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | 10.x |
-| `FLASHINFER_MLA_SPARSE` | fp16, bf16 | `auto`, `bfloat16` | 32, 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 10.x |
+| `FLASHINFER_MLA_SPARSE` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 10.x |
 | `FLASHMLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 64 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x-10.x |
 | `FLASHMLA_SPARSE` | bf16 | `auto`, `bfloat16`, `fp8_ds_mla` | 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 9.x-10.x |
 | `FLASH_ATTN_MLA` | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x |
-| `ROCM_AITER_MLA` | fp16, bf16 | `auto` | 1 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
+| `ROCM_AITER_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 1 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
-| `ROCM_AITER_MLA_SPARSE` | fp16, bf16 | `auto` | Any | 576 | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
+| `ROCM_AITER_MLA_SPARSE` | fp16, bf16 | `auto`, `bfloat16` | 1 | Any | ❌ | ✅ | ❌ | ❌ | Decoder | N/A |
 | `ROCM_AITER_TRITON_MLA` | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
-| `TRITON_MLA` | fp16, bf16 | `auto`, `bfloat16` | Any | Any | ❌ | ❌ | ❌ | ✅ | Decoder | Any |
+| `TRITON_MLA` | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | Any |
--- a/docs/design/cuda_graphs.md
+++ b/docs/design/cuda_graphs.md
@@ -98,7 +98,7 @@ The goal of this structure is to uniquely identify a (padded) batch with minimal
 ### `CudagraphDispatcher`
-The [CudagraphDispatcher][vllm.v1.cudagraph_dispatcher.CudagraphDispatcher] takes responsibility for maintaining two sets of valid dispatching keys, one set for `FULL` runtime mode and one set for `PIECEWISE` runtime mode, and dispatches the correct runtime mode and the dispatching keys before executing the model's forwards. It will take in the initial key (a rough batch_descriptor for the padded input) and return the selected runtime mode and the final batch_descriptor, then tell the CUDAGraphWarpper instances that decision through forward contexts. Notice that `CudagraphDispatcher` is the only source of truth for available CUDA Graph keys and `CUDAGraphWrapper` instances can blindly trust the forward context on what CUDA Graphs to dispatch to. This lets us simplify the wrapper code and centralize the logic in the dispatcher.
+The [CudagraphDispatcher][vllm.v1.cudagraph_dispatcher.CudagraphDispatcher] takes responsibility for maintaining two sets of valid dispatching keys, one set for `FULL` runtime mode and one set for `PIECEWISE` runtime mode, and dispatches the correct runtime mode and the dispatching keys before executing the model's forwards. It will take in the initial key (a rough batch_descriptor for the padded input) and return the selected runtime mode and the final batch_descriptor, then tell the CUDAGraphWrapper instances that decision through forward contexts. Notice that `CudagraphDispatcher` is the only source of truth for available CUDA Graph keys and `CUDAGraphWrapper` instances can blindly trust the forward context on what CUDA Graphs to dispatch to. This lets us simplify the wrapper code and centralize the logic in the dispatcher.
 The dispatching keys are initialized through the dispatcher's `initialize_cudagraph_keys` method, which is called by the gpu_model_runner after all possible attention backends are initialized. This is where we can get much fancier in the future and “prepare” all kinds of CUDA Graphs combinations. For now, we just append available keys based on the valid combos of `decode_mode`/`mixed_mode` of `cudagraph_mode` and `cudagraph_capture_sizes` in the compilation config.
@@ -174,18 +174,18 @@ Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that
 The following table lists backends that support full CUDA Graphs at the time of writing.
 | Attention Backend | cudagraph_support | Comments |
-|:---|:---|:---|
+| :---------------- | :---------------- | :------- |
 | FlashAttention v2 | `UNIFORM_BATCH` | Actually `ALWAYS` but workaround to fallback to `FULL_AND_PIECEWISE` for performance reason |
 | FlashAttention v3 | `ALWAYS` | has unified routine for both batches, so `FULL` mode is good |
 | Triton Attention | `ALWAYS` | prefer `FULL_AND_PIECEWISE` since it has different kernels for prefill/mixed and pure decode batches |
-| AITER FlashAttention | `UNIFORM_BATCH`| |
+| AITER FlashAttention | `UNIFORM_BATCH` | |
 | FlashInfer | `UNIFORM_SINGLE_TOKEN_DECODE` | Will be set to `UNIFORM_BATCH` when using TRTLLM attention on Blackwell |
 | FlashMLA | `UNIFORM_BATCH` | |
 | FlashInferMLA | `UNIFORM_BATCH` | |
 | FlashInferMLASparse | `UNIFORM_BATCH` | |
 | AITER MLA | `UNIFORM_SINGLE_TOKEN_DECODE` | |
 | CUTLASS MLA | `UNIFORM_SINGLE_TOKEN_DECODE` | |
-| Mamba attention| `UNIFORM_SINGLE_TOKEN_DECODE` | |
+| Mamba attention | `UNIFORM_SINGLE_TOKEN_DECODE` | |
 Unlisted backends are all declared as `NEVER`.
--- a/docs/design/debug_vllm_compile.md
+++ b/docs/design/debug_vllm_compile.md
@@ -6,7 +6,7 @@ TL;DR:
 - The vLLM-torch.compile integration is multiple pieces. vLLM exposes flags to turn off each piece:
 | Online Flag | Offline Flag | Result |
-|----------|----------|-------------|
+| ----------- | ------------ | ------ |
 | --enforce-eager | enforce_eager=True | Turn off torch.compile and CUDAGraphs |
 | -cc.mode=0 | mode=CompilationMode.NONE | Turn off torch.compile only |
 | -cc.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) | Turn off CUDAGraphs only |
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -47,7 +47,7 @@ The TopK Weight Application and Reduction components happen right after the Unpe
 Please find the implementations of TopKWeightAndReduce [here](../../vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).
 `FusedMoEPrepareAndFinalizeModular::finalize()` method accepts a `TopKWeightAndReduce` argument that is invoked inside the method.
-The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEExpertsModular` and `FusedMoEPerpareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens.
+The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEExpertsModular` and `FusedMoEPrepareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens.
 * `FusedMoEExpertsModular::finalize_weight_and_reduce_impl` method returns `TopKWeightAndReduceNoOp` if the `FusedMoEExpertsModular` implementation does the weight application and reduction itself.
 * `FusedMoEExpertsModular::finalize_weight_and_reduce_impl` method returns `TopKWeightAndReduceContiguous` / `TopKWeightAndReduceNaiveBatched` / `TopKWeightAndReduceDelegate` if the `FusedMoEExpertsModular` implementation needs the `FusedMoEPrepareAndFinalizeModular::finalize()` to do the weight application and reduction.
--- a/docs/design/fusions.md
+++ b/docs/design/fusions.md
@@ -0,0 +1,339 @@
 # Fusion torch.compile passes
 vLLM applies a set of kernel/operator fusions at compile time (via custom [`torch.compile`](torch_compile.md) Inductor passes)
 to separate optimizations from model definitions and avoid breaking layer abstractions in model code.
 These fusions are controlled by fields in [`PassConfig`][vllm.config.compilation.PassConfig] and are automatically enabled
 at appropriate [optimization levels](optimization_levels.md).
 ## Quick Reference
 The table below maps each fusion to its controlling flag/config knob, the
 operations it fuses, what level enables it by default, and an indicative speedup.
 The Fullgraph column indicates whether the fusion requires the entire model graph to be
 visible (either via Inductor partition or `splitting_ops=[]`),
 and the last column indicates whether the fusion activates for all `num_tokens`
 or just on the low or high end.
 !!! info
    Speedup depends heavily on the exact model, batch size, and hardware.
    If tuning performance by hand, always benchmark your exact use-case with and without the fusion to verify the impact.
 | Fusion                                                                         | `PassConfig` flag            | Fused operations                               | Default at                     | E2E Speedup        | Fullgraph | `num_tokens` |
 | ------------------------------------------------------------------------------ | ---------------------------- | ---------------------------------------------- | ------------------------------ | ------------------ | --------- | ------------ |
 | [AllReduce + RMSNorm](#allreduce--rmsnorm-fuse_allreduce_rms)                  | `fuse_allreduce_rms`         | All-reduce → RMSNorm (+residual_add) (→ quant) | O2 (Hopper/Blackwell + TP > 1) | 5-20%              | No        | Low          |
 | [Attention + Quant](#attention--quantization-fuse_attn_quant)                  | `fuse_attn_quant`            | Attention output → FP8/NVFP4 quant             | Off by default                 | 3-7%               | Yes       | Always       |
 | [RoPE + KV-Cache Update](#rope--kv-cache-update-fuse_rope_kvcache)             | `fuse_rope_kvcache`          | Rotary embedding → KV cache write              | O1 (ROCm/AITER only)           | TBD                | No        | Low          |
 | [QK Norm + RoPE](#qk-norm--rope-enable_qk_norm_rope_fusion)                    | `enable_qk_norm_rope_fusion` | Q/K RMSNorm → rotary embedding                 | Off by default                 | 2-3%               | No        | Low          |
 | [Sequence Parallelism](#sequence-parallelism-enable_sp)                        | `enable_sp`                  | AllReduce → ReduceScatter + AllGather          | Off by default                 | Prereq for AsyncTP | Yes       | High         |
 | [AsyncTP GEMM + collective](#asynctp-gemm--collective-overlap-fuse_gemm_comms) | `fuse_gemm_comms`            | GEMM → reduce-scatter / all-gather → GEMM      | Off by default                 | 7-10%              | Yes       | High         |
 | [RMSNorm + Quant](#rmsnorm--quantization-fuse_norm_quant)                      | `fuse_norm_quant`            | RMSNorm (+residual add) → FP8/FP4 quant        | O1 (conditional)               | 1-4%               | No        | Always       |
 | [SiLU+Mul + Quant](#silumul--quantization-fuse_act_quant)                      | `fuse_act_quant`             | SiLU+Mul activation → FP8/FP4 quant            | O1 (conditional)               | 1-4%               | No        | Always       |
 | [RMSNorm + Padding](#rmsnorm--padding-fuse_act_padding)                        | `fuse_act_padding`           | Residual add + RMSNorm → padding               | O1 (ROCm/AITER only)           | TBD                | No        | Always       |
 ## Support Matrix
 The table below lists the quantization schemes supported by each fusion on each platform.
 **—** means the fusion is not available on that platform. The latest and in-progress work is available in the tracking issue:
 [#36066](https://github.com/vllm-project/vllm/issues/36066)
 | Fusion                       | SM100 (Blackwell)                        | SM90 (Hopper)                            | SM89 (Ada)                               | SM80 (Ampere) | ROCm                                     |
 | ---------------------------- | ---------------------------------------- | ---------------------------------------- | ---------------------------------------- | ------------- | ---------------------------------------- |
 | `fuse_allreduce_rms`         | FP16/BF16, FP8 static, NVFP4             | FP16/BF16, FP8 static                    | —                                        | —             | —                                        |
 | `fuse_attn_quant`\*          | FP8 static\*, NVFP4\*                    | FP8 static\*                             | FP8 static\*                             | —             | FP8 static\*                             |
 | `fuse_rope_kvcache`          | —                                        | —                                        | —                                        | —             | FP16/BF16                                |
 | `enable_qk_norm_rope_fusion` | FP16/BF16                                | FP16/BF16                                | FP16/BF16†                               | FP16/BF16†    | —                                        |
 | `enable_sp`                  | FP16/BF16, FP8 static†                   | FP16/BF16, FP8 static                    | FP16/BF16†                               | FP16/BF16†    | —                                        |
 | `fuse_gemm_comms`            | FP16/BF16, FP8 static†                   | FP16/BF16, FP8 static                    | FP16/BF16†                               | FP16/BF16†    | —                                        |
 | `fuse_norm_quant`            | FP8 static, FP8 per-token, FP8 per-group | FP8 static, FP8 per-token, FP8 per-group | FP8 static, FP8 per-token, FP8 per-group | —             | FP8 static, FP8 per-token, FP8 per-group |
 | `fuse_act_quant`             | FP8 static, NVFP4                        | FP8 static                               | FP8 static                               | —             | FP8 per-group                            |
 | `fuse_act_padding`           | —                                        | —                                        | —                                        | —             | FP16/BF16                                |
 \* `fuse_attn_quant` support depends on the attention backend in use; not all backends support
 fused quantization output. See the [`fuse_attn_quant` section](#attention--quantization-fuse_attn_quant)
 for per-backend details.
 † `enable_sp` and `fuse_gemm_comms` are only autoconfigured for SM90 today;
 other architectures support requires setting `PassConfig.sp_min_token_num` explicitly.
 SM100 support also requires setting `VLLM_DISABLED_KERNELS=FlashInferFP8ScaledMMLinearKernel`.
 ## Enabling / Disabling Fusions
 Fusions are exposed through `PassConfig`, which is nested inside `CompilationConfig`:
 ```python
 from vllm import LLM
 from vllm.config import CompilationConfig, PassConfig
 llm = LLM(
    model="...",
    optimization_level=2, # Default optimization level
    compilation_config=CompilationConfig(
        pass_config=PassConfig(
            fuse_norm_quant=True,
            fuse_act_quant=True,
            fuse_allreduce_rms=False,  # disable a specific fusion
        )
    ),
 )
 ```
 Fusions can also be enabled using command-line flags with any `vllm ...` command:
 ```bash
 # Enable O2 defaults, but turn off allreduce fusion
 vllm serve meta-llama/Llama-3.1-8B-Instruct -O2 -cc.pass_config.fuse_allreduce_rms=False
 # The above is equivalent to the more verbose:
 vllm serve meta-llama/Llama-3.1-8B-Instruct -O2 --compilation-config '{"pass_config": {"fuse_allreduce_rms": false}}'
 # Same syntax in other commands, e.g. vllm bench:
 vllm bench latency --model=meta-llama/Llama-3.1-8B-Instruct -O2 -cc.pass_config.fuse_allreduce_rms=False
 ```
 Fields set explicitly by the user always take precedence over optimization-level defaults.
 ## Fusion Details
 ### AllReduce + RMSNorm (`fuse_allreduce_rms`)
 !!! warning
    TP+DP and TP+PP combinations are currently broken
    ([#34458](https://github.com/vllm-project/vllm/issues/34458) and
    [#35426](https://github.com/vllm-project/vllm/issues/35426)).
    Only supported on NVIDIA Hopper (SM90) and Blackwell (SM100) with FlashInfer installed.
 **What it fuses.** Fuses the tensor-parallel all-reduce collective with the subsequent residual add,
 RMSNorm, and optionally a quantization step into a single FlashInfer / TRT-LLM communication kernel.
 This fusion is only profitable for small `num_tokens`,
 so the fusion is only performed in the lower compiled range.
 Patterns covered:
 - `AllReduce → RMSNorm(+residual_add)`: CUDA sm90+ with FlashInfer
 - `AllReduce → RMSNorm(+residual_add) → FP8 static quant`: CUDA sm90+ with FlashInfer
 - `AllReduce → RMSNorm(+residual_add) → NVFP4 dynamic quant`: CUDA sm100+ with FlashInfer
 The maximum tensor size below which the fused kernel is used is hardware-dependent (64 MB for TP=2
 on SM90/SM100) and configurable via `PassConfig.fi_allreduce_fusion_max_size_mb`.
 **Code locations.**
 - Pass: [`vllm/compilation/passes/fusion/allreduce_rms_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/allreduce_rms_fusion.py)
 - FlashInfer all-reduce: [`vllm/distributed/device_communicators/flashinfer_all_reduce.py`](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/device_communicators/flashinfer_all_reduce.py)
 - Benchmark: [`benchmarks/kernels/benchmark_fused_collective.py`](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_fused_collective.py)
 ### Attention + Quantization (`fuse_attn_quant`)
 !!! info
    `fuse_attn_quant` is currently not enabled at any optimization level by default and must be set
    explicitly. It requires the full model graph to be visible (Inductor partition or `splitting_ops=[]`).
 **What it fuses.** Fuses the attention output quantization directly after the attention computation,
 eliminating a full-precision memory round-trip of the attention output. Patterns covered:
 `Attention → FP8 static quant`:
 - `TRITON_ATTN`: CUDA, ROCm
 - `FLASHINFER`: CUDA sm100+ with FlashInfer installed
 - `ROCM_ATTN`: ROCm
 - `ROCM_AITER_UNIFIED_ATTN`: ROCm with AITER
 `Attention → NVFP4 dynamic quant`:
 - `FLASHINFER`: CUDA sm100+ with FlashInfer installed
 Other attention backends do not support fused output quantization yet.
 **Code locations.**
 - Pass: [`vllm/compilation/passes/fusion/attn_quant_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/attn_quant_fusion.py)
 - Attention backends: [`vllm/v1/attention/backends/`](https://github.com/vllm-project/vllm/blob/main/vllm/v1/attention/backends/)
 ### RoPE + KV-Cache Update (`fuse_rope_kvcache`)
 !!! info
    ROCm/AITER-only. Not available on NVIDIA CUDA or CPU. The fusion is only enabled for
    `num_tokens ≤ 256` by default due to AITER fused kernel performance issues.
    This threshold is configurable via `PassConfig.rope_kvcache_fusion_max_token_num`.
 **What it fuses.** Fuses the rotary positional embedding kernel with the KV-cache scatter/write into
 a single kernel, avoiding separate reads and writes of the key and value tensors.
 Requires: AMD ROCm with AITER enabled, the `rotary_embedding` custom op active (automatic),
 and the `kv_cache` update op visible in the graph: either by using Inductor graph partition
 or removed from `splitting_ops`.
 If these conditions are set, the fusion is enabled automatically for optimization level O1 and above.
 **Code locations.**
 - Pass: [`vllm/compilation/passes/fusion/rope_kvcache_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/rope_kvcache_fusion.py)
 ### Sequence Parallelism (`enable_sp`)
 **What it fuses.** Replaces all-reduce collectives with reduce-scatter + local RMSNorm + all-gather,
 splitting the sequence dimension across TP ranks. This restructures the graph so the subsequent AsyncTP
 pass can fuse the reduce-scatter / all-gather with the surrounding GEMMs.
 Sequence Parallelism itself does not directly improve performance; it is a prerequisite for the
 AsyncTP pass (`fuse_gemm_comms`). SP is only applied above a minimum token threshold that is
 autoconfigured based on device capability and model `hidden_size`. Currently only active on
 H100/SM90 for models with `hidden_size >= 8192`. The threshold is configurable via
 `PassConfig.sp_min_token_num`.
 The general transformation:
 ```text
 Input → AllReduce → RMSNorm → Output
 becomes:
 Input → ReduceScatter → local RMSNorm → AllGather → Output
 ```
 Patterns covered:
 - First block: `AllReduce → RMSNorm` → `ReduceScatter → RMSNorm → AllGather`
 - Middle blocks: `AllReduce → fused_add_RMSNorm` → `ReduceScatter → fused_add_RMSNorm → AllGather`
 - Both with optional `→ FP8 static quant` suffix
 Requires: `use_inductor_graph_partition=True` **or** piecewise compilation with static sizes
 divisible by `tensor_parallel_size`.
 Supported hardware: Only tested on NVIDIA CUDA, possibly works on ROCm. FP8 all-gather requires sm90+.
 **Code locations.**
 - Pass: [`vllm/compilation/passes/fusion/sequence_parallelism.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/sequence_parallelism.py)
 ### AsyncTP GEMM + Collective Overlap (`fuse_gemm_comms`)
 !!! info
    Requires `enable_sp=True` (enabled automatically). This pass is a no-op if Sequence Parallelism has not been applied.
 **What it fuses.** After Sequence Parallelism transforms the graph, fuses GEMM kernels with the
 surrounding reduce-scatter (output projection) and all-gather (input projection) using
 `torch.ops.symm_mem` symmetric-memory primitives, overlapping communication and computation.
 This overlap is only profitable for large `num_tokens`, so the fusion (and preceding SP)
 is only performed in the higher compiled range above `PassConfig.sp_min_token_num`.
 Patterns covered:
 - `GEMM → reduce-scatter` → `fused_matmul_reduce_scatter`
 - `all-gather → GEMM` → `all_gather_matmul`
 - FP8 scaled variants of both patterns
 Supported hardware: NVIDIA CUDA with symmetric-memory (`torch.distributed._symmetric_memory`) support.
 On B200, pattern-matching fp8 FlashInfer scaled MM is not supported, so it must be disabled
 ([#27893](https://github.com/vllm-project/vllm/issues/27893))
 ```shell
 VLLM_DISABLED_KERNELS=FlashInferFP8ScaledMMLinearKernel ...
 ```
 **Code locations.**
 - Pass: [`vllm/compilation/passes/fusion/collective_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/collective_fusion.py)
 - Sequence parallelism pass: [`vllm/compilation/passes/fusion/sequence_parallelism.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/sequence_parallelism.py)
 ### QK Norm + RoPE (`enable_qk_norm_rope_fusion`)
 !!! info
    Only applicable to models that apply per-head RMSNorm to Q and K before rotary positional
    embedding (e.g. Qwen). Not enabled by default at any optimization level due to perf issues on H100:
    [#34391](https://github.com/vllm-project/vllm/issues/34391)
 **What it fuses.** Fuses the sequence: split QKV → reshape → Q/K RMSNorm → reshape → rotary
 embedding into a single `fused_qk_norm_rope` CUDA kernel.
 ```text
 # Unfused:
 q, k, v = split(qkv)
 q_norm = rms_norm(q.view(heads))
 k_norm = rms_norm(k.view(kv_heads))
 q_rope, k_rope = rotary_embedding(q_norm, k_norm, ...)
 # Fused:
 fused_qk_norm_rope(qkv, ...)
 ```
 Supported hardware: CUDA (sm80+) only, tested only on sm90 and sm100.
 **Code locations.**
 - Pass: [`vllm/compilation/passes/fusion/qk_norm_rope_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/qk_norm_rope_fusion.py)
 - CUDA kernel: [`csrc/ops.h`](https://github.com/vllm-project/vllm/blob/main/csrc/ops.h) (`fused_qk_norm_rope`)
 ### RMSNorm + Quantization (`fuse_norm_quant`)
 !!! warning
    On NVIDIA, Inductor actually generates a faster fused kernel than our custom CUDA kernel.
    Hence, this fusion is only enabled when either `rms_norm` or `quant_fp8` is using a custom kernel.
 **What it fuses.** Combines the custom `rms_norm` / `fused_add_rms_norm`
 operations with subsequent quantization into a single fused kernel,
 eliminating an intermediate read/write of the full-precision activation tensor.
 Two variants are fused:
 - *Plain RMSNorm + quant*: `rms_norm(x) → quant_fp8(y)`
 - *Fused-add RMSNorm + quant*: `fused_add_rms_norm(x, residual) → quant_fp8(y)` — also updates the residual in-place.
 Note that AITER fusions are currently in a separate pass in `vllm.compilation.passes.fusion.rocm_aiter_fusion`.
 Supported quantization scheme/hardware combinations:
 - FP8 static per-tensor: CUDA & HIP kernel
 - FP8 dynamic per-token: CUDA & HIP kernel, AITER
 - FP8 dynamic per-token-group (128/64): CUDA & HIP kernel, AITER
 **Code locations.**
 - Pass: [`vllm/compilation/passes/fusion/rms_quant_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/rms_quant_fusion.py)
 - ROCm AITER pass: [`vllm/compilation/passes/fusion/rocm_aiter_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/rocm_aiter_fusion.py)
 - CUDA/HIP kernels: [`csrc/layernorm_quant_kernels.cu`](https://github.com/vllm-project/vllm/blob/main/csrc/layernorm_quant_kernels.cu)
 ### SiLU+Mul + Quantization (`fuse_act_quant`)
 !!! warning
    Same as `fuse_norm_quant`: on NVIDIA, Inductor generates a faster fused kernel than our custom ops.
    This fusion is only enabled when either `silu_and_mul` or `quant_fp8` are using a custom kernel,
    or for NVFP4-quantized models (where FP4 quant is always a custom op).
 **What it fuses.** Fuses the `silu_and_mul` gate-up projection activation with subsequent quantization into a single kernel,
 avoiding materialization of the full-precision post-activation tensor.
 Note that AITER fusions are in a separate pass in `vllm.compilation.passes.fusion.rocm_aiter_fusion`.
 Supported quantization scheme/hardware combinations:
 - FP8 static per-tensor: CUDA & HIP kernel
 - NVFP4 dynamic: CUDA sm100+ only with FlashInfer
 - FP8 per-token-group (128): ROCm AITER only
 **Code locations.**
 - Pass: [`vllm/compilation/passes/fusion/act_quant_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/act_quant_fusion.py)
 - ROCm AITER pass: [`vllm/compilation/passes/fusion/rocm_aiter_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/rocm_aiter_fusion.py)
 - CUDA/HIP kernels: [`csrc/quantization/`](https://github.com/vllm-project/vllm/blob/main/csrc/quantization/)
 ### RMSNorm + Padding (`fuse_act_padding`)
 !!! info
    ROCm/AITER-only. Targeted at GPT-OSS models.
 **What it fuses.** Fuses a residual add + RMSNorm with a subsequent padding operation that pads
 the hidden dimension to a multiple required by downstream AITER Triton GEMM kernels.
 Requires: AMD ROCm with AITER RMSNorm enabled. Enabled by default in optimization level O1 and above
 when the hidden size is 2880 and AITER Triton GEMMs *not* enabled.
 **Code locations.**
 - Pass: [`vllm/compilation/passes/fusion/rocm_aiter_fusion.py`](https://github.com/vllm-project/vllm/blob/main/vllm/compilation/passes/fusion/rocm_aiter_fusion.py) (`RocmAiterTritonAddRMSNormPadFusionPass`)
 ## See Also
 - [Optimization Levels](optimization_levels.md) — high-level presets that set
  fusion defaults.
 - [torch.compile in vLLM](torch_compile.md) — how the Inductor pass pipeline
  works.
 - [Attention Backends](attention_backends.md) — attention-specific kernel
  selection.
--- a/docs/design/logits_processors.md
+++ b/docs/design/logits_processors.md
@@ -352,7 +352,7 @@ The `BatchUpdate` abstraction models the persistent batch as a list of requests,
        (s, d, UNIDIRECTIONAL or SWAP)
        ```
-    * If the Move specifies `UNIDRECTIONAL`:
+    * If the Move specifies `UNIDIRECTIONAL`:
        * The request at index `s` is moved to index `d`; index `s` becomes an empty slot
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@@ -507,10 +507,10 @@ longer relevant in v1:
 - `vllm:num_requests_swapped`
 - `vllm:cpu_cache_usage_perc`
-In this mode, when a request is preempted (e.g. to make room in KV
+In this mode, when a request was preempted (e.g. to make room in KV
-cache to complete other requests), we swap kv cache blocks out to CPU
+cache to complete other requests), kv cache blocks were swapped out to
-memory. This is also known as "KV cache offloading" and is configured
+CPU memory. The `--swap-space` flag has been removed as this feature
-with `--swap-space` and `--preemption-mode`.
+is no longer used in V1.
 Historically, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The
 SequenceGroup encapsulated the idea of N Sequences which
--- a/docs/design/model_runner_v2.md
+++ b/docs/design/model_runner_v2.md
@@ -50,7 +50,7 @@ V1 was not originally designed with async scheduling in mind, and support requir
 ## 3. Removing Async Barrier
-A key requirement for async execution is that CPU operations remain non-blocking. Both explicit sync (for example, `torch.cuda.synchronize`) and implicit sync (for example, unpinned `.to("cuda")`) must be avoided.
+A key requirement for async execution is that CPU operations remain non-blocking. Both explicit sync (for example, `torch.accelerator.synchronize`) and implicit sync (for example, unpinned `.to("cuda")`) must be avoided.
 However, async execution can introduce race conditions when CPU and GPU concurrently touch the same memory.
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -31,7 +31,7 @@ th {
 </style>
 | Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Subclass |
-|---------|--------------------|--------------|---------------|-------|-----------------------|-----------|
+| ------- | ------------------ | ------------ | ------------- | ----- | --------------------- | --------- |
 | naive | standard | all<sup>1</sup> | G,A,T | N | <sup>6</sup> | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE] |
 | deepep_high_throughput | standard | fp8 | G(128),A,T<sup>2</sup> | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] |
 | deepep_low_latency | batched | fp8 | G(128),A,T<sup>3</sup> | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] |
@@ -78,7 +78,7 @@ Most experts flavors include an equivalent modular interface which will be a sub
 To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE kernels must have compatible activation formats, quantization types and quantization formats.
 | Kernel | Input act. format | Quant. types | Quant. format | Activation function | Apply Weight On Input | Modular | Source |
-|--------|-------------------|--------------|---------------|---------------------|-----------------------|---------|--------|
+| ------ | ----------------- | ------------ | ------------- | ------------------- | --------------------- | ------- | ------ |
 | triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] |
 | triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
 | deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | </br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
@@ -105,7 +105,7 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k
 The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts. Note that the "naive" backend will work with any non-modular experts.
 | backend | `FusedMoEPrepareAndFinalizeModular` subclasses | `FusedMoEExpertsModular` subclasses |
-|---------|-----------------------------------------|----------------------------------------------|
+| ------- | ---------------------------------------------- | ----------------------------------- |
 | deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
 | deepep_low_latency | `DeepEPLLPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
 | flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
--- a/docs/design/multiprocessing.md
+++ b/docs/design/multiprocessing.md
@@ -12,9 +12,8 @@ page for information on known issues and how to solve them.
 The use of Python multiprocessing in vLLM is complicated by:
- The use of vLLM as a library and the inability to control the code using vLLM
+- using vLLM as a library, which limits control over its internal code;
- Varying levels of incompatibilities between multiprocessing methods and vLLM
+- incompatibilities between certain multiprocessing methods and vLLM dependencies.
  dependencies
 This document describes how vLLM deals with these challenges.
@@ -22,11 +21,9 @@ This document describes how vLLM deals with these challenges.
 [Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
- `spawn` - spawn a new Python process. The default on Windows and macOS.
+- `spawn` - Spawn a new Python process. The default on Windows and macOS.
 - `fork` - Use `os.fork()` to fork the Python interpreter. The default on
  Linux for Python versions prior to 3.14.
 - `forkserver` - Spawn a server process that will fork a new process on request.
  The default on Linux for Python version 3.14 and newer.
@@ -36,8 +33,8 @@ This document describes how vLLM deals with these challenges.
 threads. If you are under macOS, using `fork` may cause the process to crash.
 `spawn` is more compatible with dependencies, but can be problematic when vLLM
-is used as a library. If the consuming code does not use a `__main__` guard (`if
+is used as a library. If the consuming code does not use a `__main__` guard
-__name__ == "__main__":`), the code will be inadvertently re-executed when vLLM
+(`if __name__ == "__main__":`), the code will be inadvertently re-executed when vLLM
 spawns a new process. This can lead to infinite recursion, among other problems.
 `forkserver` will spawn a new server process that will fork new processes on
@@ -57,8 +54,7 @@ Multiple vLLM dependencies indicate either a preference or requirement for using
 - <https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors>
 - <https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders>
-It is perhaps more accurate to say that there are known problems with using
+Known issues exist when using `fork` after initializing these dependencies.
 `fork` after initializing these dependencies.
 ## Current State (v0)
@@ -66,8 +62,8 @@ The environment variable `VLLM_WORKER_MULTIPROC_METHOD` can be used to control w
 - <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/envs.py#L339-L342>
-When we know we own the process because the `vllm` command was used, we use
+If the main process is controlled via the `vllm` command,
-`spawn` because it's the most widely compatible.
+`spawn` is used because it's the most widely compatible.
 - <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/scripts.py#L123-L140>
@@ -104,8 +100,8 @@ dependencies and code using vLLM as a library.
 ### Changes Made in v1
 There is not an easy solution with Python's `multiprocessing` that will work
-everywhere. As a first step, we can get v1 into a state where it does "best
+everywhere. As a first step, we can get v1 into a state where it does
-effort" choice of multiprocessing method to maximize compatibility.
+"best effort" choice of multiprocessing method to maximize compatibility.
 - Default to `fork`.
 - Use `spawn` when we know we control the main process (`vllm` was executed).
@@ -154,8 +150,8 @@ RuntimeError:
 ### Detect if a `__main__` guard is present
 It has been suggested that we could behave better if we could detect whether
-code using vLLM as a library has a `__main__` guard in place. This [post on
+code using vLLM as a library has a `__main__` guard in place. This
-stackoverflow](https://stackoverflow.com/questions/77220442/multiprocessing-pool-in-a-python-class-without-name-main-guard)
+[post on Stack Overflow](https://stackoverflow.com/questions/77220442/multiprocessing-pool-in-a-python-class-without-name-main-guard)
 was from a library author facing the same question.
 It is possible to detect whether we are in the original, `__main__` process, or
@@ -192,4 +188,4 @@ that works around these challenges.
 2. We can explore other libraries that may better suit our needs. Examples to
   consider:
- <https://github.com/joblib/loky>
+    - <https://github.com/joblib/loky>
--- a/docs/design/optimization_levels.md
+++ b/docs/design/optimization_levels.md
@@ -1,29 +1,18 @@
 <!-- markdownlint-disable -->
 # Optimization Levels
 ## Overview
-vLLM now supports optimization levels (`-O0`, `-O1`, `-O2`, `-O3`). Optimization levels provide an intuitive mechanism for users to trade startup time for performance. Higher levels have better performance but worse startup time. These optimization levels have associated defaults to help users get desired out-of-the-box performance. Importantly, defaults set by optimization levels are purely defaults; explicit user settings will not be overwritten.
+vLLM provides 4 optimization levels (`-O0`, `-O1`, `-O2`, `-O3`) that allow users to trade off startup time for performance:
 - `-O0`: No optimization. Fastest startup time, but lowest performance.
 - `-O1`: Fast optimization. Simple compilation and fast fusions, and PIECEWISE cudagraphs.
 - `-O2`: Default optimization. Additional compilation ranges, additional fusions, FULL_AND_PIECEWISE cudagraphs.
 - `-O3`: Aggressive optimization. Currently equal to `-O2`, but may include additional time-consuming or experimental optimizations in the future.
 All optimization level defaults can be achieved by manually setting the underlying flags.
 User-set flags take precedence over optimization level defaults.
 ## Level Summaries and Usage Examples
 ```bash
 # CLI usage
 python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O0
 # Python API usage
 from vllm.entrypoints.llm import LLM
 llm = LLM(
    model="RedHatAI/Llama-3.2-1B-FP8",
    optimization_level=0
 )
 ```
 #### `-O1`: Quick Optimizations
 - **Startup**: Moderate startup time
 - **Performance**: Inductor compilation, CUDAGraphMode.PIECEWISE
 - **Use case**:  Balance for most development scenarios
 ```bash
 # CLI usage
@@ -34,31 +23,59 @@ from vllm.entrypoints.llm import LLM
 llm = LLM(
    model="RedHatAI/Llama-3.2-1B-FP8",
-    optimization_level=1
+    optimization_level=2 # equivalent to -O2
 )
 ```
-#### `-O2`: Full Optimizations (Default)
+### `-O0`: No Optimization
 - **Startup**: Longer startup time
 - **Performance**: `-O1` + CUDAGraphMode.FULL_AND_PIECEWISE
 - **Use case**: Production workloads where performance is important. This is the default use case. It is also very similar to the previous default. The primary difference is that  noop & fusion flags are enabled. 
-```bash
+Startup as fast as possible - no autotuning, no compilation, and no cudagraphs.
-# CLI usage (default, so optional)
+This level is good for initial phases of development and debugging.
 python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O2
-# Python API usage
+Settings:
 from vllm.entrypoints.llm import LLM
-llm = LLM(
+- `-cc.cudagraph_mode=NONE`
-    model="RedHatAI/Llama-3.2-1B-FP8",
+- `-cc.mode=NONE` (also resulting in `-cc.custom_ops=["none"]`)
-    optimization_level=2  # This is the default
+- `-cc.pass_config.fuse_...=False` (all fusions disabled)
-)
+- `--kernel-config.enable_flashinfer_autotune=False`
 ```
-#### `-O3`: Full Optimization
+### `-O1`: Fast Optimization
-Still in development. Added infrastructure to prevent changing API in future 
+
-release. Currently behaves the same O2.
+Prioritize fast startup, but still enable basic optimizations like compilation and cudagraphs.
 This level is a good balance for most development scenarios where you want faster startup but
 still make sure your code does not break cudagraphs or compilation.
 Settings:
 - `-cc.cudagraph_mode=PIECEWISE`
 - `-cc.mode=VLLM_COMPILE`
 - `--kernel-config.enable_flashinfer_autotune=True`
 Fusions:
 - `-cc.pass_config.fuse_norm_quant=True`*
 - `-cc.pass_config.fuse_act_quant=True`*
 - `-cc.pass_config.fuse_act_padding=True`†
 - `-cc.pass_config.fuse_rope_kvcache=True`† (will be moved to O2)
 \* These fusions are only enabled when either op is using a custom kernel, otherwise Inductor fusion is better.</br>
 † These fusions are ROCm-only and require AITER.
 ### `-O2`: Full Optimization (Default)
 Prioritize performance at the expense of additional startup time.
 This level is recommended for production workloads and is hence the default.
 Fusions in this level _may_ take longer due to additional compile ranges.
 Settings (on top of `-O1`):
 - `-cc.cudagraph_mode=FULL_AND_PIECEWISE`
 - `-cc.pass_config.fuse_allreduce_rms=True`
 ### `-O3`: Aggressive Optimization
 This level is currently the same as `-O2`, but may include additional optimizations
 in the future that are more time-consuming or experimental.
 ## Troubleshooting
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -141,7 +141,7 @@ Every plugin has three parts:
    - triton ops
      Custom way doesn't work for triton ops now.
-7. (optional) Implement other plugable modules, such as lora, graph backend, quantization, mamba attention backend, etc.
+7. (optional) Implement other pluggable modules, such as lora, graph backend, quantization, mamba attention backend, etc.
 ## Compatibility Guarantee
@@ -155,4 +155,4 @@ The interface for the model/module may change during vLLM's development. If you
    - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0.
    - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.v1.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
    - `seed_everything` platform interface is deprecated. It has been removed in v0.16.0. Please use `vllm.utils.torch_utils.set_random_seed` instead.
-    - `prompt` in `Platform.validate_request` is deprecated and will be removed in v0.18.0.
+    - `prompt` in `Platform.validate_request` is deprecated. It has been removed in v0.18.0.
--- a/docs/design/torch_compile_multimodal.md
+++ b/docs/design/torch_compile_multimodal.md
@@ -26,7 +26,7 @@ This feature is off by default, but can be enabled by setting `compile_mm_encode
 To compile a multimodal component such as an encoder, we follow the same mechanism as the LLM text backbone, with a few additional scaffoldings:
-1. The `@support_torch_compile` decorator should include `enable_if=should_torch_compile_mm_vit`. This will gate the compilation behind our
+1. The `@support_torch_compile` decorator should include `enable_if=should_torch_compile_mm_encoder`. This will gate the compilation behind our
 `compile_mm_encoder` configuration
 2. `with set_model_tag("<component_name>", is_encoder=True)` context manager should be used around the nn.Module's instantiation. Since torch.compile
--- a/docs/features/README.md
+++ b/docs/features/README.md
@@ -37,7 +37,7 @@ th:not(:first-child) {
 </style>
 | Feature | [CP](../configuration/optimization.md#chunked-prefill) | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](speculative_decoding/README.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
-|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
+| - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
 | [CP](../configuration/optimization.md#chunked-prefill) | ✅ | | | | | | | | | | | | | | |
 | [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
 | [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
@@ -60,7 +60,7 @@ th:not(:first-child) {
 ### Feature x Hardware
 | Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | Intel GPU |
-|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------| ------------|
+| ------- | ----- | ------ | ------ | --- | ------ | --- | --- | --------- |
 | [CP](../configuration/optimization.md#chunked-prefill) | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [APC](automatic_prefix_caching.md) | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
 | [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
--- a/Show More
+++ b/Show More