[CI/Build[ Don't auto-rebase PRs with CI failures (#39443 )

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
[Spec Decode] fix returning size mismatch on extract hidden states proposer (#38610 )
2026-04-09 13:57:37 -07:00 · 2026-04-09 20:39:39 +00:00 · 2026-04-09 20:09:09 +00:00 · 2026-04-09 12:46:33 -07:00 · 2026-04-09 12:42:17 -07:00 · 2026-04-09 15:05:36 -04:00
476 changed files with 28778 additions and 10501 deletions
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -5,7 +5,6 @@ steps:
    depends_on: []
    device: amd_cpu
    no_plugin: true
-    soft_fail: true
    commands:
    - >
      docker build
--- a/.buildkite/intel_jobs/test-intel.yaml
+++ b/.buildkite/intel_jobs/test-intel.yaml
@@ -56,9 +56,9 @@ steps:
        'cd tests &&
        pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py &&
        pytest -v -s v1/engine --ignore=v1/engine/test_output_processor.py &&
-        pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py &&
+        pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py -k "not test_topk_only and not test_topp_only and not test_topk_and_topp" &&
        pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py --ignore=v1/worker/test_worker_memory_snapshot.py &&
        pytest -v -s v1/structured_output &&
        pytest -v -s v1/test_serial_utils.py &&
        pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py &&
-        pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py'
+        pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py --ignore=v1/kv_connector/unit/test_hf3fs_client.py --ignore=v1/kv_connector/unit/test_hf3fs_connector.py --ignore=v1/kv_connector/unit/test_hf3fs_metadata_server.py'
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@@ -1,6 +1,9 @@
 # For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
 model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+required_gpu_arch:
+  - gfx942
+  - gfx950
 tasks:
 - name: "mmlu_pro"
  metrics:
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@@ -1,6 +1,9 @@
 # For vllm script, with -t option (tensor parallel size)
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
+required_gpu_arch:
+  - gfx942
+  - gfx950
 tasks:
 - name: "gsm8k"
  metrics:
--- a/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
@@ -1,4 +1,7 @@
 model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
+required_gpu_arch:
+  - gfx942
+  - gfx950
 tasks:
  - name: "mmlu_pro"
    metrics:
--- a/.buildkite/lm-eval-harness/configs/models-small-rocm.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small-rocm.txt
@@ -1,5 +1,6 @@
 Qwen2.5-1.5B-Instruct.yaml
 Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
 Qwen1.5-MoE-W4A16-compressed-tensors.yaml
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -13,6 +13,7 @@ import os
 from contextlib import contextmanager

 import lm_eval
+import pytest
 import yaml

 from vllm.platforms import current_platform
@@ -89,9 +90,40 @@ def launch_lm_eval(eval_config, tp_size):
    return results


+def _check_rocm_gpu_arch_requirement(eval_config):
+    """Skip the test if the model requires a ROCm GPU arch not present.
+
+    Model YAML configs can specify::
+
+        required_gpu_arch:
+          - gfx942
+          - gfx950
+
+    The check only applies on ROCm.  On other platforms (e.g. CUDA) the
+    field is ignored so that shared config files work for both NVIDIA and
+    AMD CI pipelines.
+    """
+    required_archs = eval_config.get("required_gpu_arch")
+    if not required_archs:
+        return
+
+    if not current_platform.is_rocm():
+        return
+
+    from vllm.platforms.rocm import _GCN_ARCH  # noqa: E402
+
+    if not any(arch in _GCN_ARCH for arch in required_archs):
+        pytest.skip(
+            f"Model requires GPU arch {required_archs}, "
+            f"but detected arch is '{_GCN_ARCH}'"
+        )
+
+
 def test_lm_eval_correctness_param(config_filename, tp_size):
    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))

+    _check_rocm_gpu_arch_requirement(eval_config)
+
    results = launch_lm_eval(eval_config, tp_size)

    rtol = eval_config.get("rtol", DEFAULT_RTOL)
--- a/.buildkite/scripts/generate-and-upload-nightly-index.sh
+++ b/.buildkite/scripts/generate-and-upload-nightly-index.sh
@@ -19,7 +19,7 @@ has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12)
 if [[ "$has_new_python" -eq 0 ]]; then
    # use new python from docker
    docker pull python:3-slim
-    PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
+    PYTHON="docker run --rm -u $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3"
 fi

 echo "Using python interpreter: $PYTHON"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -35,23 +35,6 @@ export PYTHONPATH=".."
 # Helper Functions
 ###############################################################################

-wait_for_clean_gpus() {
-  local timeout=${1:-300}
-  local start=$SECONDS
-  echo "--- Waiting for clean GPU state (timeout: ${timeout}s)"
-  while true; do
-    if grep -q clean /opt/amdgpu/etc/gpu_state; then
-      echo "GPUs state is \"clean\""
-      return
-    fi
-    if (( SECONDS - start >= timeout )); then
-      echo "Error: GPUs did not reach clean state within ${timeout}s" >&2
-      exit 1
-    fi
-    sleep 3
-  done
-}
-
 cleanup_docker() {
  # Get Docker's root directory
  docker_root=$(docker info -f '{{.DockerRootDir}}')
@@ -365,19 +348,12 @@ apply_rocm_test_overrides() {
 ###############################################################################

 # --- GPU initialization ---
-echo "--- Confirming Clean Initial State"
-wait_for_clean_gpus
-
 echo "--- ROCm info"
 rocminfo

 # --- Docker housekeeping ---
 cleanup_docker

-echo "--- Resetting GPUs"
-echo "reset" > /opt/amdgpu/etc/gpu_state
-wait_for_clean_gpus
-
 # --- Pull test image ---
 echo "--- Pulling container"
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
--- a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
@@ -23,22 +23,22 @@ if [ "$failed_req" -ne 0 ]; then
  exit 1
 fi

-echo "--- DP+TP"
-vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 --max-model-len=4096 &
-server_pid=$!
-timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
-vllm bench serve \
-    --backend vllm \
-    --dataset-name random \
-    --model meta-llama/Llama-3.2-3B-Instruct \
-    --num-prompts 20 \
-    --result-dir ./test_results \
-    --result-filename dp_pp.json \
-    --save-result \
-    --endpoint /v1/completions
-kill -s SIGTERM $server_pid; wait $server_pid || true
-failed_req=$(jq '.failed' ./test_results/dp_pp.json)
-if [ "$failed_req" -ne 0 ]; then
-  echo "Some requests were failed!"
-  exit 1
-fi
+#echo "--- DP+TP"
+#vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 --max-model-len=4096 &
+#server_pid=$!
+#timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
+#vllm bench serve \
+#    --backend vllm \
+#    --dataset-name random \
+#    --model meta-llama/Llama-3.2-3B-Instruct \
+#    --num-prompts 20 \
+#    --result-dir ./test_results \
+#    --result-filename dp_pp.json \
+#    --save-result \
+#    --endpoint /v1/completions
+#kill -s SIGTERM $server_pid; wait $server_pid || true
+#failed_req=$(jq '.failed' ./test_results/dp_pp.json)
+#if [ "$failed_req" -ne 0 ]; then
+#  echo "Some requests were failed!"
+#  exit 1
+#fi
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -50,6 +50,6 @@ docker run \
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py --ignore=v1/worker/test_worker_memory_snapshot.py
    pytest -v -s v1/structured_output
    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py --ignore=v1/spec_decode/test_acceptance_length.py
-    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py -k "not (test_register_kv_caches and FLASH_ATTN and True)"
+    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py --ignore=v1/kv_connector/unit/test_hf3fs_client.py --ignore=v1/kv_connector/unit/test_hf3fs_connector.py --ignore=v1/kv_connector/unit/test_hf3fs_metadata_server.py
    pytest -v -s v1/test_serial_utils.py
 '
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -751,6 +751,7 @@ steps:
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
+  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - csrc/
@@ -2035,7 +2036,6 @@ steps:
  timeout_in_minutes: 38
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
-  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - csrc/
@@ -2165,7 +2165,15 @@ steps:
  - vllm/platforms/rocm.py
  - tests/quantization
  commands:
-  - uv pip install --system torchao==0.14.1
+
+  # temporary install here since we need nightly, will move to requirements/test.in
+  # after torchao 0.12 release, and pin a working version of torchao nightly here
+
+  # since torchao nightly is only compatible with torch nightly currently
+  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+  # we can only upgrade after this is resolved
+  # TODO(jerryzh168): resolve the above comment
+  - uv pip install --system torchao==0.17.0
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

@@ -2690,6 +2698,24 @@ steps:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt


+- label: LM Eval Small Models (MI325) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  - vllm/model_executor/models/
+  - vllm/model_executor/model_loader/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small-rocm.txt
+
+
 - label: LM Eval Small Models (B200-MI325) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
@@ -3320,7 +3346,7 @@ steps:
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
-  - uv pip install --system torchao==0.14.1
+  - uv pip install --system torchao==0.17.0
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

--- a/.buildkite/test_areas/basic_correctness.yaml
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -4,6 +4,7 @@ depends_on:
 steps:
 - label: Basic Correctness
  timeout_in_minutes: 30
+  device: h200_18gb
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness/test_basic_correctness
--- a/.buildkite/test_areas/benchmarks.yaml
+++ b/.buildkite/test_areas/benchmarks.yaml
@@ -4,6 +4,7 @@ depends_on:
 steps:
 - label: Benchmarks CLI Test
  timeout_in_minutes: 20
+  device: h200_18gb
  source_file_dependencies:
  - vllm/
  - tests/benchmarks/
--- a/.buildkite/test_areas/cuda.yaml
+++ b/.buildkite/test_areas/cuda.yaml
@@ -4,6 +4,7 @@ depends_on:
 steps:
 - label: Platform Tests (CUDA)
  timeout_in_minutes: 15
+  device: h200_18gb
  source_file_dependencies:
  - vllm/
  - tests/cuda
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -224,20 +224,6 @@ steps:
  commands:
    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"

- label: MessageQueue TCP Multi-Node (2 GPUs)
-  timeout_in_minutes: 10
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 1
-  num_nodes: 2
-  no_plugin: true
-  optional: true
-  source_file_dependencies:
-  - vllm/distributed/device_communicators/shm_broadcast.py
-  - vllm/distributed/parallel_state.py
-  - tests/distributed/test_mq_tcp_multinode.py
-  commands:
-    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 1 $IMAGE_TAG "torchrun --nnodes 2 --nproc-per-node=1 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_mq_tcp_multinode.py" "torchrun --nnodes 2 --nproc-per-node=1 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_mq_tcp_multinode.py"
-
 - label: Distributed NixlConnector PD accuracy (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -4,6 +4,7 @@ depends_on:
 steps:
 - label: Engine
  timeout_in_minutes: 15
+  device: h200_18gb
  source_file_dependencies:
  - vllm/
  - tests/engine
@@ -25,6 +26,7 @@ steps:

 - label: e2e Scheduling (1 GPU)
  timeout_in_minutes: 30
+  device: h200_18gb
  source_file_dependencies:
    - vllm/v1/
    - tests/v1/e2e/general/
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -61,6 +61,7 @@ steps:

 - label: Entrypoints Integration (API Server openai - Part 3)
  timeout_in_minutes: 50
+  device: h200_18gb
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
@@ -105,6 +106,7 @@ steps:

 - label: OpenAI API Correctness
  timeout_in_minutes: 30
+  device: h200_18gb
  source_file_dependencies:
  - csrc/
  - vllm/entrypoints/openai/
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -4,6 +4,7 @@ depends_on:
 steps:
 - label: EPLB Algorithm
  timeout_in_minutes: 15
+  device: h200_18gb
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/eplb
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -4,6 +4,7 @@ depends_on:
 steps:
 - label: vLLM IR Tests
  timeout_in_minutes: 10
+  device: h200_18gb
  working_dir: "/vllm-workspace/"
  source_file_dependencies:
    - vllm/ir
@@ -17,10 +18,9 @@ steps:
  source_file_dependencies:
  - csrc/
  - tests/kernels/core
-  - tests/kernels/test_top_k_per_row.py
  - tests/kernels/test_concat_mla_q.py
  commands:
-    - pytest -v -s kernels/core kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py
+    - pytest -v -s kernels/core kernels/test_concat_mla_q.py

 - label: Kernels Attention Test %N
  timeout_in_minutes: 35
@@ -106,6 +106,7 @@ steps:
  - vllm/v1/attention/backends/mla/flashinfer_mla.py
  - vllm/v1/attention/selector.py
  - vllm/platforms/cuda.py
+  - tests/kernels/test_top_k_per_row.py
  commands:
    - nvidia-smi
    - python3 examples/basic/offline_inference/chat.py
@@ -116,6 +117,7 @@ steps:
    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
+    - pytest -v -s tests/kernels/test_top_k_per_row.py
    # Quantization
    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
@@ -179,3 +181,21 @@ steps:
    - pytest -v -s kernels/moe/test_flashinfer_moe.py
    - pytest -v -s kernels/moe/test_nvfp4_moe.py
    - pytest -v -s kernels/moe/test_ocp_mx_moe.py
+
+
+- label: Kernels FusedMoE Layer Test (2 H100s)
+  timeout_in_minutes: 90
+  device: h100
+  num_devices: 2
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_moe_layer.py
+
+
+- label: Kernels FusedMoE Layer Test (2 B200s)
+  timeout_in_minutes: 90
+  device: b200
+  num_devices: 2
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_moe_layer.py
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -19,6 +19,7 @@ steps:

 - label: V1 Sample + Logits
  timeout_in_minutes: 30
+  device: h200_18gb
  source_file_dependencies:
    - vllm/
    - tests/v1/sample
@@ -86,6 +87,7 @@ steps:

 - label: Regression
  timeout_in_minutes: 20
+  device: h200_18gb
  source_file_dependencies:
  - vllm/
  - tests/test_regression
@@ -174,6 +176,7 @@ steps:
  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
+  - tests/reasoning
  - tests/tool_parsers
  - tests/transformers_utils
  - tests/config
@@ -187,6 +190,7 @@ steps:
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s renderers
  - pytest -v -s tokenizers_
+  - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py --ignore=reasoning/test_gemma4_reasoning_parser.py
  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
  - pytest -v -s config
--- a/.buildkite/test_areas/model_runner_v2.yaml
+++ b/.buildkite/test_areas/model_runner_v2.yaml
@@ -78,7 +78,6 @@ steps:
    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py -k "not ray"
    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py

-# These require fix https://github.com/vllm-project/vllm/pull/36280
 - label: Model Runner V2 Pipeline Parallelism (4 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
@@ -101,11 +100,13 @@ steps:
  - vllm/v1/worker/gpu/
  - vllm/v1/worker/gpu_worker.py
  - tests/v1/spec_decode/test_max_len.py
+  - tests/v1/spec_decode/test_probabilistic_rejection_sampler_utils.py
  - tests/v1/spec_decode/test_synthetic_rejection_sampler_utils.py
  - tests/v1/e2e/spec_decode/test_spec_decode.py
  commands:
  - set -x
  - export VLLM_USE_V2_MODEL_RUNNER=1
  - pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp"
+  - pytest -v -s v1/spec_decode/test_probabilistic_rejection_sampler_utils.py
  - pytest -v -s v1/spec_decode/test_synthetic_rejection_sampler_utils.py
  - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle or mtp"
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -4,6 +4,7 @@ depends_on:
 steps:
 - label: Basic Models Tests (Initialization)
  timeout_in_minutes: 45
+  device: h200_18gb
  torch_nightly: true
  source_file_dependencies:
  - vllm/
--- a/.buildkite/test_areas/models_language.yaml
+++ b/.buildkite/test_areas/models_language.yaml
@@ -38,7 +38,7 @@ steps:
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
-    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0'
    # Shard hybrid language model tests
    - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
  parallelism: 2
@@ -53,7 +53,7 @@ steps:
    # Install fast path packages for testing against transformers
    # Note: also needed to run plamo2 model in vLLM
    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
-    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
  mirror:
    amd:
@@ -67,6 +67,7 @@ steps:

 - label: Language Models Test (PPL)
  timeout_in_minutes: 110
+  device: h200_18gb
  optional: true
  source_file_dependencies:
  - vllm/
@@ -90,6 +91,7 @@ steps:

 - label: Language Models Test (MTEB)
  timeout_in_minutes: 110
+  device: h200_18gb
  optional: true
  source_file_dependencies:
  - vllm/
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -4,6 +4,7 @@ depends_on:
 steps:
 - label: "Multi-Modal Models (Standard) 1: qwen2"
  timeout_in_minutes: 45
+  device: h200_18gb
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
@@ -19,6 +20,7 @@ steps:

 - label: "Multi-Modal Models (Standard) 2: qwen3 + gemma"
  timeout_in_minutes: 45
+  device: h200_18gb
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
@@ -54,7 +56,8 @@ steps:
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py  --ignore models/multimodal/generation/test_memory_leak.py --ignore models/multimodal/processing
+    - pytest models/multimodal/generation/test_memory_leak.py -m core_model
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
  mirror:
    amd:
@@ -77,6 +80,7 @@ steps:

 - label: Multi-Modal Processor # 44min
  timeout_in_minutes: 60
+  device: h200_18gb
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
@@ -131,6 +135,7 @@ steps:

 - label: Multi-Modal Models (Extended Pooling)
  optional: true
+  device: h200_18gb
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal/pooling
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -49,6 +49,7 @@ steps:

 - label: PyTorch Fullgraph
  timeout_in_minutes: 30
+  device: h200_18gb
  source_file_dependencies:
  - vllm/
  - tests/compile
@@ -60,6 +61,7 @@ steps:
  # if this test fails, it means the nightly torch version is not compatible with some
  # of the dependencies. Please check the error message and add the package to whitelist
  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
+  device: h200_18gb
  soft_fail: true
  source_file_dependencies:
  - requirements/nightly_torch_test.txt
--- a/.buildkite/test_areas/quantization.yaml
+++ b/.buildkite/test_areas/quantization.yaml
@@ -16,7 +16,7 @@ steps:
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
-  - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system torchao==0.17.0 --index-url https://download.pytorch.org/whl/cu130
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

--- a/.buildkite/test_areas/ray_compat.yaml
+++ b/.buildkite/test_areas/ray_compat.yaml
@@ -7,6 +7,7 @@ steps:
  # If this fails, it means the PR introduces a dependency that
  # conflicts with Ray's dependency constraints.
  # See https://github.com/vllm-project/vllm/issues/33599
+  device: h200_18gb
  soft_fail: true
  timeout_in_minutes: 10
  source_file_dependencies:
--- a/.buildkite/test_areas/spec_decode.yaml
+++ b/.buildkite/test_areas/spec_decode.yaml
@@ -4,6 +4,18 @@ depends_on:
 steps:
 - label: Spec Decode Eagle
  timeout_in_minutes: 30
+  device: h200_18gb
+  source_file_dependencies:
+    - vllm/v1/spec_decode/
+    - vllm/v1/worker/gpu/spec_decode/
+    - tests/v1/e2e/spec_decode/
+  commands:
+    - pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"
+
+- label: Spec Decode Eagle Nightly B200
+  timeout_in_minutes: 30
+  device: b200
+  optional: true
  source_file_dependencies:
    - vllm/v1/spec_decode/
    - vllm/v1/worker/gpu/spec_decode/
@@ -13,6 +25,19 @@ steps:

 - label: Spec Decode Speculators + MTP
  timeout_in_minutes: 30
+  device: h200_18gb
+  source_file_dependencies:
+    - vllm/v1/spec_decode/
+    - vllm/v1/worker/gpu/spec_decode/
+    - vllm/transformers_utils/configs/speculators/
+    - tests/v1/e2e/spec_decode/
+  commands:
+    - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
+
+- label: Spec Decode Speculators + MTP Nightly B200
+  timeout_in_minutes: 30
+  device: b200
+  optional: true
  source_file_dependencies:
    - vllm/v1/spec_decode/
    - vllm/v1/worker/gpu/spec_decode/
@@ -23,6 +48,7 @@ steps:
  
 - label: Spec Decode Ngram + Suffix
  timeout_in_minutes: 30
+  device: h200_18gb
  source_file_dependencies:
    - vllm/v1/spec_decode/
    - vllm/v1/worker/gpu/spec_decode/
@@ -32,6 +58,18 @@ steps:

 - label: Spec Decode Draft Model
  timeout_in_minutes: 30
+  device: h200_18gb
+  source_file_dependencies:
+    - vllm/v1/spec_decode/
+    - vllm/v1/worker/gpu/spec_decode/
+    - tests/v1/e2e/spec_decode/
+  commands:
+    - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"
+
+- label: Spec Decode Draft Model Nightly B200
+  timeout_in_minutes: 30
+  device: b200
+  optional: true
  source_file_dependencies:
    - vllm/v1/spec_decode/
    - vllm/v1/worker/gpu/spec_decode/
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -18,7 +18,7 @@ pull_request_rules:
 - name: comment-pre-commit-failure
  description: Comment on PR when pre-commit check fails
  conditions:
-    - status-failure=pre-commit
+    - check-failure=pre-commit
    - -closed
    - -draft
  actions:
@@ -51,7 +51,7 @@ pull_request_rules:
 - name: comment-dco-failure
  description: Comment on PR when DCO check fails
  conditions:
-    - status-failure=dco
+    - check-failure=dco
    - -closed
    - -draft
  actions:
@@ -378,17 +378,18 @@ pull_request_rules:
      add:
        - tool-calling

- name: auto-rebase if approved, ready, and 40 commits behind main
+- name: auto-rebase to keep merge candidate within 1 day behind main
  conditions:
    - base = main
    - label=ready
    - "#approved-reviews-by >= 1"
-    - "#commits-behind >= 40"
+    - "#commits-behind >= 50"
+    - "#check-failure = 0"
    - -closed
    - -draft
    - -conflict
  actions:
-    rebase: {}
+    update: {}

 - name: ping author on conflicts and add 'needs-rebase' label
  conditions:
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,9 @@ vllm/third_party/triton_kernels/*
 # FlashMLA interface copied from source
 vllm/third_party/flashmla/flash_mla_interface.py

+# DeepGEMM vendored package built from source
+vllm/third_party/deep_gemm/
+
 # triton jit
 .triton

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -39,7 +39,7 @@ repos:
  rev: 0.11.1
  hooks:
    - id: pip-compile
-      args: [requirements/test.in, -c, requirements/common.txt, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"]
+      args: [requirements/test.in, -c, requirements/common.txt, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu130, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"]
      files: ^requirements/test\.(in|txt)$
    - id: pip-compile
      alias: pip-compile-rocm
@@ -59,21 +59,54 @@ repos:
        --no-emit-package, cuda-pathfinder,
        --no-emit-package, cuda-toolkit,
        --no-emit-package, cupy-cuda12x,
+        # nvidia packages (unsuffixed / unified naming)
        --no-emit-package, nvidia-cublas,
        --no-emit-package, nvidia-cuda-cupti,
        --no-emit-package, nvidia-cuda-nvrtc,
        --no-emit-package, nvidia-cuda-runtime,
-        --no-emit-package, nvidia-cudnn-cu13,
+        --no-emit-package, nvidia-cudnn,
        --no-emit-package, nvidia-cufft,
        --no-emit-package, nvidia-cufile,
        --no-emit-package, nvidia-curand,
        --no-emit-package, nvidia-cusolver,
        --no-emit-package, nvidia-cusparse,
+        --no-emit-package, nvidia-cusparselt,
+        --no-emit-package, nvidia-nccl,
+        --no-emit-package, nvidia-nvjitlink,
+        --no-emit-package, nvidia-nvshmem,
+        --no-emit-package, nvidia-nvtx,
+        # nvidia cu12 packages
+        --no-emit-package, nvidia-cublas-cu12,
+        --no-emit-package, nvidia-cuda-cupti-cu12,
+        --no-emit-package, nvidia-cuda-nvrtc-cu12,
+        --no-emit-package, nvidia-cuda-runtime-cu12,
+        --no-emit-package, nvidia-cudnn-cu12,
+        --no-emit-package, nvidia-cufft-cu12,
+        --no-emit-package, nvidia-cufile-cu12,
+        --no-emit-package, nvidia-curand-cu12,
+        --no-emit-package, nvidia-cusolver-cu12,
+        --no-emit-package, nvidia-cusparse-cu12,
+        --no-emit-package, nvidia-cusparselt-cu12,
+        --no-emit-package, nvidia-nccl-cu12,
+        --no-emit-package, nvidia-nvjitlink-cu12,
+        --no-emit-package, nvidia-nvshmem-cu12,
+        --no-emit-package, nvidia-nvtx-cu12,
+        # nvidia cu13 packages
+        --no-emit-package, nvidia-cublas-cu13,
+        --no-emit-package, nvidia-cuda-cupti-cu13,
+        --no-emit-package, nvidia-cuda-nvrtc-cu13,
+        --no-emit-package, nvidia-cuda-runtime-cu13,
+        --no-emit-package, nvidia-cudnn-cu13,
+        --no-emit-package, nvidia-cufft-cu13,
+        --no-emit-package, nvidia-cufile-cu13,
+        --no-emit-package, nvidia-curand-cu13,
+        --no-emit-package, nvidia-cusolver-cu13,
+        --no-emit-package, nvidia-cusparse-cu13,
        --no-emit-package, nvidia-cusparselt-cu13,
        --no-emit-package, nvidia-nccl-cu13,
-        --no-emit-package, nvidia-nvjitlink,
+        --no-emit-package, nvidia-nvjitlink-cu13,
        --no-emit-package, nvidia-nvshmem-cu13,
-        --no-emit-package, nvidia-nvtx,
+        --no-emit-package, nvidia-nvtx-cu13,
      ]
      files: ^requirements/rocm-test\.(in|txt)$
 - repo: local
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,8 +56,8 @@ endif()
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.10.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.10.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.11.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.11.0")

 #
 # Try to find python package with an executable that exactly matches
@@ -225,8 +225,8 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
  # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
  # a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
  #
-  set(CMAKE_${VLLM_GPU_LANG}_FLAGS "${CMAKE_${VLLM_GPU_LANG}_FLAGS} -Wno-unused-result")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result")
+  set(CMAKE_${VLLM_GPU_LANG}_FLAGS "${CMAKE_${VLLM_GPU_LANG}_FLAGS} -Wno-unused-result -Wno-unused-value")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result -Wno-unused-value")
 endif()

 #
@@ -299,6 +299,7 @@ set(VLLM_EXT_SRC
  "csrc/quantization/w8a8/int8/scaled_quant.cu"
  "csrc/quantization/w8a8/fp8/common.cu"
  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
+  "csrc/quantization/fused_kernels/fused_silu_mul_block_quant.cu"
  "csrc/quantization/gguf/gguf_kernel.cu"
  "csrc/quantization/activation_kernels.cu"
  "csrc/cuda_utils_kernels.cu"
@@ -340,8 +341,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  list(APPEND VLLM_EXT_SRC
    "csrc/quantization/awq/gemm_kernels.cu"
-    "csrc/cutlass_extensions/common.cpp"
-    "csrc/quantization/fused_kernels/fused_silu_mul_block_quant.cu")
+    "csrc/cutlass_extensions/common.cpp")

  set_gencode_flags_for_srcs(
    SRCS "${VLLM_EXT_SRC}"
@@ -1222,6 +1222,7 @@ endif()

 # For CUDA we also build and ship some external projects.
 if (VLLM_GPU_LANG STREQUAL "CUDA")
+    include(cmake/external_projects/deepgemm.cmake)
    include(cmake/external_projects/flashmla.cmake)
    include(cmake/external_projects/qutlass.cmake)

--- a/README.md
+++ b/README.md
@@ -23,47 +23,54 @@ For events, please visit [vllm.ai/events](https://vllm.ai/events) to join us.

 vLLM is a fast and easy-to-use library for LLM inference and serving.

-Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
+Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has grown into one of the most active open-source AI projects built and maintained by a diverse community of many dozens of academic institutions and companies from over 2000 contributors.

 vLLM is fast with:

 - State-of-the-art serving throughput
 - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
- Continuous batching of incoming requests
- Fast model execution with CUDA/HIP graph
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer
- Speculative decoding
- Chunked prefill
+- Continuous batching of incoming requests, chunked prefill, prefix caching
+- Fast and flexible model execution with piecewise and full CUDA/HIP graphs
+- Quantization: FP8, MXFP8/MXFP4, NVFP4, INT8, INT4, GPTQ/AWQ, GGUF, compressed-tensors, ModelOpt, TorchAO, and [more](https://docs.vllm.ai/en/latest/features/quantization/index.html)
+- Optimized attention kernels including FlashAttention, FlashInfer, TRTLLM-GEN, FlashMLA, and Triton
+- Optimized GEMM/MoE kernels for various precisions using CUTLASS, TRTLLM-GEN, CuTeDSL
+- Speculative decoding including n-gram, suffix, EAGLE, DFlash
+- Automatic kernel generation and graph-level transformations using torch.compile
+- Disaggregated prefill, decode, and encode

 vLLM is flexible and easy to use with:

 - Seamless integration with popular Hugging Face models
 - High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
- Tensor, pipeline, data and expert parallelism support for distributed inference
+- Tensor, pipeline, data, expert, and context parallelism for distributed inference
 - Streaming outputs
- OpenAI-compatible API server
- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, Arm CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
- Prefix caching support
- Multi-LoRA support
+- Generation of structured outputs using xgrammar or guidance
+- Tool calling and reasoning parsers
+- OpenAI-compatible API server, plus Anthropic Messages API and gRPC support
+- Efficient multi-LoRA support for dense and MoE layers
+- Support for NVIDIA GPUs, AMD GPUs, and x86/ARM/PowerPC CPUs. Additionally, diverse hardware plugins such as Google TPUs, Intel Gaudi, IBM Spyre, Huawei Ascend, Rebellions NPU, Apple Silicon, MetaX GPU, and more.

-vLLM seamlessly supports most popular open-source models on HuggingFace, including:
+vLLM seamlessly supports 200+ model architectures on HuggingFace, including:

- Transformer-like LLMs (e.g., Llama)
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
- Embedding Models (e.g., E5-Mistral)
- Multi-modal LLMs (e.g., LLaVA)
+- Decoder-only LLMs (e.g., Llama, Qwen, Gemma)
+- Mixture-of-Expert LLMs (e.g., Mixtral, DeepSeek-V3, Qwen-MoE, GPT-OSS)
+- Hybrid attention and state-space models (e.g., Mamba, Qwen3.5)
+- Multi-modal models (e.g., LLaVA, Qwen-VL, Pixtral)
+- Embedding and retrieval models (e.g., E5-Mistral, GTE, ColBERT)
+- Reward and classification models (e.g., Qwen-Math)

 Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).

 ## Getting Started

-Install vLLM with `pip` or [from source](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/index.html#build-wheel-from-source):
+Install vLLM with [`uv`](https://docs.astral.sh/uv/) (recommended) or `pip`:

 ```bash
-pip install vllm
+uv pip install vllm
 ```

+Or [build from source](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/index.html#build-wheel-from-source) for development.
+
 Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.

 - [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
--- a/benchmarks/kernels/benchmark_block_fp8_gemm.py
+++ b/benchmarks/kernels/benchmark_block_fp8_gemm.py
@@ -9,11 +9,12 @@ os.environ["VLLM_USE_DEEP_GEMM"] = "0"
 import torch

 from vllm.benchmarks.lib.utils import default_vllm_config
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    W8A8BlockFp8LinearOp,
+from vllm.model_executor.kernels.linear import (
+    init_fp8_linear_kernel,
 )
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    GroupShape,
+    create_fp8_quant_key,
 )
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
    CUTLASS_BLOCK_FP8_SUPPORTED,
@@ -70,11 +71,15 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
    weight_group_shape = GroupShape(block_n, block_k)
    act_quant_group_shape = GroupShape(1, block_k)  # Per-token, per-group quantization

-    linear_op = W8A8BlockFp8LinearOp(
-        weight_group_shape=weight_group_shape,
-        act_quant_group_shape=act_quant_group_shape,
-        cutlass_block_fp8_supported=use_cutlass,
-        use_aiter_and_is_supported=False,
+    linear_op = init_fp8_linear_kernel(
+        weight_quant_key=create_fp8_quant_key(
+            static=True, group_shape=weight_group_shape
+        ),
+        activation_quant_key=create_fp8_quant_key(
+            static=False, group_shape=act_quant_group_shape
+        ),
+        out_dtype=torch.get_default_dtype(),
+        module_name="build_w8a8_block_fp8_runner",
    )

    def run():
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -20,7 +20,7 @@ import matplotlib.pyplot as plt
 import numpy as np
 import torch

-from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import (
    persistent_masked_m_silu_mul_quant,
 )
 from vllm.triton_utils import tl, triton
--- a/cmake/external_projects/deepgemm.cmake
+++ b/cmake/external_projects/deepgemm.cmake
@@ -0,0 +1,151 @@
+include(FetchContent)
+
+# If DEEPGEMM_SRC_DIR is set, DeepGEMM is built from that directory
+# instead of downloading.
+# It can be set as an environment variable or passed as a cmake argument.
+# The environment variable takes precedence.
+if (DEFINED ENV{DEEPGEMM_SRC_DIR})
+  set(DEEPGEMM_SRC_DIR $ENV{DEEPGEMM_SRC_DIR})
+endif()
+
+if(DEEPGEMM_SRC_DIR)
+  FetchContent_Declare(
+    deepgemm
+    SOURCE_DIR ${DEEPGEMM_SRC_DIR}
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+  )
+else()
+  # This ref should be kept in sync with tools/install_deepgemm.sh
+  FetchContent_Declare(
+    deepgemm
+    GIT_REPOSITORY https://github.com/deepseek-ai/DeepGEMM.git
+    GIT_TAG 477618cd51baffca09c4b0b87e97c03fe827ef03
+    GIT_SUBMODULES "third-party/cutlass" "third-party/fmt"
+    GIT_PROGRESS TRUE
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+  )
+endif()
+
+# Use FetchContent_Populate (not MakeAvailable) to avoid processing
+# DeepGEMM's own CMakeLists.txt which has incompatible find_package calls.
+FetchContent_GetProperties(deepgemm)
+if(NOT deepgemm_POPULATED)
+  FetchContent_Populate(deepgemm)
+endif()
+message(STATUS "DeepGEMM is available at ${deepgemm_SOURCE_DIR}")
+
+# DeepGEMM requires CUDA 12.3+ for SM90, 12.9+ for SM100
+set(DEEPGEMM_SUPPORT_ARCHS)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3)
+  list(APPEND DEEPGEMM_SUPPORT_ARCHS "9.0a")
+endif()
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
+  list(APPEND DEEPGEMM_SUPPORT_ARCHS "10.0f")
+elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+  list(APPEND DEEPGEMM_SUPPORT_ARCHS "10.0a")
+endif()
+
+cuda_archs_loose_intersection(DEEPGEMM_ARCHS
+  "${DEEPGEMM_SUPPORT_ARCHS}" "${CUDA_ARCHS}")
+
+if(DEEPGEMM_ARCHS)
+  message(STATUS "DeepGEMM CUDA architectures: ${DEEPGEMM_ARCHS}")
+
+  find_package(CUDAToolkit REQUIRED)
+
+  #
+  # Build the _C pybind11 extension from DeepGEMM's C++ source.
+  # This is a CXX-only module — CUDA kernels are JIT-compiled at runtime.
+  #
+  Python_add_library(_deep_gemm_C MODULE WITH_SOABI
+    "${deepgemm_SOURCE_DIR}/csrc/python_api.cpp")
+
+  # The pybind11 module name must be _C to match DeepGEMM's Python imports.
+  set_target_properties(_deep_gemm_C PROPERTIES OUTPUT_NAME "_C")
+
+  target_compile_definitions(_deep_gemm_C PRIVATE
+    "-DTORCH_EXTENSION_NAME=_C")
+
+  target_include_directories(_deep_gemm_C PRIVATE
+    "${deepgemm_SOURCE_DIR}/csrc"
+    "${deepgemm_SOURCE_DIR}/deep_gemm/include"
+    "${deepgemm_SOURCE_DIR}/third-party/cutlass/include"
+    "${deepgemm_SOURCE_DIR}/third-party/cutlass/tools/util/include"
+    "${deepgemm_SOURCE_DIR}/third-party/fmt/include")
+
+  target_compile_options(_deep_gemm_C PRIVATE
+    $<$<COMPILE_LANGUAGE:CXX>:-std=c++17>
+    $<$<COMPILE_LANGUAGE:CXX>:-O3>
+    $<$<COMPILE_LANGUAGE:CXX>:-Wno-psabi>
+    $<$<COMPILE_LANGUAGE:CXX>:-Wno-deprecated-declarations>)
+
+  # torch_python is required because DeepGEMM uses pybind11 type casters
+  # for at::Tensor (via PYBIND11_MODULE), unlike vLLM's own extensions which
+  # use torch::Library custom ops.
+  find_library(TORCH_PYTHON_LIBRARY torch_python
+    PATHS "${TORCH_INSTALL_PREFIX}/lib"
+    REQUIRED)
+
+  target_link_libraries(_deep_gemm_C PRIVATE
+    torch ${TORCH_LIBRARIES} "${TORCH_PYTHON_LIBRARY}"
+    CUDA::cudart CUDA::nvrtc)
+
+  # Install the shared library into the vendored package directory
+  install(TARGETS _deep_gemm_C
+    LIBRARY DESTINATION vllm/third_party/deep_gemm
+    COMPONENT _deep_gemm_C)
+
+  #
+  # Vendor DeepGEMM Python package files
+  #
+  install(FILES
+    "${deepgemm_SOURCE_DIR}/deep_gemm/__init__.py"
+    DESTINATION vllm/third_party/deep_gemm
+    COMPONENT _deep_gemm_C)
+
+  install(DIRECTORY "${deepgemm_SOURCE_DIR}/deep_gemm/utils/"
+    DESTINATION vllm/third_party/deep_gemm/utils
+    COMPONENT _deep_gemm_C
+    FILES_MATCHING PATTERN "*.py")
+
+  install(DIRECTORY "${deepgemm_SOURCE_DIR}/deep_gemm/testing/"
+    DESTINATION vllm/third_party/deep_gemm/testing
+    COMPONENT _deep_gemm_C
+    FILES_MATCHING PATTERN "*.py")
+
+  install(DIRECTORY "${deepgemm_SOURCE_DIR}/deep_gemm/legacy/"
+    DESTINATION vllm/third_party/deep_gemm/legacy
+    COMPONENT _deep_gemm_C
+    FILES_MATCHING PATTERN "*.py")
+
+  # Generate envs.py (normally generated by DeepGEMM's setup.py build step)
+  file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/deep_gemm_envs.py"
+    "# Pre-installed environment variables\npersistent_envs = dict()\n")
+  install(FILES "${CMAKE_CURRENT_BINARY_DIR}/deep_gemm_envs.py"
+    DESTINATION vllm/third_party/deep_gemm
+    RENAME envs.py
+    COMPONENT _deep_gemm_C)
+
+  #
+  # Install include files needed for JIT compilation at runtime.
+  # The JIT compiler finds these relative to the package directory.
+  #
+
+  # DeepGEMM's own CUDA headers
+  install(DIRECTORY "${deepgemm_SOURCE_DIR}/deep_gemm/include/"
+    DESTINATION vllm/third_party/deep_gemm/include
+    COMPONENT _deep_gemm_C)
+
+  # CUTLASS and CuTe headers (vendored for JIT, separate from vLLM's CUTLASS)
+  install(DIRECTORY "${deepgemm_SOURCE_DIR}/third-party/cutlass/include/"
+    DESTINATION vllm/third_party/deep_gemm/include
+    COMPONENT _deep_gemm_C)
+
+else()
+  message(STATUS "DeepGEMM will not compile: "
+    "unsupported CUDA architecture ${CUDA_ARCHS}")
+  # Create empty target so setup.py doesn't fail on unsupported systems
+  add_custom_target(_deep_gemm_C)
+endif()
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -39,7 +39,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG c0ec424fd8a546d0cbbf4bf050bbcfe837c55afb
+          GIT_TAG f5bc33cfc02c744d24a2e9d50e6db656de40611c
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
@@ -87,10 +87,21 @@ endforeach()
 #
 add_custom_target(_vllm_fa4_cutedsl_C)

-# Copy flash_attn/cute directory (needed for FA4) and transform imports
-# The cute directory uses flash_attn.cute imports internally, which we replace
-# with vllm.vllm_flash_attn.cute to match our package structure.
-install(CODE "
+# Install flash_attn/cute directory (needed for FA4).
+# When using a local source dir (VLLM_FLASH_ATTN_SRC_DIR), create a symlink
+# so edits to cute-dsl Python files take effect immediately without rebuilding.
+# Otherwise, copy files and transform flash_attn.cute imports to
+# vllm.vllm_flash_attn.cute to match our package structure.
+if(VLLM_FLASH_ATTN_SRC_DIR)
+  install(CODE "
+    set(LINK_TARGET \"${vllm-flash-attn_SOURCE_DIR}/flash_attn/cute\")
+    set(LINK_NAME \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn/cute\")
+    file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")
+    file(REMOVE_RECURSE \"\${LINK_NAME}\")
+    file(CREATE_LINK \"\${LINK_TARGET}\" \"\${LINK_NAME}\" SYMBOLIC)
+  " COMPONENT _vllm_fa4_cutedsl_C)
+else()
+  install(CODE "
    file(GLOB_RECURSE CUTE_PY_FILES \"${vllm-flash-attn_SOURCE_DIR}/flash_attn/cute/*.py\")
    foreach(SRC_FILE \${CUTE_PY_FILES})
      file(RELATIVE_PATH REL_PATH \"${vllm-flash-attn_SOURCE_DIR}/flash_attn/cute\" \${SRC_FILE})
@@ -101,4 +112,5 @@ install(CODE "
      string(REPLACE \"flash_attn.cute\" \"vllm.vllm_flash_attn.cute\" FILE_CONTENTS \"\${FILE_CONTENTS}\")
      file(WRITE \${DST_FILE} \"\${FILE_CONTENTS}\")
    endforeach()
-" COMPONENT _vllm_fa4_cutedsl_C)
+  " COMPONENT _vllm_fa4_cutedsl_C)
+endif()
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -91,9 +91,9 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs,

  if (n == 0) return;

-  const int64_t* src_data = src_ptrs.data_ptr<int64_t>();
-  const int64_t* dst_data = dst_ptrs.data_ptr<int64_t>();
-  const int64_t* size_data = sizes.data_ptr<int64_t>();
+  int64_t* src_data = src_ptrs.mutable_data_ptr<int64_t>();
+  int64_t* dst_data = dst_ptrs.mutable_data_ptr<int64_t>();
+  int64_t* size_data = sizes.mutable_data_ptr<int64_t>();

  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

@@ -107,15 +107,24 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs,
  CUmemcpyAttributes attr = {};
  attr.srcAccessOrder = CU_MEMCPY_SRC_ACCESS_ORDER_STREAM;
  size_t attrs_idx = 0;
+  #if defined(CUDA_VERSION) && CUDA_VERSION >= 13000
+  CUresult result = cuMemcpyBatchAsync(
+      reinterpret_cast<CUdeviceptr*>(dst_data),
+      reinterpret_cast<CUdeviceptr*>(src_data),
+      reinterpret_cast<size_t*>(size_data), static_cast<size_t>(n), &attr,
+      &attrs_idx, 1, static_cast<CUstream>(stream));
+  TORCH_CHECK(result == CUDA_SUCCESS, "cuMemcpyBatchAsync failed with error ",
+              result);
+  #else
  size_t fail_idx = 0;
  CUresult result = cuMemcpyBatchAsync(
-      reinterpret_cast<CUdeviceptr*>(const_cast<int64_t*>(dst_data)),
-      reinterpret_cast<CUdeviceptr*>(const_cast<int64_t*>(src_data)),
-      reinterpret_cast<size_t*>(const_cast<int64_t*>(size_data)),
-      static_cast<size_t>(n), &attr, &attrs_idx, 1, &fail_idx,
-      static_cast<CUstream>(stream));
+      reinterpret_cast<CUdeviceptr*>(dst_data),
+      reinterpret_cast<CUdeviceptr*>(src_data),
+      reinterpret_cast<size_t*>(size_data), static_cast<size_t>(n), &attr,
+      &attrs_idx, 1, &fail_idx, static_cast<CUstream>(stream));
  TORCH_CHECK(result == CUDA_SUCCESS, "cuMemcpyBatchAsync failed at index ",
              fail_idx, " with error ", result);
+  #endif
 #else
  // Fallback for CUDA < 12.8 and ROCm: individual async copies.
  // cudaMemcpyDefault lets the driver infer direction from pointer types.
--- a/csrc/cpu/cpu_attn_vec.hpp
+++ b/csrc/cpu/cpu_attn_vec.hpp
@@ -53,7 +53,7 @@ class TileGemm82 {
                         const int64_t ldb, const int64_t ldc,
                         const int32_t block_size, const int32_t dynamic_k_size,
                         const bool accum_c) {
-    static_assert(0 < M <= 8);
+    static_assert(0 < M && M <= 8);
    using load_vec_t = typename VecTypeTrait<kv_cache_t>::vec_t;

    kv_cache_t* __restrict__ curr_b_0 = b_tile;
--- a/csrc/cpu/cpu_attn_vec16.hpp
+++ b/csrc/cpu/cpu_attn_vec16.hpp
@@ -68,7 +68,7 @@ class TileGemm161 {
                         const int64_t ldb, const int64_t ldc,
                         const int32_t block_size, const int32_t dynamic_k_size,
                         const bool accum_c) {
-    static_assert(0 < M <= 16);
+    static_assert(0 < M && M <= 16);
    using load_vec_t = typename VecTypeTrait<kv_cache_t>::vec_t;

    kv_cache_t* __restrict__ curr_b_0 = b_tile;
--- a/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp
+++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp
@@ -39,7 +39,7 @@ class TileGemm82 {

  template <int32_t M>
  static void gemm_micro(DEFINE_CPU_MICRO_GEMM_PARAMS) {
-    static_assert(0 < M <= 8);
+    static_assert(0 < M && M <= 8);
    using load_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;

    scalar_t* __restrict__ curr_b_0 = b_ptr;
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -8,8 +8,6 @@
 // libraries use different ISAs.
 #define TORCH_EXTENSION_NAME _C

-std::string init_cpu_threads_env(const std::string& cpu_ids);
-
 void release_dnnl_matmul_handler(int64_t handler);

 int64_t create_onednn_scaled_mm_handler(const torch::Tensor& b,
@@ -354,7 +352,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "str act, str isa) -> ()");
  ops.impl("cpu_fused_moe", torch::kCPU, &cpu_fused_moe);
 #endif
-  ops.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
  ops.def(
      "mla_decode_kvcache("
      "   Tensor! out, Tensor query, Tensor kv_cache,"
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -21,150 +21,6 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {

 #endif

-#ifndef VLLM_NUMA_DISABLED
-std::string init_cpu_threads_env(const std::string& cpu_ids) {
-  bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
-  TORCH_CHECK(omp_cpu_mask != nullptr,
-              "Failed to parse CPU string: " + cpu_ids);
-  TORCH_CHECK(omp_cpu_mask->size > 0);
-  std::vector<int> omp_cpu_ids;
-  omp_cpu_ids.reserve(omp_cpu_mask->size);
-
-  constexpr int group_size = 8 * sizeof(*omp_cpu_mask->maskp);
-
-  for (int offset = 0; offset < omp_cpu_mask->size; offset += group_size) {
-    unsigned long group_mask = omp_cpu_mask->maskp[offset / group_size];
-    int i = 0;
-    while (group_mask) {
-      if (group_mask & 1) {
-        omp_cpu_ids.emplace_back(offset + i);
-      }
-      ++i;
-      group_mask >>= 1;
-    }
-  }
-
-  // Memory node binding
-  if (numa_available() != -1) {
-    std::set<int> node_ids;
-    for (const auto& cpu_id : omp_cpu_ids) {
-      int node_id = numa_node_of_cpu(cpu_id);
-      if (node_id != -1) {
-        node_ids.insert(node_id);
-      }
-    }
-    // Concatenate all node_ids into a single comma-separated string
-    if (!node_ids.empty()) {
-      std::string node_ids_str;
-      for (const int node_id : node_ids) {
-        if (!node_ids_str.empty()) {
-          node_ids_str += ",";
-        }
-        node_ids_str += std::to_string(node_id);
-      }
-
-      bitmask* mask = numa_parse_nodestring(node_ids_str.c_str());
-      bitmask* src_mask = numa_get_mems_allowed();
-
-      int pid = getpid();
-
-      if (mask && src_mask) {
-        // move all existing pages to the specified numa node.
-        *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
-        int page_num = numa_migrate_pages(pid, src_mask, mask);
-        if (page_num == -1) {
-          TORCH_WARN("numa_migrate_pages failed. errno: " +
-                     std::to_string(errno));
-        }
-
-        // Restrict memory allocation to the selected NUMA node(s).
-        // Enhances memory locality for the threads bound to those NUMA CPUs.
-        if (node_ids.size() > 1) {
-          errno = 0;
-          numa_set_interleave_mask(mask);
-          if (errno != 0) {
-            TORCH_WARN("numa_set_interleave_mask failed. errno: " +
-                       std::to_string(errno));
-          } else {
-            TORCH_WARN(
-                "NUMA binding: Using INTERLEAVE policy for memory "
-                "allocation across multiple NUMA nodes (nodes: " +
-                node_ids_str +
-                "). Memory allocations will be "
-                "interleaved across the specified NUMA nodes.");
-          }
-        } else {
-          errno = 0;
-          numa_set_membind(mask);
-          if (errno != 0) {
-            TORCH_WARN("numa_set_membind failed. errno: " +
-                       std::to_string(errno));
-          } else {
-            TORCH_WARN(
-                "NUMA binding: Using MEMBIND policy for memory "
-                "allocation on the NUMA nodes (" +
-                node_ids_str +
-                "). Memory allocations will be "
-                "strictly bound to these NUMA nodes.");
-          }
-        }
-
-        numa_set_strict(1);
-
-        numa_free_nodemask(mask);
-        numa_free_nodemask(src_mask);
-      } else {
-        TORCH_WARN(
-            "numa_parse_nodestring or numa_get_run_node_mask failed. errno: " +
-            std::to_string(errno));
-      }
-    }
-  }
-
-  // OMP threads binding
-  omp_set_num_threads((int)omp_cpu_ids.size());
-  torch::set_num_threads((int)omp_cpu_ids.size());
-  TORCH_CHECK_EQ(omp_cpu_ids.size(), torch::get_num_threads());
-  TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads());
-
-  std::vector<std::pair<int, int>> thread_core_mapping;
-  thread_core_mapping.reserve(omp_cpu_ids.size());
-  omp_lock_t writelock;
-  omp_init_lock(&writelock);
-
-  #pragma omp parallel for schedule(static, 1)
-  for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
-    cpu_set_t mask;
-    CPU_ZERO(&mask);
-    CPU_SET(omp_cpu_ids[i], &mask);
-    int ret = sched_setaffinity(0, sizeof(cpu_set_t), &mask);
-    if (ret == -1) {
-      TORCH_CHECK(false,
-                  "sched_setaffinity failed. errno: " + std::to_string(errno));
-    }
-
-    omp_set_lock(&writelock);
-    thread_core_mapping.emplace_back(gettid(), omp_cpu_ids[i]);
-    omp_unset_lock(&writelock);
-  }
-
-  omp_destroy_lock(&writelock);
-
-  numa_free_nodemask(omp_cpu_mask);
-
-  std::stringstream ss;
-  ss << "OMP threads binding of Process " << getpid() << ":\n";
-  std::sort(thread_core_mapping.begin(), thread_core_mapping.end(),
-            [](auto&& a, auto&& b) { return a.second < b.second; });
-  for (auto&& item : thread_core_mapping) {
-    ss << "\t"
-       << "OMP tid: " << item.first << ", core " << item.second << "\n";
-  }
-
-  return ss.str();
-}
-#endif  // VLLM_NUMA_DISABLED
-
 namespace cpu_utils {
 ScratchPadManager::ScratchPadManager() : size_(0), ptr_(nullptr) {
  this->realloc(allocation_unit * 128);
--- a/csrc/cpu/utils.hpp
+++ b/csrc/cpu/utils.hpp
@@ -55,7 +55,8 @@ struct Counter {

 inline int64_t get_available_l2_size() {
  static int64_t size = []() {
-    const uint32_t l2_cache_size = at::cpu::L2_cache_size();
+    auto caps = at::cpu::get_cpu_capabilities();
+    const uint32_t l2_cache_size = caps.at("l2_cache_size").toInt();
    return l2_cache_size >> 1;  // use 50% of L2 cache
  }();
  return size;
--- a/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
+++ b/csrc/libtorch_stable/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
@@ -26,8 +26,10 @@ using namespace cute;
 template <class OutType, int ScaleGranularityM,
          int ScaleGranularityN, int ScaleGranularityK,
          class MmaTileShape, class ClusterShape,
-          class EpilogueScheduler, class MainloopScheduler>
+          class EpilogueScheduler, class MainloopScheduler,
+          bool swap_ab_ = false>
 struct cutlass_3x_gemm_fp8_blockwise {
+  static constexpr bool swap_ab = swap_ab_;
  using ElementAB = cutlass::float_e4m3_t;

  using ElementA = ElementAB;
@@ -55,9 +57,13 @@ struct cutlass_3x_gemm_fp8_blockwise {
  using ElementCompute = float;
  using ElementBlockScale = float; 

-  using ScaleConfig = cutlass::detail::Sm120BlockwiseScaleConfig<
+  using ScaleConfig = conditional_t<swap_ab,
+      cutlass::detail::Sm120BlockwiseScaleConfig<
        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
-        cute::UMMA::Major::MN, cute::UMMA::Major::K>;
+        cute::UMMA::Major::K, cute::UMMA::Major::MN>,
+      cutlass::detail::Sm120BlockwiseScaleConfig<
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
+        cute::UMMA::Major::MN, cute::UMMA::Major::K>>;

  // layout_SFA and layout_SFB cannot be swapped since they are deduced.
  using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
@@ -78,17 +84,32 @@ struct cutlass_3x_gemm_fp8_blockwise {
      ElementAccumulator,
      ElementCompute,
      ElementC,
-      LayoutC,
+      conditional_t<swap_ab, LayoutC_Transpose, LayoutC>,
      AlignmentC,
      ElementD,
-      LayoutD,
+      conditional_t<swap_ab, LayoutD_Transpose, LayoutD>,
      AlignmentD,
      EpilogueScheduler,
      DefaultOperation
  >::CollectiveOp;
 
  using StageCountType = cutlass::gemm::collective::StageCountAuto; 
-  using CollectiveMainloop = 
+  using CollectiveMainloop = conditional_t<swap_ab,
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag,
+          OperatorClass,
+          ElementB,
+          cute::tuple<LayoutB_Transpose, LayoutSFA>,
+          AlignmentB,
+          ElementA,
+          cute::tuple<LayoutA_Transpose, LayoutSFB>,
+          AlignmentA,
+          ElementAccumulator,
+          MmaTileShape,
+          ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          MainloopScheduler
+      >::CollectiveOp,
      typename cutlass::gemm::collective::CollectiveBuilder<
          ArchTag,
          OperatorClass,
@@ -103,7 +124,7 @@ struct cutlass_3x_gemm_fp8_blockwise {
          ClusterShape,
          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
          MainloopScheduler
-      >::CollectiveOp;
+      >::CollectiveOp>;

  // SM12x family to support both SM120 (RTX 5090) and SM121 (DGX Spark)
  using KernelType = enable_sm120_family<cutlass::gemm::kernel::GemmUniversal<
@@ -115,7 +136,7 @@ struct cutlass_3x_gemm_fp8_blockwise {
 // Tile configurations for different M ranges
 template <typename OutType>
 struct sm120_blockwise_fp8_config_default {
-  // M > 256: use 128x128x128 tile with Cooperative (Auto) schedule
+  // use 128x128x128 tile with Cooperative (Auto) schedule
  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
  using TileShape = Shape<_128, _128, _128>;
@@ -127,8 +148,8 @@ struct sm120_blockwise_fp8_config_default {
 };

 template <typename OutType>
-struct sm120_blockwise_fp8_config_M64 {
-  // M in [1, 256]: use 64x128x128 tile with Pingpong schedule
+struct sm120_blockwise_fp8_config_pingpong {
+  // use 64x128x128 tile with Pingpong schedule
  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedBlockwisePingpongSm120;
  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
  using TileShape = Shape<_64, _128, _128>;
@@ -139,11 +160,24 @@ struct sm120_blockwise_fp8_config_M64 {
      EpilogueSchedule, KernelSchedule>;
 };

+template <typename OutType>
+struct sm120_blockwise_fp8_config_swapab {
+  // use 128x32x128 tile with Cooperative schedule
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedBlockwiseCooperativeSm120;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_128, _32, _128>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Gemm = cutlass_3x_gemm_fp8_blockwise<
+      OutType, 128, 1, 128, TileShape, ClusterShape,
+      EpilogueSchedule, KernelSchedule, true>;
+};
+
 template <typename Gemm>
 void cutlass_gemm_caller_blockwise(torch::stable::Tensor& out, torch::stable::Tensor const& a,
                                   torch::stable::Tensor const& b,
                                   torch::stable::Tensor const& a_scales,
                                   torch::stable::Tensor const& b_scales) {
+  static constexpr bool swap_ab = Gemm::swap_ab;
  using GemmKernel = typename Gemm::GemmKernel;
  using StrideA = typename Gemm::GemmKernel::StrideA;
  using StrideB = typename Gemm::GemmKernel::StrideB;
@@ -167,11 +201,13 @@ void cutlass_gemm_caller_blockwise(torch::stable::Tensor& out, torch::stable::Te
  b_stride =
      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
  c_stride =
-      cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));
+      cutlass::make_cute_packed_stride(StrideC{}, swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1));

-  LayoutSFA layout_SFA = 
+  LayoutSFA layout_SFA = swap_ab ?
+      ScaleConfig::tile_atom_to_shape_SFA(make_shape(n, m, k, 1)) :
      ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
-  LayoutSFB layout_SFB = 
+  LayoutSFB layout_SFB = swap_ab ?
+      ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) :
      ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));

  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
@@ -180,15 +216,24 @@ void cutlass_gemm_caller_blockwise(torch::stable::Tensor& out, torch::stable::Te
  auto b_scales_ptr = static_cast<ElementBlockScale const*>(b_scales.data_ptr());

  typename GemmKernel::MainloopArguments mainloop_args{};
+  mainloop_args.layout_SFA = layout_SFA;
+  mainloop_args.layout_SFB = layout_SFB;
+  if (swap_ab) {
+    mainloop_args.ptr_A = b_ptr;
+    mainloop_args.dA = b_stride;
+    mainloop_args.ptr_B = a_ptr;
+    mainloop_args.dB = a_stride;
+    mainloop_args.ptr_SFA = b_scales_ptr;
+    mainloop_args.ptr_SFB = a_scales_ptr;
+  } else {
    mainloop_args.ptr_A = a_ptr;
    mainloop_args.dA = a_stride;
    mainloop_args.ptr_B = b_ptr;
    mainloop_args.dB = b_stride;
    mainloop_args.ptr_SFA = a_scales_ptr;
-  mainloop_args.layout_SFA = layout_SFA;
    mainloop_args.ptr_SFB = b_scales_ptr;
-  mainloop_args.layout_SFB = layout_SFB;
-  auto prob_shape = cute::make_shape(m, n, k, 1);
+  }
+  auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);

  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
  typename GemmKernel::EpilogueArguments epilogue_args{
@@ -204,8 +249,12 @@ void cutlass_gemm_blockwise_sm120_fp8_dispatch(torch::stable::Tensor& out,
                                               torch::stable::Tensor const& a_scales,
                                               torch::stable::Tensor const& b_scales) {
  int M = a.size(0);
+  // more heuristic tuning can be done here by checking N/K dimensions as well
+  bool swap_ab = (M <= 64) || (M % 4 != 0);
+
+  if (!swap_ab) {
    if (M <= 256) {
-    using Gemm = typename sm120_blockwise_fp8_config_M64<OutType>::Gemm;
+      using Gemm = typename sm120_blockwise_fp8_config_pingpong<OutType>::Gemm;
      return cutlass_gemm_caller_blockwise<Gemm>(
          out, a, b, a_scales, b_scales);
    }
@@ -213,6 +262,13 @@ void cutlass_gemm_blockwise_sm120_fp8_dispatch(torch::stable::Tensor& out,
    using Gemm = typename sm120_blockwise_fp8_config_default<OutType>::Gemm;
    return cutlass_gemm_caller_blockwise<Gemm>(
        out, a, b, a_scales, b_scales);
+  } else {
+    // Swap A/B for small M to improve performance
+    // Use TILE_N=32 as the minimum compatible tile size.
+    using Gemm = typename sm120_blockwise_fp8_config_swapab<OutType>::Gemm;
+    return cutlass_gemm_caller_blockwise<Gemm>(
+        out, a, b, a_scales, b_scales);
+  }
 }

 }  // namespace vllm
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -114,9 +114,9 @@ void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
                          int64_t numRows, int64_t stride0, int64_t stride1,
                          int64_t topK);

-void large_context_topk(const torch::Tensor& score, torch::Tensor& indices,
-                        const torch::Tensor& lengths,
-                        std::optional<torch::Tensor> row_starts_opt);
+void persistent_topk(const torch::Tensor& logits, const torch::Tensor& lengths,
+                     torch::Tensor& output, torch::Tensor& workspace, int64_t k,
+                     int64_t max_seq_len);

 void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
                               torch::Tensor& weight, torch::Tensor& scale,
@@ -143,13 +143,11 @@ void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input,
                              std::optional<torch::Tensor> residual,
                              int64_t group_size, bool is_scale_transposed);

-#ifndef USE_ROCM
 void silu_and_mul_per_block_quant(torch::Tensor& out,
                                  torch::Tensor const& input,
                                  torch::Tensor& scales, int64_t group_size,
                                  std::optional<torch::Tensor> scale_ub,
                                  bool is_scale_transposed);
-#endif

 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                      std::optional<torch::Tensor> key, int64_t head_size,
--- a/csrc/persistent_topk.cuh
+++ b/csrc/persistent_topk.cuh
--- a/csrc/quantization/fused_kernels/quant_conversions.cuh
+++ b/csrc/quantization/fused_kernels/quant_conversions.cuh
@@ -6,7 +6,7 @@

 #include "libtorch_stable/quantization/vectorization.cuh"
 // TODO(luka/varun):refactor common.cuh to use this file instead
-#include "quantization/w8a8/fp8/common.cuh"
+#include "../w8a8/fp8/common.cuh"

 namespace vllm {

--- a/csrc/quantization/w8a8/fp8/common.cuh
+++ b/csrc/quantization/w8a8/fp8/common.cuh
@@ -1,7 +1,7 @@
 #pragma once

 #include "libtorch_stable/quantization/vectorization.cuh"
-#include "quantization/utils.cuh"
+#include "../../utils.cuh"

 #include <cmath>

--- a/csrc/sampler.cu
+++ b/csrc/sampler.cu
@@ -564,8 +564,9 @@ template <int kNumThreadsPerBlock, bool useRadixSort,
          bool multipleBlocksPerRow = false, bool mergeBlocks = false>
 static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode(
    const float* logits, const int* seqLens, int* outIndices, int stride0,
-    int stride1, const int topK, int next_n, float* outLogits = nullptr,
-    const int numBlocksToMerge = 0, const int* indices = nullptr) {
+    int stride1, const int topK, int next_n, int seqLensIs2D = 0,
+    float* outLogits = nullptr, const int numBlocksToMerge = 0,
+    const int* indices = nullptr) {
  // The number of bins in the histogram.
  static constexpr int kNumBins = 2048;

@@ -574,8 +575,16 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode(

  // The range of logits within the row.
  int rowStart = 0;
-  int seq_len = seqLens[rowIdx / next_n];
-  int rowEnd = max(0, seq_len - next_n + (rowIdx % next_n) + 1);
+  int batch_idx = rowIdx / next_n;
+  int next_n_idx = rowIdx % next_n;
+  // seqLensIs2D=0: 1D seqLens — all rows in a batch share the same seq_len;
+  //               kernel computes per-row effective length via offset.
+  // seqLensIs2D=1: 2D seqLens — each logit row has its own pre-computed
+  //               effective length (flat index rowIdx = b*next_n + j maps
+  //               directly to seqLens[b, j] in C-contiguous layout).
+  int seq_len = seqLensIs2D ? seqLens[rowIdx] : seqLens[batch_idx];
+  int rowEnd =
+      seqLensIs2D ? max(0, seq_len) : max(0, seq_len - next_n + next_n_idx + 1);

  // Local pointers to this block
  if constexpr (!multipleBlocksPerRow && !mergeBlocks) {
@@ -653,6 +662,11 @@ void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  const auto numColumns = logits.size(1);

+  // True if seqLens is 2D (B, next_n): each logit row has its own pre-computed
+  // effective seq_len. False if seqLens is 1D (B,): all rows in a batch share
+  // the same seq_len and the kernel computes the per-row offset itself.
+  int seqLensIs2D = seqLens.dim() == 2 ? 1 : 0;
+
  if (numColumns < kSortingAlgorithmThreshold) {
    // Use insertion sort
    vllm::topKPerRowDecode<kNumThreadsPerBlock, false>
@@ -660,7 +674,7 @@ void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
            logits.data_ptr<float>(), seqLens.data_ptr<int>(),
            indices.data_ptr<int>(), static_cast<int>(stride0),
            static_cast<int>(stride1), static_cast<int>(topK),
-            static_cast<int>(next_n));
+            static_cast<int>(next_n), seqLensIs2D);
  } else if (numColumns < kSplitWorkThreshold) {
    // From this threshold, use radix sort instead
    vllm::topKPerRowDecode<kNumThreadsPerBlock, true>
@@ -668,7 +682,7 @@ void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
            logits.data_ptr<float>(), seqLens.data_ptr<int>(),
            indices.data_ptr<int>(), static_cast<int>(stride0),
            static_cast<int>(stride1), static_cast<int>(topK),
-            static_cast<int>(next_n));
+            static_cast<int>(next_n), seqLensIs2D);
  } else {
    // Long sequences are run in two steps
    constexpr auto multipleBlocksPerRowConfig = 10;
@@ -686,15 +700,16 @@ void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
            logits.data_ptr<float>(), seqLens.data_ptr<int>(),
            outIndicesAux.data_ptr<int>(), static_cast<int>(stride0),
            static_cast<int>(stride1), static_cast<int>(topK),
-            static_cast<int>(next_n), outLogitsAux.data_ptr<float>());
+            static_cast<int>(next_n), seqLensIs2D,
+            outLogitsAux.data_ptr<float>());

    constexpr int kNumThreadsPerBlockMerge = 1024;
    vllm::topKPerRowDecode<kNumThreadsPerBlockMerge, true, false, true>
        <<<numRows, kNumThreadsPerBlockMerge, topK * sizeof(int32_t), stream>>>(
            outLogitsAux.data_ptr<float>(), seqLens.data_ptr<int>(),
            indices.data_ptr<int>(), multipleBlocksPerRowConfig * topK, 1,
-            static_cast<int>(topK), static_cast<int>(next_n), nullptr,
-            multipleBlocksPerRowConfig, outIndicesAux.data_ptr<int>());
+            static_cast<int>(topK), static_cast<int>(next_n), seqLensIs2D,
+            nullptr, multipleBlocksPerRowConfig, outIndicesAux.data_ptr<int>());
  }
 }

--- a/csrc/topk.cu
+++ b/csrc/topk.cu
@@ -1,373 +1,156 @@
-// Portions of this file are adapted from SGLang PR:
-// https://github.com/sgl-project/sglang/pull/11194
-// and
-// https://github.com/sgl-project/sglang/pull/17747
+// Persistent TopK kernel for DeepSeek V3 sparse attention indexer.
+// See persistent_topk.cuh for kernel implementation.

-#include "cuda_compat.h"
-#include "dispatch_utils.h"
-
-#include <torch/cuda.h>
-#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_runtime.h>
+#include <algorithm>

 #ifndef USE_ROCM
-  #include <cub/cub.cuh>
-#else
-  #include <hipcub/hipcub.hpp>
+  #include "persistent_topk.cuh"
 #endif

-namespace vllm {
+void persistent_topk(const torch::Tensor& logits, const torch::Tensor& lengths,
+                     torch::Tensor& output, torch::Tensor& workspace, int64_t k,
+                     int64_t max_seq_len) {
+#ifndef USE_ROCM
+  TORCH_CHECK(logits.is_cuda(), "logits must be CUDA tensor");
+  TORCH_CHECK(lengths.is_cuda(), "lengths must be CUDA tensor");
+  TORCH_CHECK(output.is_cuda(), "output must be CUDA tensor");
+  TORCH_CHECK(logits.dtype() == torch::kFloat32, "Only float32 supported");
+  TORCH_CHECK(lengths.dtype() == torch::kInt32, "lengths must be int32");
+  TORCH_CHECK(output.dtype() == torch::kInt32, "output must be int32");
+  TORCH_CHECK(logits.dim() == 2, "logits must be 2D");
+  TORCH_CHECK(lengths.dim() == 1 || lengths.dim() == 2,
+              "lengths must be 1D or 2D");
+  TORCH_CHECK(lengths.is_contiguous(), "lengths must be contiguous");
+  TORCH_CHECK(output.dim() == 2, "output must be 2D");

-constexpr int TopK = 2048;              // DeepSeek V3 sparse attention top-k
-constexpr int kThreadsPerBlock = 1024;  // Threads per block
+  const int64_t num_rows = logits.size(0);
+  const int64_t stride = logits.size(1);

-// Shared memory budget
-#if defined(USE_ROCM)
-constexpr size_t kSmem = 48 * 1024;  // ROCm default: 48KB
-#else
-// Reduced from 128KB to 32KB to improve occupancy.
-// Each radix pass needs at most ~TopK candidates in the threshold bin,
-// so 4K entries per round (2 rounds = 8K entries = 32KB) is sufficient.
-constexpr size_t kSmem = 8 * 1024 * sizeof(uint32_t);  // 32KB (bytes)
-#endif
+  TORCH_CHECK(lengths.numel() == num_rows, "lengths size mismatch");
+  TORCH_CHECK(output.size(0) == num_rows && output.size(1) == k,
+              "output size mismatch");
+  namespace P = vllm::persistent;

-struct FastTopKParams {
-  const float* __restrict__ input;         // [batch, seq_len] Logits
-  const int32_t* __restrict__ row_starts;  // [batch] Offset into each row
-                                           // (optional)
-  int32_t* __restrict__ indices;           // [batch, TopK] Output top-k indices
-  int32_t* __restrict__ lengths;           // [batch] Sequence lengths per row
-  int64_t input_stride;                    // Stride between rows
-};
+  TORCH_CHECK(k == P::TopK, "k must be 2048");
+  TORCH_CHECK(k <= stride, "k out of range");

-__device__ __forceinline__ auto convert_to_uint32_v2(float x) -> uint32_t {
-  uint32_t bits = __float_as_uint(x);
-  return (bits & 0x80000000u) ? ~bits : (bits | 0x80000000u);
-}
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-__device__ __forceinline__ auto convert_to_uint8(float x) -> uint8_t {
-  __half h = __float2half_rn(x);
-  uint16_t bits = __half_as_ushort(h);
-  uint16_t key = (bits & 0x8000) ? static_cast<uint16_t>(~bits)
-                                 : static_cast<uint16_t>(bits | 0x8000);
-  return static_cast<uint8_t>(key >> 8);
-}
-
-__device__ void naive_topk_cuda(const float* __restrict__ logits,
-                                int32_t* __restrict__ output_indices,
-                                int32_t seq_len) {
-  const int thread_id = threadIdx.x;
-  for (int i = thread_id; i < TopK; i += kThreadsPerBlock) {
-    output_indices[i] = (i < seq_len) ? i : -1;
-  }
-}
-
-// Adapted from:
-// https://github.com/sgl-project/sglang/blob/v0.5.8/sgl-kernel/csrc/elementwise/topk.cu#L87
-// by: DarkSharpness
-// which at the same time is an optimized topk kernel copied from tilelang
-// kernel
-__device__ void fast_topk_cuda_tl(
-    const float* __restrict__ logits,  // Input logits [seq_len]
-    int* __restrict__ output_indices,  // Output top-k indices [TopK]
-    int logits_offset,                 // Starting offset in logits array
-    int seq_len)                       // Number of valid logits to process
-{
-  constexpr int RADIX = 256;
-  constexpr int MAX_BUFFERED_ITEMS = kSmem / (2 * sizeof(int));
-
-  alignas(128) __shared__ int shared_histogram[2][RADIX + 128];
-  alignas(128) __shared__ int shared_output_count;
-  alignas(128) __shared__ int shared_threshold_bin;
-  alignas(128) __shared__ int shared_buffered_count[2];
-
-  extern __shared__ int buffered_indices[][MAX_BUFFERED_ITEMS];
-
-  const int thread_id = threadIdx.x;
-  int remaining_k = TopK;
-
-  // Pass 0: Build coarse 8-bit histogram using FP16 high bits
-  if (thread_id < RADIX + 1) {
-    shared_histogram[0][thread_id] = 0;
-  }
-  __syncthreads();
-
-  for (int idx = thread_id; idx < seq_len; idx += kThreadsPerBlock) {
-    const auto bin = convert_to_uint8(logits[idx + logits_offset]);
-    ::atomicAdd(&shared_histogram[0][bin], 1);
-  }
-  __syncthreads();
-
-  // Helper: Compute cumulative sum (suffix sum) over histogram using ping-pong
-  // buffers
-  auto compute_cumulative_sum = [&]() {
-    static_assert(1 << 8 == RADIX,
-                  "Radix must be 256 for 8 unrolled iterations");
-#pragma unroll 8
-    for (int i = 0; i < 8; ++i) {
-      if (C10_LIKELY(thread_id < RADIX)) {
-        const int stride = 1 << i;
-        const int src_buffer = i & 1;
-        const int dst_buffer = src_buffer ^ 1;
-
-        int value = shared_histogram[src_buffer][thread_id];
-        if (thread_id < RADIX - stride) {
-          value += shared_histogram[src_buffer][thread_id + stride];
-        }
-        shared_histogram[dst_buffer][thread_id] = value;
-      }
-      __syncthreads();
-    }
-  };
-
-  compute_cumulative_sum();
-
-  // Find threshold bin where cumsum crosses remaining_k
-  if (thread_id < RADIX && shared_histogram[0][thread_id] > remaining_k &&
-      shared_histogram[0][thread_id + 1] <= remaining_k) {
-    shared_threshold_bin = thread_id;
-    shared_buffered_count[0] = 0;
-    shared_output_count = 0;
-  }
-  __syncthreads();
-
-  const int threshold_bin = shared_threshold_bin;
-  remaining_k -= shared_histogram[0][threshold_bin + 1];
-
-  // Early exit if threshold bin perfectly matches remaining_k
-  if (remaining_k == 0) {
-    for (int idx = thread_id; idx < seq_len; idx += kThreadsPerBlock) {
-      const int bin = convert_to_uint8(logits[idx + logits_offset]);
-      if (bin > threshold_bin) {
-        const int output_pos = ::atomicAdd(&shared_output_count, 1);
-        output_indices[output_pos] = idx;
-      }
-    }
-    __syncthreads();
-    return;
+  static int num_sms = 0;
+  static int max_smem_per_block = 0;
+  if (num_sms == 0) {
+    int device;
+    cudaGetDevice(&device);
+    cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device);
+    cudaDeviceGetAttribute(&max_smem_per_block,
+                           cudaDevAttrMaxSharedMemoryPerBlockOptin, device);
  }

-  // Prepare for refinement passes: Process threshold bin
-  __syncthreads();
-  if (thread_id < RADIX + 1) {
-    shared_histogram[0][thread_id] = 0;
-  }
-  __syncthreads();
-
-  // Scan all elements and:
-  // 1. Write indices > threshold_bin to output
-  // 2. Buffer indices == threshold_bin for refinement
-  // 3. Build histogram for next refinement pass (fused optimization)
-  for (int idx = thread_id; idx < seq_len; idx += kThreadsPerBlock) {
-    const float logit_value = logits[idx + logits_offset];
-    const int bin = convert_to_uint8(logit_value);
-
-    if (bin > threshold_bin) {
-      // in top-k, write to output
-      const int output_pos = ::atomicAdd(&shared_output_count, 1);
-      output_indices[output_pos] = idx;
-    } else if (bin == threshold_bin) {
-      // Candidate for top-k, needs refinement
-      const int buffer_pos = ::atomicAdd(&shared_buffered_count[0], 1);
-      if (C10_LIKELY(buffer_pos < MAX_BUFFERED_ITEMS)) {
-        buffered_indices[0][buffer_pos] = idx;
-        // Fused: Build histogram for next pass
-        const uint32_t fp32_bits = convert_to_uint32_v2(logit_value);
-        const int next_bin = (fp32_bits >> 24) & 0xFF;
-        ::atomicAdd(&shared_histogram[0][next_bin], 1);
-      }
-    }
-  }
-  __syncthreads();
-
-  // ============================================================================
-  // Passes 1-4: Refine using 8-bit passes over FP32 bits
-  // ============================================================================
-  // FP32 bits [31:0] split into 4 bytes processed MSB-first:
-  // Pass 1: bits [31:24], Pass 2: bits [23:16], Pass 3: bits [15:8], Pass 4:
-  // bits [7:0]
-#pragma unroll 4
-  for (int pass = 0; pass < 4; ++pass) {
-    __shared__ int shared_final_k;  // For final pass: remaining slots to fill
-    const int src_buffer = pass % 2;
-    const int dst_buffer = src_buffer ^ 1;
-
-    // Clamp buffered count to prevent overflow
-    const int raw_buffered = shared_buffered_count[src_buffer];
-    const int num_buffered =
-        (raw_buffered < MAX_BUFFERED_ITEMS) ? raw_buffered : MAX_BUFFERED_ITEMS;
-
-    compute_cumulative_sum();
-
-    // Find threshold bin for this pass
-    if (thread_id < RADIX && shared_histogram[0][thread_id] > remaining_k &&
-        shared_histogram[0][thread_id + 1] <= remaining_k) {
-      shared_threshold_bin = thread_id;
-      shared_buffered_count[dst_buffer] = 0;
-      shared_final_k = remaining_k - shared_histogram[0][thread_id + 1];
-    }
-    __syncthreads();
-
-    const int threshold_bin = shared_threshold_bin;
-    remaining_k -= shared_histogram[0][threshold_bin + 1];
-
-    // Bit offset for this pass: 24, 16, 8, 0
-    const int bit_offset = 24 - pass * 8;
-
-    // Early exit if threshold bin perfectly matches
-    if (remaining_k == 0) {
-      for (int i = thread_id; i < num_buffered; i += kThreadsPerBlock) {
-        const int idx = buffered_indices[src_buffer][i];
-        const uint32_t fp32_bits =
-            convert_to_uint32_v2(logits[idx + logits_offset]);
-        const int bin = (fp32_bits >> bit_offset) & 0xFF;
-        if (bin > threshold_bin) {
-          const int output_pos = ::atomicAdd(&shared_output_count, 1);
-          output_indices[output_pos] = idx;
-        }
-      }
-      __syncthreads();
-      break;
-    }
-
-    // Continue refinement
-    __syncthreads();
-    if (thread_id < RADIX + 1) {
-      shared_histogram[0][thread_id] = 0;
-    }
-    __syncthreads();
-
-    for (int i = thread_id; i < num_buffered; i += kThreadsPerBlock) {
-      const int idx = buffered_indices[src_buffer][i];
-      const float logit_value = logits[idx + logits_offset];
-      const uint32_t fp32_bits = convert_to_uint32_v2(logit_value);
-      const int bin = (fp32_bits >> bit_offset) & 0xFF;
-
-      if (bin > threshold_bin) {
-        // Definitely in top-k
-        const int output_pos = ::atomicAdd(&shared_output_count, 1);
-        output_indices[output_pos] = idx;
-      } else if (bin == threshold_bin) {
-        if (pass == 3) {
-          // Final pass (bits [7:0]): No more refinement possible
-          // Fill remaining slots in reverse order to maintain descending order
-          const int slot = ::atomicAdd(&shared_final_k, -1);
-          if (slot > 0) {
-            output_indices[TopK - slot] = idx;
-          }
+  if (num_rows > 32 && max_smem_per_block >= 128 * 1024) {
+    cudaError_t status = vllm::FilteredTopKRaggedTransform<float, int32_t>(
+        logits.data_ptr<float>(), output.data_ptr<int32_t>(),
+        lengths.data_ptr<int32_t>(), static_cast<uint32_t>(num_rows),
+        static_cast<uint32_t>(k), static_cast<uint32_t>(stride), stream);
+    TORCH_CHECK(status == cudaSuccess,
+                "FilteredTopK failed: ", cudaGetErrorString(status));
  } else {
-          // Buffer for next pass and build next histogram
-          const int buffer_pos =
-              ::atomicAdd(&shared_buffered_count[dst_buffer], 1);
-          if (C10_LIKELY(buffer_pos < MAX_BUFFERED_ITEMS)) {
-            buffered_indices[dst_buffer][buffer_pos] = idx;
-            // Fused: Build histogram for next pass
-            const int next_bit_offset = bit_offset - 8;
-            const int next_bin = (fp32_bits >> next_bit_offset) & 0xFF;
-            ::atomicAdd(&shared_histogram[0][next_bin], 1);
-          }
-        }
-      }
-    }
-    __syncthreads();
-  }
-}
+    TORCH_CHECK(workspace.is_cuda(), "workspace must be CUDA tensor");
+    TORCH_CHECK(workspace.dtype() == torch::kUInt8, "workspace must be uint8");

-__global__ __launch_bounds__(kThreadsPerBlock) void topk_kernel(
-    const FastTopKParams params) {
-  const auto& [input, row_starts, indices, lengths, input_stride] = params;
-  const uint64_t batch_idx = blockIdx.x;
-  const int logits_offset = row_starts == nullptr ? 0 : row_starts[batch_idx];
-  const int seq_len = lengths[batch_idx];
-  int* output_indices = indices + batch_idx * TopK;
-  const float* logits = input + batch_idx * input_stride;
-
-  if (seq_len <= TopK) {
-    // Shortcut: All elements are in top-k
-    return naive_topk_cuda(logits, output_indices, seq_len);
+    // Smem cap: smaller smem → more CTAs/group → more per-row parallelism for
+    // large path. Empirically tuned.
+    int effective_max_smem;
+    if (num_rows <= 4) {
+      effective_max_smem =
+          std::min(max_smem_per_block, static_cast<int>(P::kSmemMedium));
+    } else if (num_rows <= 8) {
+      constexpr int kSmemCapMedium = 48 * 1024;
+      effective_max_smem = std::min(max_smem_per_block, kSmemCapMedium);
    } else {
-    return fast_topk_cuda_tl(logits, output_indices, logits_offset, seq_len);
-  }
-}
-
-FastTopKParams get_params(
-    const at::Tensor& score, const at::Tensor& lengths,
-    std::optional<at::Tensor> row_starts_opt = std::nullopt,
-    std::optional<at::Tensor> indices_opt = std::nullopt) {
-  const int64_t batch_size = score.size(0);
-
-  TORCH_CHECK(score.dim() == 2 && score.stride(1) == 1,
-              "score must be 2D with contiguous rows");
-  TORCH_CHECK(lengths.dim() == 1 && lengths.is_contiguous() &&
-                  lengths.size(0) == batch_size,
-              "lengths must be 1D contiguous with size matching batch");
-
-  const int32_t* row_starts_ptr = nullptr;
-  if (row_starts_opt.has_value()) {
-    const auto& row_starts = *row_starts_opt;
-    TORCH_CHECK(row_starts.dim() == 1 && row_starts.size(0) == batch_size,
-                "row_starts must be 1D with size matching batch");
-    row_starts_ptr = row_starts.data_ptr<int32_t>();
+      effective_max_smem = max_smem_per_block;
    }

-  int32_t* indices_ptr = nullptr;
-  if (indices_opt.has_value()) {
-    const auto& indices = *indices_opt;
-    TORCH_CHECK(indices.dim() == 2 && indices.is_contiguous() &&
-                    indices.size(0) == batch_size && indices.size(1) == TopK,
-                "indices must be 2D contiguous [batch, TopK]");
-    indices_ptr = indices.data_ptr<int32_t>();
+    size_t available_for_ordered =
+        static_cast<size_t>(effective_max_smem) - P::kFixedSmemLarge;
+    uint32_t max_chunk_elements =
+        static_cast<uint32_t>(available_for_ordered / sizeof(uint32_t));
+
+    uint32_t vec_size = 1;
+    if (stride % 4 == 0)
+      vec_size = 4;
+    else if (stride % 2 == 0)
+      vec_size = 2;
+
+    max_chunk_elements = (max_chunk_elements / vec_size) * vec_size;
+    uint32_t min_chunk = vec_size * P::kThreadsPerBlock;
+    if (max_chunk_elements < min_chunk) max_chunk_elements = min_chunk;
+
+    uint32_t ctas_per_group =
+        (static_cast<uint32_t>(stride) + max_chunk_elements - 1) /
+        max_chunk_elements;
+    uint32_t chunk_size =
+        (static_cast<uint32_t>(stride) + ctas_per_group - 1) / ctas_per_group;
+    chunk_size = ((chunk_size + vec_size - 1) / vec_size) * vec_size;
+    if (chunk_size > max_chunk_elements) chunk_size = max_chunk_elements;
+
+    size_t smem_size = P::kFixedSmemLarge + chunk_size * sizeof(uint32_t);
+    if (smem_size < P::kSmemMedium) smem_size = P::kSmemMedium;
+
+    int occupancy = 1;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &occupancy, P::persistent_topk_kernel<4>, P::kThreadsPerBlock,
+        smem_size);
+    if (occupancy < 1) occupancy = 1;
+
+    uint32_t max_resident_ctas = static_cast<uint32_t>(num_sms) * occupancy;
+    uint32_t num_groups = std::min(max_resident_ctas / ctas_per_group,
+                                   static_cast<uint32_t>(num_rows));
+    if (num_groups == 0) num_groups = 1;
+    uint32_t total_ctas = num_groups * ctas_per_group;
+
+    size_t state_bytes = num_groups * sizeof(P::RadixRowState);
+    TORCH_CHECK(workspace.size(0) >= static_cast<int64_t>(state_bytes),
+                "workspace too small, need ", state_bytes, " bytes");
+
+    P::PersistentTopKParams params;
+    params.input = logits.data_ptr<float>();
+    params.output = output.data_ptr<int32_t>();
+    params.lengths = lengths.data_ptr<int32_t>();
+    params.num_rows = static_cast<uint32_t>(num_rows);
+    params.stride = static_cast<uint32_t>(stride);
+    params.chunk_size = chunk_size;
+    params.row_states =
+        reinterpret_cast<P::RadixRowState*>(workspace.data_ptr<uint8_t>());
+    params.ctas_per_group = ctas_per_group;
+    params.max_seq_len = static_cast<uint32_t>(max_seq_len);
+
+  #define LAUNCH_PERSISTENT(VS)                                               \
+    do {                                                                      \
+      auto kernel = &P::persistent_topk_kernel<VS>;                           \
+      cudaError_t err = cudaFuncSetAttribute(                                 \
+          kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);    \
+      TORCH_CHECK(err == cudaSuccess,                                         \
+                  "Failed to set smem: ", cudaGetErrorString(err));           \
+      kernel<<<total_ctas, P::kThreadsPerBlock, smem_size, stream>>>(params); \
+    } while (0)
+
+    if (vec_size == 4) {
+      LAUNCH_PERSISTENT(4);
+    } else if (vec_size == 2) {
+      LAUNCH_PERSISTENT(2);
+    } else {
+      LAUNCH_PERSISTENT(1);
+    }
+  #undef LAUNCH_PERSISTENT
  }

-  return FastTopKParams{
-      .input = score.data_ptr<float>(),
-      .row_starts = row_starts_ptr,
-      .indices = indices_ptr,
-      .lengths = lengths.data_ptr<int32_t>(),
-      .input_stride = score.stride(0),
-  };
-}
-
-template <auto* kernel_func, size_t smem_bytes>
-void setup_kernel_smem_once() {
-  static const cudaError_t result = []() -> cudaError_t {
-#ifdef USE_ROCM
-    auto func_ptr = reinterpret_cast<const void*>(kernel_func);
+  cudaError_t err = cudaGetLastError();
+  TORCH_CHECK(err == cudaSuccess,
+              "persistent_topk failed: ", cudaGetErrorString(err));
 #else
-    auto func_ptr = kernel_func;
+  TORCH_CHECK(false, "persistent_topk is not supported on ROCm");
 #endif
-    return cudaFuncSetAttribute(
-        func_ptr, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
-  }();
-
-  TORCH_CHECK(
-      result == cudaSuccess,
-      "Failed to set kernel shared memory limit: ", cudaGetErrorString(result));
-}
-
-}  // namespace vllm
-
-void large_context_topk(
-    const torch::Tensor& logits, torch::Tensor& indices,
-    const torch::Tensor& seq_lens,
-    std::optional<torch::Tensor> row_starts = std::nullopt) {
-  TORCH_CHECK(logits.is_cuda(), "logits must be a CUDA tensor");
-  TORCH_CHECK(indices.is_cuda(), "indices must be a CUDA tensor");
-  TORCH_CHECK(seq_lens.is_cuda(), "seq_lens must be a CUDA tensor");
-  if (row_starts.has_value()) {
-    TORCH_CHECK(row_starts->is_cuda(), "row_starts must be a CUDA tensor");
-  }
-
-  const auto params = vllm::get_params(logits, seq_lens, row_starts, indices);
-  const int64_t batch_size = logits.size(0);
-
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  const dim3 grid(static_cast<uint32_t>(batch_size));
-  const dim3 block(vllm::kThreadsPerBlock);
-
-  vllm::setup_kernel_smem_once<vllm::topk_kernel, vllm::kSmem>();
-  vllm::topk_kernel<<<grid, block, vllm::kSmem, stream>>>(params);
-
-  const cudaError_t result = cudaGetLastError();
-  TORCH_CHECK(result == cudaSuccess,
-              "large_context_topk kernel failed: ", cudaGetErrorString(result));
 }
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -110,6 +110,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
  ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);

+  // Fused SiLU+Mul + per-block quantization
+  ops.def(
+      "silu_and_mul_per_block_quant("
+      "Tensor! out, "
+      "Tensor input, "
+      "Tensor! scales, "
+      "int group_size, "
+      "Tensor? scale_ub=None, "
+      "bool is_scale_transposed=False) -> ()");
+  ops.impl("silu_and_mul_per_block_quant", torch::kCUDA,
+           &silu_and_mul_per_block_quant);
+
  ops.def("mul_and_silu(Tensor! out, Tensor input) -> ()");
  ops.impl("mul_and_silu", torch::kCUDA, &mul_and_silu);

@@ -185,10 +197,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("top_k_per_row_decode", torch::kCUDA, &top_k_per_row_decode);

  ops.def(
-      "large_context_topk(Tensor score, Tensor indices, Tensor lengths, "
-      "Tensor? "
-      "row_starts_opt) -> ()");
-  ops.impl("large_context_topk", torch::kCUDA, &large_context_topk);
+      "persistent_topk(Tensor logits, Tensor lengths, Tensor! output, "
+      "Tensor workspace, int k, int max_seq_len) -> ()");
+  ops.impl("persistent_topk", torch::kCUDA, &persistent_topk);

  // Layernorm-quant
  // Apply Root Mean Square (RMS) Normalization to the input tensor.
@@ -233,17 +244,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {

  // Quantization ops
 #ifndef USE_ROCM
-  // Fused SiLU+Mul + per-block quantization
-  ops.def(
-      "silu_and_mul_per_block_quant("
-      "Tensor! out, "
-      "Tensor input, "
-      "Tensor! scales, "
-      "int group_size, "
-      "Tensor? scale_ub=None, "
-      "bool is_scale_transposed=False) -> ()");
-  ops.impl("silu_and_mul_per_block_quant", torch::kCUDA,
-           &silu_and_mul_per_block_quant);
  // DeepSeek V3 fused A GEMM (SM 9.0+, bf16 only, 1-16 tokens).
  ops.def(
      "dsv3_fused_a_gemm(Tensor! output, Tensor mat_a, Tensor mat_b) -> ()");
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -22,7 +22,7 @@
 #   docker buildx bake -f docker/docker-bake.hcl -f docker/versions.json
 # =============================================================================

-ARG CUDA_VERSION=12.9.1
+ARG CUDA_VERSION=13.0.0
 ARG PYTHON_VERSION=3.12
 ARG UBUNTU_VERSION=22.04

@@ -37,7 +37,7 @@ ARG UBUNTU_VERSION=22.04
 # compatibility with other Linux OSes. The main reason for this is that the
 # glibc version is baked into the distro, and binaries built with one glibc
 # version are not backwards compatible with OSes that use an earlier version.
-ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
+ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 # Using cuda base image with minimal dependencies necessary for JIT compilation (FlashInfer, DeepGEMM, EP kernels)
 ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu${UBUNTU_VERSION}

@@ -315,7 +315,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 #################### CSRC BUILD IMAGE ####################

 #################### EXTENSIONS BUILD IMAGE ####################
-# Build DeepGEMM, DeepEP - runs in PARALLEL with csrc-build
+# Build DeepEP - runs in PARALLEL with csrc-build
 # This stage is independent and doesn't affect csrc cache
 FROM base AS extensions-build
 ARG CUDA_VERSION
@@ -327,21 +327,6 @@ ENV UV_LINK_MODE=copy

 WORKDIR /workspace

-# Build DeepGEMM wheel
-# Default moved here from tools/install_deepgemm.sh for centralized version management
-ARG DEEPGEMM_GIT_REF=477618cd51baffca09c4b0b87e97c03fe827ef03
-COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
-RUN --mount=type=cache,target=/root/.cache/uv \
-    mkdir -p /tmp/deepgemm/dist && \
-    VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh \
-        --cuda-version "${CUDA_VERSION}" \
-        ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} \
-        --wheel-dir /tmp/deepgemm/dist || \
-    echo "DeepGEMM build skipped (CUDA version requirement not met)"
-
-# Ensure the wheel dir exists so COPY won't fail when DeepGEMM is skipped
-RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
-
 # Build DeepEP wheels
 COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
 # Defaults moved here from tools/ep_kernels/install_python_libraries.sh for centralized version management
@@ -426,7 +411,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38

 # Copy extension wheels from extensions-build stage for later use
-COPY --from=extensions-build /tmp/deepgemm/dist /tmp/deepgemm/dist
 COPY --from=extensions-build /tmp/ep_kernels_workspace/dist /tmp/ep_kernels_workspace/dist

 # Check the size of the wheel if RUN_WHEEL_CHECK is true
@@ -546,17 +530,23 @@ RUN apt-get update -y \
 # Install CUDA development tools for runtime JIT compilation
 # (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime)
 RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
+    CUDA_VERSION_SHORT=$(echo $CUDA_VERSION | cut -d. -f1,2) && \
    apt-get update -y && \
-    apt-get install -y --no-install-recommends \
+    apt-get install -y --no-install-recommends --allow-change-held-packages \
        cuda-nvcc-${CUDA_VERSION_DASH} \
        cuda-cudart-${CUDA_VERSION_DASH} \
        cuda-nvrtc-${CUDA_VERSION_DASH} \
        cuda-cuobjdump-${CUDA_VERSION_DASH} \
        libcurand-dev-${CUDA_VERSION_DASH} \
        libcublas-${CUDA_VERSION_DASH} \
+        # Required by fastsafetensors (fixes #20384)
+        libnuma-dev && \
    # Fixes nccl_allocator requiring nccl.h at runtime
    # https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
-        libnccl-dev && \
+    # NCCL packages don't use the cuda-MAJOR-MINOR naming convention,
+    # so we pin the version to match our CUDA version
+    NCCL_VER=$(apt-cache madison libnccl-dev | grep "+cuda${CUDA_VERSION_SHORT}" | head -1 | awk -F'|' '{gsub(/^ +| +$/, "", $2); print $2}') && \
+    apt-get install -y --no-install-recommends --allow-change-held-packages libnccl-dev=${NCCL_VER} libnccl2=${NCCL_VER} && \
    rm -rf /var/lib/apt/lists/*

 # Install uv for faster pip installs
@@ -689,15 +679,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
 uv pip list

-# Install deepgemm wheel that has been built in the `build` stage
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,from=build,source=/tmp/deepgemm/dist,target=/tmp/deepgemm/dist,ro \
-    sh -c 'if ls /tmp/deepgemm/dist/*.whl >/dev/null 2>&1; then \
-              uv pip install --system /tmp/deepgemm/dist/*.whl; \
-           else \
-              echo "No DeepGEMM wheels to install; skipping."; \
-           fi'
-
 # Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH
 ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

@@ -822,7 +803,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
        uv pip install --system -r /tmp/kv_connectors.txt --no-build || ( \
            # if the above fails, install from source
            apt-get update -y && \
-            apt-get install -y --no-install-recommends ${BUILD_PKGS} && \
+            apt-get install -y --no-install-recommends --allow-change-held-packages ${BUILD_PKGS} && \
            uv pip install --system -r /tmp/kv_connectors.txt --no-build-isolation && \
            apt-get purge -y ${BUILD_PKGS} && \
            # clean up -dev packages, keep runtime libraries
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -140,7 +140,7 @@ RUN \
    esac; \
    }; \
    remove_packages_not_supported_on_aarch64 && \
-    sed -i 's/^torch==.*/torch==2.10.0/g' requirements/cpu-test.in && \
+    sed -i 's/^torch==.*/torch==2.11.0/g' requirements/cpu-test.in && \
    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
    uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -19,7 +19,8 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
 # Install some basic utilities
 RUN apt-get update -q -y && apt-get install -q -y \
    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
-    apt-transport-https ca-certificates wget curl
+    apt-transport-https ca-certificates wget curl \
+    libnuma-dev
 RUN python3 -m pip install --upgrade pip
 # Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base)
 ARG USE_SCCACHE
@@ -332,10 +333,10 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
    && pip uninstall -y vllm \
    && uv pip install --system *.whl

-# Verify that PyTorch is the ROCm build, not CUDA
-RUN python3 -c "import torch; assert torch.version.hip is not None, \
-    f'Expected ROCm PyTorch but got CUDA (torch.version.cuda={torch.version.cuda}, torch.version.hip={torch.version.hip})'; \
-    print(f'Verified: PyTorch {torch.__version__} with ROCm (HIP {torch.version.hip})')"
+# Persist the built wheel in the image so python_only_compile_rocm.sh can
+# reinstall it after removing compilers. The bind-mounted /install contents
+# above are not available once that RUN step completes.
+COPY --from=export_vllm /*.whl /opt/vllm-wheels/

 # Install RIXL wheel
 RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
@@ -390,7 +391,21 @@ ENV MIOPEN_DEBUG_CONV_GEMM=0
 RUN mkdir src && mv vllm src/vllm

 # This is a workaround to ensure pytest exits with the correct status code in CI tests.
-RUN echo "import os\n\ndef pytest_sessionfinish(session, exitstatus):\n    os._exit(int(exitstatus))" > /vllm-workspace/conftest.py
+RUN printf '%s\n' \
+    'import os' \
+    '' \
+    '_exit_code = 1' \
+    '' \
+    'def pytest_sessionfinish(session, exitstatus):' \
+    '    global _exit_code' \
+    '    _exit_code = int(exitstatus)' \
+    '' \
+    'def pytest_unconfigure(config):' \
+    '    import sys' \
+    '    sys.stdout.flush()' \
+    '    sys.stderr.flush()' \
+    '    os._exit(_exit_code)' \
+    > /vllm-workspace/conftest.py

 # -----------------------
 # Final vLLM image
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -9,7 +9,7 @@ ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
 ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
 ARG FA_BRANCH="0e60e394"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="v0.1.10.post2"
+ARG AITER_BRANCH="v0.1.12"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 ARG MORI_BRANCH="2d02c6a9"
 ARG MORI_REPO="https://github.com/ROCm/mori.git"
@@ -112,10 +112,14 @@ FROM base AS build_triton
 ARG TRITON_BRANCH
 ARG TRITON_REPO
 RUN git clone ${TRITON_REPO}
+# Cherry picking the following
+# https://github.com/triton-lang/triton/pull/8991
+# https://github.com/triton-lang/triton/pull/9541
 RUN cd triton \
    && git checkout ${TRITON_BRANCH} \
    && git config --global user.email "you@example.com" && git config --global user.name "Your Name" \
    && git cherry-pick 555d04f \
+    && git cherry-pick dd998b6 \
    && if [ ! -f setup.py ]; then cd python; fi \
    && python3 setup.py bdist_wheel --dist-dir=dist \
    && mkdir -p /app/install && cp dist/*.whl /app/install
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
@@ -93,13 +93,13 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \

 FROM python-install AS torch-vision
 # Install torchvision
-ARG TORCH_VISION_VERSION=v0.25.0
+ARG TORCH_VISION_VERSION=v0.26.0
 WORKDIR /tmp
 RUN --mount=type=cache,target=/root/.cache/uv \
    git clone https://github.com/pytorch/vision.git && \
    cd vision && \
    git checkout $TORCH_VISION_VERSION && \
-    uv pip install torch==2.10.0 --index-url https://download.pytorch.org/whl/cpu && \
+    uv pip install torch==2.11.0 --index-url https://download.pytorch.org/whl/cpu && \
    python setup.py bdist_wheel

 FROM python-install AS hf-xet-builder
--- a/docker/versions.json
+++ b/docker/versions.json
@@ -2,7 +2,7 @@
  "_comment": "Auto-generated from Dockerfile ARGs. Do not edit manually. Run: python tools/generate_versions_json.py",
  "variable": {
    "CUDA_VERSION": {
-      "default": "12.9.1"
+      "default": "13.0.0"
    },
    "PYTHON_VERSION": {
      "default": "3.12"
@@ -11,10 +11,10 @@
      "default": "22.04"
    },
    "BUILD_BASE_IMAGE": {
-      "default": "nvidia/cuda:12.9.1-devel-ubuntu20.04"
+      "default": "nvidia/cuda:13.0.0-devel-ubuntu22.04"
    },
    "FINAL_BASE_IMAGE": {
-      "default": "nvidia/cuda:12.9.1-base-ubuntu22.04"
+      "default": "nvidia/cuda:13.0.0-base-ubuntu22.04"
    },
    "GET_PIP_URL": {
      "default": "https://bootstrap.pypa.io/get-pip.py"
@@ -52,9 +52,6 @@
    "vllm_target_device": {
      "default": "cuda"
    },
-    "DEEPGEMM_GIT_REF": {
-      "default": "477618cd51baffca09c4b0b87e97c03fe827ef03"
-    },
    "DEEPEP_COMMIT_HASH": {
      "default": "73b6ea4"
    },
--- a/docs/README.md
+++ b/docs/README.md
@@ -25,7 +25,7 @@ hide:

 vLLM is a fast and easy-to-use library for LLM inference and serving.

-Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
+Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has grown into one of the most active open-source AI projects built and maintained by a diverse community of many dozens of academic institutions and companies from over 2000 contributors.

 Where to get started with vLLM depends on the type of user. If you are looking to:

@@ -42,23 +42,37 @@ vLLM is fast with:

 - State-of-the-art serving throughput
 - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
- Continuous batching of incoming requests
- Fast model execution with CUDA/HIP graph
- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
- Speculative decoding
- Chunked prefill
+- Continuous batching of incoming requests, chunked prefill, prefix caching
+- Fast and flexible model execution with piecewise and full CUDA/HIP graphs
+- Quantization: FP8, MXFP8/MXFP4, NVFP4, INT8, INT4, GPTQ/AWQ, GGUF, compressed-tensors, ModelOpt, TorchAO, and [more](https://docs.vllm.ai/en/latest/features/quantization/index.html)
+- Optimized attention kernels including FlashAttention, FlashInfer, TRTLLM-GEN, FlashMLA, and Triton
+- Optimized GEMM/MoE kernels for various precisions using CUTLASS, TRTLLM-GEN, CuTeDSL
+- Speculative decoding including n-gram, suffix, EAGLE, DFlash
+- Automatic kernel generation and graph-level transformations using torch.compile
+- Disaggregated prefill, decode, and encode

 vLLM is flexible and easy to use with:

- Seamless integration with popular HuggingFace models
+- Seamless integration with popular Hugging Face models
 - High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
- Tensor, pipeline, data and expert parallelism support for distributed inference
+- Tensor, pipeline, data, expert, and context parallelism for distributed inference
 - Streaming outputs
- OpenAI-compatible API server
- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, Arm CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
- Prefix caching support
- Multi-LoRA support
+- Generation of structured outputs using xgrammar or guidance
+- Tool calling and reasoning parsers
+- OpenAI-compatible API server, plus Anthropic Messages API and gRPC support
+- Efficient multi-LoRA support for dense and MoE layers
+- Support for NVIDIA GPUs, AMD GPUs, and x86/ARM/PowerPC CPUs. Additionally, diverse hardware plugins such as Google TPUs, Intel Gaudi, IBM Spyre, Huawei Ascend, Rebellions NPU, Apple Silicon, MetaX GPU, and more.
+
+vLLM seamlessly supports 200+ model architectures on HuggingFace, including:
+
+- Decoder-only LLMs (e.g., Llama, Qwen, Gemma)
+- Mixture-of-Expert LLMs (e.g., Mixtral, DeepSeek-V3, Qwen-MoE, GPT-OSS)
+- Hybrid attention and state-space models (e.g., Mamba, Qwen3.5)
+- Multi-modal models (e.g., LLaVA, Qwen-VL, Pixtral)
+- Embedding and retrieval models (e.g., E5-Mistral, GTE, ColBERT)
+- Reward and classification models (e.g., Qwen-Math)
+
+Find the full list of supported models [here](./models/supported_models.md).

 For more information, check out the following:

--- a/docs/assets/contributing/dockerfile-stages-dependency.png
+++ b/docs/assets/contributing/dockerfile-stages-dependency.png
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -140,6 +140,80 @@ Data parallelism replicates the entire model across multiple GPU sets and proces
 Data parallelism can be combined with the other parallelism strategies and is set by `data_parallel_size=N`.
 Note that MoE layers will be sharded according to the product of the tensor parallel size and data parallel size.

+### NUMA Binding for Multi-Socket GPU Nodes
+
+On multi-socket GPU servers, GPU worker processes can lose performance if their
+CPU execution and memory allocation drift away from the NUMA node nearest to the
+GPU. vLLM can pin each worker with `numactl` before the Python subprocess starts,
+so the interpreter, imports, and early allocator state are created with the
+desired NUMA policy from the beginning.
+
+Use `--numa-bind` to enable the feature. By default, vLLM auto-detects the
+GPU-to-NUMA mapping and uses `--cpunodebind=<node> --membind=<node>` for each
+worker. When you need a custom CPU policy, add `--numa-bind-cpus` and vLLM will
+switch to `--physcpubind=<cpu-list> --membind=<node>`.
+
+These `--numa-bind*` options only apply to GPU execution processes. They do not
+configure the CPU backend's separate thread-affinity controls. Automatic
+GPU-to-NUMA detection is currently implemented for CUDA/NVML-based platforms;
+other GPU backends must provide explicit binding lists if they use these
+options.
+
+`--numa-bind-nodes` takes one non-negative NUMA node index per visible GPU, in
+the same order as the GPU indices.
+`--numa-bind-cpus` takes one `numactl` CPU list per visible GPU, in the same
+order as the GPU indices. Each CPU list must use
+`numactl --physcpubind` syntax such as `0-3`, `0,2,4-7`, or `16-31,48-63`.
+
+```bash
+# Auto-detect NUMA nodes for visible GPUs
+vllm serve meta-llama/Llama-3.1-8B-Instruct \
+  --tensor-parallel-size 4 \
+  --numa-bind
+
+# Explicit NUMA-node mapping
+vllm serve meta-llama/Llama-3.1-8B-Instruct \
+  --tensor-parallel-size 4 \
+  --numa-bind \
+  --numa-bind-nodes 0 0 1 1
+
+# Explicit CPU pinning, useful for PCT or other high-frequency core layouts
+vllm serve meta-llama/Llama-3.1-8B-Instruct \
+  --tensor-parallel-size 4 \
+  --numa-bind \
+  --numa-bind-nodes 0 0 1 1 \
+  --numa-bind-cpus 0-3 4-7 48-51 52-55
+```
+
+Notes:
+
+- CLI usage forces multiprocessing to use the `spawn` method automatically. If you enable NUMA binding through the Python API, also set `VLLM_WORKER_MULTIPROC_METHOD=spawn`.
+- Automatic detection relies on NVML and NUMA support from the host. If it cannot determine the mapping reliably, pass `--numa-bind-nodes` explicitly.
+- Explicit `--numa-bind-nodes` and `--numa-bind-cpus` values must be valid `numactl` inputs. vLLM does a small amount of validation, but the effective binding semantics are still determined by `numactl`.
+- The current implementation binds GPU execution processes such as `EngineCore` and multiprocessing workers. It does not apply NUMA binding to frontend API server processes or the DP coordinator.
+- In containerized environments, NUMA policy syscalls may require extra permissions, such as `--cap-add SYS_NICE` when running via `docker run`.
+
+### CPU Backend Thread Affinity
+
+The CPU backend uses a different mechanism from `--numa-bind`. CPU execution is
+configured through CPU-specific environment variables such as
+`VLLM_CPU_OMP_THREADS_BIND`, `VLLM_CPU_NUM_OF_RESERVED_CPU`, and
+`CPU_VISIBLE_MEMORY_NODES`, rather than the GPU-oriented `--numa-bind*` CLI
+options.
+
+By default, `VLLM_CPU_OMP_THREADS_BIND=auto` derives OpenMP placement from the
+available CPU and NUMA topology for each CPU worker. To override the automatic
+policy, set `VLLM_CPU_OMP_THREADS_BIND` explicitly using the CPU list format
+documented for the CPU backend, or use `nobind` to disable this behavior.
+
+For the current CPU backend setup and tuning guidance, see:
+
+- [Related runtime environment variables](../getting_started/installation/cpu.md#related-runtime-environment-variables)
+- [How to decide `VLLM_CPU_OMP_THREADS_BIND`](../getting_started/installation/cpu.md#how-to-decide-vllm_cpu_omp_threads_bind)
+
+The GPU-only `--numa-bind`, `--numa-bind-nodes`, and `--numa-bind-cpus` options
+do not configure CPU worker affinity.
+
 ### Batch-level DP for Multi-Modal Encoders

 By default, TP is used to shard the weights of multi-modal encoders just like for language decoders,
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -57,8 +57,8 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.

 - [`ModelOptFp8MoEMethod`][vllm.model_executor.layers.quantization.modelopt.ModelOptFp8MoEMethod]
 - [`Fp8MoEMethod`][vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod]
- [`CompressedTensorsW4A4Nvfp4MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4Nvfp4MoEMethod]
- [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Fp8MoEMethod]
+- [`CompressedTensorsW4A4Nvfp4MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.compressed_tensors_moe_w4a4_nvfp4.CompressedTensorsW4A4Nvfp4MoEMethod]
+- [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.compressed_tensors_moe_w8a8_fp8.CompressedTensorsW8A8Fp8MoEMethod]
 - [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod]
 - [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod]

@@ -82,7 +82,7 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k
 | ------ | ----------------- | ------------ | ------------- | ------------------- | --------------------- | ------- | ------ |
 | triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] |
 | triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
-| deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | </br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
+| deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | </br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
 | cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] |
 | cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
 | flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
--- a/docs/mkdocs/hooks/autoref_code.py
+++ b/docs/mkdocs/hooks/autoref_code.py
@@ -0,0 +1,167 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+MkDocs hook to automatically convert inline code references to API doc links.
+
+For example, `WeightTransferConfig` becomes
+[`WeightTransferConfig`][vllm.config.WeightTransferConfig]
+
+This works with the `autorefs` plugin to create clickable cross-references
+to API documentation pages generated by `mkdocstrings`.
+
+The hook builds an index of all documented public Python names (classes and
+functions with docstrings) from the vllm package at startup using AST parsing,
+then substitutes matching inline code spans on each page. Names without
+docstrings are excluded because mkdocstrings will not generate a page for them.
+"""
+
+import ast
+import logging
+from pathlib import Path
+
+import regex as re
+from mkdocs.config.defaults import MkDocsConfig
+from mkdocs.structure.files import Files
+from mkdocs.structure.pages import Page
+
+logger = logging.getLogger("mkdocs")
+
+ROOT_DIR = Path(__file__).parent.parent.parent.parent.resolve()
+VLLM_DIR = ROOT_DIR / "vllm"
+
+# Maps short name -> qualified name (e.g. "ModelConfig" -> "vllm.config.ModelConfig")
+_name_index: dict[str, str] = {}
+
+# Fenced code block pattern (``` or ~~~, with optional language specifier).
+_FENCED_BLOCK = re.compile(
+    r"(?:^|\n)(?P<fence>`{3,}|~{3,})[^\n]*\n.*?(?:\n(?P=fence))", re.DOTALL
+)
+
+# Inline code that is NOT already part of a markdown link.
+# Matches `Name` but not [`Name`] and not [`Name`][...] or [`Name`](...).
+_INLINE_CODE = re.compile(
+    r"(?<!\[)"  # not preceded by [
+    r"`(?P<name>[A-Za-z0-9_]*)`"  # `UpperCamelCase` or `UPPER_SNAKE`
+    r"(?!\])"  # not followed by ]
+)
+
+
+def _has_docstring(node: ast.AST) -> bool:
+    """Check if a class or function node has a docstring."""
+    if not isinstance(node, ast.ClassDef | ast.FunctionDef | ast.AsyncFunctionDef):
+        return False
+    return ast.get_docstring(node, clean=False) is not None
+
+
+def _module_path(filepath: Path) -> str:
+    """Convert a filesystem path to a dotted module path."""
+    rel = filepath.relative_to(ROOT_DIR)
+    parts = list(rel.with_suffix("").parts)
+    if parts[-1] == "__init__":
+        parts = parts[:-1]
+    return ".".join(parts)
+
+
+def _index_file(filepath: Path) -> dict[str, str]:
+    """Extract documented public names from a Python file using AST parsing.
+
+    Only classes and functions with docstrings are included, since
+    mkdocstrings won't generate a page for undocumented symbols.
+    """
+    names: dict[str, str] = {}
+    try:
+        source = filepath.read_text(encoding="utf-8")
+        tree = ast.parse(source, filename=str(filepath))
+    except (SyntaxError, UnicodeDecodeError):
+        return names
+
+    module = _module_path(filepath)
+
+    for node in ast.iter_child_nodes(tree):
+        if (
+            # Class definitions (with docstring)
+            isinstance(node, ast.ClassDef)
+            and not node.name.startswith("_")
+            and _has_docstring(node)
+        ) or (
+            # Function definitions (with docstring, only uppercase/CamelCase)
+            isinstance(node, ast.FunctionDef | ast.AsyncFunctionDef)
+            and not node.name.startswith("_")
+            and node.name[0].isupper()
+            and _has_docstring(node)
+        ):
+            names[node.name] = f"{module}.{node.name}"
+
+    return names
+
+
+def _build_index() -> dict[str, str]:
+    """Walk the vllm package and build a name -> qualified path index."""
+    index: dict[str, str] = {}
+    # Track conflicts: if multiple modules define the same name,
+    # prefer shallower modules (more likely to be the public API).
+    depth: dict[str, int] = {}
+
+    for filepath in sorted(VLLM_DIR.rglob("*.py")):
+        # Skip internal/private modules
+        if any(part.startswith("_") and part != "__init__" for part in filepath.parts):
+            continue
+        # Skip third-party vendored code
+        rel = filepath.relative_to(VLLM_DIR)
+        if rel.parts and rel.parts[0] in ("third_party", "vllm_flash_attn"):
+            continue
+
+        module_depth = len(filepath.relative_to(ROOT_DIR).parts)
+        file_names = _index_file(filepath)
+
+        for name, qualified in file_names.items():
+            if name not in index or module_depth < depth[name]:
+                index[name] = qualified
+                depth[name] = module_depth
+
+    return index
+
+
+def on_startup(*, command: str, dirty: bool) -> None:
+    """Build the name index once at startup."""
+    global _name_index
+    _name_index = _build_index()
+    logger.info("autoref_code: indexed %d names from vllm/", len(_name_index))
+
+
+def on_page_markdown(
+    markdown: str, *, page: Page, config: MkDocsConfig, files: Files
+) -> str:
+    """Replace inline code references with autoref links."""
+    if not _name_index:
+        return markdown
+
+    # Skip API reference pages to avoid circular/redundant links.
+    if page.file.src_path.startswith("api/"):
+        return markdown
+
+    # Step 1: Mask fenced code blocks so we don't touch code inside them.
+    masks: list[str] = []
+
+    def _mask_block(match: re.Match) -> str:
+        masks.append(match.group(0))
+        return f"\ue000CODEBLOCK{len(masks) - 1}\ue000"
+
+    masked = _FENCED_BLOCK.sub(_mask_block, markdown)
+
+    # Step 2: Replace inline code references.
+    def _replace(match: re.Match) -> str:
+        name = match.group("name")
+        qualified = _name_index.get(name)
+        if qualified is None:
+            return match.group(0)
+        logger.debug("autoref_code: linking `%s` to [%s]", name, qualified)
+        return f"[`{name}`][{qualified}]"
+
+    result = _INLINE_CODE.sub(_replace, masked)
+
+    # Step 3: Restore masked code blocks.
+    result = re.sub(
+        r"\ue000CODEBLOCK(\d+)\ue000", lambda m: masks[int(m.group(1))], result
+    )
+    return result
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -59,7 +59,7 @@ class PydanticMagicMock(MagicMock):
    """`MagicMock` that's able to generate pydantic-core schemas."""

    def __init__(self, *args, **kwargs):
-        name = kwargs.pop("name", None)
+        name = kwargs.get("name")
        super().__init__(*args, **kwargs)
        self.__spec__ = ModuleSpec(name, None)

@@ -85,7 +85,8 @@ def auto_mock(module_name: str, attr: str, max_mocks: int = 100):
            logger.info("Mocking %s for argparse doc generation", e.name)
            sys.modules[e.name] = PydanticMagicMock(name=e.name)
        except Exception:
-            logger.exception("Failed to import %s.%s: %s", module_name, attr)
+            logger.exception("Failed to import %s.%s", module_name, attr)
+            raise

    raise ImportError(
        f"Failed to import {module_name}.{attr} after mocking {max_mocks} imports"
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -457,6 +457,7 @@ th {
 | `PanguEmbeddedForCausalLM` | openPangu-Embedded-7B | `FreedomIntelligence/openPangu-Embedded-7B-V1.1` | ✅︎ | ✅︎ |
 | `PanguProMoEV2ForCausalLM` | openpangu-pro-moe-v2 | | ✅︎ | ✅︎ |
 | `PanguUltraMoEForCausalLM` | openpangu-ultra-moe-718b-model | `FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1` | ✅︎ | ✅︎ |
+| `Param2MoEForCausalLM` | param2moe | `bharatgenai/Param2-17B-A2.4B-Thinking`, etc. | ✅︎ | ✅︎ |
 | `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ |
 | `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ |
 | `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ |
@@ -600,6 +601,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | ✅︎ | ✅︎ |
 | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
 | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
+| `Phi4ForCausalLMV` | Phi-4-reasoning-vision | T + I<sup>+</sup> | `microsoft/Phi-4-reasoning-vision-15B`, etc. | | ✅︎ |
 | `PixtralForConditionalGeneration` | Ministral 3 (Mistral format), Mistral 3 (Mistral format), Mistral Large 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Mistral-Large-3-675B-Instruct-2512` `mistralai/Pixtral-12B-2409` etc. | ✅︎ | ✅︎ |
 | `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ |
 | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ |
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1741,6 +1741,27 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
    )


+# Phi-4-reasoning-vision
+def run_phi4siglip(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "microsoft/Phi-4-reasoning-vision-15B"
+    prompts = [
+        f"<|user|>\n<image>\n{question}<|end|>\n<|assistant|>\n"
+        for question in questions
+    ]
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={modality: 1},
+    )
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # Pixtral HF-format
 def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@@ -2222,6 +2243,7 @@ model_example_map = {
    "paligemma2": run_paligemma2,
    "phi3_v": run_phi3v,
    "phi4_mm": run_phi4mm,
+    "phi4_siglip": run_phi4siglip,
    "pixtral_hf": run_pixtral_hf,
    "qwen_vl": run_qwen_vl,
    "qwen2_vl": run_qwen2_vl,
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -957,6 +957,24 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
    )


+def load_phi4siglip(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "microsoft/Phi-4-reasoning-vision-15B"
+    placeholders = "\n".join("<image>" for _ in image_urls)
+    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "Qwen/Qwen-VL-Chat"
    engine_args = EngineArgs(
@@ -1455,6 +1473,7 @@ model_example_map = {
    "paddleocr_vl": load_paddleocr_vl,
    "phi3_v": load_phi3v,
    "phi4_mm": load_phi4mm,
+    "phi4_siglip": load_phi4siglip,
    "pixtral_hf": load_pixtral_hf,
    "qwen_vl_chat": load_qwen_vl_chat,
    "qwen2_vl": load_qwen2_vl,
--- a/examples/tool_chat_template_gemma4.jinja
+++ b/examples/tool_chat_template_gemma4.jinja
@@ -0,0 +1,331 @@
+{%- macro format_parameters(properties, required) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'OBJECT' -%}
+                ,properties:{
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                {%- elif value is mapping -%}
+                    {{- format_parameters(value, value['required'] | default([])) -}}
+                {%- endif -%}
+                }
+                {%- if value['required'] -%}
+                    ,required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    ,items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{ bos_token }}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {{- messages[0]['content'] | trim -}}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- OpenAI may emit multiple assistant messages in one tool loop (user → asst → tool → asst → tool).
+        Only the first of those should open <|turn>model; later ones continue the same model turn. -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+
+    {%- if message.get('reasoning') and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + message['reasoning'] + '\n<channel|>'}}
+    {%- endif -%}
+
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: consecutive following messages with role "tool" (no break/continue; range scan) -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '\n\n<|image|>\n\n' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '\n\n<|video|>\n\n' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+        {%- if not (ns_tr_out.flag and not message.get('content')) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+    {%- if not enable_thinking | default(false) -%}
+        {{- '<|channel>thought\n<channel|>' -}}
+    {%- endif -%}
+{%- endif -%}
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -54,6 +54,7 @@ hooks:
  - docs/mkdocs/hooks/generate_argparse.py
  - docs/mkdocs/hooks/generate_metrics.py
  - docs/mkdocs/hooks/url_schemes.py
+  - docs/mkdocs/hooks/autoref_code.py

 plugins:
  - meta
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
    "packaging>=24.2",
    "setuptools>=77.0.3,<81.0.0",
    "setuptools-scm>=8.0",
-    "torch == 2.10.0",
+    "torch == 2.11.0",
    "wheel",
    "jinja2",
 ]
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -4,7 +4,7 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
-torch==2.10.0
+torch==2.11.0
 wheel
 jinja2>=3.1.6
 regex
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -31,7 +31,7 @@ partial-json-parser # used for parsing partial JSON outputs
 pyzmq >= 25.0.0
 msgspec
 gguf >= 0.17.0
-mistral_common[image] >= 1.10.0
+mistral_common[image] >= 1.11.0
 opencv-python-headless >= 4.13.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@@ -1,10 +1,11 @@
+--extra-index-url https://download.pytorch.org/whl/cpu
 cmake>=3.26.1
 ninja
 packaging>=24.2
 setuptools==77.0.3 # this version can reuse CMake build dir
 setuptools-scm>=8
-torch==2.10.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
-torch==2.10.0; platform_machine == "aarch64" or platform_system == "Darwin" or platform_machine == "ppc64le"
+torch==2.11.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x" or platform_machine == "aarch64"
+torch==2.11.0; platform_system == "Darwin" or platform_machine == "ppc64le"  or platform_machine == "riscv64"
 wheel
 jinja2>=3.1.6
 regex
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -1,3 +1,4 @@
+--extra-index-url https://download.pytorch.org/whl/cpu
 # Common dependencies
 -r common.txt

@@ -6,8 +7,8 @@ setuptools==77.0.3 # this version can reuse CMake build dir
 numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding

 # Dependencies for CPUs
-torch==2.10.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
-torch==2.10.0; platform_machine == "aarch64" or platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "riscv64"
+torch==2.11.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x" or platform_machine == "aarch64"
+torch==2.11.0; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "riscv64"

 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchaudio; platform_machine != "s390x" and platform_machine != "riscv64"
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -4,10 +4,10 @@
 numba == 0.61.2 # Required for N-gram speculative decoding

 # Dependencies for NVIDIA GPUs
-torch==2.10.0
-torchaudio==2.10.0
+torch==2.11.0
+torchaudio==2.11.0
 # These must be updated alongside torch
-torchvision==0.25.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision==0.26.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
 flashinfer-python==0.6.7
 flashinfer-cubin==0.6.7
@@ -15,6 +15,9 @@ flashinfer-cubin==0.6.7
 # breaking changes in 1.19.0
 nvidia-cudnn-frontend>=1.13.0,<1.19.0

+# Required for faster safetensors model loading
+fastsafetensors >= 0.2.2
+
 # QuACK and Cutlass DSL for FA4 (cute-DSL implementation)
 nvidia-cutlass-dsl>=4.4.2
 quack-kernels>=0.3.3
--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
@@ -1,3 +1,3 @@
 lmcache >= 0.3.9
-nixl >= 0.7.1, < 0.10.0 # Required for disaggregated prefill
+nixl[cu13] >= 0.7.1, < 0.10.0 # Required for disaggregated prefill
 mooncake-transfer-engine >= 0.3.8
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -23,7 +23,7 @@ jiwer # required for audio tests
 timm # required for internvl test
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[image,audio] >= 1.9.1 # required for voxtral test
+mistral_common[image,audio] >= 1.11.0 # required for voxtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.13.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -1,10 +1,11 @@
 # Common dependencies
 -r common.txt

+
 --extra-index-url https://download.pytorch.org/whl/rocm7.1
-torch==2.10.0
-torchvision==0.25.0
-torchaudio==2.10.0
+torch==2.11.0
+torchvision==0.26.0
+torchaudio==2.11.0
 triton==3.6.0
 cmake>=3.26.1,<4
 packaging>=24.2
--- a/requirements/rocm-test.in
+++ b/requirements/rocm-test.in
@@ -1,3 +1,5 @@
+-r common.txt
+
 # testing
 pytest
 tensorizer==2.10.1
@@ -29,7 +31,7 @@ tblib # for pickling test exceptions
 timm>=1.0.17 # required for internvl and gemma3n-mm test
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[image,audio]>=1.10.0 # required for voxtral test
+mistral_common[image,audio]>=1.11.0 # required for voxtral test
 num2words # required for smolvlm test
 open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
 opencv-python-headless>=4.13.0 # required for video test
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -1,5 +1,5 @@
 # This file was autogenerated by uv via the following command:
-#    uv pip compile requirements/rocm-test.in -o requirements/rocm-test.txt --index-strategy unsafe-best-match -c requirements/rocm.txt --python-platform x86_64-manylinux_2_28 --python-version 3.12 --no-emit-package torch --no-emit-package torchvision --no-emit-package torchaudio --no-emit-package triton --no-emit-package cuda-bindings --no-emit-package cuda-pathfinder --no-emit-package cuda-toolkit --no-emit-package cupy-cuda12x --no-emit-package nvidia-cublas --no-emit-package nvidia-cuda-cupti --no-emit-package nvidia-cuda-nvrtc --no-emit-package nvidia-cuda-runtime --no-emit-package nvidia-cudnn-cu13 --no-emit-package nvidia-cufft --no-emit-package nvidia-cufile --no-emit-package nvidia-curand --no-emit-package nvidia-cusolver --no-emit-package nvidia-cusparse --no-emit-package nvidia-cusparselt-cu13 --no-emit-package nvidia-nccl-cu13 --no-emit-package nvidia-nvjitlink --no-emit-package nvidia-nvshmem-cu13 --no-emit-package nvidia-nvtx
+#    uv pip compile requirements/rocm-test.in -o requirements/rocm-test.txt --index-strategy unsafe-best-match -c requirements/rocm.txt --python-platform x86_64-manylinux_2_28 --python-version 3.12 --no-emit-package torch --no-emit-package torchvision --no-emit-package torchaudio --no-emit-package triton --no-emit-package cuda-bindings --no-emit-package cuda-pathfinder --no-emit-package cuda-toolkit --no-emit-package cupy-cuda12x --no-emit-package nvidia-cublas --no-emit-package nvidia-cuda-cupti --no-emit-package nvidia-cuda-nvrtc --no-emit-package nvidia-cuda-runtime --no-emit-package nvidia-cudnn --no-emit-package nvidia-cufft --no-emit-package nvidia-cufile --no-emit-package nvidia-curand --no-emit-package nvidia-cusolver --no-emit-package nvidia-cusparse --no-emit-package nvidia-cusparselt --no-emit-package nvidia-nccl --no-emit-package nvidia-nvjitlink --no-emit-package nvidia-nvshmem --no-emit-package nvidia-nvtx --no-emit-package nvidia-cublas-cu12 --no-emit-package nvidia-cuda-cupti-cu12 --no-emit-package nvidia-cuda-nvrtc-cu12 --no-emit-package nvidia-cuda-runtime-cu12 --no-emit-package nvidia-cudnn-cu12 --no-emit-package nvidia-cufft-cu12 --no-emit-package nvidia-cufile-cu12 --no-emit-package nvidia-curand-cu12 --no-emit-package nvidia-cusolver-cu12 --no-emit-package nvidia-cusparse-cu12 --no-emit-package nvidia-cusparselt-cu12 --no-emit-package nvidia-nccl-cu12 --no-emit-package nvidia-nvjitlink-cu12 --no-emit-package nvidia-nvshmem-cu12 --no-emit-package nvidia-nvtx-cu12 --no-emit-package nvidia-cublas-cu13 --no-emit-package nvidia-cuda-cupti-cu13 --no-emit-package nvidia-cuda-nvrtc-cu13 --no-emit-package nvidia-cuda-runtime-cu13 --no-emit-package nvidia-cudnn-cu13 --no-emit-package nvidia-cufft-cu13 --no-emit-package nvidia-cufile-cu13 --no-emit-package nvidia-curand-cu13 --no-emit-package nvidia-cusolver-cu13 --no-emit-package nvidia-cusparse-cu13 --no-emit-package nvidia-cusparselt-cu13 --no-emit-package nvidia-nccl-cu13 --no-emit-package nvidia-nvjitlink-cu13 --no-emit-package nvidia-nvshmem-cu13 --no-emit-package nvidia-nvtx-cu13
 absl-py==2.4.0
    # via
    #   rouge-score
@@ -15,6 +15,7 @@ aiohappyeyeballs==2.6.1
 aiohttp==3.13.3
    # via
    #   -c requirements/common.txt
+    #   -r requirements/common.txt
    #   aiohttp-cors
    #   fsspec
    #   gpt-oss
@@ -38,20 +39,31 @@ annotated-doc==0.0.4
    #   typer
 annotated-types==0.7.0
    # via pydantic
+anthropic==0.89.0
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/common.txt
 antlr4-python3-runtime==4.9.3
    # via
    #   hydra-core
    #   omegaconf
-anyio==4.6.2.post1
+anyio==4.13.0
    # via
+    #   anthropic
    #   httpx
+    #   mcp
+    #   openai
+    #   sse-starlette
    #   starlette
+    #   watchfiles
 arctic-inference==0.1.1
    # via -r requirements/rocm-test.in
 argcomplete==3.6.3
    # via datamodel-code-generator
 arrow==1.4.0
    # via isoduration
+astor==0.8.1
+    # via depyf
 attrs==26.1.0
    # via
    #   aiohttp
@@ -83,6 +95,8 @@ bitsandbytes==0.49.2
    #   lightning
 black==26.3.1
    # via datamodel-code-generator
+blake3==1.0.8
+    # via -r requirements/common.txt
 blobfile==3.0.0
    # via -r requirements/rocm-test.in
 bm25s==0.2.13
@@ -99,6 +113,10 @@ bounded-pool-executor==0.0.3
    # via pqdm
 buildkite-test-collector==0.1.9
    # via -r requirements/rocm-test.in
+cachetools==7.0.5
+    # via -r requirements/common.txt
+cbor2==5.9.0
+    # via -r requirements/common.txt
 certifi==2026.2.25
    # via
    #   fiona
@@ -132,6 +150,7 @@ click==8.3.1
    #   nltk
    #   rasterio
    #   ray
+    #   rich-toolkit
    #   schemathesis
    #   typer
    #   uvicorn
@@ -142,6 +161,8 @@ cligj==0.7.2
    # via
    #   fiona
    #   rasterio
+cloudpickle==3.1.2
+    # via -r requirements/common.txt
 colorama==0.4.6
    # via
    #   perceptron
@@ -151,6 +172,10 @@ colorful==0.5.8
    # via ray
 colorlog==6.10.1
    # via optuna
+compressed-tensors==0.14.0.1
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/common.txt
 contourpy==1.3.3
    # via matplotlib
 coverage==7.13.5
@@ -182,24 +207,42 @@ decorator==5.2.1
    # via librosa
 decord==0.6.0
    # via -r requirements/rocm-test.in
+depyf==0.20.0
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/common.txt
 diffusers==0.37.0
    # via terratorch
 dill==0.3.8
    # via
    #   datasets
+    #   depyf
    #   evaluate
    #   lm-eval
    #   multiprocess
+diskcache==5.6.3
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/common.txt
 distlib==0.4.0
    # via virtualenv
+distro==1.9.0
+    # via
+    #   anthropic
+    #   openai
+dnspython==2.8.0
+    # via email-validator
 docker==7.1.0
    # via gpt-oss
 docopt==0.6.2
    # via num2words
 docstring-parser==0.17.0
-    # via jsonargparse
+    # via
+    #   anthropic
+    #   jsonargparse
 einops==0.8.2
    # via
+    #   -r requirements/common.txt
    #   -r requirements/rocm-test.in
    #   encodec
    #   terratorch
@@ -208,6 +251,10 @@ einops==0.8.2
    #   vocos
 einx==0.4.2
    # via vector-quantize-pytorch
+email-validator==2.3.0
+    # via
+    #   fastapi
+    #   pydantic
 encodec==0.1.1
    # via vocos
 et-xmlfile==2.0.0
@@ -217,14 +264,25 @@ evaluate==0.4.6
 fastapi==0.135.2
    # via
    #   -c requirements/common.txt
+    #   -r requirements/common.txt
    #   gpt-oss
+    #   model-hosting-container-standards
+fastapi-cli==0.0.24
+    # via fastapi
+fastapi-cloud-cli==0.15.1
+    # via fastapi-cli
+fastar==0.9.0
+    # via fastapi-cloud-cli
 fastparquet==2026.3.0
    # via genai-perf
 fastsafetensors==0.2.2
-    # via -r requirements/rocm-test.in
+    # via
+    #   -c requirements/rocm.txt
+    #   -r requirements/rocm-test.in
 filelock==3.25.2
    # via
    #   -c requirements/common.txt
+    #   -r requirements/common.txt
    #   blobfile
    #   datasets
    #   diffusers
@@ -264,6 +322,10 @@ genson==1.3.0
    # via datamodel-code-generator
 geopandas==1.1.3
    # via terratorch
+gguf==0.18.0
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/common.txt
 gitdb==4.0.12
    # via gitpython
 gitpython==3.1.46
@@ -290,7 +352,10 @@ google-crc32c==1.8.0
 google-resumable-media==2.8.0
    # via google-cloud-storage
 googleapis-common-protos==1.73.0
-    # via google-api-core
+    # via
+    #   google-api-core
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
 gpt-oss==0.0.8
    # via -r requirements/rocm-test.in
 graphql-core==3.2.8
@@ -302,6 +367,7 @@ grpcio==1.78.0
    #   -c requirements/rocm.txt
    #   -r requirements/rocm-test.in
    #   grpcio-reflection
+    #   opentelemetry-exporter-otlp-proto-grpc
    #   ray
    #   tensorboard
 grpcio-reflection==1.78.0
@@ -328,12 +394,22 @@ html2text==2025.4.15
    # via gpt-oss
 httpcore==1.0.9
    # via httpx
+httptools==0.7.1
+    # via uvicorn
 httpx==0.27.2
    # via
    #   -r requirements/rocm-test.in
+    #   anthropic
    #   diffusers
+    #   fastapi
+    #   fastapi-cloud-cli
+    #   mcp
+    #   model-hosting-container-standards
+    #   openai
    #   perceptron
    #   schemathesis
+httpx-sse==0.4.3
+    # via mcp
 huggingface-hub==0.36.2
    # via
    #   -r requirements/rocm-test.in
@@ -370,10 +446,13 @@ hypothesis-jsonschema==0.23.1
 idna==3.11
    # via
    #   anyio
+    #   email-validator
    #   httpx
    #   jsonschema
    #   requests
    #   yarl
+ijson==3.5.0
+    # via -r requirements/common.txt
 imagehash==4.3.2
    # via -r requirements/rocm-test.in
 imageio==2.37.3
@@ -390,6 +469,8 @@ iniconfig==2.3.0
    # via pytest
 instanttensor==0.1.6
    # via -r requirements/rocm-test.in
+interegular==0.3.3
+    # via lm-format-enforcer
 isodate==0.7.2
    # via azure-storage-blob
 isoduration==20.11.0
@@ -399,15 +480,21 @@ isort==8.0.1
 jinja2==3.1.6
    # via
    #   datamodel-code-generator
+    #   fastapi
    #   genai-perf
    #   lm-eval
    #   torch
+jiter==0.13.0
+    # via
+    #   anthropic
+    #   openai
 jiwer==4.0.0
    # via -r requirements/rocm-test.in
 jmespath==1.1.0
    # via
    #   boto3
    #   botocore
+    #   model-hosting-container-standards
 joblib==1.5.3
    # via
    #   librosa
@@ -426,6 +513,7 @@ jsonpointer==3.1.0
 jsonschema==4.26.0
    # via
    #   hypothesis-jsonschema
+    #   mcp
    #   mistral-common
    #   ray
    #   schemathesis
@@ -443,6 +531,10 @@ kornia==0.8.2
    # via torchgeo
 kornia-rs==0.1.10
    # via kornia
+lark==1.2.2
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/common.txt
 lazy-loader==0.4
    # via
    #   librosa
@@ -466,14 +558,24 @@ lightning-utilities==0.15.3
    #   lightning
    #   pytorch-lightning
    #   torchmetrics
+llguidance==1.3.0
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/common.txt
 llvmlite==0.44.0
    # via numba
 lm-eval==0.4.11
    # via -r requirements/rocm-test.in
+lm-format-enforcer==0.11.3
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/common.txt
 logistro==2.0.1
    # via
    #   choreographer
    #   kaleido
+loguru==0.7.3
+    # via compressed-tensors
 lxml==6.0.2
    # via
    #   blobfile
@@ -500,12 +602,19 @@ mbstrdecoder==1.1.4
    #   dataproperty
    #   pytablewriter
    #   typepy
+mcp==1.27.0
+    # via -r requirements/common.txt
 mdurl==0.1.2
    # via markdown-it-py
-mistral-common==1.10.0
+mistral-common==1.11.0
    # via
    #   -c requirements/common.txt
+    #   -r requirements/common.txt
    #   -r requirements/rocm-test.in
+model-hosting-container-standards==0.1.14
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/common.txt
 more-itertools==10.8.0
    # via
    #   inflect
@@ -522,6 +631,8 @@ msgpack==1.1.2
    # via
    #   librosa
    #   ray
+msgspec==0.20.0
+    # via -r requirements/common.txt
 mteb==2.11.5
    # via -r requirements/rocm-test.in
 multidict==6.7.1
@@ -541,6 +652,8 @@ networkx==3.6.1
    # via
    #   scikit-image
    #   torch
+ninja==1.13.0
+    # via -r requirements/common.txt
 nltk==3.9.3
    # via rouge-score
 num2words==0.5.14
@@ -555,6 +668,7 @@ numkong==7.1.1
    # via albucore
 numpy==2.2.6
    # via
+    #   -r requirements/common.txt
    #   -r requirements/rocm-test.in
    #   accelerate
    #   albucore
@@ -572,6 +686,7 @@ numpy==2.2.6
    #   fastparquet
    #   genai-perf
    #   geopandas
+    #   gguf
    #   h5py
    #   imagehash
    #   imageio
@@ -620,15 +735,21 @@ numpy==2.2.6
    #   tritonclient
    #   vocos
    #   xarray
+    #   xgrammar
 omegaconf==2.3.0
    # via
    #   hydra-core
    #   lightning
 open-clip-torch==2.32.0
    # via -r requirements/rocm-test.in
+openai==2.30.0
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/common.txt
 openai-harmony==0.0.8
    # via
    #   -c requirements/common.txt
+    #   -r requirements/common.txt
    #   gpt-oss
 opencensus==0.11.4
    # via ray
@@ -637,6 +758,7 @@ opencensus-context==0.1.3
 opencv-python-headless==4.13.0.92
    # via
    #   -c requirements/common.txt
+    #   -r requirements/common.txt
    #   -r requirements/rocm-test.in
    #   albumentations
    #   mistral-common
@@ -645,26 +767,59 @@ openpyxl==3.1.5
 opentelemetry-api==1.40.0
    # via
    #   -c requirements/common.txt
+    #   -r requirements/common.txt
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
    #   opentelemetry-exporter-prometheus
    #   opentelemetry-sdk
    #   opentelemetry-semantic-conventions
+opentelemetry-exporter-otlp==1.40.0
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/common.txt
+opentelemetry-exporter-otlp-proto-common==1.40.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-exporter-otlp-proto-grpc==1.40.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-exporter-otlp-proto-http==1.40.0
+    # via opentelemetry-exporter-otlp
 opentelemetry-exporter-prometheus==0.61b0
    # via ray
 opentelemetry-proto==1.40.0
-    # via ray
+    # via
+    #   opentelemetry-exporter-otlp-proto-common
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   ray
 opentelemetry-sdk==1.40.0
    # via
    #   -c requirements/common.txt
+    #   -r requirements/common.txt
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
    #   opentelemetry-exporter-prometheus
+    #   opentelemetry-semantic-conventions-ai
    #   ray
 opentelemetry-semantic-conventions==0.61b0
-    # via opentelemetry-sdk
+    # via
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions-ai
+opentelemetry-semantic-conventions-ai==0.5.1
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/common.txt
 optuna==3.6.1
    # via genai-perf
 orjson==3.11.7
    # via
    #   genai-perf
    #   kaleido
+outlines-core==0.2.11
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/common.txt
 packaging==26.0
    # via
    #   -c requirements/rocm.txt
@@ -682,6 +837,7 @@ packaging==26.0
    #   lazy-loader
    #   lightning
    #   lightning-utilities
+    #   lm-format-enforcer
    #   matplotlib
    #   optuna
    #   peft
@@ -713,6 +869,8 @@ pandas==3.0.1
    #   tacoreader
    #   torchgeo
    #   xarray
+partial-json-parser==0.2.1.1.post7
+    # via -r requirements/common.txt
 pathspec==1.0.4
    # via black
 pathvalidate==3.3.1
@@ -727,6 +885,7 @@ perf-analyzer==0.1.0
    # via genai-perf
 pillow==12.1.1
    # via
+    #   -r requirements/common.txt
    #   diffusers
    #   genai-perf
    #   imagehash
@@ -768,8 +927,14 @@ pqdm==0.2.0
 prometheus-client==0.24.1
    # via
    #   -c requirements/common.txt
+    #   -r requirements/common.txt
    #   opentelemetry-exporter-prometheus
+    #   prometheus-fastapi-instrumentator
    #   ray
+prometheus-fastapi-instrumentator==7.1.0
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/common.txt
 propcache==0.4.1
    # via
    #   aiohttp
@@ -779,6 +944,7 @@ proto-plus==1.27.1
 protobuf==6.33.6
    # via
    #   -c requirements/common.txt
+    #   -r requirements/common.txt
    #   google-api-core
    #   googleapis-common-protos
    #   grpcio-reflection
@@ -791,11 +957,14 @@ protobuf==6.33.6
    #   wandb
 psutil==7.2.2
    # via
+    #   -r requirements/common.txt
    #   accelerate
    #   peft
    #   tensorizer
 py==1.11.0
    # via pytest-forked
+py-cpuinfo==9.0.0
+    # via -r requirements/common.txt
 py-spy==0.4.1
    # via ray
 pyarrow==23.0.1
@@ -808,6 +977,8 @@ pyasn1==0.6.3
    # via pyasn1-modules
 pyasn1-modules==0.4.2
    # via google-auth
+pybase64==1.4.3
+    # via -r requirements/common.txt
 pycocotools==2.0.11
    # via terratorch
 pycountry==26.2.16
@@ -819,26 +990,44 @@ pycryptodomex==3.23.0
 pydantic==2.12.5
    # via
    #   -c requirements/common.txt
+    #   -r requirements/common.txt
    #   -r requirements/rocm-test.in
    #   albumentations
+    #   anthropic
+    #   compressed-tensors
    #   datamodel-code-generator
    #   fastapi
+    #   fastapi-cloud-cli
    #   gpt-oss
    #   lightly
+    #   lm-format-enforcer
+    #   mcp
    #   mistral-common
+    #   model-hosting-container-standards
    #   mteb
+    #   openai
    #   openai-harmony
    #   pydantic-extra-types
+    #   pydantic-settings
    #   ray
    #   wandb
+    #   xgrammar
 pydantic-core==2.41.5
    # via pydantic
 pydantic-extra-types==2.11.1
-    # via mistral-common
+    # via
+    #   fastapi
+    #   mistral-common
+pydantic-settings==2.13.1
+    # via
+    #   fastapi
+    #   mcp
 pygments==2.19.2
    # via rich
 pyjwt==2.12.1
-    # via msal
+    # via
+    #   mcp
+    #   msal
 pyogrio==0.12.1
    # via geopandas
 pyparsing==3.3.2
@@ -898,6 +1087,16 @@ python-dateutil==2.9.0.post0
    #   typepy
 python-discovery==1.2.0
    # via virtualenv
+python-dotenv==1.2.2
+    # via
+    #   pydantic-settings
+    #   uvicorn
+python-json-logger==4.1.0
+    # via -r requirements/common.txt
+python-multipart==0.0.22
+    # via
+    #   fastapi
+    #   mcp
 python-rapidjson==1.23
    # via tritonclient
 pytokens==0.4.1
@@ -914,14 +1113,17 @@ pywavelets==1.9.0
    # via imagehash
 pyyaml==6.0.3
    # via
+    #   -r requirements/common.txt
    #   accelerate
    #   albumentations
    #   datamodel-code-generator
    #   datasets
    #   genai-perf
+    #   gguf
    #   huggingface-hub
    #   jsonargparse
    #   lightning
+    #   lm-format-enforcer
    #   omegaconf
    #   optuna
    #   peft
@@ -931,8 +1133,13 @@ pyyaml==6.0.3
    #   schemathesis
    #   timm
    #   transformers
+    #   uvicorn
    #   vocos
    #   wandb
+pyzmq==27.1.0
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/common.txt
 rapidfuzz==3.12.1
    # via
    #   -r requirements/rocm-test.in
@@ -952,6 +1159,7 @@ referencing==0.37.0
    #   jsonschema-specifications
 regex==2026.2.28
    # via
+    #   -r requirements/common.txt
    #   diffusers
    #   nltk
    #   open-clip-torch
@@ -961,12 +1169,14 @@ regex==2026.2.28
 requests==2.32.5
    # via
    #   -c requirements/common.txt
+    #   -r requirements/common.txt
    #   azure-core
    #   buildkite-test-collector
    #   datasets
    #   diffusers
    #   docker
    #   evaluate
+    #   gguf
    #   google-api-core
    #   google-cloud-storage
    #   gpt-oss
@@ -976,6 +1186,7 @@ requests==2.32.5
    #   mistral-common
    #   msal
    #   mteb
+    #   opentelemetry-exporter-otlp-proto-http
    #   pooch
    #   ray
    #   responses
@@ -999,8 +1210,15 @@ rich==14.3.3
    #   lightning
    #   mteb
    #   perceptron
+    #   rich-toolkit
    #   terratorch
    #   typer
+rich-toolkit==0.19.7
+    # via
+    #   fastapi-cli
+    #   fastapi-cloud-cli
+rignore==0.7.6
+    # via fastapi-cloud-cli
 rioxarray==0.22.0
    # via terratorch
 rouge-score==0.1.2
@@ -1070,12 +1288,20 @@ sentence-transformers==5.3.0
    # via
    #   -r requirements/rocm-test.in
    #   mteb
+sentencepiece==0.2.1
+    # via -r requirements/common.txt
 sentry-sdk==2.55.0
-    # via wandb
+    # via
+    #   fastapi-cloud-cli
+    #   wandb
+setproctitle==1.3.7
+    # via -r requirements/common.txt
 setuptools==79.0.1
    # via
    #   -c requirements/common.txt
    #   -c requirements/rocm.txt
+    #   -r requirements/common.txt
+    #   model-hosting-container-standards
    #   pytablewriter
    #   tensorboard
    #   torch
@@ -1092,6 +1318,7 @@ simplejson==3.20.2
 six==1.17.0
    # via
    #   -c requirements/common.txt
+    #   -r requirements/common.txt
    #   junit-xml
    #   lightly
    #   opencensus
@@ -1104,8 +1331,9 @@ smmap==5.0.3
    # via gitdb
 sniffio==1.3.1
    # via
-    #   anyio
+    #   anthropic
    #   httpx
+    #   openai
 sortedcontainers==2.4.0
    # via hypothesis
 soundfile==0.13.1
@@ -1124,10 +1352,16 @@ sqlalchemy==2.0.48
    #   optuna
 sqlitedict==2.1.0
    # via lm-eval
+sse-starlette==3.3.4
+    # via mcp
 starlette==0.52.1
    # via
    #   fastapi
+    #   mcp
+    #   model-hosting-container-standards
+    #   prometheus-fastapi-instrumentator
    #   schemathesis
+    #   sse-starlette
    #   starlette-testclient
 starlette-testclient==0.4.1
    # via schemathesis
@@ -1137,6 +1371,8 @@ stringzilla==4.6.0
    # via albucore
 structlog==25.5.0
    # via gpt-oss
+supervisor==4.3.0
+    # via model-hosting-container-standards
 sympy==1.14.0
    # via
    #   einx
@@ -1180,6 +1416,7 @@ tifffile==2026.3.3
 tiktoken==0.12.0
    # via
    #   -c requirements/common.txt
+    #   -r requirements/common.txt
    #   gpt-oss
    #   lm-eval
    #   mistral-common
@@ -1194,6 +1431,7 @@ timm==1.0.17
 tokenizers==0.22.0
    # via
    #   -c requirements/common.txt
+    #   -r requirements/common.txt
    #   -r requirements/rocm-test.in
    #   transformers
 tomli==2.4.0
@@ -1212,8 +1450,10 @@ torchmetrics==1.9.0
    #   torchgeo
 tqdm==4.67.3
    # via
+    #   -r requirements/common.txt
    #   datasets
    #   evaluate
+    #   gguf
    #   huggingface-hub
    #   lightly
    #   lightning
@@ -1221,6 +1461,7 @@ tqdm==4.67.3
    #   mteb
    #   nltk
    #   open-clip-torch
+    #   openai
    #   optuna
    #   peft
    #   pqdm
@@ -1233,11 +1474,14 @@ tqdm==4.67.3
 transformers==4.57.5
    # via
    #   -c requirements/common.txt
+    #   -r requirements/common.txt
    #   -r requirements/rocm-test.in
+    #   compressed-tensors
    #   genai-perf
    #   peft
    #   sentence-transformers
    #   transformers-stream-generator
+    #   xgrammar
 transformers-stream-generator==0.0.5
    # via -r requirements/rocm-test.in
 tritonclient==2.66.0
@@ -1251,6 +1495,8 @@ typepy==1.3.4
    #   tabledata
 typer==0.24.1
    # via
+    #   fastapi-cli
+    #   fastapi-cloud-cli
    #   fastsafetensors
    #   perceptron
 typeshed-client==2.9.0
@@ -1258,9 +1504,12 @@ typeshed-client==2.9.0
 typing-extensions==4.15.0
    # via
    #   -c requirements/common.txt
+    #   -r requirements/common.txt
    #   aiosignal
    #   albumentations
    #   alembic
+    #   anthropic
+    #   anyio
    #   azure-core
    #   azure-identity
    #   azure-storage-blob
@@ -1272,9 +1521,13 @@ typing-extensions==4.15.0
    #   lightning
    #   lightning-utilities
    #   lm-eval
+    #   mcp
    #   mistral-common
    #   mteb
+    #   openai
    #   opentelemetry-api
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
    #   opentelemetry-sdk
    #   opentelemetry-semantic-conventions
    #   pqdm
@@ -1283,6 +1536,7 @@ typing-extensions==4.15.0
    #   pydantic-extra-types
    #   pytorch-lightning
    #   referencing
+    #   rich-toolkit
    #   sentence-transformers
    #   sqlalchemy
    #   starlette
@@ -1292,10 +1546,13 @@ typing-extensions==4.15.0
    #   typeshed-client
    #   typing-inspection
    #   wandb
+    #   xgrammar
 typing-inspection==0.4.2
    # via
    #   fastapi
+    #   mcp
    #   pydantic
+    #   pydantic-settings
 tzdata==2025.3
    # via arrow
 uri-template==1.3.0
@@ -1311,7 +1568,14 @@ urllib3==2.6.3
    #   sentry-sdk
    #   tritonclient
 uvicorn==0.42.0
-    # via gpt-oss
+    # via
+    #   fastapi
+    #   fastapi-cli
+    #   fastapi-cloud-cli
+    #   gpt-oss
+    #   mcp
+uvloop==0.22.1
+    # via uvicorn
 vector-quantize-pytorch==1.28.0
    # via -r requirements/rocm-test.in
 virtualenv==21.2.0
@@ -1320,10 +1584,16 @@ vocos==0.1.0
    # via -r requirements/rocm-test.in
 wandb==0.25.1
    # via terratorch
+watchfiles==1.1.1
+    # via
+    #   -r requirements/common.txt
+    #   uvicorn
 wcwidth==0.6.0
    # via ftfy
 webcolors==25.10.0
    # via jsonschema
+websockets==16.0
+    # via uvicorn
 werkzeug==3.1.6
    # via
    #   schemathesis
@@ -1334,6 +1604,10 @@ wrapt==2.1.2
    # via smart-open
 xarray==2026.2.0
    # via rioxarray
+xgrammar==0.1.33
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/common.txt
 xxhash==3.6.0
    # via
    #   datasets
@@ -1360,14 +1634,14 @@ zstandard==0.25.0
 # nvidia-cuda-cupti
 # nvidia-cuda-nvrtc
 # nvidia-cuda-runtime
-# nvidia-cudnn-cu13
 # nvidia-cufft
 # nvidia-cufile
 # nvidia-curand
 # nvidia-cusolver
 # nvidia-cusparse
+# nvidia-nvjitlink
+# nvidia-nvtx
+# nvidia-cudnn-cu13
 # nvidia-cusparselt-cu13
 # nvidia-nccl-cu13
-# nvidia-nvjitlink
 # nvidia-nvshmem-cu13
-# nvidia-nvtx
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -21,3 +21,5 @@ timm>=1.0.17
 # amd-quark: required for Quark quantization on ROCm 
 # To be consistent with test_quark.py
 amd-quark>=0.8.99
+# Required for faster safetensors model loading
+fastsafetensors >= 0.2.2
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -27,12 +27,12 @@ soundfile # required for audio tests
 jiwer # required for audio tests
 tblib # for pickling test exceptions
 timm >=1.0.17 # required for internvl and gemma3n-mm test
-torch==2.10.0
-torchaudio==2.10.0
-torchvision==0.25.0
+torch==2.11.0
+torchaudio==2.11.0
+torchvision==0.26.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[image,audio] >= 1.9.1 # required for voxtral test
+mistral_common[image,audio] >= 1.11.0 # required for voxtral test
 num2words # required for smolvlm test
 open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
 opencv-python-headless >= 4.13.0 # required for video test
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1,5 +1,5 @@
 # This file was autogenerated by uv via the following command:
-#    uv pip compile requirements/test.in -c requirements/common.txt -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12
+#    uv pip compile requirements/test.in -c requirements/common.txt -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu130 --python-platform x86_64-manylinux_2_28 --python-version 3.12
 absl-py==2.1.0
    # via
    #   rouge-score
@@ -165,10 +165,12 @@ cryptography==46.0.5
    #   azure-storage-blob
    #   msal
    #   pyjwt
-cuda-bindings==12.9.4
+cuda-bindings==13.0.3
    # via torch
 cuda-pathfinder==1.3.3
    # via cuda-bindings
+cuda-toolkit==13.0.2
+    # via torch
 cupy-cuda12x==13.6.0
    # via ray
 cycler==0.12.1
@@ -508,7 +510,7 @@ mbstrdecoder==1.1.3
    #   typepy
 mdurl==0.1.2
    # via markdown-it-py
-mistral-common==1.10.0
+mistral-common==1.11.0
    # via
    #   -c requirements/common.txt
    #   -r requirements/test.in
@@ -615,45 +617,45 @@ numpy==2.2.6
    #   tritonclient
    #   vocos
    #   xarray
-nvidia-cublas-cu12==12.9.1.4
+nvidia-cublas==13.1.0.3
    # via
-    #   nvidia-cudnn-cu12
-    #   nvidia-cusolver-cu12
-    #   torch
-nvidia-cuda-cupti-cu12==12.9.79
+    #   cuda-toolkit
+    #   nvidia-cudnn-cu13
+    #   nvidia-cusolver
+nvidia-cuda-cupti==13.0.85
+    # via cuda-toolkit
+nvidia-cuda-nvrtc==13.0.88
+    # via cuda-toolkit
+nvidia-cuda-runtime==13.0.96
+    # via cuda-toolkit
+nvidia-cudnn-cu13==9.19.0.56
    # via torch
-nvidia-cuda-nvrtc-cu12==12.9.86
-    # via torch
-nvidia-cuda-runtime-cu12==12.9.79
-    # via torch
-nvidia-cudnn-cu12==9.10.2.21
-    # via torch
-nvidia-cufft-cu12==11.4.1.4
-    # via torch
-nvidia-cufile-cu12==1.14.1.1
-    # via torch
-nvidia-curand-cu12==10.3.10.19
-    # via torch
-nvidia-cusolver-cu12==11.7.5.82
-    # via torch
-nvidia-cusparse-cu12==12.5.10.65
+nvidia-cufft==12.0.0.61
+    # via cuda-toolkit
+nvidia-cufile==1.15.1.6
+    # via cuda-toolkit
+nvidia-curand==10.4.0.35
+    # via cuda-toolkit
+nvidia-cusolver==12.0.4.66
+    # via cuda-toolkit
+nvidia-cusparse==12.6.3.3
    # via
-    #   nvidia-cusolver-cu12
-    #   torch
-nvidia-cusparselt-cu12==0.7.1
+    #   cuda-toolkit
+    #   nvidia-cusolver
+nvidia-cusparselt-cu13==0.8.0
    # via torch
-nvidia-nccl-cu12==2.27.5
+nvidia-nccl-cu13==2.28.9
    # via torch
-nvidia-nvjitlink-cu12==12.9.86
+nvidia-nvjitlink==13.0.88
    # via
-    #   nvidia-cufft-cu12
-    #   nvidia-cusolver-cu12
-    #   nvidia-cusparse-cu12
-    #   torch
-nvidia-nvshmem-cu12==3.4.5
-    # via torch
-nvidia-nvtx-cu12==12.9.79
+    #   cuda-toolkit
+    #   nvidia-cufft
+    #   nvidia-cusolver
+    #   nvidia-cusparse
+nvidia-nvshmem-cu13==3.4.5
    # via torch
+nvidia-nvtx==13.0.85
+    # via cuda-toolkit
 omegaconf==2.3.0
    # via
    #   hydra-core
@@ -1220,7 +1222,7 @@ tomli==2.2.1
    # via schemathesis
 tomli-w==1.2.0
    # via schemathesis
-torch==2.10.0+cu129
+torch==2.11.0+cu130
    # via
    #   -r requirements/test.in
    #   accelerate
@@ -1240,13 +1242,12 @@ torch==2.10.0+cu129
    #   tensorizer
    #   terratorch
    #   timm
-    #   torchaudio
    #   torchgeo
    #   torchmetrics
    #   torchvision
    #   vector-quantize-pytorch
    #   vocos
-torchaudio==2.10.0+cu129
+torchaudio==2.11.0+cu130
    # via
    #   -r requirements/test.in
    #   encodec
@@ -1259,7 +1260,7 @@ torchmetrics==1.7.4
    #   pytorch-lightning
    #   terratorch
    #   torchgeo
-torchvision==0.25.0+cu129
+torchvision==0.26.0+cu130
    # via
    #   -r requirements/test.in
    #   lightly
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -11,8 +11,8 @@ jinja2>=3.1.6
 datasets # for benchmark scripts
 numba == 0.61.2 # Required for N-gram speculative decoding
 --extra-index-url=https://download.pytorch.org/whl/xpu
-torch==2.10.0+xpu
+torch==2.11.0+xpu
 torchaudio
 torchvision

-vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.4/vllm_xpu_kernels-0.1.4-cp38-abi3-manylinux_2_28_x86_64.whl
+vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.5/vllm_xpu_kernels-0.1.5-cp38-abi3-manylinux_2_28_x86_64.whl
--- a/setup.py
+++ b/setup.py
@@ -379,6 +379,20 @@ class cmake_build_ext(build_ext):
                dirs_exist_ok=True,
            )

+        if _is_cuda():
+            # copy vendored deep_gemm package from build_lib to source tree
+            # for editable installs
+            deep_gemm_build = os.path.join(
+                self.build_lib, "vllm", "third_party", "deep_gemm"
+            )
+            if os.path.exists(deep_gemm_build):
+                print(f"Copying {deep_gemm_build} to vllm/third_party/deep_gemm")
+                shutil.copytree(
+                    deep_gemm_build,
+                    "vllm/third_party/deep_gemm",
+                    dirs_exist_ok=True,
+                )
+

 class precompiled_build_ext(build_ext):
    """Disables extension building when using precompiled binaries."""
@@ -685,6 +699,8 @@ class precompiled_wheel_utils:
                flashmla_regex = re.compile(
                    r"vllm/third_party/flashmla/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
                )
+                # DeepGEMM: extract all files (.py, .so, .cuh, .h, .hpp, etc.)
+                deep_gemm_regex = re.compile(r"vllm/third_party/deep_gemm/.*")
                file_members = list(
                    filter(lambda x: x.filename in files_to_copy, wheel.filelist)
                )
@@ -699,6 +715,9 @@ class precompiled_wheel_utils:
                file_members += list(
                    filter(lambda x: flashmla_regex.match(x.filename), wheel.filelist)
                )
+                file_members += list(
+                    filter(lambda x: deep_gemm_regex.match(x.filename), wheel.filelist)
+                )

                for file in file_members:
                    print(f"[extract] {file.filename}")
@@ -987,6 +1006,12 @@ if _is_cuda():
        ext_modules.append(
            CMakeExtension(name="vllm._flashmla_extension_C", optional=True)
        )
+    if envs.VLLM_USE_PRECOMPILED or (
+        CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3")
+    ):
+        # DeepGEMM requires CUDA 12.3+ (SM90/SM100)
+        # Optional since it won't build on unsupported architectures
+        ext_modules.append(CMakeExtension(name="vllm._deep_gemm_C", optional=True))

 if _is_cpu():
    import platform
@@ -1013,6 +1038,11 @@ package_data = {
        "model_executor/layers/quantization/utils/configs/*.json",
        "entrypoints/serve/instrumentator/static/*.js",
        "entrypoints/serve/instrumentator/static/*.css",
+        "distributed/kv_transfer/kv_connector/v1/hf3fs/utils/*.cpp",
+        # DeepGEMM JIT include headers (vendored via cmake)
+        "third_party/deep_gemm/include/**/*.cuh",
+        "third_party/deep_gemm/include/**/*.h",
+        "third_party/deep_gemm/include/**/*.hpp",
    ]
 }

@@ -1060,8 +1090,6 @@ setup(
        ],  # Required for audio processing
        "video": [],  # Kept for backwards compatibility
        "flashinfer": [],  # Kept for backwards compatibility
-        # Optional deps for AMD FP4 quantization support
-        "petit-kernel": ["petit-kernel"],
        # Optional deps for Helion kernel development
        # NOTE: When updating helion version, also update CI files:
        #   - .buildkite/test_areas/kernels.yaml
--- a/tests/compile/passes/distributed/test_fusion_all_reduce.py
+++ b/tests/compile/passes/distributed/test_fusion_all_reduce.py
@@ -39,7 +39,9 @@ from vllm.utils.torch_utils import set_random_seed


 class TestAllReduceRMSNormModel(torch.nn.Module):
-    def __init__(self, hidden_size=16, token_num=16, eps=1e-6):
+    def __init__(
+        self, hidden_size=16, token_num=16, eps=1e-6, dtype: torch.dtype = torch.float16
+    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.eps = eps
@@ -78,7 +80,9 @@ class TestAllReduceRMSNormModel(torch.nn.Module):
 class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
    quant_key = kFp8StaticTensorSym

-    def __init__(self, hidden_size=16, token_num=16, eps=1e-6):
+    def __init__(
+        self, hidden_size=16, token_num=16, eps=1e-6, dtype: torch.dtype = torch.float16
+    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.eps = eps
@@ -88,6 +92,7 @@ class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
                weight_shape=(hidden_size, hidden_size),
                activation_quant_key=self.quant_key,
                weight_quant_key=self.quant_key,
+                input_dtype=dtype,
            )
            for i in range(3)
        ]
@@ -127,7 +132,9 @@ class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):


 class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module):
-    def __init__(self, hidden_size=16, token_num=16, eps=1e-6):
+    def __init__(
+        self, hidden_size=16, token_num=16, eps=1e-6, dtype: torch.dtype = torch.float16
+    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.eps = eps
@@ -314,7 +321,7 @@ def all_reduce_fusion_pass_on_test_model(
        )

        token_num = batch_size * seq_len
-        model = test_model_cls(hidden_size, token_num)
+        model = test_model_cls(hidden_size, token_num, dtype=dtype)

        hidden_states = torch.randn((token_num, hidden_size), requires_grad=False)

--- a/tests/compile/passes/distributed/test_sequence_parallelism.py
+++ b/tests/compile/passes/distributed/test_sequence_parallelism.py
@@ -109,6 +109,7 @@ class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
                weight_shape=(hidden_size, hidden_size),
                activation_quant_key=self.quant_key,
                weight_quant_key=self.quant_key,
+                input_dtype=self.vllm_config.model_config.dtype,
            )
            for i in range(3)
        ]
--- a/tests/compile/passes/test_functionalization.py
+++ b/tests/compile/passes/test_functionalization.py
@@ -23,6 +23,7 @@ from vllm.config import (
    ModelConfig,
    PassConfig,
    VllmConfig,
+    get_current_vllm_config,
    set_current_vllm_config,
 )
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -49,6 +50,7 @@ class TestSiluMul(torch.nn.Module):
                weight_shape=(hidden_size, hidden_size),
                activation_quant_key=self.quant_key,
                weight_quant_key=self.quant_key,
+                input_dtype=get_current_vllm_config().model_config.dtype,
            )

    def forward(self, x):
@@ -92,6 +94,7 @@ class TestFusedAddRMSNorm(torch.nn.Module):
                weight_shape=(hidden_size, intermediate_size),
                activation_quant_key=self.quant_key,
                weight_quant_key=self.quant_key,
+                input_dtype=get_current_vllm_config().model_config.dtype,
            )

    def forward(self, hidden_states, residual):
--- a/tests/compile/passes/test_fusion.py
+++ b/tests/compile/passes/test_fusion.py
@@ -9,7 +9,7 @@ import vllm.config
 import vllm.ir.ops
 import vllm.plugins
 from tests.compile.backend import TestBackend
-from tests.utils import TestBlockFP8Layer, TestFP8Layer
+from tests.utils import TestFP8Layer
 from vllm._aiter_ops import IS_AITER_FOUND, rocm_aiter_ops
 from vllm.compilation.passes.fusion.matcher_utils import QUANT_OPS
 from vllm.compilation.passes.fusion.rms_quant_fusion import (
@@ -28,19 +28,23 @@ from vllm.config import (
    VllmConfig,
 )
 from vllm.model_executor.kernels.linear import (
+    AiterFp8BlockScaledMMKernel,
    ChannelWiseTorchFP8ScaledMMLinearKernel,
+    CutlassFp8BlockScaledMMKernel,
    CutlassFP8ScaledMMLinearKernel,
+    DeepGemmFp8BlockScaledMMKernel,
+    FlashInferFp8DeepGEMMDynamicBlockScaledKernel,
    FlashInferFP8ScaledMMLinearKernel,
-    FP8ScaledMMLinearKernel,
    PerTensorTorchFP8ScaledMMLinearKernel,
    ROCmFP8ScaledMMLinearKernel,
    RowWiseTorchFP8ScaledMMLinearKernel,
+    TritonFp8BlockScaledMMKernel,
+    _KernelT,
 )
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    GroupShape,
-    QuantKey,
-    ScaleDesc,
+    create_fp8_quant_key,
 )
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
    cutlass_block_fp8_supported,
@@ -66,9 +70,12 @@ CUDA_KERNEL_GROUPSHAPE_COMBINATIONS = [
    (PerTensorTorchFP8ScaledMMLinearKernel, GroupShape.PER_TENSOR),
    # ChannelWiseTorchFP8ScaledMMLinearKernel only supports per-token
    (ChannelWiseTorchFP8ScaledMMLinearKernel, GroupShape.PER_TOKEN),
-    # Blockwise group shapes (no kernel abstraction)
-    (None, GroupShape(1, 128)),
-    (None, GroupShape(1, 64)),
+    # Blockwise group shapes
+    (FlashInferFp8DeepGEMMDynamicBlockScaledKernel, GroupShape(1, 128)),
+    (CutlassFp8BlockScaledMMKernel, GroupShape(1, 128)),
+    (DeepGemmFp8BlockScaledMMKernel, GroupShape(1, 128)),
+    (TritonFp8BlockScaledMMKernel, GroupShape(1, 128)),
+    (TritonFp8BlockScaledMMKernel, GroupShape(1, 64)),
 ]

 # ROCm kernels
@@ -80,8 +87,8 @@ ROCM_KERNEL_GROUPSHAPE_COMBINATIONS = [
    # ChannelWiseTorchFP8ScaledMMLinearKernel only supports per-token
    (ChannelWiseTorchFP8ScaledMMLinearKernel, GroupShape.PER_TOKEN),
    # Blockwise group shapes (no kernel abstraction)
-    (None, GroupShape(1, 128)),
-    (None, GroupShape(1, 64)),
+    (TritonFp8BlockScaledMMKernel, GroupShape(1, 128)),
+    (TritonFp8BlockScaledMMKernel, GroupShape(1, 64)),
 ]

 KERNEL_GROUPSHAPE_COMBINATIONS = (
@@ -100,8 +107,8 @@ AITER_KERNEL_GROUPSHAPE_COMBINATIONS = [
    # Per-token with ChannelWiseTorchFP8ScaledMMLinearKernel
    (ChannelWiseTorchFP8ScaledMMLinearKernel, GroupShape.PER_TOKEN, True),
    (ChannelWiseTorchFP8ScaledMMLinearKernel, GroupShape.PER_TOKEN, False),
-    # Blockwise (no kernel abstraction)
-    (None, GroupShape(1, 128), True),
+    # Blockwise
+    (AiterFp8BlockScaledMMKernel, GroupShape(1, 128), True),
 ]


@@ -110,8 +117,9 @@ class TestModel(torch.nn.Module):
        self,
        hidden_size: int,
        eps: float,
-        force_kernel: FP8ScaledMMLinearKernel | None,
+        force_kernel: type[_KernelT] | None,
        group_shape: GroupShape,
+        dtype: torch.dtype,
        use_aiter_fusion: bool = False,
        use_aiter_quant: bool = False,
        *args,
@@ -129,43 +137,31 @@ class TestModel(torch.nn.Module):
        is_blockwise = group_shape.is_per_group()

        if is_blockwise:
-            act_quant_scale_desc = ScaleDesc(torch.float32, False, group_shape)
-            self.activation_quant_key = QuantKey(
-                dtype=FP8_DTYPE, scale=act_quant_scale_desc, symmetric=True
+            block_size = group_shape.col
+            self.activation_quant_key = create_fp8_quant_key(
+                static=False, group_shape=group_shape
            )
-            self.fp8_linear_layers = [
-                TestBlockFP8Layer(
-                    weight_shape=(hidden_size, hidden_size),
-                    group_shape=group_shape,
-                    cutlass_block_fp8_supported=cutlass_block_fp8_supported(),
-                    use_aiter_and_is_supported=use_aiter_quant,
-                    transpose_weights=use_aiter_fusion,
-                )
-                for _ in range(3)
-            ]
-
-            self.enable_quant_fp8_custom_op = (
-                False
-                if use_aiter_quant
-                else self.fp8_linear_layers[0].linear_op.input_quant_op.enabled()
+            self.weight_quant_key = create_fp8_quant_key(
+                static=True, group_shape=GroupShape(block_size, block_size)
            )

        else:
            is_static = group_shape == GroupShape.PER_TENSOR
-            act_quant_scale_desc = ScaleDesc(torch.float32, is_static, group_shape)
-            w_quant_scale_desc = ScaleDesc(torch.float32, True, group_shape)
-            self.activation_quant_key = QuantKey(
-                dtype=FP8_DTYPE, scale=act_quant_scale_desc, symmetric=True
+            self.activation_quant_key = create_fp8_quant_key(
+                is_static, group_shape=group_shape
            )
-            self.weight_quant_key = QuantKey(
-                dtype=FP8_DTYPE, scale=w_quant_scale_desc, symmetric=True
+            self.weight_quant_key = create_fp8_quant_key(
+                static=True, group_shape=group_shape
            )
+
        self.fp8_linear_layers = [
            TestFP8Layer(
                weight_shape=(hidden_size, hidden_size),
                activation_quant_key=self.activation_quant_key,
                weight_quant_key=self.weight_quant_key,
                force_kernel=force_kernel,
+                transpose_weights=use_aiter_fusion,
+                input_dtype=dtype,
            )
            for _ in range(3)
        ]
@@ -354,6 +350,7 @@ def test_fusion_rmsnorm_quant(
            eps=eps,
            force_kernel=force_kernel,
            group_shape=group_shape,
+            dtype=dtype,
            use_aiter_fusion=False,
            use_aiter_quant=False,
        )
@@ -426,6 +423,7 @@ def test_aiter_fusion_rmsnorm_quant(
            eps=eps,
            force_kernel=force_kernel,
            group_shape=group_shape,
+            dtype=dtype,
            use_aiter_fusion=True,  # Always use aiter fusion ops in aiter test
            use_aiter_quant=use_aiter_quant_op,  # Toggle aiter quantization
        )
--- a/tests/compile/passes/test_fusion_attn.py
+++ b/tests/compile/passes/test_fusion_attn.py
@@ -66,6 +66,7 @@ class AttentionQuantPatternModel(torch.nn.Module):
        self.kv_cache_dtype = kv_cache_dtype
        self.device = device
        self.vllm_config = vllm_config
+        self.dtype = vllm_config.model_config.dtype

        self.attn = Attention(
            num_heads=self.num_qo_heads,
@@ -155,6 +156,7 @@ class TestAttentionFp8StaticQuantPatternModel(AttentionQuantPatternModel):
            activation_quant_key=self.quant_key,
            weight_quant_key=self.quant_key,
            device=self.device,
+            input_dtype=self.dtype,
        )

        w = kwargs.get("w")
--- a/tests/compile/passes/test_mla_attn_quant_fusion.py
+++ b/tests/compile/passes/test_mla_attn_quant_fusion.py
@@ -74,6 +74,7 @@ class MLAAttentionQuantPatternModel(torch.nn.Module):
        self.kv_cache_dtype = kv_cache_dtype
        self.device = device
        self.vllm_config = vllm_config
+        self.dtype = vllm_config.model_config.dtype

        # Create kv_b_proj (ColumnParallelLinear) on device.
        # Reuse weights from prior model instance when available, because
@@ -190,6 +191,7 @@ class TestMLAAttentionFp8StaticQuantPatternModel(MLAAttentionQuantPatternModel):
            activation_quant_key=self.quant_key,
            weight_quant_key=self.quant_key,
            device=self.device,
+            input_dtype=self.dtype,
        )

        w = kwargs.get("w")
--- a/tests/compile/passes/test_silu_mul_quant_fusion.py
+++ b/tests/compile/passes/test_silu_mul_quant_fusion.py
@@ -36,9 +36,9 @@ from vllm.model_executor.kernels.linear import (
 )
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
-from vllm.model_executor.layers.quantization.utils.fp8_utils import W8A8BlockFp8LinearOp
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    GroupShape,
+    create_fp8_quant_key,
    kFp8Dynamic128Sym,
    kFp8StaticTensorSym,
    kNvfp4Dynamic,
@@ -58,7 +58,11 @@ class TestSiluMulFp8QuantModel(torch.nn.Module):
    quant_key = kFp8StaticTensorSym

    def __init__(
-        self, hidden_size: int, force_kernel: FP8ScaledMMLinearKernel, **kwargs
+        self,
+        hidden_size: int,
+        force_kernel: FP8ScaledMMLinearKernel,
+        dtype: torch.dtype,
+        **kwargs,
    ):
        super().__init__()
        self.silu_and_mul = SiluAndMul()
@@ -68,6 +72,7 @@ class TestSiluMulFp8QuantModel(torch.nn.Module):
            activation_quant_key=self.quant_key,
            weight_quant_key=self.quant_key,
            force_kernel=force_kernel,
+            input_dtype=dtype,
        )

        self.enable_silu_mul_custom_op = self.silu_and_mul.enabled()
@@ -137,14 +142,20 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):


 class TestSiluMulGroupFp8QuantModel(torch.nn.Module):
-    def __init__(self, hidden_size: int, **kwargs):
+    act_quant_key = kFp8Dynamic128Sym
+
+    def __init__(self, hidden_size: int, dtype: torch.dtype, **kwargs):
        super().__init__()
        self.silu_and_mul = SiluAndMul()
-        self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
-            weight_group_shape=GroupShape(128, 128),
-            act_quant_group_shape=GroupShape(1, 128),
-            cutlass_block_fp8_supported=False,
-            use_aiter_and_is_supported=True,
+        self.weight_quant_key = create_fp8_quant_key(
+            static=True, group_shape=GroupShape(hidden_size, hidden_size)
+        )
+
+        self.w8a8_block_fp8_linear = TestFP8Layer(
+            weight_shape=(hidden_size, hidden_size),
+            weight_quant_key=self.weight_quant_key,
+            activation_quant_key=self.act_quant_key,
+            input_dtype=dtype,
        )
        self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()

@@ -157,7 +168,7 @@ class TestSiluMulGroupFp8QuantModel(torch.nn.Module):

    def forward(self, x):
        y = self.silu_and_mul(x)
-        x2 = self.w8a8_block_fp8_linear.apply(y, self.w, self.wscale)
+        x2 = self.w8a8_block_fp8_linear(y, self.w, self.wscale)
        return x2

    def ops_in_model_before(self):
@@ -324,7 +335,9 @@ def test_fusion_silu_and_mul_quant(

        passes = [NoOpEliminationPass(config), *fusion_passes, PostCleanupPass(config)]
        backend = TestBackend(*passes)
-        model = model_class(hidden_size=hidden_size, force_kernel=force_kernel, x=x)
+        model = model_class(
+            hidden_size=hidden_size, force_kernel=force_kernel, x=x, dtype=dtype
+        )

        # First dimension dynamic
        torch._dynamo.mark_dynamic(x, 0)
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -216,12 +216,14 @@ def test_splitting_ops_dynamic():
        compilation_config=CompilationConfig(
            mode=CompilationMode.VLLM_COMPILE,
            use_inductor_graph_partition=True,
-            splitting_ops=["vllm::unified_attention"],
+            splitting_ops=["vllm::unified_attention_with_output"],
        )
    )
    # with inductor partition we use splitting_ops directly for
    # partition rules
-    assert config.compilation_config.splitting_ops == ["vllm::unified_attention"]
+    assert config.compilation_config.splitting_ops == [
+        "vllm::unified_attention_with_output"
+    ]

    # When attn_fusion pass enabled.
    config = VllmConfig(
@@ -281,7 +283,7 @@ def test_moe_splitting_ops_deepep_ht_inductor_partition():
            mode=CompilationMode.VLLM_COMPILE,
            use_inductor_graph_partition=True,
            splitting_ops=[
-                "vllm::unified_attention",
+                "vllm::unified_attention_with_output",
                "vllm::moe_forward",
                "vllm::moe_forward_shared",
            ],
@@ -289,7 +291,7 @@ def test_moe_splitting_ops_deepep_ht_inductor_partition():
    )
    splitting_ops = config.compilation_config.splitting_ops
    assert splitting_ops == [
-        "vllm::unified_attention",
+        "vllm::unified_attention_with_output",
        "vllm::moe_forward",
        "vllm::moe_forward_shared",
    ]
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -246,8 +246,9 @@ def default_vllm_config():
    """
    from vllm.config import VllmConfig, set_current_vllm_config

-    with set_current_vllm_config(VllmConfig()):
-        yield
+    config = VllmConfig()
+    with set_current_vllm_config(config):
+        yield config


@pytest.fixture()
--- a/tests/distributed/test_mq_tcp_multinode.py
+++ b/tests/distributed/test_mq_tcp_multinode.py
@@ -1,119 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Multi-node integration test for MessageQueue TCP fallback.
-
-Verifies that when writer and readers span separate nodes (Docker containers
-with isolated /dev/shm), `create_from_process_group` correctly detects
-cross-node ranks via `in_the_same_node_as()` and falls back to ZMQ TCP
-transport — and that data actually arrives.
-"""
-
-import numpy as np
-import torch.distributed as dist
-
-from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
-from vllm.distributed.parallel_state import in_the_same_node_as
-
-
-def main():
-    dist.init_process_group(backend="gloo")
-
-    rank = dist.get_rank()
-    world_size = dist.get_world_size()
-    assert world_size >= 2, (
-        f"Need at least 2 ranks across nodes, got world_size={world_size}"
-    )
-
-    # Verify that in_the_same_node_as detects cross-node correctly
-    status = in_the_same_node_as(dist.group.WORLD, source_rank=0)
-    local_count = sum(status)
-    print(
-        f"[Rank {rank}] in_the_same_node_as(source=0): {status}  "
-        f"(local={local_count}/{world_size})"
-    )
-    # With 2 Docker containers (1 proc each), rank 0 and rank 1
-    # should be on different nodes.
-    assert local_count < world_size, (
-        f"Expected cross-node ranks but all {world_size} ranks appear local."
-    )
-
-    # Create MessageQueue
-    writer_rank = 0
-    mq = MessageQueue.create_from_process_group(
-        dist.group.WORLD,
-        max_chunk_bytes=1024 * 1024,  # 1 MiB
-        max_chunks=10,
-        writer_rank=writer_rank,
-    )
-
-    # Verify the transport path selection
-    if rank == writer_rank:
-        print(
-            f"[Rank {rank}] Writer: n_local_reader={mq.n_local_reader}, "
-            f"n_remote_reader={mq.n_remote_reader}"
-        )
-        assert mq.n_remote_reader > 0, (
-            "Writer should have at least 1 remote (TCP) reader in a multi-node setup."
-        )
-    else:
-        if status[rank]:
-            assert mq._is_local_reader, (
-                f"Rank {rank} is on the same node as writer but is not a local reader."
-            )
-            print(f"[Rank {rank}] Reader: local (shared memory)")
-        else:
-            assert mq._is_remote_reader, (
-                f"Rank {rank} is on a different node but is not a remote (TCP) reader."
-            )
-            print(f"[Rank {rank}] Reader: remote (TCP)")
-
-    # Test data transfer: simple objects
-    dist.barrier()
-    if rank == writer_rank:
-        mq.enqueue("hello_from_node0")
-    else:
-        msg = mq.dequeue(timeout=10)
-        assert msg == "hello_from_node0"
-    dist.barrier()
-    print(f"[Rank {rank}] Simple object test passed")
-
-    # Test data transfer: numpy arrays
-    np.random.seed(42)
-    arrays = [
-        np.random.randint(0, 100, size=np.random.randint(100, 5000)) for _ in range(100)
-    ]
-
-    dist.barrier()
-    if rank == writer_rank:
-        for arr in arrays:
-            mq.enqueue(arr)
-    else:
-        for i, expected in enumerate(arrays):
-            received = mq.dequeue(timeout=10)
-            assert np.array_equal(expected, received), (
-                f"Array mismatch at index {i}: "
-                f"expected shape {expected.shape}, got shape {received.shape}"
-            )
-    dist.barrier()
-    print(f"[Rank {rank}] Numpy array test passed")
-
-    # Test data transfer: large payload (> max_chunk_bytes)
-    dist.barrier()
-    big_array = np.zeros(200_000, dtype=np.int64)  # ~1.6 MiB > 1 MiB chunk
-    if rank == writer_rank:
-        mq.enqueue(big_array)
-    else:
-        received = mq.dequeue(timeout=10)
-        assert np.array_equal(big_array, received)
-    dist.barrier()
-    print(f"[Rank {rank}] Large payload test passed")
-
-    # Done -- cleanup
-    dist.barrier()
-    print(f"[Rank {rank}] All MessageQueue TCP multi-node tests passed!")
-    dist.destroy_process_group()
-
-
-if __name__ == "__main__":
-    main()
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -525,6 +525,29 @@ def test_human_readable_model_len():
            parser.parse_args(["--max-model-len", invalid])


+def test_numa_bind_args():
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    args = parser.parse_args(
+        [
+            "--numa-bind",
+            "--numa-bind-nodes",
+            "0",
+            "0",
+            "1",
+            "1",
+            "--numa-bind-cpus",
+            "0-3",
+            "4-7",
+            "8-11",
+            "12-15",
+        ]
+    )
+    engine_args = EngineArgs.from_cli_args(args=args)
+    assert engine_args.numa_bind is True
+    assert engine_args.numa_bind_nodes == [0, 0, 1, 1]
+    assert engine_args.numa_bind_cpus == ["0-3", "4-7", "8-11", "12-15"]
+
+
 def test_ir_op_priority():
    from vllm.config.kernel import IrOpPriorityConfig, KernelConfig

--- a/tests/entrypoints/openai/chat_completion/test_chat_error.py
+++ b/tests/entrypoints/openai/chat_completion/test_chat_error.py
@@ -87,7 +87,6 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
    serving_render = OpenAIServingRender(
        model_config=engine.model_config,
        renderer=engine.renderer,
-        io_processor=engine.io_processor,
        model_registry=models.registry,
        request_logger=None,
        chat_template=None,
@@ -123,7 +122,6 @@ async def test_chat_error_non_stream():
    mock_engine.errored = False
    mock_engine.model_config = MockModelConfig()
    mock_engine.input_processor = MagicMock()
-    mock_engine.io_processor = MagicMock()
    mock_engine.renderer = _build_renderer(mock_engine.model_config)

    serving_chat = _build_serving_chat(mock_engine)
@@ -173,7 +171,6 @@ async def test_chat_error_stream():
    mock_engine.errored = False
    mock_engine.model_config = MockModelConfig()
    mock_engine.input_processor = MagicMock()
-    mock_engine.io_processor = MagicMock()
    mock_engine.renderer = _build_renderer(mock_engine.model_config)

    serving_chat = _build_serving_chat(mock_engine)
--- a/tests/entrypoints/openai/chat_completion/test_serving_chat.py
+++ b/tests/entrypoints/openai/chat_completion/test_serving_chat.py
@@ -567,7 +567,6 @@ def _build_serving_render(
    return OpenAIServingRender(
        model_config=engine.model_config,
        renderer=engine.renderer,
-        io_processor=engine.io_processor,
        model_registry=model_registry,
        request_logger=None,
        chat_template=CHAT_TEMPLATE,
@@ -599,7 +598,6 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
 class MockEngine:
    model_config: MockModelConfig = field(default_factory=MockModelConfig)
    input_processor: MagicMock = field(default_factory=MagicMock)
-    io_processor: MagicMock = field(default_factory=MagicMock)
    renderer: MagicMock = field(default_factory=MagicMock)


@@ -632,7 +630,6 @@ async def test_serving_chat_returns_correct_model_name():
    mock_engine.errored = False
    mock_engine.model_config = MockModelConfig()
    mock_engine.input_processor = MagicMock()
-    mock_engine.io_processor = MagicMock()
    mock_engine.renderer = _build_renderer(mock_engine.model_config)

    serving_chat = _build_serving_chat(mock_engine)
@@ -662,7 +659,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
    mock_engine.errored = False
    mock_engine.model_config = MockModelConfig()
    mock_engine.input_processor = MagicMock()
-    mock_engine.io_processor = MagicMock()
    mock_engine.renderer = _build_renderer(mock_engine.model_config)

    serving_chat = _build_serving_chat(mock_engine)
@@ -693,7 +689,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
    mock_engine.errored = False
    mock_engine.model_config = mock_model_config
    mock_engine.input_processor = MagicMock()
-    mock_engine.io_processor = MagicMock()
    mock_engine.renderer = _build_renderer(mock_engine.model_config)

    # Initialize the serving chat
@@ -737,7 +732,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
    mock_engine.errored = False
    mock_engine.model_config = mock_model_config
    mock_engine.input_processor = MagicMock()
-    mock_engine.io_processor = MagicMock()
    mock_engine.renderer = _build_renderer(mock_engine.model_config)

    serving_chat = _build_serving_chat(mock_engine)
@@ -779,7 +773,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
    mock_engine.errored = False
    mock_engine.model_config = mock_model_config
    mock_engine.input_processor = MagicMock()
-    mock_engine.io_processor = MagicMock()
    mock_engine.renderer = _build_renderer(mock_engine.model_config)

    # Initialize the serving chat
@@ -823,7 +816,6 @@ async def test_serving_chat_mistral_token_ids_prompt_is_validated():
    mock_engine.errored = False
    mock_engine.model_config = MockModelConfig(skip_tokenizer_init=True)
    mock_engine.input_processor = MagicMock()
-    mock_engine.io_processor = MagicMock()

    mock_tokenizer = MagicMock(spec=MistralTokenizer)
    mock_renderer = MistralRenderer(
@@ -863,7 +855,6 @@ async def test_serving_chat_mistral_token_ids_prompt_too_long_is_rejected():
    mock_engine.errored = False
    mock_engine.model_config = MockModelConfig(skip_tokenizer_init=True)
    mock_engine.input_processor = MagicMock()
-    mock_engine.io_processor = MagicMock()

    mock_tokenizer = MagicMock(spec=MistralTokenizer)
    mock_renderer = MistralRenderer(
@@ -906,7 +897,6 @@ async def test_serving_chat_could_load_correct_generation_config():
    mock_engine.errored = False
    mock_engine.model_config = mock_model_config
    mock_engine.input_processor = MagicMock()
-    mock_engine.io_processor = MagicMock()
    mock_engine.renderer = _build_renderer(mock_engine.model_config)

    # Initialize the serving chat
@@ -952,7 +942,6 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
    mock_engine.errored = False
    mock_engine.model_config = mock_model_config
    mock_engine.input_processor = MagicMock()
-    mock_engine.io_processor = MagicMock()
    mock_engine.renderer = _build_renderer(mock_engine.model_config)

    serving_chat = _build_serving_chat(mock_engine)
@@ -1003,7 +992,6 @@ async def test_serving_chat_data_parallel_rank_extraction():
    mock_engine.errored = False
    mock_engine.model_config = MockModelConfig()
    mock_engine.input_processor = MagicMock()
-    mock_engine.io_processor = MagicMock()
    mock_engine.renderer = _build_renderer(mock_engine.model_config)

    # Mock the generate method to return an async generator
@@ -1095,7 +1083,6 @@ class TestServingChatWithHarmony:
        mock_engine.errored = False
        mock_engine.model_config = MockModelConfig()
        mock_engine.input_processor = MagicMock()
-        mock_engine.io_processor = MagicMock()
        mock_engine.renderer = _build_renderer(mock_engine.model_config)
        return mock_engine

@@ -1732,7 +1719,6 @@ async def test_tool_choice_validation_without_parser():
    mock_engine.errored = False
    mock_engine.model_config = MockModelConfig()
    mock_engine.input_processor = MagicMock()
-    mock_engine.io_processor = MagicMock()
    mock_engine.renderer = _build_renderer(mock_engine.model_config)

    models = OpenAIServingModels(
@@ -1802,7 +1788,6 @@ async def test_streaming_n_gt1_independent_tool_parsers():
    mock_engine.errored = False
    mock_engine.model_config = MockModelConfig()
    mock_engine.input_processor = MagicMock()
-    mock_engine.io_processor = MagicMock()
    mock_engine.renderer = _build_renderer(mock_engine.model_config)

    models = OpenAIServingModels(
--- a/tests/entrypoints/openai/completion/test_completion_error.py
+++ b/tests/entrypoints/openai/completion/test_completion_error.py
@@ -79,7 +79,6 @@ def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
    serving_render = OpenAIServingRender(
        model_config=engine.model_config,
        renderer=engine.renderer,
-        io_processor=engine.io_processor,
        model_registry=models.registry,
        request_logger=None,
        chat_template=None,
@@ -107,7 +106,6 @@ async def test_completion_error_non_stream():
    mock_engine.errored = False
    mock_engine.model_config = MockModelConfig()
    mock_engine.input_processor = MagicMock()
-    mock_engine.io_processor = MagicMock()
    mock_engine.renderer = _build_renderer(mock_engine.model_config)

    serving_completion = _build_serving_completion(mock_engine)
@@ -157,7 +155,6 @@ async def test_completion_error_stream():
    mock_engine.errored = False
    mock_engine.model_config = MockModelConfig()
    mock_engine.input_processor = MagicMock()
-    mock_engine.io_processor = MagicMock()
    mock_engine.renderer = _build_renderer(mock_engine.model_config)

    serving_completion = _build_serving_completion(mock_engine)
--- a/Show More
+++ b/Show More