[Bugfix] Disable TRTLLM attention when KV transfer is enabled (#33192 )

Signed-off-by: Zhanqiu Hu <zh338@cornell.edu>
[CI][torch.compile] Reduce e2e fusion test time (#33293 )
2026-02-05 00:49:18 +00:00 · 2026-02-04 19:09:03 -05:00 · 2026-02-05 08:05:13 +08:00 · 2026-02-04 23:40:22 +00:00 · 2026-02-04 17:02:46 -05:00 · 2026-02-04 21:59:59 +00:00
659 changed files with 30430 additions and 10622 deletions
--- a/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
+++ b/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
@@ -0,0 +1,15 @@
+model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.695
+  - name: "exact_match,flexible-extract"
+    value: 0.447
+limit: 1319
+num_fewshot: 5
+max_model_len: 262144
+enforce_eager: false
+apply_chat_template: true
+fewshot_as_multiturn: true
+trust_remote_code: true
--- a/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
@@ -0,0 +1,19 @@
+model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.7142
+  - name: "exact_match,flexible-extract"
+    value: 0.4579
+env_vars:
+  VLLM_USE_FLASHINFER_MOE_FP8: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
+limit: 1319
+num_fewshot: 5
+max_model_len: 262144
+kv_cache_dtype: fp8
+enforce_eager: false
+apply_chat_template: true
+fewshot_as_multiturn: true
+trust_remote_code: true
--- a/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt
@@ -1 +1,2 @@
 Qwen3-235B-A22B-Instruct-2507-FP8.yaml
+NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
+NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -393,6 +393,11 @@ run_serving_tests() {
      fi
    fi

+    # save the compilation mode and optimization level on the serving results
+    # whenever they are set
+    compilation_config_mode=$(echo "$server_params" | jq -r '."compilation_config.mode" // empty')
+    optimization_level=$(echo "$server_params" | jq -r '.optimization_level // empty')
+
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
@@ -406,15 +411,15 @@ run_serving_tests() {
      for max_concurrency in $max_concurrency_list; do
        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
        echo " new test name $new_test_name"
-        # pass the tensor parallel size to the client so that it can be displayed
-        # on the benchmark dashboard
+        # pass the tensor parallel size, the compilation mode, and the optimization
+        # level to the client so that they can be used on the benchmark dashboard
        client_command="vllm bench serve \
          --save-result \
          --result-dir $RESULTS_FOLDER \
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --max-concurrency $max_concurrency \
-          --metadata "tensor_parallel_size=$tp" \
+          --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
          $client_args $client_remote_args "

        echo "Running test case $test_name with qps $qps"
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -86,5 +86,27 @@ docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
 docker manifest push vllm/vllm-openai:latest-cu130
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
+
+# CPU images (vllm/vllm-openai-cpu)
+docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
+docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
+docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
+docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
+docker push vllm/vllm-openai-cpu:latest-x86_64
+docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
+docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
+docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+docker push vllm/vllm-openai-cpu:latest-arm64
+docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+
+docker manifest rm vllm/vllm-openai-cpu:latest || true
+docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
+docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+docker manifest push vllm/vllm-openai-cpu:latest
+docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
 \`\`\`
-EOF 
+EOF
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -87,7 +87,7 @@ mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"

 commands=$@
-echo "Commands:$commands"
+echo "Raw commands: $commands"

 commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}

@@ -169,6 +169,9 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi

+commands=$(echo "$commands" | sed 's/ \\ / /g')
+echo "Final commands: $commands"
+
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
@@ -176,7 +179,6 @@ fi
 # --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13


-PARALLEL_JOB_COUNT=8
 MYPYTHONPATH=".."

 # Test that we're launching on the machine that has
@@ -187,56 +189,7 @@ if [[ -z "$render_gid" ]]; then
  exit 1
 fi

-# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
-if [[ $commands == *"--shard-id="* ]]; then
-  # assign job count as the number of shards used
-  commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
-  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
-    # assign shard-id for each shard
-    commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
-    echo "Shard ${GPU} commands:$commands_gpu"
-    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
-    docker run \
-        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
-        --network=host \
-        --shm-size=16gb \
-        --group-add "$render_gid" \
-        --rm \
-        -e HIP_VISIBLE_DEVICES="${GPU}" \
-        -e HF_TOKEN \
-        -e AWS_ACCESS_KEY_ID \
-        -e AWS_SECRET_ACCESS_KEY \
-        -v "${HF_CACHE}:${HF_MOUNT}" \
-        -e "HF_HOME=${HF_MOUNT}" \
-        -e "PYTHONPATH=${MYPYTHONPATH}" \
-        --name "${container_name}_${GPU}" \
-        "${image_name}" \
-        /bin/bash -c "${commands_gpu}" \
-        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
-    PIDS+=($!)
-  done
-  #wait for all processes to finish and collect exit codes
-  for pid in "${PIDS[@]}"; do
-    wait "${pid}"
-    STATUS+=($?)
-  done
-  at_least_one_shard_with_tests=0
-  for st in "${STATUS[@]}"; do
-    if [[ ${st} -ne 0 ]] && [[ ${st} -ne 5 ]]; then
-      echo "One of the processes failed with $st"
-      exit "${st}"
-    elif [[ ${st} -eq 5 ]]; then
-      echo "Shard exited with status 5 (no tests collected) - treating as success"
-    else # This means st is 0
-      at_least_one_shard_with_tests=1
-    fi
-  done
-  if [[ ${#STATUS[@]} -gt 0 && ${at_least_one_shard_with_tests} -eq 0 ]]; then
-    echo "All shards reported no tests collected. Failing the build."
-    exit 1
-  fi
-
-elif [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
+if [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then

  export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')

--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -5,7 +5,9 @@
 set -exuo pipefail

 # Try building the docker image
-cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
+image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
+container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
+cat <<EOF | docker build -t ${image_name} -f - .
 FROM gaudi-base-image:latest

 COPY ./ /workspace/vllm
@@ -15,7 +17,8 @@ WORKDIR /workspace/vllm
 ENV no_proxy=localhost,127.0.0.1
 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true

-RUN VLLM_TARGET_DEVICE=empty pip install .
+RUN bash -c 'pip install -r <(sed "/^torch/d" requirements/build.txt)'
+RUN VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e .
 RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git

 # install development dependencies (for testing)
@@ -36,15 +39,20 @@ EOF
 # functions, while other platforms only need one remove_docker_container
 # function.
 EXITCODE=1
-remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
+remove_docker_containers() { docker rm -f ${container_name} || true; }
 trap 'remove_docker_containers; exit $EXITCODE;' EXIT
 remove_docker_containers

 echo "Running HPU plugin v1 test"
-docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
+docker run --rm --runtime=habana --name=${container_name} --network=host \
  -e HABANA_VISIBLE_DEVICES=all \
-  hpu-plugin-v1-test-env \
-  /bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
+  -e VLLM_SKIP_WARMUP=true \
+  -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
+  -e PT_HPU_LAZY_MODE=1 \
+  "${image_name}" \
+  /bin/bash -c '
+  cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
+'

 EXITCODE=$?
 if [ $EXITCODE -eq 0 ]; then
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -38,10 +38,11 @@ docker run \
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
+    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
+    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
    cd tests
-    pytest -v -s v1/core
+    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -43,7 +43,6 @@ trap cleanup EXIT

 for BACK in "${BACKENDS[@]}"; do
  VLLM_DEEP_GEMM_WARMUP=skip \
-  VLLM_ALL2ALL_BACKEND=$BACK \
  vllm serve "$MODEL" \
    --enforce-eager \
    --tensor-parallel-size 2 \
@@ -52,6 +51,7 @@ for BACK in "${BACKENDS[@]}"; do
    --enable-eplb \
    --trust-remote-code \
    --max-model-len 2048 \
+    --all2all-backend $BACK \
    --port $PORT &
  SERVER_PID=$!
  wait_for_server $PORT
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -542,7 +542,7 @@ steps:
 - label: LoRA Test %N # 20min each
  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - vllm/lora
@@ -604,9 +604,11 @@ steps:
  - tests/compile
  commands:
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
-    # Limit to no custom ops to reduce running time
-    # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+    # # Limit to no custom ops to reduce running time
+    # # Wrap with quotes to escape yaml and avoid starting -k string with a -
+    # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.

 - label: Cudagraph test
  timeout_in_minutes: 20
@@ -636,7 +638,7 @@ steps:
 - label: Kernels Attention Test %N # 23min
  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - csrc/attention/
@@ -651,7 +653,7 @@ steps:
 - label: Kernels Quantization Test %N # 64min
  timeout_in_minutes: 90
  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - csrc/quantization/
@@ -664,7 +666,7 @@ steps:
 - label: Kernels MoE Test %N # 40min
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - csrc/quantization/cutlass_w8a8/moe/
@@ -742,7 +744,7 @@ steps:
 - label: Benchmarks # 11min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  working_dir: "/vllm-workspace/.buildkite"
  source_file_dependencies:
@@ -753,7 +755,7 @@ steps:
 - label: Benchmarks CLI Test # 7min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  source_file_dependencies:
  - vllm/
@@ -827,7 +829,7 @@ steps:
 - label: Basic Models Tests (Extra Initialization) %N
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies:
@@ -888,7 +890,7 @@ steps:
 - label: Language Models Tests (Extra Standard) %N
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies:
@@ -909,7 +911,7 @@ steps:
 - label: Language Models Tests (Hybrid) %N
  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_8
+  agent_pool: mi325_1
  # grade: Blocking
  torch_nightly: true
  source_file_dependencies:
@@ -1181,7 +1183,6 @@ steps:
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
  - tests/compile/distributed/test_fusion_all_reduce.py
-  - tests/compile/distributed/test_fusions_e2e.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
@@ -1189,33 +1190,16 @@ steps:
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+
+    # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # # Wrap with quotes to escape yaml
+    # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
+
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: Blackwell Fusion E2E Tests # 30 min
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  gpu: b200
-  optional: true
-  num_gpus: 2
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
-  commands:
-    - nvidia-smi
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-
 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
@@ -1566,7 +1550,10 @@ steps:
    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
    #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
-    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
+
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -537,9 +537,11 @@ steps:
  commands:
    # fp8 kv scales not supported on sm89, tested on Blackwell instead
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
-    # Limit to no custom ops to reduce running time
-    # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+    # # Limit to no custom ops to reduce running time
+    # # Wrap with quotes to escape yaml and avoid starting -k string with a -
+    # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.

 - label: Cudagraph test
  timeout_in_minutes: 20
@@ -1069,7 +1071,6 @@ steps:
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
  - tests/compile/distributed/test_fusion_all_reduce.py
-  - tests/compile/distributed/test_fusions_e2e.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
@@ -1077,75 +1078,15 @@ steps:
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    #  # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    #  # Wrap with quotes to escape yaml
+    #  - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
+
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: Blackwell Fusion E2E Tests # 30 min
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  gpu: b200
-  optional: true
-  num_gpus: 2
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
-  commands:
-    - nvidia-smi
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-
- label: Hopper Fusion E2E Tests (H100) # 10min
-  timeout_in_minutes: 70
-  working_dir: "/vllm-workspace/"
-  gpu: h100
-  optional: true
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/test_fusion_attn.py
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # skip Llama-4 since it does not fit on this device
-    - pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
-
- label: Hopper Fusion Distributed E2E Tests (2xH100)  # 70min
-  timeout_in_minutes: 70
-  working_dir: "/vllm-workspace/"
-  gpu: h100
-  optional: true
-  num_gpus: 2
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-
 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -2,56 +2,196 @@ group: Compile
 depends_on: 
  - image-build
 steps:
- label: Fusion and Compile Tests (B200)
+- label: Sequence Parallel Tests (2 GPUs)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/"
+  num_devices: 2
+  source_file_dependencies:
+  - vllm/model_executor/layers/
+  - vllm/compilation/
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
+  - tests/distributed/test_sequence_parallel.py
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/distributed/test_sequence_parallel.py
+
+- label: Sequence Parallel Tests (2xH100)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/"
+  device: h100
+  optional: true
+  num_devices: 2
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/distributed/test_sequence_parallel.py
+
+- label: Distributed Compile Unit Tests (2xH100)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 2
+  source_file_dependencies:
+  - vllm/compilation/
+  - vllm/model_executor/layers
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/distributed/test_sequence_parallelism.py
+  - tests/compile/distributed/test_async_tp.py
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+  - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+  - pytest -v -s tests/compile/distributed/test_async_tp.py
+
+- label: Fusion and Compile Unit Tests (B200)
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/"
  device: b200
  source_file_dependencies:
  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/v1/worker/
-  - vllm/v1/cudagraph_dispatcher.py
-  - vllm/compilation/
-  # can affect pattern matching
+  - vllm/model_executor/layers/quantization/
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - vllm/model_executor/layers/attention/attention.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
  - tests/compile/distributed/test_fusion_all_reduce.py
-  - tests/compile/distributed/test_fusions_e2e.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
+    # b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
    - nvidia-smi
-    - pytest -v -s tests/compile/test_fusion_attn.py
+    - pytest -v -s tests/compile/test_fusion_attn.py -k FLASHINFER
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_devices=2 is not set
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    # TODO(luka) move to H100 once pass tests run on H100
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: Fusion E2E (2 GPUs)(B200)
-  timeout_in_minutes: 40
+- label: Fusion E2E Quick (H100)
+  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/"
-  device: b200
-  optional: true
-  num_devices: 2
+  device: h100
+  num_devices: 1
  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"

+- label: Fusion E2E Config Sweep (H100)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 1
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/compilation/
+    # can affect pattern matching
+    - vllm/model_executor/layers/layernorm.py
+    - vllm/model_executor/layers/activation.py
+    - vllm/model_executor/layers/attention/attention.py
+    - vllm/model_executor/layers/quantization/input_quant_fp8.py
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run just llama3 (fp8) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
+
+- label: Fusion E2E Config Sweep (B200)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/"
+  device: b200
+  num_devices: 1
+  optional: true
+  commands:
+    - nvidia-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    # -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
+    # Run just llama3 (fp8 & fp4) for all config combinations
+    # -k "llama-3"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
+
+- label: Fusion E2E TP2 Quick (H100)
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+
+- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/compilation/
+    # can affect pattern matching
+    - vllm/model_executor/layers/layernorm.py
+    - vllm/model_executor/layers/activation.py
+    - vllm/model_executor/layers/attention/attention.py
+    - vllm/model_executor/layers/quantization/input_quant_fp8.py
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run just llama3 (fp4 & fp8 & bf16) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
+
+- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/compilation/
+    # can affect pattern matching
+    - vllm/model_executor/layers/layernorm.py
+    - vllm/model_executor/layers/activation.py
+    - vllm/model_executor/layers/attention/attention.py
+    - vllm/model_executor/layers/quantization/input_quant_fp8.py
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run just llama3 (fp8 & bf16) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
+
+- label: Fusion E2E TP2 (B200)
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/"
+  device: b200
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    # for ar-rms-quant-fp4, also sweep llama3
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -16,7 +16,7 @@ steps:
  - pytest -v -s distributed/test_shm_storage.py

 - label: Distributed (2 GPUs)
-  timeout_in_minutes: 90
+  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
  num_devices: 2
  source_file_dependencies:
@@ -47,7 +47,6 @@ steps:
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
-  - pytest -v -s distributed/test_sequence_parallel.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py

@@ -133,25 +132,13 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py

- label: Sequence Parallel Tests (H100)
-  timeout_in_minutes: 60
-  working_dir: "/vllm-workspace/"
-  device: h100
-  optional: true
-  num_devices: 2
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # Run sequence parallel tests
-    - pytest -v -s tests/distributed/test_sequence_parallel.py
-    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-
 - label: Distributed Tests (2 GPUs)(H100)
+  timeout_in_minutes: 15
  device: h100
  optional: true
  working_dir: "/vllm-workspace/"
  num_devices: 2
  commands:
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py
@@ -217,45 +204,3 @@ steps:
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py
-
- label: Hopper Fusion E2E Tests (H100)
-  timeout_in_minutes: 70
-  working_dir: "/vllm-workspace/"
-  device: h100
-  optional: true
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/test_fusion_attn.py
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # skip Llama-4 since it does not fit on this device
-    - pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
-
- label: Hopper Fusion Distributed E2E Tests (2xH100)
-  timeout_in_minutes: 70
-  working_dir: "/vllm-workspace/"
-  device: h100
-  optional: true
-  num_devices: 2
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -16,7 +16,7 @@ steps:
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
-    - pytest -v -s v1/spec_decode
+    - pytest -v -s -m 'not slow_test' v1/spec_decode
    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
    - pytest -v -s -m 'not cpu_test' v1/metrics
    - pytest -v -s v1/test_oracle.py
@@ -166,4 +166,18 @@ steps:
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pip install pytest-timeout pytest-forked
    - pytest -v -s v1/determinism/test_batch_invariance.py
-    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+  
+- label: Acceptance Length Test (Large Models) # optional
+  timeout_in_minutes: 25
+  gpu: h100
+  optional: true
+  num_gpus: 1
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/v1/spec_decode/
+  - vllm/model_executor/models/mlp_speculator.py
+  - tests/v1/spec_decode/test_acceptance_length.py
+  commands:
+    - export VLLM_ALLOW_INSECURE_SERIALIZATION=1
+    - pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -18,7 +18,7 @@ steps:
  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"

 - label: PyTorch Fullgraph Smoke Test
-  timeout_in_minutes: 30
+  timeout_in_minutes: 35
  source_file_dependencies:
  - vllm/
  - tests/compile
@@ -30,16 +30,13 @@ steps:
  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"

 - label: PyTorch Fullgraph
-  timeout_in_minutes: 40
+  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
    # fp8 kv scales not supported on sm89, tested on Blackwell instead
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
-    # Limit to no custom ops to reduce running time
-    # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"

 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -842,6 +842,7 @@ class BenchmarkTensors:
            "sorted_token_ids": sorted_token_ids,
            "expert_ids": expert_ids,
            "num_tokens_post_padded": num_tokens_post_padded,
+            "token_lora_mapping": self.lora_kernel_meta.token_lora_mapping,
            "top_k_num": ctx.top_k_num,
            "device": self.input.device,
            "N": lora_rank,
@@ -915,6 +916,7 @@ class BenchmarkTensors:
            "sorted_token_ids": sorted_token_ids,
            "expert_ids": expert_ids,
            "num_tokens_post_padded": num_tokens_post_padded,
+            "token_lora_mapping": self.lora_kernel_meta.token_lora_mapping,
            "top_k_num": ctx.top_k_num,
            "device": self.input.device,
            "N": lora_rank,
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -27,7 +27,6 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
    TritonOrDeepGemmExperts,
 )
-from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
 from vllm.triton_utils import triton
 from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -482,6 +481,8 @@ class BenchmarkWorker:
        block_quant_shape: list[int] = None,
        use_deep_gemm: bool = False,
    ) -> tuple[dict[str, int], float]:
+        # local import to allow serialization by ray
+
        set_random_seed(self.seed)
        dtype_str = _get_config_dtype_str(
            dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
@@ -535,6 +536,9 @@ class BenchmarkWorker:
        block_quant_shape: list[int],
        use_deep_gemm: bool,
    ) -> dict[str, int]:
+        # local import to allow serialization by ray
+        from vllm.platforms import current_platform
+
        best_config = None
        best_time = float("inf")
        if current_platform.is_rocm():
@@ -646,20 +650,28 @@ def save_configs(
        f.write("\n")


+def get_compressed_tensors_block_structure(config, default_value=None):
+    config_groups = config.get("config_groups", {})
+    if len(config_groups) != 1:
+        return default_value
+    group = next(iter(config_groups.values()))
+    weights = group.get("weights", {})
+    block_structure = weights.get("block_structure", default_value)
+    return block_structure
+
+
 def get_weight_block_size_safety(config, default_value=None):
    quantization_config = getattr(config, "quantization_config", {})
    if isinstance(quantization_config, dict):
-        return quantization_config.get("weight_block_size", default_value)
+        if "weight_block_size" in quantization_config:
+            return quantization_config["weight_block_size"]
+        return get_compressed_tensors_block_structure(
+            quantization_config, default_value
+        )
    return default_value


-def main(args: argparse.Namespace):
-    print(args)
-
-    config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
-    if args.model_prefix:
-        config = getattr(config, args.model_prefix)
-
+def get_model_params(config):
    if config.architectures[0] == "DbrxForCausalLM":
        E = config.ffn_config.moe_num_experts
        topk = config.ffn_config.moe_top_k
@@ -677,6 +689,7 @@ def main(args: argparse.Namespace):
        "Glm4MoeForCausalLM",
        "Glm4MoeLiteForCausalLM",
        "NemotronHForCausalLM",
+        "MistralLarge3ForCausalLM",
    ):
        E = config.n_routed_experts
        topk = config.num_experts_per_tok
@@ -697,16 +710,20 @@ def main(args: argparse.Namespace):
        topk = text_config.num_experts_per_tok
        intermediate_size = text_config.moe_intermediate_size
        hidden_size = text_config.hidden_size
-    elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
+    elif config.architectures[0] == "HunYuanMoEV1ForCausalLM":
        E = config.num_experts
        topk = config.moe_topk[0]
        intermediate_size = config.moe_intermediate_size[0]
        hidden_size = config.hidden_size
-    elif config.architectures[0] in ["Qwen3OmniMoeForConditionalGeneration"]:
+    elif config.architectures[0] == "Qwen3OmniMoeForConditionalGeneration":
        E = config.thinker_config.text_config.num_experts
        topk = config.thinker_config.text_config.num_experts_per_tok
        intermediate_size = config.thinker_config.text_config.moe_intermediate_size
        hidden_size = config.thinker_config.text_config.hidden_size
+    elif config.architectures[0] == "PixtralForConditionalGeneration":
+        # Pixtral can contain different LLM architectures,
+        # recurse to get their parameters
+        return get_model_params(config.get_text_config())
    else:
        # Support for llama4
        config = config.get_text_config()
@@ -715,6 +732,16 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok
        intermediate_size = config.intermediate_size
        hidden_size = config.hidden_size
+    return E, topk, intermediate_size, hidden_size
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
+    if args.model_prefix:
+        config = getattr(config, args.model_prefix)
+    E, topk, intermediate_size, hidden_size = get_model_params(config)
    enable_ep = bool(args.enable_expert_parallel)
    if enable_ep:
        ensure_divisibility(E, args.tp_size, "Number of experts")
--- a/benchmarks/kernels/cpu/benchmark_cpu_attn.py
+++ b/benchmarks/kernels/cpu/benchmark_cpu_attn.py
@@ -14,7 +14,7 @@ from vllm._custom_ops import (
 )
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
-from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
 from vllm.v1.attention.backends.cpu_attn import CPUAttentionBackend, _get_attn_isa


@@ -58,7 +58,7 @@ def main(
    seed: int = 0,
    iters: int = 20,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    num_seqs = len(seq_lens)
    query_lens = [x[0] for x in seq_lens]
    kv_lens = [x[1] for x in seq_lens]
--- a/benchmarks/kernels/cpu/benchmark_cpu_fused_moe.py
+++ b/benchmarks/kernels/cpu/benchmark_cpu_fused_moe.py
@@ -7,8 +7,8 @@ import time
 import numpy as np
 import torch

-from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import set_random_seed

 # Check if CPU MoE operations are available
 try:
@@ -41,7 +41,7 @@ def main(
    seed: int = 0,
    iters: int = 20,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    # up_dim = 2 * intermediate_size for gate + up projection
    up_dim = 2 * intermediate_size

--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -359,6 +359,19 @@ else()
    add_compile_definitions(-DVLLM_NUMA_DISABLED)
 endif()

+#
+# Generate CPU attention dispatch header
+#
+message(STATUS "Generating CPU attention dispatch header")
+execute_process(
+    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/csrc/cpu/generate_cpu_attn_dispatch.py
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/csrc/cpu
+    RESULT_VARIABLE GEN_RESULT
+)
+if(NOT GEN_RESULT EQUAL 0)
+    message(FATAL_ERROR "Failed to generate CPU attention dispatch header")
+endif()
+
 #
 # _C extension
 #
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 188be16520ceefdc625fdf71365585d2ee348fe2
+          GIT_TAG 2adfc8c2177c5b0e8ddeedfd5a8990d80eb496ff
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/csrc/cpu/cpu_attn.cpp
+++ b/csrc/cpu/cpu_attn.cpp
@@ -1,79 +1,4 @@
-#include "cpu_attn_vec.hpp"
-#include "cpu_attn_vec16.hpp"
-
-#ifdef CPU_CAPABILITY_AMXBF16
-  #include "cpu_attn_amx.hpp"
-  #define AMX_DISPATCH(...)                                                   \
-    case cpu_attention::ISA::AMX: {                                           \
-      using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::AMX, \
-                                                     scalar_t, head_dim>;     \
-      return __VA_ARGS__();                                                   \
-    }
-#else
-  #define AMX_DISPATCH(...) case cpu_attention::ISA::AMX:
-#endif
-
-#ifdef __aarch64__
-  #include "cpu_attn_neon.hpp"
-  // NEON requires head_dim to be a multiple of 32
-  #define NEON_DISPATCH(...)                                                   \
-    case cpu_attention::ISA::NEON: {                                           \
-      using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::NEON, \
-                                                     scalar_t, head_dim>;      \
-      return __VA_ARGS__();                                                    \
-    }
-#else
-  #define NEON_DISPATCH(...) case cpu_attention::ISA::NEON:
-#endif  // #ifdef __aarch64__
-
-#define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \
-  case HEAD_DIM: {                            \
-    constexpr size_t head_dim = HEAD_DIM;     \
-    return __VA_ARGS__();                     \
-  }
-
-#define CPU_ATTN_DISPATCH_CASE_HEADDIM(HEAD_DIM, ...)           \
-  [&] {                                                         \
-    switch (HEAD_DIM) {                                         \
-      CPU_ATTN_DISPATCH_CASE(32, __VA_ARGS__)                   \
-      CPU_ATTN_DISPATCH_CASE(64, __VA_ARGS__)                   \
-      CPU_ATTN_DISPATCH_CASE(80, __VA_ARGS__)                   \
-      CPU_ATTN_DISPATCH_CASE(96, __VA_ARGS__)                   \
-      CPU_ATTN_DISPATCH_CASE(112, __VA_ARGS__)                  \
-      CPU_ATTN_DISPATCH_CASE(128, __VA_ARGS__)                  \
-      CPU_ATTN_DISPATCH_CASE(160, __VA_ARGS__)                  \
-      CPU_ATTN_DISPATCH_CASE(192, __VA_ARGS__)                  \
-      CPU_ATTN_DISPATCH_CASE(224, __VA_ARGS__)                  \
-      CPU_ATTN_DISPATCH_CASE(256, __VA_ARGS__)                  \
-      default: {                                                \
-        TORCH_CHECK(false, "Invalid CPU attention head_dim: " + \
-                               std::to_string(HEAD_DIM));       \
-      }                                                         \
-    }                                                           \
-  }()
-
-#define CPU_ATTN_DISPATCH_IMPL(ISA_TYPE, ...)                                 \
-  [&] {                                                                       \
-    switch (ISA_TYPE) {                                                       \
-      AMX_DISPATCH(__VA_ARGS__)                                               \
-      NEON_DISPATCH(__VA_ARGS__)                                              \
-      case cpu_attention::ISA::VEC: {                                         \
-        using attn_impl =                                                     \
-            cpu_attention::AttentionImpl<cpu_attention::ISA::VEC, scalar_t,   \
-                                         head_dim>;                           \
-        return __VA_ARGS__();                                                 \
-      }                                                                       \
-      case cpu_attention::ISA::VEC16: {                                       \
-        using attn_impl =                                                     \
-            cpu_attention::AttentionImpl<cpu_attention::ISA::VEC16, scalar_t, \
-                                         head_dim>;                           \
-        return __VA_ARGS__();                                                 \
-      }                                                                       \
-      default: {                                                              \
-        TORCH_CHECK(false, "Invalid CPU attention ISA type.");                \
-      }                                                                       \
-    }                                                                         \
-  }()
+#include "cpu_attn_dispatch_generated.h"

 torch::Tensor get_scheduler_metadata(
    const int64_t num_req, const int64_t num_heads_q,
@@ -122,16 +47,14 @@ torch::Tensor get_scheduler_metadata(
  input.enable_kv_split = enable_kv_split;

  VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
-    CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
-      CPU_ATTN_DISPATCH_IMPL(isa, [&]() {
-        input.elem_size = sizeof(scalar_t);
-        input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
-        input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
-        input.output_buffer_elem_size =
-            sizeof(attn_impl::partial_output_buffer_t);
-        input.max_num_q_per_iter = attn_impl::MaxQHeadNumPerIteration;
-        input.kv_block_alignment = attn_impl::BlockSizeAlignment;
-      });
+    CPU_ATTN_DISPATCH(head_dim, isa, [&]() {
+      input.elem_size = sizeof(scalar_t);
+      input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t);
+      input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t);
+      input.output_buffer_elem_size =
+          sizeof(attn_impl::partial_output_buffer_t);
+      input.max_num_q_per_iter = attn_impl::MaxQHeadNumPerIteration;
+      input.kv_block_alignment = attn_impl::BlockSizeAlignment;
    });
  });

@@ -184,18 +107,14 @@ void cpu_attn_reshape_and_cache(

  VLLM_DISPATCH_FLOATING_TYPES(
      key.scalar_type(), "cpu_attn_reshape_and_cache", [&]() {
-        CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
-          CPU_ATTN_DISPATCH_IMPL(isa_tag, [&]() {
-            attn_impl::reshape_and_cache(
-                key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
-                key_cache.data_ptr<scalar_t>(),
-                value_cache.data_ptr<scalar_t>(),
-                slot_mapping.data_ptr<int64_t>(), token_num,
-                key_token_num_stride, value_token_num_stride, head_num,
-                key_head_num_stride, value_head_num_stride, num_blocks,
-                num_blocks_stride, cache_head_num_stride, block_size,
-                block_size_stride);
-          });
+        CPU_ATTN_DISPATCH(head_dim, isa_tag, [&]() {
+          attn_impl::reshape_and_cache(
+              key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
+              key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
+              slot_mapping.data_ptr<int64_t>(), token_num, key_token_num_stride,
+              value_token_num_stride, head_num, key_head_num_stride,
+              value_head_num_stride, num_blocks, num_blocks_stride,
+              cache_head_num_stride, block_size, block_size_stride);
        });
      });
 }
@@ -257,12 +176,10 @@ void cpu_attention_with_kv_cache(

  VLLM_DISPATCH_FLOATING_TYPES(
      query.scalar_type(), "cpu_attention_with_kv_cache", [&]() {
-        CPU_ATTN_DISPATCH_CASE_HEADDIM(query.size(2), [&] {
-          CPU_ATTN_DISPATCH_IMPL(input.metadata->isa, [&]() {
-            TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
-            cpu_attention::AttentionMainLoop<attn_impl> mainloop;
-            mainloop(&input);
-          });
+        CPU_ATTN_DISPATCH(query.size(2), input.metadata->isa, [&]() {
+          TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0);
+          cpu_attention::AttentionMainLoop<attn_impl> mainloop;
+          mainloop(&input);
        });
      });
 }
--- a/csrc/cpu/cpu_attn_amx.hpp
+++ b/csrc/cpu/cpu_attn_amx.hpp
@@ -377,7 +377,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
      const int32_t q_heads_per_kv, const int64_t q_num_stride,
      const int64_t q_head_stride, const float scale) {
    constexpr int64_t bytes_per_head = head_dim * sizeof(scalar_t);
-    // static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
+    static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
    constexpr int64_t head_size_block_num = bytes_per_head / AMX_TILE_ROW_BYTES;
    constexpr int64_t head_elem_num_pre_block =
        AMX_TILE_ROW_BYTES / sizeof(scalar_t);
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -816,14 +816,10 @@ struct VecTypeTrait<float> {
  using vec_t = vec_op::FP32Vec16;
 };

-// ARM only supports BF16 with ARMv8.6-A extension
-#if (defined(__aarch64__) && !defined(ARM_BF16_SUPPORT))
-#else
 template <>
 struct VecTypeTrait<c10::BFloat16> {
  using vec_t = vec_op::BF16Vec16;
 };
-#endif

 #if !defined(__powerpc__) && !defined(__s390x__)
 template <>
@@ -1585,17 +1581,10 @@ class AttentionMainLoop {

              if (use_sink) {
                alignas(64) float s_aux_fp32[16];
-#if defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
-                // ARM without native BF16 support: manual conversion
-                for (int i = 0; i < 16; ++i) {
-                  s_aux_fp32[i] = static_cast<float>(curr_s_aux[i]);
-                }
-#else
                // All other platforms have BF16Vec16 available
                vec_op::BF16Vec16 vec_bf16(curr_s_aux);
                vec_op::FP32Vec16 vec_fp32(vec_bf16);
                vec_fp32.save(s_aux_fp32);
-#endif

                float* __restrict__ curr_sum_buffer = sum_buffer;
                float* __restrict__ curr_max_buffer = max_buffer;
--- a/csrc/cpu/cpu_attn_neon.hpp
+++ b/csrc/cpu/cpu_attn_neon.hpp
@@ -264,7 +264,7 @@ class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
  constexpr static ISA ISAType = ISA::NEON;
  constexpr static bool scale_on_logits = false;  // apply scale on q_buffer

-  //  static_assert(HeadDim % HeadDimAlignment == 0);
+  static_assert(HeadDim % HeadDimAlignment == 0);
  // the gemm micro kernel is Mx8
  static_assert(HeadDimAlignment % 8 == 0);
  static_assert(BlockSizeAlignment % 8 == 0);
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@@ -14,13 +14,11 @@ struct KernelVecType<float> {
  using cvt_vec_type = vec_op::FP32Vec16;
 };

-#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
 template <>
 struct KernelVecType<c10::BFloat16> {
  using load_vec_type = vec_op::BF16Vec16;
  using cvt_vec_type = vec_op::FP32Vec16;
 };
-#endif

 template <>
 struct KernelVecType<c10::Half> {
--- a/csrc/cpu/generate_cpu_attn_dispatch.py
+++ b/csrc/cpu/generate_cpu_attn_dispatch.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Generate CPU attention dispatch switch cases and kernel instantiations.
+"""
+
+import os
+
+# Head dimensions divisible by 32 (support all ISAs)
+HEAD_DIMS_32 = [32, 64, 96, 128, 160, 192, 224, 256]
+
+# Head dimensions divisible by 16 but not 32 (VEC16 only)
+HEAD_DIMS_16 = [80, 112]
+
+# ISA types
+ISA_TYPES = {
+    "AMX": 0,
+    "VEC": 1,
+    "VEC16": 2,
+    "NEON": 3,
+}
+
+# ISAs supported for head_dims divisible by 32
+ISA_FOR_32 = ["AMX", "NEON", "VEC", "VEC16"]
+
+# ISAs supported for head_dims divisible by 16 only
+ISA_FOR_16 = ["VEC16"]
+
+
+def encode_params(head_dim: int, isa_type: str) -> int:
+    """Encode head_dim and ISA type into a single int64_t."""
+    isa_val = ISA_TYPES[isa_type]
+    # Encoding: (head_dim << 8) | isa_type
+    # This allows head_dim up to 2^56 - 1 and 256 ISA types
+    return (head_dim << 8) | isa_val
+
+
+def generate_cases_for_isa_group(isa_list: list[str]) -> str:
+    """Generate switch cases for a specific ISA group."""
+    cases = []
+
+    # Generate cases for head_dims divisible by 32
+    for head_dim in HEAD_DIMS_32:
+        for isa in isa_list:
+            if isa not in ISA_FOR_32:
+                continue
+            encoded = encode_params(head_dim, isa)
+            case_str = (
+                f"""      case {encoded}LL: {{ """
+                f"""/* head_dim={head_dim}, isa={isa} */ \\"""
+                f"""
+        constexpr size_t head_dim = {head_dim}; \\"""
+                f"""
+        using attn_impl = cpu_attention::AttentionImpl<"""
+                f"""cpu_attention::ISA::{isa}, \\"""
+                f"""
+                                                       """
+                f"""scalar_t, head_dim>; \\"""
+                f"""
+        return __VA_ARGS__(); \\"""
+                f"""
+      }} \\"""
+            )
+            cases.append(case_str)
+
+    # Generate cases for head_dims divisible by 16 only
+    for head_dim in HEAD_DIMS_16:
+        for isa in isa_list:
+            encoded = encode_params(head_dim, isa)
+            case_str = (
+                f"""      case {encoded}LL: {{ """
+                f"""/* head_dim={head_dim}, isa={isa} """
+                f"""(using VEC16) */ \\"""
+                f"""
+        constexpr size_t head_dim = {head_dim}; \\"""
+                f"""
+        using attn_impl = cpu_attention::AttentionImpl<"""
+                f"""cpu_attention::ISA::VEC16, \\"""
+                f"""
+                                                       """
+                f"""scalar_t, head_dim>; \\"""
+                f"""
+        return __VA_ARGS__(); \\"""
+                f"""
+      }} \\"""
+            )
+            cases.append(case_str)
+
+    return "\n".join(cases)
+
+
+def generate_helper_function() -> str:
+    """Generate helper function to encode parameters."""
+    return """
+inline int64_t encode_cpu_attn_params(int64_t head_dim, cpu_attention::ISA isa) {
+  return (head_dim << 8) | static_cast<int64_t>(isa);
+}
+"""
+
+
+def generate_header_file() -> str:
+    """Generate the complete header file content."""
+    header = """// auto generated by generate_cpu_attn_dispatch.py
+// clang-format off
+
+#ifndef CPU_ATTN_DISPATCH_GENERATED_H
+#define CPU_ATTN_DISPATCH_GENERATED_H
+
+#include "cpu_attn_vec.hpp"
+#include "cpu_attn_vec16.hpp"
+
+#ifdef CPU_CAPABILITY_AMXBF16
+  #include "cpu_attn_amx.hpp"
+#endif
+
+#ifdef __aarch64__
+  #include "cpu_attn_neon.hpp"
+#endif
+
+"""
+
+    header += generate_helper_function()
+
+    # Generate dispatch macro with conditional compilation for different ISA sets
+    header += """
+// Dispatch macro using encoded parameters
+"""
+
+    # x86_64 with AMX
+    header += """#if defined(CPU_CAPABILITY_AMXBF16)
+#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
+  [&] { \\
+    int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
+    switch (encoded_params) { \\
+"""
+    header += generate_cases_for_isa_group(["AMX", "VEC", "VEC16"])
+    header += """
+      default: { \\
+        TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
+                    std::to_string(HEAD_DIM) + " isa=" + \\
+                    std::to_string(static_cast<int>(ISA_TYPE))); \\
+      } \\
+    } \\
+  }()
+
+"""
+
+    # ARM64 with NEON
+    header += """#elif defined(__aarch64__)
+#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
+  [&] { \\
+    int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
+    switch (encoded_params) { \\
+"""
+    header += generate_cases_for_isa_group(["NEON", "VEC", "VEC16"])
+    header += """
+      default: { \\
+        TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
+                    std::to_string(HEAD_DIM) + " isa=" + \\
+                    std::to_string(static_cast<int>(ISA_TYPE))); \\
+      } \\
+    } \\
+  }()
+
+"""
+
+    # Fallback: VEC and VEC16 only
+    header += """#else
+#define CPU_ATTN_DISPATCH(HEAD_DIM, ISA_TYPE, ...) \\
+  [&] { \\
+    int64_t encoded_params = encode_cpu_attn_params(HEAD_DIM, ISA_TYPE); \\
+    switch (encoded_params) { \\
+"""
+    header += generate_cases_for_isa_group(["VEC", "VEC16"])
+    header += """
+      default: { \\
+        TORCH_CHECK(false, "Unsupported CPU attention configuration: head_dim=" + \\
+                    std::to_string(HEAD_DIM) + " isa=" + \\
+                    std::to_string(static_cast<int>(ISA_TYPE))); \\
+      } \\
+    } \\
+  }()
+
+#endif  /* CPU_CAPABILITY_AMXBF16 / __aarch64__ */
+
+#endif  // CPU_ATTN_DISPATCH_GENERATED_H
+"""
+
+    return header
+
+
+def main():
+    output_path = os.path.join(
+        os.path.dirname(__file__), "cpu_attn_dispatch_generated.h"
+    )
+
+    with open(output_path, "w") as f:
+        f.write(generate_header_file())
+
+
+if __name__ == "__main__":
+    main()
--- a/csrc/cpu/mla_decode.cpp
+++ b/csrc/cpu/mla_decode.cpp
@@ -38,9 +38,7 @@ struct KernelVecType<c10::BFloat16> {
  using qk_vec_type = vec_op::BF16Vec32;
  using v_load_vec_type = vec_op::BF16Vec16;
 };
-#elif defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
-// pass
-#else
+#elif defined(__aarch64__)
 template <>
 struct KernelVecType<c10::BFloat16> {
  using qk_load_vec_type = vec_op::BF16Vec16;
--- a/csrc/cpu/sgl-kernels/gemm.cpp
+++ b/csrc/cpu/sgl-kernels/gemm.cpp
@@ -265,7 +265,7 @@ void tinygemm_kernel(
        // mb_size = 4
        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
        case 0x44: LAUNCH_TINYGEMM_KERNEL_NN(4, 64); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
      }
    }
  }
--- a/csrc/cpu/sgl-kernels/gemm_fp8.cpp
+++ b/csrc/cpu/sgl-kernels/gemm_fp8.cpp
@@ -324,7 +324,7 @@ void tinygemm_kernel(
        case 0x22: LAUNCH_TINYGEMM_KERNEL_NN(2, 32); break;
        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
      }
    }
  }
--- a/csrc/cpu/sgl-kernels/gemm_int8.cpp
+++ b/csrc/cpu/sgl-kernels/gemm_int8.cpp
@@ -180,7 +180,7 @@ void tinygemm_kernel(
        // mb_size = 4
        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
        case 0x44: LAUNCH_TINYGEMM_KERNEL_NN(4, 64); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
      }
    }
  }
--- a/csrc/cpu/sgl-kernels/moe.cpp
+++ b/csrc/cpu/sgl-kernels/moe.cpp
@@ -398,7 +398,7 @@ void tinygemm_kernel(
        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
        // mb_size = 4
        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
      }
    }
  }
@@ -511,7 +511,7 @@ void tinygemm_kernel(
        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN2(3, 32); break;
        // mb_size = 4
        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN2(4, 32); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
      }
    }
  }
--- a/csrc/cpu/sgl-kernels/moe_int8.cpp
+++ b/csrc/cpu/sgl-kernels/moe_int8.cpp
@@ -271,7 +271,7 @@ void tinygemm_kernel(
        case 0x22: LAUNCH_TINYGEMM_KERNEL_VNNI(2, 32); break;
        case 0x32: LAUNCH_TINYGEMM_KERNEL_VNNI(3, 32); break;
        case 0x42: LAUNCH_TINYGEMM_KERNEL_VNNI(4, 32); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
      }
    }
  }
@@ -401,7 +401,7 @@ void tinygemm_kernel(
        case 0x22: LAUNCH_TINYGEMM_KERNEL_VNNI2(2, 32); break;
        case 0x32: LAUNCH_TINYGEMM_KERNEL_VNNI2(3, 32); break;
        case 0x42: LAUNCH_TINYGEMM_KERNEL_VNNI2(4, 32); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
      }
    }
  }
--- a/csrc/cpu/utils.hpp
+++ b/csrc/cpu/utils.hpp
@@ -30,12 +30,10 @@ struct VecTypeTrait<float> {
  using vec_t = vec_op::FP32Vec16;
 };

-#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
 template <>
 struct VecTypeTrait<c10::BFloat16> {
  using vec_t = vec_op::BF16Vec16;
 };
-#endif

 #if !defined(__powerpc__)
 template <>
--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -115,11 +115,28 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
  if (flag) {  // support GPUDirect RDMA if possible
    prop.allocFlags.gpuDirectRDMACapable = 1;
  }
+  int fab_flag = 0;
+  CUDA_CHECK(cuDeviceGetAttribute(
+      &fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device));
+  if (fab_flag) {  // support fabric handle if possible
+    prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
+  }
 #endif

 #ifndef USE_ROCM
  // Allocate memory using cuMemCreate
-  CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
+  CUresult ret = (CUresult)cuMemCreate(p_memHandle, size, &prop, 0);
+  if (ret) {
+    if (fab_flag &&
+        (ret == CUDA_ERROR_NOT_PERMITTED || ret == CUDA_ERROR_NOT_SUPPORTED)) {
+      // Fabric allocation may fail without multi-node nvlink,
+      // fallback to POSIX file descriptor
+      prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+      CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
+    } else {
+      CUDA_CHECK(ret);
+    }
+  }
  if (error_code != 0) {
    return;
  }
--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@@ -3,7 +3,8 @@
 #include "cutlass/cutlass.h"
 #include <climits>
 #include "cuda_runtime.h"
-#include <iostream>
+#include <cstdio>
+#include <cstdlib>

 /**
 * Helper function for checking CUTLASS errors
@@ -31,12 +32,63 @@ int32_t get_sm_version_num();
 * __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
 * into code that will be executed on the device where it is defined.
 */
+
+template <typename Kernel>
+struct enable_sm75_to_sm80 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
+    Kernel::invoke(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm[75, 80).\n");
+    asm("trap;");
+  #endif
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm80_to_sm89 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
+    Kernel::invoke(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm[80, 89).\n");
+    asm("trap;");
+  #endif
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm89_to_sm90 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
+    Kernel::invoke(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm[89, 90).\n");
+    asm("trap;");
+  #endif
+#endif
+  }
+};
+
 template <typename Kernel>
 struct enable_sm90_or_later : Kernel {
  template <typename... Args>
  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ >= 900
    Kernel::operator()(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm >= 90.\n");
+    asm("trap;");
+  #endif
 #endif
  }
 };
@@ -45,18 +97,43 @@ template <typename Kernel>
 struct enable_sm90_only : Kernel {
  template <typename... Args>
  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 900
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ == 900
    Kernel::operator()(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm90.\n");
+    asm("trap;");
+  #endif
 #endif
  }
 };

 template <typename Kernel>
-struct enable_sm100_only : Kernel {
+struct enable_sm100f_only : Kernel {
  template <typename... Args>
  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 1000
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ == 1000 || __CUDA_ARCH__ == 1030
    Kernel::operator()(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm100f.\n");
+    asm("trap;");
+  #endif
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm100a_only : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ == 1000
+    Kernel::operator()(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm100a.\n");
+    asm("trap;");
+  #endif
 #endif
  }
 };
@@ -65,8 +142,13 @@ template <typename Kernel>
 struct enable_sm120_only : Kernel {
  template <typename... Args>
  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 1200
+#if defined __CUDA_ARCH__
+  #if __CUDA_ARCH__ == 1200
    Kernel::operator()(std::forward<Args>(args)...);
+  #else
+    printf("This kernel only supports sm120.\n");
+    asm("trap;");
+  #endif
 #endif
  }
 };
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@@ -770,7 +770,7 @@ torch::Tensor moe_wna16_marlin_gemm(
    b_bias = b_bias_or_none.value();
    TORCH_CHECK(b_bias.device().is_cuda(), "b_bias is not on GPU");
    TORCH_CHECK(b_bias.is_contiguous(), "b_bias is not contiguous");
-    TORCH_CHECK(b_bias.size(1) == size_n, "b_bias.size(0) != size_n");
+    TORCH_CHECK(b_bias.size(1) == size_n, "b_bias.size(1) != size_n");
    TORCH_CHECK(b_bias.stride(1) == 1, "b_bias.stride(1) != 1");
  } else {
    b_bias = torch::empty({0}, options);
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm.cuh
@@ -141,8 +141,8 @@ struct cutlass_3x_gemm_sm100 {
              sizeof(typename CollectiveEpilogue::SharedStorage))>,
          KernelSchedule>::CollectiveOp;

-  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+  using GemmKernel = enable_sm100f_only<cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>>;
 };

 template <typename ElementAB_, typename ElementD_,
@@ -202,8 +202,8 @@ struct cutlass_3x_gemm_sm120 {
              sizeof(typename CollectiveEpilogue::SharedStorage))>,
          KernelSchedule>::CollectiveOp;

-  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+  using GemmKernel = enable_sm120_only<cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>>;
 };

 }  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
@@ -123,7 +123,7 @@ struct cutlass_3x_gemm_fp8_blockwise {
          MainloopScheduler
      >::CollectiveOp>;

-  using KernelType = enable_sm100_only<cutlass::gemm::kernel::GemmUniversal<
+  using KernelType = enable_sm100f_only<cutlass::gemm::kernel::GemmUniversal<
      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;

  struct GemmKernel : public KernelType {};
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@@ -90,8 +90,8 @@ struct cutlass_3x_gemm_sm100_fp8 {
  // -----------------------------------------------------------
  // Kernel definition
  // -----------------------------------------------------------
-  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+  using GemmKernel = enable_sm100f_only<cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>>;
 };

 template <typename InType, typename OutType, bool EnableBias>
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cuh
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cuh
@@ -36,41 +36,6 @@ using namespace cute;
 */

 namespace vllm {
-
-// Wrappers for the GEMM kernel that is used to guard against compilation on
-// architectures that will never use the kernel. The purpose of this is to
-// reduce the size of the compiled binary.
-// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
-// into code that will be executed on the device where it is defined.
-template <typename Kernel>
-struct enable_sm75_to_sm80 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE static void invoke(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
-    Kernel::invoke(std::forward<Args>(args)...);
-#endif
-  }
-};
-
-template <typename Kernel>
-struct enable_sm80_to_sm89 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE static void invoke(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
-    Kernel::invoke(std::forward<Args>(args)...);
-#endif
-  }
-};
-
-template <typename Kernel>
-struct enable_sm89_to_sm90 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE static void invoke(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
-    Kernel::invoke(std::forward<Args>(args)...);
-#endif
-  }
-};
 template <typename Arch, template <typename> typename ArchGuard,
          typename ElementAB_, typename ElementD_,
          template <typename, typename> typename Epilogue_, typename TileShape,
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_fp8_dispatch.cuh
@@ -50,7 +50,7 @@ struct sm89_fp8_config_default {
      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -58,7 +58,7 @@ struct sm89_fp8_config_default {
      using TileShape = typename cutlass::gemm::GemmShape<256, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -67,7 +67,7 @@ struct sm89_fp8_config_default {
      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -100,7 +100,7 @@ struct sm89_fp8_config_M256 {
      using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -108,7 +108,7 @@ struct sm89_fp8_config_M256 {
      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -141,7 +141,7 @@ struct sm89_fp8_config_M128 {
      using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -150,7 +150,7 @@ struct sm89_fp8_config_M128 {
      using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -158,7 +158,7 @@ struct sm89_fp8_config_M128 {
      using TileShape = typename cutlass::gemm::GemmShape<128, 64, 128>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -191,7 +191,7 @@ struct sm89_fp8_config_M64 {
      using FP8MathOperator = typename cutlass::arch::OpMultiplyAdd;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -201,7 +201,7 @@ struct sm89_fp8_config_M64 {
      using FP8MathOperator = typename cutlass::arch::OpMultiplyAddFastAccum;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -211,7 +211,7 @@ struct sm89_fp8_config_M64 {
      using FP8MathOperator = typename cutlass::arch::OpMultiplyAdd;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -244,7 +244,7 @@ struct sm89_fp8_config_M32 {
      using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -253,7 +253,7 @@ struct sm89_fp8_config_M32 {
      using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 4, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -262,7 +262,7 @@ struct sm89_fp8_config_M32 {
      using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5, FP8MathOperator>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -296,7 +296,7 @@ struct sm89_fp8_config_M16 {
      using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, MainLoopStages,
                                FP8MathOperator>,
@@ -305,7 +305,7 @@ struct sm89_fp8_config_M16 {
      using TileShape = typename cutlass::gemm::GemmShape<16, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, MainLoopStages,
                                FP8MathOperator>,
@@ -314,7 +314,7 @@ struct sm89_fp8_config_M16 {
      using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, MainLoopStages,
                                FP8MathOperator>,
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_int8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x_sm89_int8_dispatch.cuh
@@ -48,7 +48,7 @@ struct sm89_int8_config_default {
      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -56,7 +56,7 @@ struct sm89_int8_config_default {
      using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -64,7 +64,7 @@ struct sm89_int8_config_default {
      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -72,7 +72,7 @@ struct sm89_int8_config_default {
      using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -104,7 +104,7 @@ struct sm89_int8_config_M256 {
      using TileShape = cutlass::gemm::GemmShape<64, 128, 128>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -112,7 +112,7 @@ struct sm89_int8_config_M256 {
      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -120,7 +120,7 @@ struct sm89_int8_config_M256 {
      using TileShape = cutlass::gemm::GemmShape<256, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -128,7 +128,7 @@ struct sm89_int8_config_M256 {
      using TileShape = cutlass::gemm::GemmShape<128, 128, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -160,7 +160,7 @@ struct sm89_int8_config_M128 {
      using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -169,7 +169,7 @@ struct sm89_int8_config_M128 {
      using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -178,7 +178,7 @@ struct sm89_int8_config_M128 {
      using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -210,7 +210,7 @@ struct sm89_int8_config_M64 {
      using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -219,7 +219,7 @@ struct sm89_int8_config_M64 {
      using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 3>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -251,7 +251,7 @@ struct sm89_int8_config_M32 {
      using WarpShape = cutlass::gemm::GemmShape<16, 64, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -260,7 +260,7 @@ struct sm89_int8_config_M32 {
      using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 4>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -292,7 +292,7 @@ struct sm89_int8_config_M16 {
      using TileShape = cutlass::gemm::GemmShape<16, 64, 128>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 5>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
@@ -300,7 +300,7 @@ struct sm89_int8_config_M16 {
      using TileShape = cutlass::gemm::GemmShape<16, 128, 128>;

      return vllm::fallback_cutlass_gemm_caller<
-          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, vllm::enable_sm89_to_sm90,
+          vllm::cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
                                InType, OutType, Epilogue, TileShape, WarpShape,
                                InstructionShape, 4>,
          FallbackGemm>(out, a, b, std::forward<EpilogueArgs>(args)...);
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -97,9 +97,7 @@ ARG PYTHON_VERSION
 ENV DEBIAN_FRONTEND=noninteractive

 # Install system dependencies including build tools
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
+RUN apt-get update -y \
    && apt-get install -y --no-install-recommends \
        ccache \
        software-properties-common \
@@ -502,9 +500,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment

 # Install Python and system dependencies
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
+RUN apt-get update -y \
    && apt-get install -y --no-install-recommends \
        software-properties-common \
        curl \
@@ -586,7 +582,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # This is ~1.1GB and only changes when FlashInfer version bumps
 # https://docs.flashinfer.ai/installation.html
 # From versions.json: .flashinfer.version
-ARG FLASHINFER_VERSION=0.6.1
+ARG FLASHINFER_VERSION=0.6.2
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \
    && uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
@@ -713,9 +709,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy

-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
+RUN apt-get update -y \
    && apt-get install -y git

 # We can specify the standard or nightly build of PyTorch
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -20,9 +20,7 @@ ARG PYTHON_VERSION=3.12
 ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive
 # Install Python and other dependencies
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
+RUN apt-get update -y \
    && apt-get install -y ccache software-properties-common git curl sudo \
    && for i in 1 2 3; do \
        add-apt-repository -y ppa:deadsnakes/ppa && break || \
@@ -172,9 +170,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment

 # Install Python and other dependencies
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
+RUN apt-get update -y \
    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
    && for i in 1 2 3; do \
@@ -221,13 +217,13 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.


 # build flashinfer for torch nightly from source around 10 mins
-# release version: v0.6.1
+# release version: v0.6.2
 # todo(elainewy): cache flashinfer build result for faster build
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/uv \
    echo "git clone flashinfer..." \
-    && git clone --depth 1 --branch v0.6.1 --recursive https://github.com/flashinfer-ai/flashinfer.git \
+    && git clone --depth 1 --branch v0.6.2 --recursive https://github.com/flashinfer-ai/flashinfer.git \
    && cd flashinfer \
    && git submodule update --init --recursive \
    && echo "finish git clone flashinfer..." \
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -15,8 +15,6 @@ FROM ${BASE_IMAGE} AS base

 ARG ARG_PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
-ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
-ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1

 # Install some basic utilities
 RUN apt-get update -q -y && apt-get install -q -y \
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
@@ -1,5 +1,5 @@
 # Base UBI image for s390x architecture
-ARG BASE_UBI_IMAGE_TAG=9.5-1736404155
+ARG BASE_UBI_IMAGE_TAG=9.6
 ARG PYTHON_VERSION=3.12
 FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base

@@ -14,12 +14,18 @@ ENV LANG=C.UTF-8 \

 # Install development utilities
 RUN microdnf install -y \
-    which procps findutils tar vim git gcc-toolset-14 gcc-toolset-14-libatomic-devel patch zlib-devel \
+    which procps findutils tar vim git gcc-toolset-14 gcc-toolset-14-binutils gcc-toolset-14-libatomic-devel patch zlib-devel \
    libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
    openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile \
    clang llvm-devel llvm-static clang-devel && \
    microdnf clean all

+ENV GCC_TOOLSET_ROOT=/opt/rh/gcc-toolset-14/root \
+    PATH=/opt/rh/gcc-toolset-14/root/usr/bin:/usr/local/bin:/usr/bin:/bin \
+    LD_LIBRARY_PATH=/opt/rh/gcc-toolset-14/root/usr/lib64:/usr/local/lib:/usr/lib64 \
+    LIBRARY_PATH=/opt/rh/gcc-toolset-14/root/usr/lib64 \
+    PKG_CONFIG_PATH=/opt/rh/gcc-toolset-14/root/usr/lib64/pkgconfig
+
 # Python Installation
 FROM base AS python-install
 ARG PYTHON_VERSION
@@ -87,13 +93,13 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \

 FROM python-install AS torch-vision
 # Install torchvision
-ARG TORCH_VISION_VERSION=v0.23.0
+ARG TORCH_VISION_VERSION=v0.25.0
 WORKDIR /tmp
 RUN --mount=type=cache,target=/root/.cache/uv \
    git clone https://github.com/pytorch/vision.git && \
    cd vision && \
    git checkout $TORCH_VISION_VERSION && \
-    uv pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cpu && \
+    uv pip install torch==2.10.0 --index-url https://download.pytorch.org/whl/cpu && \
    python setup.py bdist_wheel

 FROM python-install AS hf-xet-builder
@@ -174,7 +180,19 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \
       sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \
    fi && python setup.py bdist_wheel
-    
+
+# Build OpenCV from source for s390x
+FROM python-install AS opencv-builder
+WORKDIR /tmp
+ARG MAX_JOBS
+ARG OPENCV_VERSION=90
+ARG ENABLE_HEADLESS=1
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install numpy setuptools  wheel scikit_build build && \
+    git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
+    cd opencv-python && \
+    python -m build --wheel --installer=uv --outdir /tmp/opencv-python/dist
+
 # Build Outlines Core
 FROM python-install AS outlines-core-builder
 WORKDIR /tmp
@@ -198,7 +216,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Final build stage
 FROM python-install AS vllm-cpu
 ARG PYTHON_VERSION
-
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 # Set correct library path for torch and numactl
 ENV LD_LIBRARY_PATH="/opt/vllm/lib64/python${PYTHON_VERSION}/site-packages/torch/lib:/usr/local/lib:/opt/rh/gcc-toolset-14/root/usr/lib64:$LD_LIBRARY_PATH"
 ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH"
@@ -209,7 +227,8 @@ ENV GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
 ENV PCP_DIR=/opt/rh/gcc-toolset-14/root
 ENV PKG_CONFIG_PATH="/opt/rh/gcc-toolset-14/root/usr/lib64/pkgconfig:/usr/local/lib/pkgconfig/"
 ENV PATH="${VIRTUAL_ENV:+${VIRTUAL_ENV}/bin}:/opt/rh/gcc-toolset-14/root/usr/bin:/usr/local/bin:$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
-
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 COPY . /workspace/vllm
 WORKDIR /workspace/vllm

@@ -225,23 +244,25 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
    --mount=type=bind,from=numba-builder,source=/tmp/llvmlite/dist,target=/tmp/llvmlite-wheels/ \
    --mount=type=bind,from=numba-builder,source=/tmp/numba/dist,target=/tmp/numba-wheels/ \
+    --mount=type=bind,from=opencv-builder,source=/tmp/opencv-python/dist,target=/tmp/opencv-wheels/ \
    --mount=type=bind,from=outlines-core-builder,source=/tmp/outlines-core/dist,target=/tmp/outlines-core/dist/ \
-     sed -i '/^torch/d' requirements/build.txt && \
     ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl) && \
     VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl) && \
     HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl) && \
     LLVM_WHL_FILE=$(ls /tmp/llvmlite-wheels/*.whl) && \
     NUMBA_WHL_FILE=$(ls /tmp/numba-wheels/*.whl) && \
+     OPENCV_WHL_FILE=$(ls /tmp/opencv-wheels/*.whl) && \
     OUTLINES_CORE_WHL_FILE=$(ls /tmp/outlines-core/dist/*.whl) && \
-    uv pip install -v \    
+     uv pip install -v \    
        $ARROW_WHL_FILE  \
        $VISION_WHL_FILE \
        $HF_XET_WHL_FILE \
        $LLVM_WHL_FILE \
        $NUMBA_WHL_FILE \
+        $OPENCV_WHL_FILE \
        $OUTLINES_CORE_WHL_FILE \
        --index-strategy unsafe-best-match \
-        -r requirements/build.txt \
+        -r requirements/cpu-build.txt \
        -r requirements/cpu.txt


@@ -252,7 +273,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \

 # setup non-root user for vllm
 RUN umask 002 && \
-    useradd --uid 2000 --gid 0 vllm && \
+    /usr/sbin/useradd --uid 2000 --gid 0 vllm && \
    mkdir -p /home/vllm && \
    chmod g+rwx /home/vllm

--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -1,8 +1,8 @@
-FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base
+FROM intel/deep-learning-essentials:2025.3.2-0-devel-ubuntu24.04 AS vllm-base

 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
-    add-apt-repository -y ppa:kobuk-team/intel-graphics-staging
+    add-apt-repository -y ppa:kobuk-team/intel-graphics

 RUN apt clean && apt-get update -y && \
    apt-get install -y --no-install-recommends --fix-missing \
@@ -25,10 +25,13 @@ RUN apt clean && apt-get update -y && \
 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
 RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1

-RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc
+RUN apt update && apt upgrade -y && \
+    apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc && \
+    apt install -y intel-oneapi-compiler-dpcpp-cpp-2025.3
+

 # This oneccl contains the BMG support which is not the case for default version of oneapi 2025.2.
-ARG ONECCL_INSTALLER="intel-oneccl-2021.15.7.6_offline.sh"
+ARG ONECCL_INSTALLER="intel-oneccl-2021.15.7.8_offline.sh"
 RUN wget "https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.7/${ONECCL_INSTALLER}" && \
    bash "${ONECCL_INSTALLER}" -a --silent --eula accept && \
    rm "${ONECCL_INSTALLER}" && \
@@ -85,6 +88,9 @@ RUN python3 -m pip install -e tests/vllm_test_utils
 ENV NIXL_VERSION=0.7.0
 RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py

+# FIX triton
+RUN --mount=type=cache,target=/root/.cache/pip pip uninstall triton triton-xpu -y && pip install triton-xpu==3.6.0 --extra-index-url=https://download.pytorch.org/whl/xpu
+
 # PyJWT-2.7.0 will influence some wheel behaviors, remove its dist-info to avoid conflicts
 RUN rm /usr/lib/python3/dist-packages/PyJWT-2.7.0.dist-info/ -rf

--- a/docker/versions.json
+++ b/docker/versions.json
@@ -68,7 +68,7 @@
      "default": "true"
    },
    "FLASHINFER_VERSION": {
-      "default": "0.6.1"
+      "default": "0.6.2"
    },
    "GDRCOPY_CUDA_VERSION": {
      "default": "12.8"
--- a/docs/benchmarking/cli.md
+++ b/docs/benchmarking/cli.md
@@ -32,6 +32,7 @@ th {
 | HuggingFace-Blazedit | ✅ | ✅ | `vdaita/edit_5k_char`, `vdaita/edit_10k_char` |
 | Spec Bench | ✅ | ✅ | `wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl` |
 | Custom | ✅ | ✅ | Local file: `data.jsonl` |
+| Custom MM | ✅ | ✅ | Local file: `mm_data.jsonl` |

 Legend:

@@ -133,6 +134,33 @@ vllm bench serve --port 9001 --save-result --save-detailed \

 You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.

+#### Custom multimodal dataset
+
+If the multimodal dataset you want to benchmark is not supported yet in vLLM, then you can benchmark on it using `CustomMMDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" and "image_files" field per entry, e.g., `mm_data.jsonl`:
+
+```json
+{"prompt": "How many animals are present in the given image?", "image_files": ["/path/to/image/folder/horsepony.jpg"]}
+{"prompt": "What colour is the bird shown in the image?", "image_files": ["/path/to/image/folder/flycatcher.jpeg"]}
+```
+
+```bash
+# need a model with vision capability here
+vllm serve Qwen/Qwen2-VL-7B-Instruct
+```
+
+```bash
+# run benchmarking script
+vllm bench serve--save-result --save-detailed \
+  --backend openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name custom_mm \
+  --dataset-path <path-to-your-mm-data-jsonl> \
+  --allowed-local-media-path /path/to/image/folder
+```
+
+Note that we need to use the `openai-chat` backend and `/v1/chat/completions` endpoint for multimodal inputs.
+
 #### VisionArena Benchmark for Vision Language Models

 ```bash
--- a/docs/benchmarking/sweeps.md
+++ b/docs/benchmarking/sweeps.md
@@ -82,7 +82,7 @@ vllm bench sweep serve \
    You can use `--dry-run` to preview the commands to be run.

    We only start the server once for each `--serve-params`, and keep it running for multiple `--bench-params`.
-    Between each benchmark run, we call the `/reset_prefix_cache` and `/reset_mm_cache` endpoints to get a clean slate for the next run.
+    Between each benchmark run, we call all `/reset_*_cache` endpoints to get a clean slate for the next run.
    In case you are using a custom `--serve-cmd`, you can override the commands used for resetting the state by setting `--after-bench-cmd`.

 !!! note
--- a/docs/contributing/model/transcription.md
+++ b/docs/contributing/model/transcription.md
@@ -251,6 +251,7 @@ No extra registration is required beyond having your model class available via t
 - Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py)
 - Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py). Make sure to have installed `mistral-common[audio]`.
 - Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py)
+- Qwen3-Omni multimodal with audio embeddings: [vllm/model_executor/models/qwen3_omni_moe_thinker.py](../../../vllm/model_executor/models/qwen3_omni_moe_thinker.py)

 ## Test with the API

--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@@ -59,11 +59,15 @@ First, create a Kubernetes PVC and Secret for downloading and storing Hugging Fa
 Here, the `token` field stores your **Hugging Face access token**. For details on how to generate a token,
 see the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens).

-Next, start the vLLM server as a Kubernetes Deployment and Service:
+Next, start the vLLM server as a Kubernetes Deployment and Service.
+
+Note that you will want to configure your vLLM image based on your processor arch:

 ??? console "Config"

    ```bash
+    VLLM_IMAGE=public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest       # use this for x86_64
+    VLLM_IMAGE=public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest # use this for arm64
    cat <<EOF |kubectl apply -f -
    apiVersion: apps/v1
    kind: Deployment
@@ -81,7 +85,7 @@ Next, start the vLLM server as a Kubernetes Deployment and Service:
        spec:
          containers:
          - name: vllm
-            image: vllm/vllm-openai:latest
+            image: $VLLM_IMAGE
            command: ["/bin/sh", "-c"]
            args: [
              "vllm serve meta-llama/Llama-3.2-1B-Instruct"
--- a/docs/design/attention_backends.md
+++ b/docs/design/attention_backends.md
@@ -168,7 +168,7 @@ Priority is **1 = highest** (tried first).
 | `FLASH_ATTN` | FA3* | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ❌ | All | 9.x |
 | `FLASH_ATTN_DIFFKV` |  | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | Decoder | Any |
 | `FLEX_ATTENTION` |  | fp16, bf16, fp32 | `auto`, `bfloat16` | Any | Any | ❌ | ✅ | Decoder, Encoder Only | Any |
-| `ROCM_AITER_FA` |  | fp16, bf16 | `auto` | %16 | 64, 128, 256 | ❌ | ❌ | Decoder | N/A |
+| `ROCM_AITER_FA` |  | fp16, bf16 | `auto` | 16, 32 | 64, 128, 256 | ❌ | ❌ | Decoder | N/A |
 | `ROCM_AITER_UNIFIED_ATTN` |  | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | Decoder | N/A |
 | `ROCM_ATTN` |  | fp16, bf16, fp32 | `auto` | 16, 32, 544 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | Decoder | N/A |
 | `TREE_ATTN` |  | fp16, bf16 | `auto` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | Decoder | Any |
--- a/docs/design/debug_vllm_compile.md
+++ b/docs/design/debug_vllm_compile.md
@@ -282,6 +282,15 @@ If vLLM's compile cache is wrong, this usually means that a factor is missing.
 Please see [this example](https://github.com/vllm-project/vllm/blob/18b39828d90413d05d770dfd2e2f48304f4ca0eb/vllm/config/model.py#L310)
 of how vLLM computes part of the cache key.

+vLLM's compilation cache requires that the code being compiled ends up being serializable.
+If this is not the case, then it will error out on save. Usually the fixes are to either:
+
+- rewrite the non-serializable pieces (perhaps difficult because it's difficult to
+  tell right now what is serializable and what isn't)
+- file a bug report
+- ignore the error by setting `VLLM_DISABLE_COMPILE_CACHE=1` (note that this will
+  make warm server starts a lot slower).
+
 ## Debugging CUDAGraphs

 CUDAGraphs is a feature that allows one to:
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -154,4 +154,4 @@ The interface for the model/module may change during vLLM's development. If you
 !!! warning "Deprecations"
    - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0.
    - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.v1.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
-    - `seed_everything` platform interface is deprecated. It will be removed in v0.15.0 or later. Please use `vllm.utils.torch_utils.set_random_seed` instead.
+    - `seed_everything` platform interface is deprecated. It has been removed in v0.16.0. Please use `vllm.utils.torch_utils.set_random_seed` instead.
--- a/docs/examples/README.md
+++ b/docs/examples/README.md
@@ -2,6 +2,6 @@

 vLLM's examples are split into three categories:

- If you are using vLLM from within Python code, see the *Offline Inference* section.
- If you are using vLLM from an HTTP application or client, see the *Online Serving* section.
- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see the *Others* section.
+- If you are using vLLM from within Python code, see the [Offline Inference](../../examples/offline_inference) section.
+- If you are using vLLM from an HTTP application or client, see the [Online Serving](../../examples/online_serving) section.
+- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see the [Others](../../examples/others) section.
--- a/docs/features/batch_invariance.md
+++ b/docs/features/batch_invariance.md
@@ -108,6 +108,7 @@ Batch invariance has been tested and verified on the following models:
 - **Qwen3 (MoE)**: `Qwen/Qwen3-30B-A3B`, `Qwen/Qwen3-Next-80B-A3B-Instruct`
 - **Qwen2.5**: `Qwen/Qwen2.5-0.5B-Instruct`, `Qwen/Qwen2.5-1.5B-Instruct`, `Qwen/Qwen2.5-3B-Instruct`, `Qwen/Qwen2.5-7B-Instruct`, `Qwen/Qwen2.5-14B-Instruct`, `Qwen/Qwen2.5-32B-Instruct`
 - **Llama 3**: `meta-llama/Llama-3.1-8B-Instruct`, `meta-llama/Llama-3.2-1B-Instruct`
+- **GPT-OSS**: `openai/gpt-oss-20b`, `openai/gpt-oss-120b`

 Other models may also work, but these have been explicitly validated. If you encounter issues with a specific model, please report them on the [GitHub issue tracker](https://github.com/vllm-project/vllm/issues/new/choose).

--- a/docs/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@@ -19,12 +19,13 @@ Two main reasons:

 Please refer to [examples/online_serving/disaggregated_prefill.sh](../../examples/online_serving/disaggregated_prefill.sh) for the example usage of disaggregated prefilling.

-Now supports 5 types of connectors:
+Now supports 6 types of connectors:

 - **ExampleConnector**: refer to [examples/offline_inference/disaggregated-prefill-v1/run.sh](../../examples/offline_inference/disaggregated-prefill-v1/run.sh) for the example usage of ExampleConnector disaggregated prefilling.
 - **LMCacheConnectorV1**: refer to [examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh](../../examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh) for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
 - **NixlConnector**: refer to [tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh) for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
 - **P2pNcclConnector**: refer to [examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh](../../examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh) for the example usage of P2pNcclConnector disaggregated prefilling.
+- **MooncakeConnector**: refer to [examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh](../../examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh) for the example usage of ExampleConnector disaggregated prefilling. For detailed usage guide, see [MooncakeConnector Usage Guide](mooncake_connector_usage.md).
 - **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as:

  ```bash
--- a/docs/features/mooncake_connector_usage.md
+++ b/docs/features/mooncake_connector_usage.md
@@ -31,11 +31,9 @@ vllm serve Qwen/Qwen2.5-7B-Instruct --port 8020 --kv-transfer-config '{"kv_conne
 ### Proxy

 ```bash
-python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --prefiller-host 192.168.0.2 --prefiller-port 8010 --decoder-host 192.168.0.3 --decoder-port 8020
+python examples/online_serving/disaggregated_serving/mooncake_connector/mooncake_connector_proxy.py --prefill http://192.168.0.2:8010 --decode http://192.168.0.3:8020
 ```

-> NOTE: The Mooncake Connector currently uses the proxy from nixl_integration. This will be replaced with a self-developed proxy in the future.
-
 Now you can send requests to the proxy server through port 8000.

 ## Environment Variables
@@ -43,16 +41,29 @@ Now you can send requests to the proxy server through port 8000.
 - `VLLM_MOONCAKE_BOOTSTRAP_PORT`: Port for Mooncake bootstrap server
    - Default: 8998
    - Required only for prefiller instances
-    - Each vLLM worker needs a unique port on its host; using the same port number across different hosts is fine
-    - For TP/DP deployments, each worker's port on a node is computed as: base_port + dp_rank * tp_size + tp_rank
-    - Used for the decoder notifying the prefiller
+    - For headless instances, must be the same as the master instance
+    - Each instance needs a unique port on its host; using the same port number across different hosts is fine

 - `VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT`: Timeout (in seconds) for automatically releasing the prefiller’s KV cache for a particular request. (Optional)
    - Default: 480
    - If a request is aborted and the decoder has not yet notified the prefiller, the prefill instance will release its KV-cache blocks after this timeout to avoid holding them indefinitely.

-## KV Role Options
+## KV Transfer Config
+
+### KV Role Options

 - **kv_producer**: For prefiller instances that generate KV caches
 - **kv_consumer**: For decoder instances that consume KV caches from prefiller
 - **kv_both**: Enables symmetric functionality where the connector can act as both producer and consumer. This provides flexibility for experimental setups and scenarios where the role distinction is not predetermined.
+
+### kv_connector_extra_config
+
+- **num_workers**: Size of thread pool for one prefiller worker to transfer KV caches by mooncake. (default 10)
+- **mooncake_protocol**: Mooncake connector protocol. (default "rdma")
+
+## Example Scripts/Code
+
+Refer to these example scripts in the vLLM repository:
+
+- [run_mooncake_connector.sh](../../examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh)
+- [mooncake_connector_proxy.py](../../examples/online_serving/disaggregated_serving/mooncake_connector/mooncake_connector_proxy.py)
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@@ -36,6 +36,35 @@ export UCX_NET_DEVICES=all  # or specify network devices like "mlx5_0:1,mlx5_1:1
 !!! tip
    When using UCX as the transport backend, NCCL environment variables (like `NCCL_IB_HCA`, `NCCL_SOCKET_IFNAME`) are not applicable to NixlConnector, so configure UCX-specific environment variables instead of NCCL variables.

+#### Selecting a NIXL transport backend (plugin)
+
+NixlConnector can use different NIXL transport backends (plugins). By default, NixlConnector uses UCX as the transport backend.
+
+To select a different backend, set `kv_connector_extra_config.backends` in `--kv-transfer-config`.
+
+### Example: using LIBFABRIC backend
+
+```bash
+vllm serve <MODEL> \
+  --kv-transfer-config '{
+    "kv_connector":"NixlConnector",
+    "kv_role":"kv_both",
+    "kv_connector_extra_config":{"backends":["LIBFABRIC"]}
+  }'
+```
+
+You can also pass JSON keys individually using dotted arguments, and you can append list elements using `+`:
+
+```bash
+vllm serve <MODEL> \
+  --kv-transfer-config.kv_connector NixlConnector \
+  --kv-transfer-config.kv_role kv_both \
+  --kv-transfer-config.kv_connector_extra_config.backends+ LIBFABRIC
+```
+
+!!! note
+    Backend availability depends on how NIXL was built and what plugins are present in your environment. Refer to the [NIXL repository](https://github.com/ai-dynamo/nixl) for available backends and build instructions.
+
 ## Basic Usage (on the same host)

 ### Producer (Prefiller) Configuration
--- a/docs/features/quantization/gguf.md
+++ b/docs/features/quantization/gguf.md
@@ -6,34 +6,38 @@
 !!! warning
    Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model.

-To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command:
+To run a GGUF model with vLLM, you can use the `repo_id:quant_type` format to load directly from HuggingFace. For example, to load a Q4_K_M quantized model from [unsloth/Qwen3-0.6B-GGUF](https://huggingface.co/unsloth/Qwen3-0.6B-GGUF):

 ```bash
-wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
 # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
-vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
-   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
+vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M --tokenizer Qwen/Qwen3-0.6B
 ```

 You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs:

 ```bash
-# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
-vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
-   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M \
+   --tokenizer Qwen/Qwen3-0.6B \
   --tensor-parallel-size 2
 ```

+Alternatively, you can download and use a local GGUF file:
+
+```bash
+wget https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf
+vllm serve ./Qwen3-0.6B-Q4_K_M.gguf --tokenizer Qwen/Qwen3-0.6B
+```
+
 !!! warning
    We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.

-GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path
+GGUF assumes that HuggingFace can convert the metadata to a config file. In case HuggingFace doesn't support your model you can manually create a config and pass it as hf-config-path

 ```bash
-# If you model is not supported by huggingface you can manually provide a huggingface compatible config path
-vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
-   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
-   --hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0
+# If your model is not supported by HuggingFace you can manually provide a HuggingFace compatible config path
+vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M \
+   --tokenizer Qwen/Qwen3-0.6B \
+   --hf-config-path Qwen/Qwen3-0.6B
 ```

 You can also use the GGUF model directly through the LLM entrypoint:
@@ -66,10 +70,10 @@ You can also use the GGUF model directly through the LLM entrypoint:
      # Create a sampling params object.
      sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-      # Create an LLM.
+      # Create an LLM using repo_id:quant_type format.
      llm = LLM(
-         model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-         tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+         model="unsloth/Qwen3-0.6B-GGUF:Q4_K_M",
+         tokenizer="Qwen/Qwen3-0.6B",
      )
      # Generate texts from the prompts. The output is a list of RequestOutput objects
      # that contain the prompt, generated text, and other information.
--- a/docs/getting_started/installation/cpu.arm.inc.md
+++ b/docs/getting_started/installation/cpu.arm.inc.md
@@ -136,15 +136,31 @@ Testing has been conducted on AWS Graviton3 instances for compatibility.
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]

-See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image.
+To pull the latest image:

-Stable vLLM Docker images are being pre-built for Arm from version 0.12.0. Available image tags are here: [https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo).
+```bash
+docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest
+```
+
+To pull an image with a specific vLLM version:

 ```bash
 export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
 docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${VLLM_VERSION}
 ```

+All available image tags are here: [https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo).
+
+You can run these images via:
+
+```bash
+docker run \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    -p 8000:8000 \
+    --env "HF_TOKEN=<secret>" \
+    public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:<tag> <args...>
+```
+
 You can also access the latest code with Docker images. These are not intended for production use and are meant for CI and testing only. They will expire after several days.

 The latest code can contain bugs and may not be stable. Please use it with caution.
--- a/docs/getting_started/installation/cpu.x86.inc.md
+++ b/docs/getting_started/installation/cpu.x86.inc.md
@@ -161,7 +161,23 @@ uv pip install dist/*.whl
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]

-[https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
+You can pull the latest available CPU image here via:
+
+```bash
+docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest
+```
+
+If you want a more specific build you can find all published CPU based images here: [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
+
+You can run these images via:
+
+```bash
+docker run \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    -p 8000:8000 \
+    --env "HF_TOKEN=<secret>" \
+    public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:<tag> <args...>
+```

 !!! warning
    If deploying the pre-built images on machines without `avx512f`, `avx512_bf16`, or `avx512_vnni` support, an `Illegal instruction` error may be raised. See the build-image-from-source section below for build arguments to match your target CPU capabilities.
--- a/docs/getting_started/installation/gpu.rocm.inc.md
+++ b/docs/getting_started/installation/gpu.rocm.inc.md
@@ -31,7 +31,7 @@ uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/
 To install a specific version and ROCm variant of vLLM wheel.

 ```bash
-uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.14.1/rocm700
+uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.15.0/rocm700
 ```

 !!! warning "Caveats for using `pip`" 
@@ -41,7 +41,7 @@ uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.14.1/rocm700
    If you insist on using `pip`, you have to specify the exact vLLM version and full URL of the wheel path `https://wheels.vllm.ai/rocm/<version>/<rocm-variant>` (which can be obtained from the web page).

    ```bash
-    pip install vllm==0.14.1+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.14.1/rocm700
+    pip install vllm==0.15.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.15.0/rocm700
    ```

 # --8<-- [end:pre-built-wheels]
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -307,6 +307,62 @@ An OpenAI client example can be found here: [examples/pooling/embed/openai_embed

 ## Specific models

+### ColBERT Late Interaction Models
+
+[ColBERT](https://arxiv.org/abs/2004.12832) (Contextualized Late Interaction over BERT) is a retrieval model that uses per-token embeddings and MaxSim scoring for document ranking. Unlike single-vector embedding models, ColBERT retains token-level representations and computes relevance scores through late interaction, providing better accuracy while being more efficient than cross-encoders.
+
+vLLM supports ColBERT models for reranking tasks, automatically applying MaxSim scoring for query-document relevance:
+
+```shell
+vllm serve answerdotai/answerai-colbert-small-v1
+```
+
+Currently supports ColBERT models with standard BERT encoders (e.g., `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0`).
+
+ColBERT models with modified encoder architectures are not yet supported, including BERT variants with rotary embeddings (e.g., `jinaai/jina-colbert-v2`) or other custom encoders (e.g., `LiquidAI/LFM2-ColBERT-350M`).
+
+If your standard BERT ColBERT model's config doesn't specify the architecture as `HF_ColBERT`, override it with:
+
+```shell
+vllm serve your-colbert-model --hf-overrides '{"architectures": ["HF_ColBERT"]}'
+```
+
+Then you can use the rerank endpoint:
+
+```shell
+curl -s http://localhost:8000/rerank -H "Content-Type: application/json" -d '{
+    "model": "answerdotai/answerai-colbert-small-v1",
+    "query": "What is machine learning?",
+    "documents": [
+        "Machine learning is a subset of artificial intelligence.",
+        "Python is a programming language.",
+        "Deep learning uses neural networks."
+    ]
+}'
+```
+
+Or the score endpoint:
+
+```shell
+curl -s http://localhost:8000/score -H "Content-Type: application/json" -d '{
+    "model": "answerdotai/answerai-colbert-small-v1",
+    "text_1": "What is machine learning?",
+    "text_2": ["Machine learning is a subset of AI.", "The weather is sunny."]
+}'
+```
+
+You can also get the raw token embeddings using the pooling endpoint with `token_embed` task:
+
+```shell
+curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
+    "model": "answerdotai/answerai-colbert-small-v1",
+    "input": "What is machine learning?",
+    "task": "token_embed"
+}'
+```
+
+An example can be found here: [examples/pooling/score/colbert_rerank_online.py](../../examples/pooling/score/colbert_rerank_online.py)
+
 ### BAAI/bge-m3

 The `BAAI/bge-m3` model comes with extra weights for sparse and colbert embeddings but unfortunately in its `config.json`
@@ -352,15 +408,6 @@ We have split the `encode` task into two more specific token-wise tasks: `token_
 - `token_embed` is the same as `embed`, using normalization as the activation.
 - `token_classify` is the same as `classify`, by default using softmax as the activation.

-### Remove softmax from PoolingParams
-
-We are going to remove `softmax` and `activation` from `PoolingParams` in v0.15. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.
-
-### as_reward_model
-
-!!! warning
-    We are going to remove `--convert reward` in v0.15, use `--convert embed` instead.
-
 Pooling models now default support all pooling, you can use it without any settings.

 - Extracting hidden states prefers using `token_embed` task.
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -365,6 +365,7 @@ th {
 | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ |
 | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `thu-coai/ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ |
 | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ |
+| `CwmForCausalLM` | CWM | `facebook/cwm`, etc. | ✅︎ | ✅︎ |
 | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ |
 | `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ |
 | `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | ✅︎ | ✅︎ |
@@ -375,7 +376,7 @@ th {
 | `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ |
 | `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ |
 | `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ |
-| `ExaoneMoeCausalLM` | K-EXAONE | `LGAI-EXAONE/K-EXAONE-236B-A23B`, etc. | | |
+| `ExaoneMoEForCausalLM` | K-EXAONE | `LGAI-EXAONE/K-EXAONE-236B-A23B`, etc. | | |
 | `Exaone4ForCausalLM` | EXAONE-4 | `LGAI-EXAONE/EXAONE-4.0-32B`, etc. | ✅︎ | ✅︎ |
 | `Fairseq2LlamaForCausalLM` | Llama (fairseq2 format) | `mgleize/fairseq2-dummy-Llama-3.2-1B`, etc. | ✅︎ | ✅︎ |
 | `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ |
@@ -389,6 +390,7 @@ th {
 | `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ |
 | `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ |
 | `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6, GLM-4.7 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ |
+| `Glm4MoeLiteForCausalLM` | GLM-4.7-Flash | `zai-org/GLM-4.7-Flash`, etc. | ✅︎ | ✅︎ |
 | `GPT2LMHeadModel` | GPT-2 | `openai-community/gpt2`, `openai-community/gpt2-xl`, etc. | | ✅︎ |
 | `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ |
 | `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ |
@@ -403,7 +405,6 @@ th {
 | `Grok1ForCausalLM` | Grok2 | `xai-org/grok-2` | ✅︎ | ✅︎ |
 | `HunYuanDenseV1ForCausalLM` | Hunyuan Dense | `tencent/Hunyuan-7B-Instruct` | ✅︎ | ✅︎ |
 | `HunYuanMoEV1ForCausalLM` | Hunyuan-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ |
-| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | |
 | `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ |
 | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ |
 | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ |
@@ -416,12 +417,14 @@ th {
 | `Lfm2ForCausalLM`  | LFM2  | `LiquidAI/LFM2-1.2B`, `LiquidAI/LFM2-700M`, `LiquidAI/LFM2-350M`, etc. | ✅︎ | ✅︎ |
 | `Lfm2MoeForCausalLM`  | LFM2MoE  | `LiquidAI/LFM2-8B-A1B-preview`, etc. | ✅︎ | ✅︎ |
 | `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ |
+| `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ | ✅︎ |
 | `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ |
 | `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ |
 | `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ |
 | `MiMoV2FlashForCausalLM` | MiMoV2Flash | `XiaomiMiMo/MiMo-V2-Flash`, etc. | ︎| ✅︎ |
 | `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
 | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
+| `MiniMaxForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01-hf`, etc. | | |
 | `MiniMaxM2ForCausalLM` | MiniMax-M2, MiniMax-M2.1 |`MiniMaxAI/MiniMax-M2`, etc. | ✅︎ | ✅︎ |
 | `MistralForCausalLM` | Ministral-3, Mistral, Mistral-Instruct | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
 | `MistralLarge3ForCausalLM` | Mistral-Large-3-675B-Base-2512, Mistral-Large-3-675B-Instruct-2512 | `mistralai/Mistral-Large-3-675B-Base-2512`, `mistralai/Mistral-Large-3-675B-Instruct-2512`, etc. | ✅︎ | ✅︎ |
@@ -429,10 +432,10 @@ th {
 | `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ |
 | `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ |
 | `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ |
-| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ |
-| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ |
-| `OLMo3ForCausalLM` | OLMo3 | `allenai/Olmo-3-7B-Instruct`, `allenai/Olmo-3-32B-Think`, etc. | ✅︎ | ✅︎ |
-| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ |
+| `OlmoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ |
+| `Olmo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ |
+| `Olmo3ForCausalLM` | OLMo3 | `allenai/Olmo-3-7B-Instruct`, `allenai/Olmo-3-32B-Think`, etc. | ✅︎ | ✅︎ |
+| `OlmoeForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ |
 | `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | ✅︎ |
 | `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ |
 | `OuroForCausalLM` | ouro | `ByteDance/Ouro-1.4B`, `ByteDance/Ouro-2.6B`, etc. | ✅︎ | |
@@ -451,18 +454,21 @@ th {
 | `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ |
 | `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ |
 | `Qwen3NextForCausalLM` | Qwen3NextMoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ |
+| `RWForCausalLM` | Falcon RW | `tiiuae/falcon-40b`, etc. | | ✅︎ |
 | `SeedOssForCausalLM` | SeedOss | `ByteDance-Seed/Seed-OSS-36B-Instruct`, etc. | ✅︎ | ✅︎ |
 | `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ |
 | `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | |
+| `StableLMEpochForCausalLM` | StableLM Epoch | `stabilityai/stablelm-zephyr-3b`, etc. | | ✅︎ |
 | `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ |
 | `Step1ForCausalLM` | Step-Audio | `stepfun-ai/Step-Audio-EditX`, etc. | ✅︎ | ✅︎ |
+| `Step3p5ForCausalLM` | Step-3.5-flash | `stepfun-ai/step-3.5-flash`, etc. |  | ✅︎ |
+| `TeleChatForCausalLM` | TeleChat | `chuhac/TeleChat2-35B`, etc. | ✅︎ | ✅︎ |
 | `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ |
 | `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ |
 | `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ |
 | `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | |
 | `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | |
 | `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | |
-| `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ | ✅︎ |

 !!! note
    Grok2 requires `tokenizer.tok.json` with `tiktoken` installed. You can optionally override MoE router renormalization with `moe_router_renormalize`.
@@ -657,7 +663,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|--------|-------------------|----------------------|---------------------------|
 | `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | |
-| `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A<sup>+</sup> | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-hf` | ✅︎ | ✅︎ |
+| `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A<sup>+</sup> | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-2601-hf` | ✅︎ | ✅︎ |
 | `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ |
 | `BagelForConditionalGeneration` | BAGEL | T + I<sup>+</sup> | `ByteDance-Seed/BAGEL-7B-MoT` | ✅︎ | ✅︎ |
 | `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
@@ -666,6 +672,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ |
 | `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ |
 | `DeepseekOCRForCausalLM` | DeepSeek-OCR | T + I<sup>+</sup> | `deepseek-ai/DeepSeek-OCR`, etc. | ✅︎ | ✅︎ |
+| `DeepseekOCR2ForCausalLM` | DeepSeek-OCR-2 | T + I<sup>+</sup> | `deepseek-ai/DeepSeek-OCR-2`, etc. | ✅︎ | ✅︎ |
 | `Eagle2_5_VLForConditionalGeneration` | Eagle2.5-VL | T + I<sup>E+</sup> | `nvidia/Eagle2.5-8B`, etc. | ✅︎ | ✅︎ |
 | `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ |
 | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ |
@@ -676,11 +683,13 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ |
 | `GlmOcrForConditionalGeneration` | GLM-OCR | T + I<sup>E+</sup>  | `zai-org/GLM-OCR`, etc. | ✅︎ | ✅︎ |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ |
+| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | T + I<sup>+</sup> + V<sup>+</sup> | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
 | `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
 | `IsaacForConditionalGeneration` | Isaac | T + I<sup>+</sup> | `PerceptronAI/Isaac-0.1` | ✅︎ | ✅︎ |
 | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ |
+| `InternS1ProForConditionalGeneration` | Intern-S1-Pro | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1-Pro`, etc. | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ |
 | `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ |
 | `KananaVForConditionalGeneration` | Kanana-V | T + I<sup>+</sup> | `kakaocorp/kanana-1.5-v-3b-instruct`, etc. | | ✅︎ |
@@ -705,6 +714,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Molmo2ForConditionalGeneration` | Molmo2 | T + I<sup>+</sup> / V | `allenai/Molmo2-4B`, `allenai/Molmo2-8B`, `allenai/Molmo2-O-7B` | ✅︎ | ✅︎ |
 | `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
 | `OpenCUAForConditionalGeneration` | OpenCUA-7B | T + I<sup>E+</sup> | `xlangai/OpenCUA-7B` | ✅︎ | ✅︎ |
+| `OpenPanguVLForConditionalGeneration` | openpangu-VL | T + I<sup>E+</sup> + V<sup>E+</sup> |`FreedomIntelligence/openPangu-VL-7B` | ✅︎ | ✅︎ |
 | `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
 | `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
 | `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I<sup>+</sup> | `PaddlePaddle/PaddleOCR-VL`, etc. | | |
@@ -771,6 +781,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
 | `GlmAsrForConditionalGeneration` | GLM-ASR | `zai-org/GLM-ASR-Nano-2512` | ✅︎ | ✅︎ |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ |
 | `Qwen3ASRForConditionalGeneration` | Qwen3-ASR | `Qwen/Qwen3-ASR-1.7B`, etc. | | ✅︎ |
+| `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, etc. | | ✅︎ |
 | `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ |
 | `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | |

--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -59,6 +59,8 @@ We currently support the following OpenAI APIs:
    - Only applicable to [Automatic Speech Recognition (ASR) models](../models/supported_models.md#transcription).
 - [Translation API](#translations-api) (`/v1/audio/translations`)
    - Only applicable to [Automatic Speech Recognition (ASR) models](../models/supported_models.md#transcription).
+- [Realtime API](#realtime-api) (`/v1/realtime`)
+    - Only applicable to [Automatic Speech Recognition (ASR) models](../models/supported_models.md#transcription).

 In addition, we have the following custom APIs:

@@ -567,6 +569,52 @@ The following extra parameters are supported:
 --8<-- "vllm/entrypoints/openai/protocol.py:translation-extra-params"
 ```

+### Realtime API
+
+The Realtime API provides WebSocket-based streaming audio transcription, allowing real-time speech-to-text as audio is being recorded.
+
+!!! note
+    To use the Realtime API, please install with extra audio dependencies using `uv pip install vllm[audio]`.
+
+#### Audio Format
+
+Audio must be sent as base64-encoded PCM16 audio at 16kHz sample rate, mono channel.
+
+#### Protocol Overview
+
+1. Client connects to `ws://host/v1/realtime`
+2. Server sends `session.created` event
+3. Client optionally sends `session.update` with model/params
+4. Client sends `input_audio_buffer.commit` when ready
+5. Client sends `input_audio_buffer.append` events with base64 PCM16 chunks
+6. Server sends `transcription.delta` events with incremental text
+7. Server sends `transcription.done` with final text + usage
+8. Repeat from step 5 for next utterance
+9. Optionally, client sends input_audio_buffer.commit with final=True
+    to signal audio input is finished. Useful when streaming audio files
+
+#### Client → Server Events
+
+| Event | Description |
+|-------|-------------|
+| `input_audio_buffer.append` | Send base64-encoded audio chunk: `{"type": "input_audio_buffer.append", "audio": "<base64>"}` |
+| `input_audio_buffer.commit` | Trigger transcription processing or end: `{"type": "input_audio_buffer.commit", "final": bool}` |
+| `session.update` | Configure session: `{"type": "session.update", "model": "model-name"}` |
+
+#### Server → Client Events
+
+| Event | Description |
+|-------|-------------|
+| `session.created` | Connection established with session ID and timestamp |
+| `transcription.delta` | Incremental transcription text: `{"type": "transcription.delta", "delta": "text"}` |
+| `transcription.done` | Final transcription with usage stats |
+| `error` | Error notification with message and optional code |
+
+#### Example Clients
+
+- [openai_realtime_client.py](https://github.com/vllm-project/vllm/tree/main/examples/online_serving/openai_realtime_client.py) - Upload and transcribe an audio file
+- [openai_realtime_microphone_client.py](https://github.com/vllm-project/vllm/tree/main/examples/online_serving/openai_realtime_microphone_client.py) - Gradio demo for live microphone transcription
+
 ### Tokenizer API

 Our Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer).
--- a/docs/usage/security.md
+++ b/docs/usage/security.md
@@ -172,12 +172,13 @@ These endpoints are **only available when the environment variable `VLLM_SERVER_
 - `/server_info` - Get detailed server configuration
 - `/reset_prefix_cache` - Reset prefix cache (can disrupt service)
 - `/reset_mm_cache` - Reset multimodal cache (can disrupt service)
+- `/reset_encoder_cache` - Reset encoder cache (can disrupt service)
 - `/sleep` - Put engine to sleep (causes denial of service)
 - `/wake_up` - Wake engine from sleep
 - `/is_sleeping` - Check if engine is sleeping
 - `/collective_rpc` - Execute arbitrary RPC methods on the engine (extremely dangerous)

-**Profiler endpoints (only when `VLLM_TORCH_PROFILER_DIR` or `VLLM_TORCH_CUDA_PROFILE` are set):**
+**Profiler endpoints (only when profiling is enabled via `--profiler-config`):**

 These endpoints are only available when profiling is enabled and should only be used for local development:

@@ -206,7 +207,7 @@ An attacker who can reach the vLLM HTTP server can:
 - Cache manipulation that can disrupt service
 - Detailed server configuration disclosure

-Similarly, never enable profiler endpoints (`VLLM_TORCH_PROFILER_DIR` or `VLLM_TORCH_CUDA_PROFILE`) in production.
+Similarly, never enable profiler endpoints in production.

 **Be cautious with `--enable-tokenizer-info-endpoint`:** Only enable the `/tokenizer_info` endpoint if you need to expose tokenizer configuration information. This endpoint reveals chat templates and tokenizer settings that may contain sensitive implementation details or prompt engineering strategies.

--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -70,6 +70,34 @@ def run_audioflamingo3(question: str, audio_count: int) -> ModelRequestData:
    )


+# MusicFlamingo
+def run_musicflamingo(question: str, audio_count: int) -> ModelRequestData:
+    model_name = "nvidia/music-flamingo-2601-hf"
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"audio": audio_count},
+        enforce_eager=True,
+    )
+
+    # MusicFlamingo uses <sound> token for audio
+    audio_placeholder = "<sound>" * audio_count
+
+    prompt = (
+        "<|im_start|>system\n"
+        "You are a helpful assistant.<|im_end|>\n"
+        "<|im_start|>user\n"
+        f"{audio_placeholder}{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+    )
+
+
 # Gemma3N
 def run_gemma3n(question: str, audio_count: int) -> ModelRequestData:
    model_name = "google/gemma-3n-E2B-it"
@@ -452,6 +480,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:

 model_example_map = {
    "audioflamingo3": run_audioflamingo3,
+    "musicflamingo": run_musicflamingo,
    "gemma3n": run_gemma3n,
    "glmasr": run_glmasr,
    "funaudiochat": run_funaudiochat,
--- a/examples/offline_inference/basic/README.md
+++ b/examples/offline_inference/basic/README.md
@@ -56,17 +56,10 @@ Try it yourself with the following argument:

 vLLM supports models that are quantized using GGUF.

-Try one yourself by downloading a quantized GGUF model and using the following arguments:
-
-```python
-from huggingface_hub import hf_hub_download
-repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
-filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
-print(hf_hub_download(repo_id, filename=filename))
-```
+Try one yourself using the `repo_id:quant_type` format to load directly from HuggingFace:

 ```bash
--model {local-path-printed-above} --tokenizer microsoft/Phi-3-medium-4k-instruct
+--model unsloth/Qwen3-0.6B-GGUF:Q4_K_M --tokenizer Qwen/Qwen3-0.6B
 ```

 ### CPU offload
--- a/examples/offline_inference/prompt_embed_inference.py
+++ b/examples/offline_inference/prompt_embed_inference.py
@@ -38,8 +38,8 @@ def get_prompt_embeds(
    embedding_layer: torch.nn.Module,
 ):
    token_ids = tokenizer.apply_chat_template(
-        chat, add_generation_prompt=True, return_tensors="pt"
-    )
+        chat, add_generation_prompt=True, return_tensors="pt", return_dict=True
+    ).input_ids
    prompt_embeds = embedding_layer(token_ids).squeeze(0)
    return prompt_embeds

--- a/examples/offline_inference/rlhf_colocate.py
+++ b/examples/offline_inference/rlhf_colocate.py
@@ -30,6 +30,7 @@ https://docs.ray.io/en/latest/placement-groups.html

 import gc
 import os
+import sys

 import ray
 import torch
@@ -40,6 +41,10 @@ from torch.multiprocessing.reductions import reduce_tensor

 from vllm import LLM

+if torch.version.hip is not None:
+    print("Skipping test for ROCm. Ray is unsupported on vLLM ROCm.")
+    sys.exit(0)
+

 class MyLLM(LLM):
    """Configure the vLLM worker for Ray placement group execution.
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -5,7 +5,6 @@ from transformers import AutoTokenizer

 from vllm import LLM, SamplingParams
 from vllm.benchmarks.datasets import add_dataset_parser, get_samples
-from vllm.inputs import TokensPrompt
 from vllm.v1.metrics.reader import Counter, Vector

 try:
@@ -56,6 +55,7 @@ def parse_args():
        default="eagle",
        choices=["ngram", "eagle", "eagle3", "mtp", "draft_model"],
    )
+    parser.add_argument("--backend", type=str, default="openai")
    parser.add_argument("--num-spec-tokens", type=int, default=2)
    parser.add_argument("--prompt-lookup-max", type=int, default=5)
    parser.add_argument("--prompt-lookup-min", type=int, default=2)
@@ -75,12 +75,11 @@ def parse_args():
    parser.add_argument("--gpu-memory-utilization", type=float, default=0.9)
    parser.add_argument("--disable-padded-drafter-batch", action="store_true")
    parser.add_argument("--max-num-seqs", type=int, default=None)
+    parser.add_argument("--allowed-local-media-path", type=str, default="")
    return parser.parse_args()


 def main(args):
-    args.endpoint_type = "openai-chat"
-
    model_dir = args.model_dir
    if args.model_dir is None:
        if args.custom_mm_prompts:
@@ -91,19 +90,25 @@ def main(args):
            )
        model_dir = "meta-llama/Llama-3.1-8B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
-    args.custom_skip_chat_template = True

-    if not args.custom_mm_prompts:
-        prompts = get_samples(args, tokenizer)
-        # add_special_tokens is False to avoid adding bos twice
-        # when using chat templates
-        prompt_ids = [
-            tokenizer.encode(prompt.prompt, add_special_tokens=False)
-            for prompt in prompts
-        ]
+    if args.custom_mm_prompts:
+        prompts = llm_prompts = get_custom_mm_prompts(args.num_prompts)
    else:
-        prompts = get_custom_mm_prompts(args.num_prompts)
-
+        prompts = get_samples(args, tokenizer)
+        if args.enable_multimodal_chat:
+            llm_prompts = [p.prompt for p in prompts]
+        else:
+            # add_special_tokens is False to avoid adding bos twice
+            # when using chat templates
+            llm_prompts = [
+                {
+                    "prompt_token_ids": tokenizer.encode(
+                        prompt.prompt, add_special_tokens=False
+                    ),
+                    "multi_modal_data": prompt.multi_modal_data,
+                }
+                for prompt in prompts
+            ]
    if args.method == "eagle" or args.method == "eagle3":
        eagle_dir = args.eagle_dir
        if args.method == "eagle" and eagle_dir is None:
@@ -154,16 +159,17 @@ def main(args):
        limit_mm_per_prompt={"image": 5},
        disable_chunked_mm_input=True,
        max_num_seqs=args.max_num_seqs,
+        allowed_local_media_path=args.allowed_local_media_path,
    )

    sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
-    if not args.custom_mm_prompts:
+    if args.backend == "openai-chat":
+        outputs = llm.chat(llm_prompts, sampling_params=sampling_params)
+    else:
        outputs = llm.generate(
-            [TokensPrompt(prompt_token_ids=x) for x in prompt_ids],
+            llm_prompts,
            sampling_params=sampling_params,
        )
-    else:
-        outputs = llm.chat(prompts, sampling_params=sampling_params)

    # print the generated text
    if args.print_output:
@@ -219,6 +225,8 @@ def main(args):

 if __name__ == "__main__":
    args = parse_args()
+    args.enable_multimodal_chat = args.backend == "openai-chat"
+
    acceptance_length = main(args)

    if args.test:
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -270,6 +270,49 @@ def run_deepseek_ocr(questions: list[str], modality: str) -> ModelRequestData:
    )


+def run_deepseek_ocr2(questions: list[str], modality: str) -> ModelRequestData:
+    from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor
+
+    assert modality == "image"
+
+    model_name = "deepseek-ai/DeepSeek-OCR-2"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        limit_mm_per_prompt={modality: 1},
+        logits_processors=[NGramPerReqLogitsProcessor],
+    )
+
+    # deepseek-ocr use plain prompt template
+    prompts = [f"<image>\n{question}" for question in questions]
+
+    # The following sampling params config is taken from
+    # the official Deepseek-OCR inference example.
+    # (IMPORTANT) Use the custom logits processor and avoid skipping
+    # special tokens for this model for the optimal OCR performance.
+    sampling_params = [
+        SamplingParams(
+            temperature=0.0,
+            max_tokens=8192,
+            # ngram logit processor args
+            extra_args=dict(
+                ngram_size=30,
+                window_size=90,
+                # whitelist: <td>, </td>
+                whitelist_token_ids={128821, 128822},
+            ),
+            skip_special_tokens=False,
+        )
+        for _ in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+
+
 # Dots-OCR
 def run_dots_ocr(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@@ -799,6 +842,40 @@ def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
    )


+# Intern-S1-Pro
+def run_interns1_pro(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "internlm/Intern-S1-Pro"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={modality: 1},
+        enforce_eager=True,
+        tensor_parallel_size=4,
+    )
+
+    if modality == "image":
+        placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
+    elif modality == "video":
+        placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"{placeholder}\n{question}"}]
+        for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # InternVL
 def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "OpenGVLab/InternVL3-2B"
@@ -952,6 +1029,31 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
    )


+# Kimi-VL
+def run_kimi_k25(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "vision_chunk"
+
+    prompts = [
+        "<|im_user|>user<|media_begin|>image<|media_content|>"
+        f"<|media_pad|><|media_end|>{question}<|im_end|>"
+        "<|im_assistant|>assistant<|im_middle|>"
+        for question in questions
+    ]
+
+    engine_args = EngineArgs(
+        model="moonshotai/Kimi-K2.5",
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={modality: 1},
+        tensor_parallel_size=4,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # LightOnOCR
 def run_lightonocr(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@@ -1394,6 +1496,37 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
    )


+# OpenPangu
+def run_openpangu_vl(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "FreedomIntelligence/openPangu-VL-7B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=4,
+        trust_remote_code=True,
+        enforce_eager=True,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    if modality == "image":
+        placeholder = "[unused19]"
+    elif modality == "video":
+        placeholder = "[unused32]"
+
+    prompts = [
+        (
+            f"<s>[unused9]系统：[unused10][unused9]用户：[unused18]{placeholder}[unused20]{question}[unused10][unused9]助手："
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # Ovis
 def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@@ -2014,6 +2147,7 @@ model_example_map = {
    "command_a_vision": run_command_a_vision,
    "deepseek_vl_v2": run_deepseek_vl2,
    "deepseek_ocr": run_deepseek_ocr,
+    "deepseek_ocr2": run_deepseek_ocr2,
    "dots_ocr": run_dots_ocr,
    "eagle2_5": run_eagle2_5,
    "ernie45_vl": run_ernie45_vl,
@@ -2030,11 +2164,13 @@ model_example_map = {
    "hyperclovax_seed_vision": run_hyperclovax_seed_vision,
    "idefics3": run_idefics3,
    "interns1": run_interns1,
+    "interns1_pro": run_interns1_pro,
    "internvl_chat": run_internvl,
    "kanana_v": run_kanana_v,
    "keye_vl": run_keye_vl,
    "keye_vl1_5": run_keye_vl1_5,
    "kimi_vl": run_kimi_vl,
+    "kimi_k25": run_kimi_k25,
    "lightonocr": run_lightonocr,
    "lfm2_vl": run_lfm2_vl,
    "llama4": run_llama4,
@@ -2051,6 +2187,7 @@ model_example_map = {
    "molmo2": run_molmo2,
    "nemotron_vl": run_nemotron_vl,
    "NVLM_D": run_nvlm_d,
+    "openpangu_vl": run_openpangu_vl,
    "ovis": run_ovis,
    "ovis2_5": run_ovis2_5,
    "paddleocr_vl": run_paddleocr_vl,
@@ -2120,6 +2257,19 @@ def get_multi_modal_input(args):
            "questions": vid_questions,
        }

+    if args.modality == "vision_chunk":
+        # Input vision chunks and question
+        image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
+        vision_chunk_questions = [
+            "What is the content of this image chunk?",
+            "Describe the content of this image chunk in detail.",
+        ]
+
+        return {
+            "data": {"type": "image", "image": image},
+            "questions": vision_chunk_questions,
+        }
+
    msg = f"Modality {args.modality} is not supported."
    raise ValueError(msg)

@@ -2202,7 +2352,7 @@ def parse_args():
        "--modality",
        type=str,
        default="image",
-        choices=["image", "video"],
+        choices=["image", "video", "vision_chunk"],
        help="Modality of the input.",
    )
    parser.add_argument(
@@ -2279,7 +2429,7 @@ def main(args):
    req_data = model_example_map[model](questions, modality)

    # Disable other modalities to save memory
-    default_limits = {"image": 0, "video": 0, "audio": 0}
+    default_limits = {"image": 0, "video": 0, "audio": 0, "vision_chunk": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
        req_data.engine_args.limit_mm_per_prompt or {}
    )
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -765,6 +765,32 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
    )


+# OpenPangu
+def load_openpangu_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "FreedomIntelligence/openPangu-VL-7B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=2,
+        enforce_eager=True,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = "[unused18][unused19][unused20]" * len(image_urls)
+    prompt = (
+        f"<s>[unused9]系统：[unused10][unused9]用户：{question}{placeholders}"
+        "[unused10][unused9]助手："
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 # Ovis
 def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "AIDC-AI/Ovis2-1B"
@@ -1257,6 +1283,42 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
    )


+# GLM-4.1V
+def load_glm4_1v(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "zai-org/GLM-4.1V-9B-Thinking"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=45082,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        enforce_eager=True,
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
 # GLM-4.5V
 def load_glm4_5v(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "zai-org/GLM-4.5V"
@@ -1388,6 +1450,7 @@ model_example_map = {
    "mistral3": load_mistral3,
    "molmo2": load_molmo2,
    "NVLM_D": load_nvlm_d,
+    "openpangu_vl": load_openpangu_vl,
    "ovis": load_ovis,
    "ovis2_5": load_ovis2_5,
    "paddleocr_vl": load_paddleocr_vl,
@@ -1403,6 +1466,7 @@ model_example_map = {
    "stepvl": load_step_vl,
    "tarsier": load_tarsier,
    "tarsier2": load_tarsier2,
+    "glm4_1v": load_glm4_1v,
    "glm4_5v": load_glm4_5v,
    "glm4_5v_fp8": load_glm4_5v_fp8,
 }
--- a/examples/online_serving/disaggregated_serving/README.md
+++ b/examples/online_serving/disaggregated_serving/README.md
@@ -6,3 +6,4 @@ This example contains scripts that demonstrate the disaggregated serving feature

 - `disagg_proxy_demo.py` - Demonstrates XpYd (X prefill instances, Y decode instances).
 - `kv_events.sh` - Demonstrates KV cache event publishing.
+- `mooncake_connector` - A proxy demo for MooncakeConnector.
--- a/examples/online_serving/disaggregated_serving/mooncake_connector/mooncake_connector_proxy.py
+++ b/examples/online_serving/disaggregated_serving/mooncake_connector/mooncake_connector_proxy.py
@@ -0,0 +1,376 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import asyncio
+import ipaddress
+import itertools
+import os
+import urllib
+import uuid
+from contextlib import asynccontextmanager
+from typing import Any
+
+import httpx
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import StreamingResponse
+
+
+def maybe_wrap_ipv6_address(address: str) -> str:
+    try:
+        ipaddress.IPv6Address(address)
+        return f"[{address}]"
+    except ValueError:
+        return address
+
+
+def make_http_path(host: str, port: int) -> str:
+    return f"http://{host}:{port}"
+
+
+def prefiller_cycle(prefill_clients: list[Any]):
+    while True:
+        for prefill_client in prefill_clients:
+            for i in range(prefill_client["dp_size"]):
+                yield prefill_client, i
+
+
+async def get_prefiller_info(prefill_clients: list, ready: asyncio.Event):
+    for prefill_client in prefill_clients:
+        while True:
+            try:
+                # Wait for prefill service to be ready
+                response = await prefill_client["client"].get("/health")
+                response.raise_for_status()
+            except Exception:
+                await asyncio.sleep(1)
+                continue
+
+            response = await prefill_client["client"].get(
+                prefill_client["bootstrap_addr"] + "/query"
+            )
+            response.raise_for_status()
+            data = response.json()
+            break
+
+        for dp_rank, dp_entry in data.items():
+            prefill_client["dp_engine_id"][int(dp_rank)] = dp_entry["engine_id"]
+        dp_size = len(data)
+        prefill_client["dp_size"] = dp_size
+        print(f"Inited prefiller {prefill_client['url']} with dp_size={dp_size}")
+
+    ready.set()
+    print("All prefiller instances are ready.")
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    Lifespan context manager to handle startup and shutdown events.
+    """
+    # Startup: Initialize client pools for prefiller and decoder services
+    app.state.prefill_clients = []
+    app.state.decode_clients = []
+    app.state.ready = asyncio.Event()
+
+    # Create prefill clients
+    for i, (url, bootstrap_port) in enumerate(global_args.prefill):
+        parsed_url = urllib.parse.urlparse(url)
+        hostname = maybe_wrap_ipv6_address(parsed_url.hostname)
+        app.state.prefill_clients.append(
+            {
+                "client": httpx.AsyncClient(
+                    timeout=None,
+                    base_url=url,
+                    limits=httpx.Limits(
+                        max_connections=None,
+                        max_keepalive_connections=None,
+                    ),
+                ),
+                "url": url,
+                "bootstrap_addr": make_http_path(hostname, bootstrap_port or 8998),
+                "dp_engine_id": {},
+            }
+        )
+
+    # Create decode clients
+    for i, url in enumerate(global_args.decode):
+        parsed_url = urllib.parse.urlparse(url)
+        hostname = maybe_wrap_ipv6_address(parsed_url.hostname)
+        app.state.decode_clients.append(
+            {
+                "client": httpx.AsyncClient(
+                    timeout=None,
+                    base_url=url,
+                    limits=httpx.Limits(
+                        max_connections=None,
+                        max_keepalive_connections=None,
+                    ),
+                ),
+            }
+        )
+
+    asyncio.create_task(get_prefiller_info(app.state.prefill_clients, app.state.ready))
+
+    # Initialize round-robin iterators
+    app.state.prefill_iterator = prefiller_cycle(app.state.prefill_clients)
+    app.state.decode_iterator = itertools.cycle(range(len(app.state.decode_clients)))
+
+    print(
+        f"Got {len(app.state.prefill_clients)} prefill clients "
+        f"and {len(app.state.decode_clients)} decode clients."
+    )
+
+    yield
+
+    # Shutdown: Close all clients
+    for client_info in app.state.prefill_clients:
+        await client_info["client"].aclose()
+
+    for client_info in app.state.decode_clients:
+        await client_info["client"].aclose()
+
+
+# Update FastAPI app initialization to use lifespan
+app = FastAPI(lifespan=lifespan)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--port", type=int, default=8000)
+    # Always use 127.0.0.1 as localhost binds to IPv6 which is blocked on CI
+    parser.add_argument("--host", type=str, default="127.0.0.1")
+
+    # For prefiller instances
+    parser.add_argument(
+        "--prefill",
+        nargs="+",
+        action="append",
+        dest="prefill_raw",
+        metavar=("URL", "bootstrap_port"),
+        help=(
+            "Prefill server URL and optional bootstrap port. "
+            "Can be specified multiple times. "
+            "Format: --prefill URL [BOOTSTRAP_PORT]. "
+            "BOOTSTRAP_PORT can be a port number, "
+            "'none', or omitted (defaults to none)."
+        ),
+    )
+
+    # For decoder instances
+    parser.add_argument(
+        "--decode",
+        nargs=1,
+        action="append",
+        dest="decode_raw",
+        metavar=("URL",),
+        help="Decode server URL. Can be specified multiple times.",
+    )
+
+    args = parser.parse_args()
+    args.prefill = _parse_prefill_urls(args.prefill_raw)
+    args.decode = _parse_decode_urls(args.decode_raw)
+
+    return args
+
+
+# From sglang router_args.py
+def _parse_prefill_urls(prefill_list):
+    """Parse prefill URLs from --prefill arguments.
+
+    Format: --prefill URL [BOOTSTRAP_PORT]
+    Example:
+        --prefill http://prefill1:8080 9000  # With bootstrap port
+        --prefill http://prefill2:8080 none  # Explicitly no bootstrap port
+        --prefill http://prefill3:8080       # Defaults to no bootstrap port
+    """
+    if not prefill_list:
+        return []
+
+    prefill_urls = []
+    for prefill_args in prefill_list:
+        url = prefill_args[0]
+
+        # Handle optional bootstrap port
+        if len(prefill_args) >= 2:
+            bootstrap_port_str = prefill_args[1]
+            # Handle 'none' as None
+            if bootstrap_port_str.lower() == "none":
+                bootstrap_port = None
+            else:
+                try:
+                    bootstrap_port = int(bootstrap_port_str)
+                except ValueError as e:
+                    raise ValueError(
+                        f"Invalid bootstrap port: {bootstrap_port_str}. Must be a number or 'none'"  # noqa: E501
+                    ) from e
+        else:
+            # No bootstrap port specified, default to None
+            bootstrap_port = None
+
+        prefill_urls.append((url, bootstrap_port))
+
+    return prefill_urls
+
+
+def _parse_decode_urls(decode_list):
+    """Parse decode URLs from --decode arguments.
+
+    Format: --decode URL
+    Example: --decode http://decode1:8081 --decode http://decode2:8081
+    """
+    if not decode_list:
+        return []
+
+    # decode_list is a list of single-element lists due to nargs=1
+    return [url[0] for url in decode_list]
+
+
+def get_next_client(app, service_type: str):
+    """
+    Get the next client in round-robin fashion.
+
+    Args:
+        app: The FastAPI app instance
+        service_type: Either 'prefill' or 'decode'
+
+    Returns:
+        The next client to use
+    """
+    if service_type == "prefill":
+        return next(app.state.prefill_iterator)
+    elif service_type == "decode":
+        client_idx = next(app.state.decode_iterator)
+        return app.state.decode_clients[client_idx]
+    else:
+        raise ValueError(f"Unknown service type: {service_type}")
+
+
+async def send_request_to_service(
+    client_info: dict, dp_rank: int, endpoint: str, req_data: dict, request_id: str
+):
+    """
+    Send a request to a service using a client from the pool.
+    """
+    req_data = req_data.copy()
+    req_data["kv_transfer_params"] = {
+        "do_remote_decode": True,
+        "do_remote_prefill": False,
+        "transfer_id": f"xfer-{request_id}",
+    }
+    req_data["stream"] = False
+    req_data["max_tokens"] = 1
+    if "max_completion_tokens" in req_data:
+        req_data["max_completion_tokens"] = 1
+    if "stream_options" in req_data:
+        del req_data["stream_options"]
+    headers = {
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        "X-Request-Id": request_id,
+        "X-data-parallel-rank": str(dp_rank),
+    }
+
+    response = await client_info["client"].post(
+        endpoint, json=req_data, headers=headers
+    )
+    response.raise_for_status()
+
+    # CRITICAL: Release connection back to pool
+    await response.aclose()
+
+
+async def stream_service_response(
+    prefill_client_info: dict,
+    prefill_dp_rank: int,
+    decode_client_info: dict,
+    endpoint: str,
+    req_data: dict,
+    request_id: str,
+):
+    """
+    Asynchronously stream response from a service using a client from the pool.
+    """
+    headers = {
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        "X-Request-Id": request_id,
+    }
+
+    req_data["kv_transfer_params"] = {
+        "do_remote_decode": False,
+        "do_remote_prefill": True,
+        "remote_bootstrap_addr": prefill_client_info["bootstrap_addr"],
+        "remote_engine_id": prefill_client_info["dp_engine_id"][prefill_dp_rank],
+        "transfer_id": f"xfer-{request_id}",
+    }
+
+    async with decode_client_info["client"].stream(
+        "POST", endpoint, json=req_data, headers=headers
+    ) as response:
+        response.raise_for_status()
+        async for chunk in response.aiter_bytes():
+            yield chunk
+
+
+async def _handle_completions(api: str, request: Request):
+    if not app.state.ready.is_set():
+        raise HTTPException(status_code=503, detail="Service Unavailable")
+
+    try:
+        req_data = await request.json()
+        request_id = str(uuid.uuid4())
+
+        # Get the next prefill client in round-robin fashion
+        prefill_client_info, prefill_dp_rank = get_next_client(request.app, "prefill")
+
+        # Send request to prefill service
+        asyncio.create_task(
+            send_request_to_service(
+                prefill_client_info, prefill_dp_rank, api, req_data, request_id
+            )
+        )
+
+        decode_client_info = get_next_client(request.app, "decode")
+
+        # Stream response from decode service
+        async def generate_stream():
+            async for chunk in stream_service_response(
+                prefill_client_info,
+                prefill_dp_rank,
+                decode_client_info,
+                api,
+                req_data,
+                request_id=request_id,
+            ):
+                yield chunk
+
+        return StreamingResponse(generate_stream(), media_type="application/json")
+
+    except Exception as e:
+        import sys
+        import traceback
+
+        exc_info = sys.exc_info()
+        print(f"Error occurred in disagg prefill proxy server - {api} endpoint")
+        print(e)
+        print("".join(traceback.format_exception(*exc_info)))
+        raise
+
+
+@app.post("/v1/completions")
+async def handle_completions(request: Request):
+    return await _handle_completions("/v1/completions", request)
+
+
+@app.post("/v1/chat/completions")
+async def handle_chat_completions(request: Request):
+    return await _handle_completions("/v1/chat/completions", request)
+
+
+if __name__ == "__main__":
+    global global_args
+    global_args = parse_args()
+
+    import uvicorn
+
+    uvicorn.run(app, host=global_args.host, port=global_args.port)
--- a/examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh
+++ b/examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh
@@ -0,0 +1,222 @@
+#!/bin/bash
+
+# =============================================================================
+# vLLM Disaggregated Serving Script for Mooncake Connector
+# =============================================================================
+# This script demonstrates disaggregated prefill and decode serving using
+# Mooncake Connector.
+#
+# Configuration can be customized via environment variables:
+#   MODEL: Model to serve
+#   PREFILL_GPUS: Comma-separated GPU IDs for prefill servers
+#   DECODE_GPUS: Comma-separated GPU IDs for decode servers
+#   PREFILL_PORTS: Comma-separated ports for prefill servers
+#   BOOTSTRAP_PORTS: Bootstrap server port launched by prefill servers
+#   DECODE_PORTS: Comma-separated ports for decode servers
+#   PROXY_PORT: Proxy server port used to setup P/D disaggregated connection.
+#   TIMEOUT_SECONDS: Server startup timeout
+# =============================================================================
+
+# Configuration - can be overridden via environment variables
+MODEL=${MODEL:-Qwen/Qwen2.5-7B-Instruct}
+TIMEOUT_SECONDS=${TIMEOUT_SECONDS:-1200}
+PROXY_PORT=${PROXY_PORT:-8000}
+
+PREFILL_GPUS=${PREFILL_GPUS:-0}
+DECODE_GPUS=${DECODE_GPUS:-1}
+PREFILL_PORTS=${PREFILL_PORTS:-8010}
+BOOTSTRAP_PORTS=${BOOTSTRAP_PORTS:-8998}
+DECODE_PORTS=${DECODE_PORTS:-8020}
+
+echo "Warning: Mooncake Connector support for vLLM v1 is experimental and subject to change."
+echo ""
+echo "Architecture Configuration:"
+echo "  Model: $MODEL"
+echo "  Prefill GPUs: $PREFILL_GPUS, Ports: $PREFILL_PORTS, Bootstrap Port:$BOOTSTRAP_PORTS"
+echo "  Decode GPUs: $DECODE_GPUS, Ports: $DECODE_PORTS"
+echo "  Proxy Port: $PROXY_PORT"
+echo "  Timeout: ${TIMEOUT_SECONDS}s"
+echo ""
+
+PIDS=()
+
+# Switch to the directory of the current script
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
+check_required_files() {
+    local files=("mooncake_connector_proxy.py")
+    for file in "${files[@]}"; do
+        if [[ ! -f "$file" ]]; then
+            echo "Required file $file not found in $(pwd)"
+            exit 1
+        fi
+    done
+}
+
+check_hf_token() {
+    if [ -z "$HF_TOKEN" ]; then
+        echo "HF_TOKEN is not set. Please set it to your Hugging Face token."
+        echo "Example: export HF_TOKEN=your_token_here"
+        exit 1
+    fi
+    if [[ "$HF_TOKEN" != hf_* ]]; then
+        echo "HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."
+        exit 1
+    fi
+    echo "HF_TOKEN is set and valid."
+}
+
+check_num_gpus() {
+    # Check if the number of GPUs are >=2 via nvidia-smi
+    num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+    if [ "$num_gpus" -lt 2 ]; then
+        echo "You need at least 2 GPUs to run disaggregated prefill."
+        exit 1
+    else
+        echo "Found $num_gpus GPUs."
+    fi
+}
+
+ensure_python_library_installed() {
+    echo "Checking if $1 is installed..."
+    if ! python3 -c "import $1" > /dev/null 2>&1; then
+        echo "$1 is not installed. Please install it via pip install $1."
+        exit 1
+    else
+        echo "$1 is installed."
+    fi
+}
+
+cleanup() {
+    echo "Stopping everything…"
+    trap - INT TERM        # prevent re-entrancy
+    pkill -9 -f "mooncake_connector_proxy.py"
+    kill -- -$$            # negative PID  ==  "this whole process-group"
+    wait                   # reap children so we don't leave zombies
+    exit 0
+}
+
+wait_for_server() {
+  local port=$1
+  local timeout_seconds=$TIMEOUT_SECONDS
+  local start_time=$(date +%s)
+
+  echo "Waiting for server on port $port..."
+
+  while true; do
+    if curl -s "localhost:${port}/v1/completions" > /dev/null; then
+      echo "Server on port $port is ready."
+      return 0
+    fi
+
+    local now=$(date +%s)
+    if (( now - start_time >= timeout_seconds )); then
+      echo "Timeout waiting for server on port $port"
+      return 1
+    fi
+
+    sleep 1
+  done
+}
+
+main() {
+    check_required_files
+    check_hf_token
+    check_num_gpus
+    ensure_python_library_installed vllm
+    ensure_python_library_installed mooncake.engine
+
+    trap cleanup INT
+    trap cleanup USR1
+    trap cleanup TERM
+
+    echo "Launching disaggregated serving components..."
+    echo "Please check the log files for detailed output:"
+    echo "  - prefill*.log: Prefill server logs"
+    echo "  - decode*.log: Decode server logs"
+    echo "  - proxy.log: Proxy server log"
+
+    # Parse GPU and port arrays
+    IFS=',' read -ra PREFILL_GPU_ARRAY <<< "$PREFILL_GPUS"
+    IFS=',' read -ra DECODE_GPU_ARRAY <<< "$DECODE_GPUS"
+    IFS=',' read -ra PREFILL_PORT_ARRAY <<< "$PREFILL_PORTS"
+    IFS=',' read -ra BOOTSTRAP_PORT_ARRAY <<< "$BOOTSTRAP_PORTS"
+    IFS=',' read -ra DECODE_PORT_ARRAY <<< "$DECODE_PORTS"
+
+    proxy_param=""
+
+    # =============================================================================
+    # Launch Prefill Servers (X Producers)
+    # =============================================================================
+    echo ""
+    echo "Starting ${#PREFILL_GPU_ARRAY[@]} prefill server(s)..."
+    for i in "${!PREFILL_GPU_ARRAY[@]}"; do
+        local gpu_id=${PREFILL_GPU_ARRAY[$i]}
+        local port=${PREFILL_PORT_ARRAY[$i]}
+        local bootstrap_port=${BOOTSTRAP_PORT_ARRAY[$i]}
+
+        echo "  Prefill server $((i+1)): GPU $gpu_id, Port $port, Bootstrap Port $bootstrap_port"
+        VLLM_MOONCAKE_BOOTSTRAP_PORT=$bootstrap_port CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
+        --port $port \
+        --kv-transfer-config \
+        "{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_producer\"}" > prefill$((i+1)).log 2>&1 &
+        PIDS+=($!)
+        proxy_param="${proxy_param} --prefill http://0.0.0.0:${port} $bootstrap_port"
+    done
+
+    # =============================================================================
+    # Launch Decode Servers (Y Decoders)
+    # =============================================================================
+    echo ""
+    echo "Starting ${#DECODE_GPU_ARRAY[@]} decode server(s)..."
+    for i in "${!DECODE_GPU_ARRAY[@]}"; do
+        local gpu_id=${DECODE_GPU_ARRAY[$i]}
+        local port=${DECODE_PORT_ARRAY[$i]}
+
+        echo "  Decode server $((i+1)): GPU $gpu_id, Port $port"
+        CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
+        --port $port \
+        --kv-transfer-config \
+        "{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_consumer\"}" > decode$((i+1)).log 2>&1 &
+        PIDS+=($!)
+        proxy_param="${proxy_param} --decode http://0.0.0.0:${port}"
+    done
+
+    # =============================================================================
+    # Launch Proxy Server
+    # =============================================================================
+    echo ""
+    echo "Starting proxy server on port $PROXY_PORT..."
+    python3 mooncake_connector_proxy.py $proxy_param --port $PROXY_PORT > proxy.log 2>&1 &
+    PIDS+=($!)
+
+    # =============================================================================
+    # Wait for All Servers to Start
+    # =============================================================================
+    echo ""
+    echo "Waiting for all servers to start..."
+    for port in "${PREFILL_PORT_ARRAY[@]}" "${DECODE_PORT_ARRAY[@]}"; do
+        if ! wait_for_server $port; then
+            echo "Failed to start server on port $port"
+            cleanup
+            exit 1
+        fi
+    done
+
+    echo ""
+    echo "All servers are up. Starting benchmark..."
+
+    # =============================================================================
+    # Run Benchmark
+    # =============================================================================
+    vllm bench serve --port $PROXY_PORT --seed $(date +%s) \
+        --backend vllm --model $MODEL \
+        --dataset-name random --random-input-len 7500 --random-output-len 200 \
+        --num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
+
+    echo "Benchmarking done. Cleaning up..."
+
+    cleanup
+}
+
+main
--- a/examples/online_serving/openai_realtime_client.py
+++ b/examples/online_serving/openai_realtime_client.py
@@ -0,0 +1,151 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This script demonstrates how to use the vLLM Realtime WebSocket API to perform
+audio transcription by uploading an audio file.
+
+Before running this script, you must start the vLLM server with a realtime-capable
+model, for example:
+
+    vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 --enforce-eager
+
+Requirements:
+- vllm with audio support
+- websockets
+- librosa
+- numpy
+
+The script:
+1. Connects to the Realtime WebSocket endpoint
+2. Converts an audio file to PCM16 @ 16kHz
+3. Sends audio chunks to the server
+4. Receives and prints transcription as it streams
+"""
+
+import argparse
+import asyncio
+import base64
+import json
+
+import librosa
+import numpy as np
+import websockets
+
+from vllm.assets.audio import AudioAsset
+
+
+def audio_to_pcm16_base64(audio_path: str) -> str:
+    """
+    Load an audio file and convert it to base64-encoded PCM16 @ 16kHz.
+    """
+    # Load audio and resample to 16kHz mono
+    audio, _ = librosa.load(audio_path, sr=16000, mono=True)
+    # Convert to PCM16
+    pcm16 = (audio * 32767).astype(np.int16)
+    # Encode as base64
+    return base64.b64encode(pcm16.tobytes()).decode("utf-8")
+
+
+async def realtime_transcribe(audio_path: str, host: str, port: int, model: str):
+    """
+    Connect to the Realtime API and transcribe an audio file.
+    """
+    uri = f"ws://{host}:{port}/v1/realtime"
+
+    async with websockets.connect(uri) as ws:
+        # Wait for session.created
+        response = json.loads(await ws.recv())
+        if response["type"] == "session.created":
+            print(f"Session created: {response['id']}")
+        else:
+            print(f"Unexpected response: {response}")
+            return
+
+        # Validate model
+        await ws.send(json.dumps({"type": "session.update", "model": model}))
+
+        # Signal ready to start
+        await ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
+
+        # Convert audio file to base64 PCM16
+        print(f"Loading audio from: {audio_path}")
+        audio_base64 = audio_to_pcm16_base64(audio_path)
+
+        # Send audio in chunks (4KB of raw audio = ~8KB base64)
+        chunk_size = 4096
+        audio_bytes = base64.b64decode(audio_base64)
+        total_chunks = (len(audio_bytes) + chunk_size - 1) // chunk_size
+
+        print(f"Sending {total_chunks} audio chunks...")
+        for i in range(0, len(audio_bytes), chunk_size):
+            chunk = audio_bytes[i : i + chunk_size]
+            await ws.send(
+                json.dumps(
+                    {
+                        "type": "input_audio_buffer.append",
+                        "audio": base64.b64encode(chunk).decode("utf-8"),
+                    }
+                )
+            )
+
+        # Signal all audio is sent
+        await ws.send(json.dumps({"type": "input_audio_buffer.commit", "final": True}))
+        print("Audio sent. Waiting for transcription...\n")
+
+        # Receive transcription
+        print("Transcription: ", end="", flush=True)
+        while True:
+            response = json.loads(await ws.recv())
+            if response["type"] == "transcription.delta":
+                print(response["delta"], end="", flush=True)
+            elif response["type"] == "transcription.done":
+                print(f"\n\nFinal transcription: {response['text']}")
+                if response.get("usage"):
+                    print(f"Usage: {response['usage']}")
+                break
+            elif response["type"] == "error":
+                print(f"\nError: {response['error']}")
+                break
+
+
+def main(args):
+    if args.audio_path:
+        audio_path = args.audio_path
+    else:
+        # Use default audio asset
+        audio_path = str(AudioAsset("mary_had_lamb").get_local_path())
+        print(f"No audio path provided, using default: {audio_path}")
+
+    asyncio.run(realtime_transcribe(audio_path, args.host, args.port, args.model))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Realtime WebSocket Transcription Client"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="mistralai/Voxtral-Mini-4B-Realtime-2602",
+        help="Model that is served and should be pinged.",
+    )
+    parser.add_argument(
+        "--audio_path",
+        type=str,
+        default=None,
+        help="Path to the audio file to transcribe.",
+    )
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="localhost",
+        help="vLLM server host (default: localhost)",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8000,
+        help="vLLM server port (default: 8000)",
+    )
+    args = parser.parse_args()
+    main(args)
--- a/examples/online_serving/openai_realtime_microphone_client.py
+++ b/examples/online_serving/openai_realtime_microphone_client.py
@@ -0,0 +1,183 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Minimal Gradio demo for real-time speech transcription using the vLLM Realtime API.
+
+Start the vLLM server first:
+
+    vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 --enforce-eager
+
+Then run this script:
+
+    python openai_realtime_microphone_client.py --host localhost --port 8000
+
+Use --share to create a public Gradio link.
+
+Requirements: websockets, numpy, gradio
+"""
+
+import argparse
+import asyncio
+import base64
+import json
+import queue
+import threading
+
+import gradio as gr
+import numpy as np
+import websockets
+
+SAMPLE_RATE = 16_000
+
+# Global state
+audio_queue: queue.Queue = queue.Queue()
+transcription_text = ""
+is_running = False
+ws_url = ""
+model = ""
+
+
+async def websocket_handler():
+    """Connect to WebSocket and handle audio streaming + transcription."""
+    global transcription_text, is_running
+
+    async with websockets.connect(ws_url) as ws:
+        # Wait for session.created
+        await ws.recv()
+
+        # Validate model
+        await ws.send(json.dumps({"type": "session.update", "model": model}))
+
+        # Signal ready
+        await ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
+
+        async def send_audio():
+            while is_running:
+                try:
+                    chunk = await asyncio.get_event_loop().run_in_executor(
+                        None, lambda: audio_queue.get(timeout=0.1)
+                    )
+                    await ws.send(
+                        json.dumps(
+                            {"type": "input_audio_buffer.append", "audio": chunk}
+                        )
+                    )
+                except queue.Empty:
+                    continue
+
+        async def receive_transcription():
+            global transcription_text
+            async for message in ws:
+                data = json.loads(message)
+                if data.get("type") == "transcription.delta":
+                    transcription_text += data["delta"]
+
+        await asyncio.gather(send_audio(), receive_transcription())
+
+
+def start_websocket():
+    """Start WebSocket connection in background thread."""
+    global is_running
+    is_running = True
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    try:
+        loop.run_until_complete(websocket_handler())
+    except Exception as e:
+        print(f"WebSocket error: {e}")
+
+
+def start_recording():
+    """Start the transcription service."""
+    global transcription_text
+    transcription_text = ""
+    thread = threading.Thread(target=start_websocket, daemon=True)
+    thread.start()
+    return gr.update(interactive=False), gr.update(interactive=True), ""
+
+
+def stop_recording():
+    """Stop the transcription service."""
+    global is_running
+    is_running = False
+    return gr.update(interactive=True), gr.update(interactive=False), transcription_text
+
+
+def process_audio(audio):
+    """Process incoming audio and queue for streaming."""
+    global transcription_text
+
+    if audio is None or not is_running:
+        return transcription_text
+
+    sample_rate, audio_data = audio
+
+    # Convert to mono if stereo
+    if len(audio_data.shape) > 1:
+        audio_data = audio_data.mean(axis=1)
+
+    # Normalize to float
+    if audio_data.dtype == np.int16:
+        audio_float = audio_data.astype(np.float32) / 32767.0
+    else:
+        audio_float = audio_data.astype(np.float32)
+
+    # Resample to 16kHz if needed
+    if sample_rate != SAMPLE_RATE:
+        num_samples = int(len(audio_float) * SAMPLE_RATE / sample_rate)
+        audio_float = np.interp(
+            np.linspace(0, len(audio_float) - 1, num_samples),
+            np.arange(len(audio_float)),
+            audio_float,
+        )
+
+    # Convert to PCM16 and base64 encode
+    pcm16 = (audio_float * 32767).astype(np.int16)
+    b64_chunk = base64.b64encode(pcm16.tobytes()).decode("utf-8")
+    audio_queue.put(b64_chunk)
+
+    return transcription_text
+
+
+# Gradio interface
+with gr.Blocks(title="Real-time Speech Transcription") as demo:
+    gr.Markdown("# Real-time Speech Transcription")
+    gr.Markdown("Click **Start** and speak into your microphone.")
+
+    with gr.Row():
+        start_btn = gr.Button("Start", variant="primary")
+        stop_btn = gr.Button("Stop", variant="stop", interactive=False)
+
+    audio_input = gr.Audio(sources=["microphone"], streaming=True, type="numpy")
+    transcription_output = gr.Textbox(label="Transcription", lines=5)
+
+    start_btn.click(
+        start_recording, outputs=[start_btn, stop_btn, transcription_output]
+    )
+    stop_btn.click(stop_recording, outputs=[start_btn, stop_btn, transcription_output])
+    audio_input.stream(
+        process_audio, inputs=[audio_input], outputs=[transcription_output]
+    )
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Realtime WebSocket Transcription with Gradio"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="mistralai/Voxtral-Mini-4B-Realtime-2602",
+        help="Model that is served and should be pinged.",
+    )
+    parser.add_argument(
+        "--host", type=str, default="localhost", help="vLLM server host"
+    )
+    parser.add_argument("--port", type=int, default=8000, help="vLLM server port")
+    parser.add_argument(
+        "--share", action="store_true", help="Create public Gradio link"
+    )
+    args = parser.parse_args()
+
+    ws_url = f"ws://{args.host}:{args.port}/v1/realtime"
+    model = args.model
+    demo.launch(share=args.share)
--- a/examples/online_serving/prompt_embed_inference_with_openai_client.py
+++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py
@@ -49,8 +49,8 @@ def main():
    # Refer to the HuggingFace repo for the correct format to use
    chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
    token_ids = tokenizer.apply_chat_template(
-        chat, add_generation_prompt=True, return_tensors="pt"
-    )
+        chat, add_generation_prompt=True, return_tensors="pt", return_dict=True
+    ).input_ids

    embedding_layer = transformers_model.get_input_embeddings()
    prompt_embeds = embedding_layer(token_ids).squeeze(0)
@@ -60,9 +60,7 @@ def main():

    completion = client.completions.create(
        model=model_name,
-        # NOTE: The OpenAI client does not allow `None` as an input to
-        # `prompt`. Use an empty string if you have no text prompts.
-        prompt="",
+        prompt=None,
        max_tokens=5,
        temperature=0.0,
        # NOTE: The OpenAI client allows passing in extra JSON body via the
--- a/examples/online_serving/token_generation_client.py
+++ b/examples/online_serving/token_generation_client.py
@@ -27,7 +27,8 @@ def main(client):
        messages,
        add_generation_prompt=True,
        enable_thinking=False,
-    )
+        return_dict=True,
+    ).input_ids
    payload = {
        "model": MODEL_NAME,
        "token_ids": token_ids,
--- a/examples/pooling/embed/embedding_requests_base64_online.py
+++ b/examples/pooling/embed/embedding_requests_base64_online.py
@@ -12,11 +12,7 @@ import base64
 import requests
 import torch

-from vllm.utils.serial_utils import (
-    EMBED_DTYPE_TO_TORCH_DTYPE,
-    ENDIANNESS,
-    binary2tensor,
-)
+from vllm.utils.serial_utils import EMBED_DTYPES, ENDIANNESS, binary2tensor


 def post_http_request(prompt: dict, api_url: str) -> requests.Response:
@@ -45,7 +41,7 @@ def main(args):
    ] * 2

    # The OpenAI client does not support the embed_dtype and endianness parameters.
-    for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
+    for embed_dtype in EMBED_DTYPES:
        for endianness in ENDIANNESS:
            prompt = {
                "model": model,
--- a/examples/pooling/embed/embedding_requests_bytes_online.py
+++ b/examples/pooling/embed/embedding_requests_bytes_online.py
@@ -12,13 +12,12 @@ import json
 import requests
 import torch

-from vllm.utils.serial_utils import (
-    EMBED_DTYPE_TO_TORCH_DTYPE,
-    ENDIANNESS,
+from vllm.entrypoints.pooling.utils import (
    MetadataItem,
    build_metadata_items,
    decode_pooling_output,
 )
+from vllm.utils.serial_utils import EMBED_DTYPES, ENDIANNESS


 def post_http_request(prompt: dict, api_url: str) -> requests.Response:
@@ -51,7 +50,7 @@ def main(args):

    # The OpenAI client does not support the bytes encoding_format.
    # The OpenAI client does not support the embed_dtype and endianness parameters.
-    for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
+    for embed_dtype in EMBED_DTYPES:
        for endianness in ENDIANNESS:
            prompt = {
                "model": model,
@@ -74,7 +73,7 @@ def main(args):
    # The vllm server always sorts the returned embeddings in the order of input. So
    # returning metadata is not necessary. You can set encoding_format to bytes_only
    # to let the server not return metadata.
-    for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
+    for embed_dtype in EMBED_DTYPES:
        for endianness in ENDIANNESS:
            prompt = {
                "model": model,
--- a/examples/pooling/embed/vision_embedding_offline.py
+++ b/examples/pooling/embed/vision_embedding_offline.py
@@ -12,6 +12,8 @@ on HuggingFace model repository.
 import argparse
 from dataclasses import asdict

+from PIL.Image import Image
+
 from vllm import LLM, EngineArgs
 from vllm.multimodal.utils import fetch_image

@@ -20,17 +22,42 @@ text = "A cat standing in the snow."
 multi_modal_data = {"image": fetch_image(image_url)}


-def print_embeddings(embeds):
+def print_embeddings(embeds: list[float]):
    embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
    print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")


 def run_qwen3_vl():
+    try:
+        from qwen_vl_utils import smart_resize
+    except ModuleNotFoundError:
+        print(
+            "WARNING: `qwen-vl-utils` not installed, input images will not "
+            "be automatically resized. This can cause different results "
+            "comparing with HF repo's example. "
+            "You can enable this functionality by `pip install qwen-vl-utils`."
+        )
+        smart_resize = None
+
+    if smart_resize is not None:
+
+        def post_process_image(image: Image) -> Image:
+            width, height = image.size
+            resized_height, resized_width = smart_resize(
+                height,
+                width,
+                factor=32,
+            )
+            return image.resize((resized_width, resized_height))
+
+        multi_modal_data["image"] = post_process_image(multi_modal_data["image"])
+
    engine_args = EngineArgs(
        model="Qwen/Qwen3-VL-Embedding-2B",
        runner="pooling",
        max_model_len=8192,
        limit_mm_per_prompt={"image": 1},
+        mm_processor_kwargs={"do_resize": False} if smart_resize is not None else None,
    )
    default_instruction = "Represent the user's input."
    image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
--- a/examples/pooling/score/colbert_rerank_online.py
+++ b/examples/pooling/score/colbert_rerank_online.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Example of using ColBERT late interaction model for reranking.
+
+ColBERT (Contextualized Late Interaction over BERT) uses per-token embeddings
+and MaxSim scoring for document reranking, providing better accuracy than
+single-vector models while being more efficient than cross-encoders.
+
+Start the server with:
+    vllm serve answerdotai/answerai-colbert-small-v1
+
+Then run this script:
+    python colbert_rerank_online.py
+"""
+
+import json
+
+import requests
+
+url = "http://127.0.0.1:8000/rerank"
+
+headers = {"accept": "application/json", "Content-Type": "application/json"}
+
+data = {
+    "model": "answerdotai/answerai-colbert-small-v1",
+    "query": "What is machine learning?",
+    "documents": [
+        "Machine learning is a subset of artificial intelligence.",
+        "Python is a programming language.",
+        "Deep learning uses neural networks for complex tasks.",
+        "The weather today is sunny.",
+    ],
+}
+
+
+def main():
+    response = requests.post(url, headers=headers, json=data)
+
+    if response.status_code == 200:
+        print("ColBERT Rerank Request successful!")
+        result = response.json()
+        print(json.dumps(result, indent=2))
+
+        # Show ranked results
+        print("\nRanked documents (most relevant first):")
+        for item in result["results"]:
+            doc_idx = item["index"]
+            score = item["relevance_score"]
+            print(f"  Score {score:.4f}: {data['documents'][doc_idx]}")
+    else:
+        print(f"Request failed with status code: {response.status_code}")
+        print(response.text)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/pooling/score/vision_rerank_api_online.py
+++ b/examples/pooling/score/vision_rerank_api_online.py
@@ -89,6 +89,29 @@ def main(args):
    response = requests.post(rerank_url, json=prompt)
    pprint.pprint(response.json())

+    print("Query: string & Document: text + image url")
+    prompt = {
+        "model": model,
+        "query": query,
+        "documents": {"content": [documents[0], documents[1]]},
+    }
+    response = requests.post(rerank_url, json=prompt)
+    pprint.pprint(response.json())
+
+    print("Query: string & Document: list")
+    prompt = {
+        "model": model,
+        "query": query,
+        "documents": [
+            document,
+            {"content": [documents[0]]},
+            {"content": [documents[1]]},
+            {"content": [documents[0], documents[1]]},
+        ],
+    }
+    response = requests.post(rerank_url, json=prompt)
+    pprint.pprint(response.json())
+

 if __name__ == "__main__":
    args = parse_args()
--- a/examples/pooling/score/vision_score_api_online.py
+++ b/examples/pooling/score/vision_score_api_online.py
@@ -92,6 +92,44 @@ def main(args):
    response = requests.post(score_url, json=prompt)
    pprint.pprint(response.json())

+    print("Query: string & Document: text + image url")
+    prompt = {
+        "model": model,
+        "queries": query,
+        "documents": {"content": [documents[0], documents[1]]},
+    }
+    response = requests.post(score_url, json=prompt)
+    pprint.pprint(response.json())
+
+    print("Query: string & Document: list")
+    prompt = {
+        "model": model,
+        "queries": query,
+        "documents": [
+            document,
+            {"content": [documents[0]]},
+            {"content": [documents[1]]},
+            {"content": [documents[0], documents[1]]},
+        ],
+    }
+    response = requests.post(score_url, json=prompt)
+    pprint.pprint(response.json())
+
+    print("Query: list & Document: list")
+    data = [
+        document,
+        {"content": [documents[0]]},
+        {"content": [documents[1]]},
+        {"content": [documents[0], documents[1]]},
+    ]
+    prompt = {
+        "model": model,
+        "queries": data,
+        "documents": data,
+    }
+    response = requests.post(score_url, json=prompt)
+    pprint.pprint(response.json())
+

 if __name__ == "__main__":
    args = parse_args()
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -11,7 +11,7 @@ transformers >= 4.56.0, < 5
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer, gRPC.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
-aiohttp
+aiohttp >= 3.13.3
 openai >= 1.99.1  # For Responses API with reasoning content
 pydantic >= 2.12.0
 prometheus_client >= 0.18.0
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -4,10 +4,10 @@
 numba == 0.61.2 # Required for N-gram speculative decoding

 # Dependencies for NVIDIA GPUs
-ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
+ray[cgraph]>=2.48.0
 torch==2.9.1
 torchaudio==2.9.1
 # These must be updated alongside torch
 torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
-flashinfer-python==0.6.1
+flashinfer-python==0.6.2
--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
@@ -1,2 +1,2 @@
-lmcache
+lmcache >= 0.3.9
 nixl >= 0.7.1 # Required for disaggregated prefill
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -14,7 +14,7 @@ pytest-shard==0.1.2
 # Async/HTTP dependencies
 anyio==4.6.2.post1
    # via httpx, starlette
-aiohttp==3.13.0
+aiohttp==3.13.3
    # via gpt-oss
 httpx==0.27.2
    # HTTP testing
@@ -94,3 +94,5 @@ timm==1.0.17
 albumentations==1.4.6
 # Pin transformers version
 transformers==4.57.3
+# Pin HF Hub version
+huggingface-hub==0.36.1
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -5,7 +5,7 @@ numba == 0.61.2 # Required for N-gram speculative decoding

 # Dependencies for AMD GPUs
 datasets
-ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
+ray[cgraph]>=2.48.0
 peft
 pytest-asyncio
 tensorizer==2.10.1
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -12,7 +12,7 @@ affine==2.4.0
    # via rasterio
 aiohappyeyeballs==2.6.1
    # via aiohttp
-aiohttp==3.13.0
+aiohttp==3.13.3
    # via
    #   aiohttp-cors
    #   datasets
@@ -332,7 +332,7 @@ httpx==0.27.2
    #   -r requirements/test.in
    #   perceptron
    #   schemathesis
-huggingface-hub==0.34.3
+huggingface-hub==0.36.1
    # via
    #   accelerate
    #   datasets
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -11,8 +11,8 @@ jinja2>=3.1.6
 datasets # for benchmark scripts
 numba == 0.61.2 # Required for N-gram speculative decoding
 --extra-index-url=https://download.pytorch.org/whl/xpu
-torch==2.9.0+xpu
+torch==2.10.0+xpu
 torchaudio
 torchvision

-intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.9.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl
+vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.1/vllm_xpu_kernels-0.1.1-cp312-cp312-linux_x86_64.whl
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -1,321 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from __future__ import annotations
-
-import logging
-from typing import Any
-
-import pytest
-import regex as re
-
-from tests.compile.fusion_test_utils import (
-    CUSTOM_OPS_FP8,
-    CUSTOM_OPS_QUANT_RMS_NORM,
-    CUSTOM_OPS_RMS_NORM,
-    MODELS,
-    MODELS_FP4,
-    MODELS_FP8,
-    MODELS_GROUP_FP8,
-    Matches,
-    custom_ops_product,
-    is_blackwell,
-    run_model,
-)
-from tests.v1.attention.utils import AttentionBackendEnum
-from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
-from vllm.platforms import current_platform
-from vllm.utils.flashinfer import has_flashinfer
-from vllm.utils.torch_utils import is_torch_equal_or_newer
-
-from ...utils import flat_product, multi_gpu_test
-
-
-@multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize(
-    "model_name, model_kwargs, backend, matches, custom_ops",
-    # Toggle RMSNorm and QuantFP8 for FP8 models
-    list(
-        flat_product(
-            MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM)
-        )
-    )
-    # Toggle RMSNorm for FP4 models and unquant models
-    + list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)),
-)
-@pytest.mark.parametrize("inductor_graph_partition", [True, False])
-@pytest.mark.skipif(
-    not current_platform.is_cuda()
-    or not has_flashinfer()
-    or not current_platform.has_device_capability(90),
-    reason="allreduce+rmsnorm fusion requires flashinfer",
-)
-def test_tp2_attn_quant_allreduce_rmsnorm(
-    model_name: str,
-    model_kwargs: dict,
-    backend: AttentionBackendEnum,
-    matches: Matches,
-    custom_ops: str,
-    inductor_graph_partition: bool,
-    caplog_mp_spawn,
-    monkeypatch,
-):
-    if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
-        pytest.skip("Inductor graph partition requires torch>=2.9")
-
-    if "fp4" in model_name.lower() and not is_blackwell():
-        pytest.skip("NVFP4 quant requires Blackwell")
-
-    if backend == AttentionBackendEnum.FLASHINFER and not is_blackwell():
-        # FlashInfer attn fusion requires Blackwell
-        matches = matches._replace(attention_fusion=0)
-
-    custom_ops_list = custom_ops.split(",") if custom_ops else []
-
-    if inductor_graph_partition:
-        mode = CUDAGraphMode.FULL_AND_PIECEWISE
-        splitting_ops: list[str] | None = None
-    else:
-        mode = CUDAGraphMode.FULL_DECODE_ONLY
-        splitting_ops = []
-
-    # Disable, compile cache to make sure custom passes run.
-    # Otherwise, we can't verify fusion happened through the logs.
-    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
-
-    # To capture subprocess logs, we need to know whether spawn or fork is used.
-    # Force spawn as it is more general.
-    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-
-    model_kwargs["attention_config"] = {"backend": backend.name}
-
-    compilation_config = CompilationConfig(
-        # Testing properties
-        use_inductor_graph_partition=inductor_graph_partition,
-        cudagraph_mode=mode,
-        custom_ops=custom_ops_list,
-        splitting_ops=splitting_ops,
-        # Common
-        mode=CompilationMode.VLLM_COMPILE,
-        pass_config=PassConfig(
-            fuse_attn_quant=True,
-            eliminate_noops=True,
-            fuse_allreduce_rms=True,
-        ),
-        # Inductor caches custom passes by default as well via uuid
-        inductor_compile_config={"force_disable_caches": True},
-    )
-
-    with caplog_mp_spawn(logging.DEBUG) as log_holder:
-        run_model(
-            compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
-        )
-    log_matches = re.findall(
-        r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
-        log_holder.text,
-    )
-    # 2 for each compile range
-    # (global compile range can be split due to fuse_allreduce_rmsnorm)
-    num_compile_ranges = len(compilation_config.get_compile_ranges())
-    assert num_compile_ranges in [1, 2]
-
-    assert len(log_matches) == 2 * num_compile_ranges, log_holder.text
-
-    assert all(int(log_match) == matches.attention_fusion for log_match in log_matches)
-
-    log_matches = re.findall(
-        r"collective_fusion.py:\d+] Replaced (\d+) patterns",
-        log_holder.text,
-    )
-    assert len(log_matches) == 2, log_holder.text
-
-    assert int(log_matches[0]) == matches.allreduce_fusion
-    assert int(log_matches[1]) == matches.allreduce_fusion
-
-    log_matches = re.findall(
-        r"pass_manager.py:\d+] Skipping .*AllReduceFusionPass.* with compile range",
-        log_holder.text,
-    )
-    assert len(log_matches) == 2 * (num_compile_ranges - 1), log_holder.text
-
-
-@multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize(
-    "model_name, model_kwargs, backend, matches, custom_ops",
-    # Toggle RMSNorm and QuantFP8 for FP8 models
-    list(
-        flat_product(
-            MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM)
-        )
-    )
-    # Toggle RMSNorm for FP4 models and unquant models
-    + list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)),
-)
-@pytest.mark.parametrize("inductor_graph_partition", [True, False])
-@pytest.mark.skipif(
-    not current_platform.is_cuda(),
-    reason="sequence parallel only tested on CUDA",
-)
-def test_tp2_attn_quant_async_tp(
-    model_name: str,
-    model_kwargs: dict,
-    backend: AttentionBackendEnum,
-    matches: Matches,
-    custom_ops: str,
-    inductor_graph_partition: bool,
-    caplog_mp_spawn,
-    monkeypatch,
-):
-    if is_blackwell():
-        # TODO: https://github.com/vllm-project/vllm/issues/27893
-        pytest.skip("Blackwell is not supported for AsyncTP pass")
-
-    if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
-        pytest.skip("Inductor graph partition requires torch>=2.9")
-
-    if "fp4" in model_name.lower() and not is_blackwell():
-        pytest.skip("NVFP4 quant requires Blackwell")
-
-    if backend == AttentionBackendEnum.FLASHINFER:
-        if not has_flashinfer():
-            pytest.skip("FlashInfer backend requires flashinfer installed")
-        if not is_blackwell():
-            # FlashInfer attn fusion requires Blackwell
-            matches = matches._replace(attention_fusion=0)
-
-    custom_ops_list = custom_ops.split(",") if custom_ops else []
-
-    if inductor_graph_partition:
-        mode = CUDAGraphMode.FULL_AND_PIECEWISE
-        splitting_ops: list[str] | None = None
-    else:
-        mode = CUDAGraphMode.FULL_DECODE_ONLY
-        splitting_ops = []
-
-    # Disable, compile cache to make sure custom passes run.
-    # Otherwise, we can't verify fusion happened through the logs.
-    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
-
-    # To capture subprocess logs, we need to know whether spawn or fork is used.
-    # Force spawn as it is more general.
-    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-
-    model_kwargs["attention_config"] = {"backend": backend.name}
-
-    compilation_config = CompilationConfig(
-        # Testing properties
-        use_inductor_graph_partition=inductor_graph_partition,
-        cudagraph_mode=mode,
-        custom_ops=custom_ops_list,
-        splitting_ops=splitting_ops,
-        # Common
-        mode=CompilationMode.VLLM_COMPILE,
-        pass_config=PassConfig(
-            fuse_attn_quant=True,
-            eliminate_noops=True,
-            enable_sp=True,
-            fuse_gemm_comms=True,
-        ),
-        # Inductor caches custom passes by default as well via uuid
-        inductor_compile_config={"force_disable_caches": True},
-    )
-
-    with caplog_mp_spawn(logging.DEBUG) as log_holder:
-        run_model(
-            compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
-        )
-    log_matches = re.findall(
-        r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
-        log_holder.text,
-    )
-    assert len(log_matches) == 2, log_holder.text
-
-    assert int(log_matches[0]) == matches.attention_fusion
-    assert int(log_matches[1]) == matches.attention_fusion
-
-    log_matches = re.findall(
-        r"sequence_parallelism.py:\d+] Replaced (\d+) patterns",
-        log_holder.text,
-    )
-    assert len(log_matches) == 2, log_holder.text
-
-    assert int(log_matches[0]) == matches.sequence_parallel
-    assert int(log_matches[1]) == matches.sequence_parallel
-
-    log_matches = re.findall(
-        r"collective_fusion.py:\d+] Replaced (\d+) patterns",
-        log_holder.text,
-    )
-    assert len(log_matches) == 2, log_holder.text
-
-    assert int(log_matches[0]) == matches.async_tp
-    assert int(log_matches[1]) == matches.async_tp
-
-
-@pytest.mark.parametrize(
-    "model_name, model_kwargs, backend, matches, custom_ops",
-    # Test rms norm+group quant_fp8 fusion
-    list[tuple[Any, ...]](flat_product(MODELS_GROUP_FP8, CUSTOM_OPS_QUANT_RMS_NORM)),
-)
-@pytest.mark.parametrize("inductor_graph_partition", [True, False])
-# TODO: remove skip after we fix the fusion thoroughly
-@pytest.mark.skipif(is_blackwell(), reason="Temporarily disabled on Blackwell")
-def test_rms_group_quant(
-    model_name: str,
-    model_kwargs: dict[str, Any],
-    backend: AttentionBackendEnum,
-    matches: Matches,
-    custom_ops: str,
-    inductor_graph_partition: bool,
-    caplog_mp_spawn,
-    monkeypatch,
-):
-    if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
-        pytest.skip("Inductor graph partition requires torch>=2.9")
-
-    custom_ops_list = custom_ops.split(",") if custom_ops else []
-
-    if inductor_graph_partition:
-        mode = CUDAGraphMode.FULL_AND_PIECEWISE
-        splitting_ops: list[str] | None = None
-    else:
-        mode = CUDAGraphMode.FULL_DECODE_ONLY
-        splitting_ops = []
-
-    # Disable, compile cache to make sure custom passes run.
-    # Otherwise, we can't verify fusion happened through the logs.
-    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
-
-    # To capture subprocess logs, we need to know whether spawn or fork is used.
-    # Force spawn as it is more general.
-    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-
-    # TODO: remove this after fusion is fixed
-    monkeypatch.setenv("VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES", "0")
-
-    model_kwargs["attention_config"] = {"backend": backend.name}
-
-    compilation_config = CompilationConfig(
-        # Testing properties
-        custom_ops=custom_ops_list,
-        use_inductor_graph_partition=inductor_graph_partition,
-        cudagraph_mode=mode,
-        splitting_ops=splitting_ops,
-        # Common
-        mode=CompilationMode.VLLM_COMPILE,
-        pass_config=PassConfig(
-            fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True
-        ),
-        # Inductor caches custom passes by default as well via uuid
-        inductor_compile_config={"force_disable_caches": True},
-    )
-
-    with caplog_mp_spawn(logging.DEBUG) as log_holder:
-        run_model(compilation_config, model_name, **model_kwargs)
-
-    log_matches = re.findall(
-        r"\[fusion.py:\d+] Replaced (\d+) patterns",
-        log_holder.text,
-    )
-    assert len(log_matches) == 1, log_holder.text
-    assert int(log_matches[0]) == matches.rms_quant_norm_fusion
--- a/tests/compile/fusion_test_utils.py
+++ b/tests/compile/fusion_test_utils.py
@@ -1,208 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Shared utilities for fusion tests (e.g. test_fusion_attn.py)."""
-
-from __future__ import annotations
-
-import itertools
-from collections.abc import Iterable
-from typing import Any, NamedTuple
-
-from tests.v1.attention.utils import AttentionBackendEnum
-from vllm import LLM, SamplingParams
-from vllm.config import CompilationConfig, CUDAGraphMode
-from vllm.platforms import current_platform
-
-is_blackwell = lambda: current_platform.is_device_capability_family(100)
-"""Are we running on Blackwell, a lot of tests depend on it"""
-
-
-def has_cuda_graph_wrapper_metadata() -> bool:
-    from importlib import import_module
-
-    try:
-        module = import_module("torch._inductor.utils")
-        module.CUDAGraphWrapperMetadata  # noqa B018
-    except AttributeError:
-        return False
-    return True
-
-
-class Matches(NamedTuple):
-    attention_fusion: int = 0
-    allreduce_fusion: int = 0
-    sequence_parallel: int = 0
-    async_tp: int = 0
-    rms_quant_norm_fusion: int = 0
-
-
-class ModelBackendTestCase(NamedTuple):
-    model_name: str
-    model_kwargs: dict[str, Any]
-    backend: AttentionBackendEnum
-    matches: Matches
-
-
-# E2E model test cases
-MODELS_FP8: list[ModelBackendTestCase] = []
-MODELS_FP4: list[ModelBackendTestCase] = []
-MODELS: list[ModelBackendTestCase] = []  # tp-only (unquantized)
-MODELS_GROUP_FP8: list[ModelBackendTestCase] = []
-
-if current_platform.is_cuda():
-    MODELS_FP8 = [
-        ModelBackendTestCase(
-            # Use smaller model for L40s in CI
-            model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
-            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=AttentionBackendEnum.TRITON_ATTN,
-            matches=Matches(
-                attention_fusion=32,
-                allreduce_fusion=65,
-                sequence_parallel=65,
-                async_tp=128,
-            ),
-        ),
-        ModelBackendTestCase(
-            model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
-            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            # TODO FlashInfer attn broken on Hopper with kvcache=fp8:
-            # https://github.com/vllm-project/vllm/issues/28568
-            backend=AttentionBackendEnum.FLASHINFER
-            if is_blackwell()
-            else AttentionBackendEnum.TRITON_ATTN,
-            matches=Matches(
-                attention_fusion=48,
-                allreduce_fusion=96,
-                sequence_parallel=96,
-                async_tp=95,  # mlp is moe, no fusion there
-            ),
-        ),
-    ]
-
-    MODELS_FP4 = [
-        ModelBackendTestCase(
-            model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
-            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=AttentionBackendEnum.FLASHINFER,
-            matches=Matches(
-                attention_fusion=32,
-                allreduce_fusion=65,
-                sequence_parallel=65,
-                async_tp=128,
-            ),
-        ),
-    ]
-
-    # TP only (unquantized models)
-    MODELS = [
-        ModelBackendTestCase(
-            model_name="meta-llama/Llama-3.1-8B-Instruct",
-            model_kwargs=dict(max_model_len=1024),
-            backend=AttentionBackendEnum.TRITON_ATTN,
-            matches=Matches(
-                attention_fusion=0,
-                allreduce_fusion=65,
-                sequence_parallel=65,
-                async_tp=128,
-            ),
-        ),
-        ModelBackendTestCase(
-            model_name="Qwen/Qwen3-30B-A3B",
-            model_kwargs=dict(max_model_len=1024),
-            backend=AttentionBackendEnum.TRITON_ATTN,
-            matches=Matches(
-                attention_fusion=0,
-                allreduce_fusion=97,
-                sequence_parallel=97,
-                async_tp=96,  # MLP is MoE, half the fusions of dense
-            ),
-        ),
-    ]
-
-    MODELS_GROUP_FP8 = [
-        ModelBackendTestCase(
-            model_name="Qwen/Qwen3-30B-A3B-FP8",
-            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=AttentionBackendEnum.TRITON_ATTN,
-            matches=Matches(
-                rms_quant_norm_fusion=48,
-            ),
-        ),
-    ]
-
-elif current_platform.is_rocm():
-    MODELS_FP8 = [
-        ModelBackendTestCase(
-            model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
-            model_kwargs=dict(max_model_len=1024),
-            backend=AttentionBackendEnum.TRITON_ATTN,
-            matches=Matches(attention_fusion=32),
-        ),
-        ModelBackendTestCase(
-            model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
-            model_kwargs=dict(max_model_len=1024),
-            backend=AttentionBackendEnum.ROCM_ATTN,
-            matches=Matches(attention_fusion=32),
-        ),
-        ModelBackendTestCase(
-            model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
-            model_kwargs=dict(max_model_len=1024),
-            backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
-            matches=Matches(attention_fusion=32),
-        ),
-    ]
-
-
-# Custom ops toggle lists for parametrization
-CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
-CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"]
-CUSTOM_OPS_QUANT_RMS_NORM = ["+quant_fp8,+rms_norm"]
-
-
-def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:
-    """Generate all combinations of custom ops for parametrization."""
-    for op_list in itertools.product(*custom_ops_lists):
-        yield ",".join(op_list)
-
-
-def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
-    """Run a model with the given compilation config for E2E fusion tests."""
-    compilation_config = (
-        compile_config
-        if isinstance(compile_config, CompilationConfig)
-        else CompilationConfig(mode=compile_config)
-    )
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0)
-    # Allow override from model_kwargs
-    model_kwargs = {"tensor_parallel_size": 1, **model_kwargs}
-    model_kwargs = {"disable_custom_all_reduce": True, **model_kwargs}
-
-    # No cudagraphs by default
-    if compilation_config.cudagraph_mode is None:
-        compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-    llm = LLM(
-        model=model,
-        compilation_config=compilation_config,
-        **model_kwargs,
-    )
-    outputs = llm.generate(prompts, sampling_params)
-
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-    # Get the compile ranges split points after vllm config post init
-    # in order to compute compile ranges correctly
-    compilation_config.compile_ranges_split_points = (
-        llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points
-    )
--- a/Show More
+++ b/Show More