diff --git a/.buildkite/lm-eval-harness/configs/models-large-rocm-fp8.txt b/.buildkite/lm-eval-harness/configs/models-large-rocm-fp8.txt
new file mode 100644
index 000000000..5552391d9
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-large-rocm-fp8.txt
@@ -0,0 +1 @@
+Qwen3-235B-A22B-Instruct-2507-FP8.yaml
diff --git a/.buildkite/scripts/check-ray-compatibility.sh b/.buildkite/scripts/check-ray-compatibility.sh
index d44d074c2..1572fe941 100644
--- a/.buildkite/scripts/check-ray-compatibility.sh
+++ b/.buildkite/scripts/check-ray-compatibility.sh
@@ -16,6 +16,23 @@ RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python"
 WORK_DIR=$(mktemp -d)
 trap 'rm -rf "$WORK_DIR"' EXIT
 
+# ── Detect PyTorch index URL ─────────────────────────────────────────────
+
+if python3 -c "import torch; assert torch.version.hip" 2>/dev/null; then
+    ROCM_VER=$(python3 -c "import torch; print(torch.version.hip.rsplit('.', 1)[0])")
+    CANDIDATE_URL="https://download.pytorch.org/whl/rocm${ROCM_VER}"
+    if curl -fsSL --head "${CANDIDATE_URL}/" >/dev/null 2>&1; then
+        TORCH_INDEX_URL="${CANDIDATE_URL}"
+    else
+        echo ">>> WARNING: ROCm ${ROCM_VER} wheel index not found at ${CANDIDATE_URL}"
+        echo ">>>          Falling back to default PyPI (resolution may be incomplete)"
+        TORCH_INDEX_URL=""
+    fi
+else
+    TORCH_INDEX_URL="https://download.pytorch.org/whl/cu129"
+fi
+echo ">>> Using PyTorch index: ${TORCH_INDEX_URL:-PyPI default}"
+
 # Fetch all Ray requirement files used in the LLM depset pipeline
 echo ">>> Fetching Ray requirement files"
 RAY_FILES=(
@@ -116,6 +133,11 @@ echo "============================================================"
 echo ">>> Resolving: Can Ray generate compatible lock files?"
 echo "============================================================"
 
+EXTRA_INDEX_ARGS=()
+if [[ -n "${TORCH_INDEX_URL}" ]]; then
+    EXTRA_INDEX_ARGS+=(--extra-index-url "${TORCH_INDEX_URL}")
+fi
+
 set +e
 uv pip compile \
     "${WORK_DIR}/requirements.txt" \
@@ -126,7 +148,7 @@ uv pip compile \
     -c "${WORK_DIR}/vllm-constraints.txt" \
     --python-version 3.12 \
     --python-platform x86_64-manylinux_2_31 \
-    --extra-index-url https://download.pytorch.org/whl/cu129 \
+    "${EXTRA_INDEX_ARGS[@]}" \
     --index-strategy unsafe-best-match \
     --unsafe-package setuptools \
     --unsafe-package ray \
diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
index dddf23f1f..de48eb282 100755
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
@@ -1,11 +1,14 @@
 #!/usr/bin/env bash
 set -euxo pipefail
-
 # Nightly e2e test for prefetch offloading with a MoE model.
 # Runs DeepSeek-V2-Lite with prefetch offloading of MoE expert weights
 # and validates GSM8K accuracy matches baseline (no offloading).
 #
 # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+#
+# Environment variables:
+#   ATTENTION_BACKEND   - attention backend to use (e.g., FLASH_ATTN,
+#                         ROCM_ATTN, FLASHINFER). If unset, uses vllm default.
 THRESHOLD=${1:-0.25}
 NUM_Q=${2:-1319}
 PORT=${3:-8030}
@@ -22,6 +25,14 @@ wait_for_server() {
 
 MODEL="deepseek-ai/DeepSeek-V2-Lite"
 
+# ── Build optional vllm serve flags ─────────────────────────────────────
+
+EXTRA_ARGS=()
+if [[ -n "${ATTENTION_BACKEND:-}" ]]; then
+  echo "Using attention backend: ${ATTENTION_BACKEND}"
+  EXTRA_ARGS+=(--attention-backend "${ATTENTION_BACKEND}")
+fi
+
 cleanup() {
   if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
     kill "${SERVER_PID}" 2>/dev/null || true
@@ -40,7 +51,8 @@ vllm serve "$MODEL" \
   --offload-num-in-group 2 \
   --offload-prefetch-step 1 \
   --offload-params w13_weight w2_weight \
-  --port "$PORT" &
+  --port "$PORT" \
+  ${EXTRA_ARGS+"${EXTRA_ARGS[@]}"} &
 SERVER_PID=$!
 wait_for_server "$PORT"
 
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 5e2c25936..82e97bfbb 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -15,7 +15,6 @@
 # command(str): the single command to run for tests. incompatible with commands.
 # commands(list): the list of commands to run for the test. incompatible with command.
 # mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
-# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
 # num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
 # num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
 #     in this case, commands must be specified. the first command runs on the first host, the second
@@ -32,6 +31,80 @@
 # - If the test takes more than 10min, then it is okay to create a new step.
 #   Note that all steps execute in parallel.
 
+
+#####################################################################################################################################
+#                                                                                                                                   #
+#                                                             README                                                                #
+#                                                                                                                                   #
+#####################################################################################################################################
+#                                                                                                                                   #
+# IMPORTANT:                                                                                                                        #
+#   * Currently AMD CI has MI300 agents, MI325 agents, and MI355 agents. Of those, AMD is using mostly MI325 and MI355. AMD team    #
+#     is actively working on enabling more MI300 machines. All upcoming feature improvements are tracked in:                        #
+#         https://github.com/vllm-project/vllm/issues/34994                                                                         #
+#                                                                                                                                   #
+#-----------------------------------------------------------------------------------------------------------------------------------#
+#                                                                                                                                   #
+# NOTES:                                                                                                                            #
+#   * [Pytorch Nightly Dependency Override Check]: if this test fails, it means the nightly torch version is not compatible with    #
+#                                                  some of the dependencies. Please check the error message and add the package to  #
+#                                                  whitelist in `/vllm/tools/pre_commit/generate_nightly_torch_test.py`.            #
+#   * [Entrypoints Integration Test (LLM)]:                                                                                         #
+#     - {`pytest -v -s entrypoints/llm/test_generate.py`}: It needs a clean process                                                 #
+#     - {`pytest -v -s entrypoints/offline_mode`}: Needs to avoid interference with other tests                                     #
+#   * [V1 Test e2e + engine]: The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability. See discussion here:        #
+#                             https://github.com/vllm-project/vllm/pull/31040                                                       #
+#   * [V1 others]:                                                                                                                  #
+#     - Split the tests to avoid interference                                                                                       #
+#     - Integration test for streaming correctness (requires special branch for __harness__ lib).                                   #
+#   * [V1 others (CPU)]: Split the tests to avoid interference                                                                      #
+#   * [PyTorch Compilation Unit Tests]: Run unit tests defined directly under `compile/`, not including subdirectories, which       #
+#                                       are usually heavier tests covered elsewhere. Use `find` to launch multiple instances        #
+#                                       of pytest so that they do not suffer from:                                                  #
+#                                       https://github.com/vllm-project/vllm/issues/28965                                           #
+#   * [PyTorch Fullgraph Smoke Test]: Run smoke tests under fullgraph directory, except `test_full_graph.py` as it is a heavy       #
+#                                     test that is covered in other steps. Use `find` to launch multiple instances of pytest        #
+#                                     so that they do not suffer from: https://github.com/vllm-project/vllm/issues/28965            #
+#   * [PyTorch Fullgraph]:                                                                                                          #
+#     - Limit to no custom ops to reduce running time. Wrap with quotes to escape yaml and avoid starting `-k` string               #
+#       with a `-`                                                                                                                  #
+#     - Old E2E tests such as:                                                                                                      #
+#           ```bash                                                                                                                 #
+#           pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'                     #
+#           ```                                                                                                                     #
+#       were removed in https://github.com/vllm-project/vllm/pull/33293 in favor of new tests in `fusions_e2e`. We                  #
+#       avoid replicating the new jobs in this file as it's deprecated.                                                             #
+#   * [Basic Models Tests (Extra Initialization) %N]: Only when vLLM model source is modified - test initialization of a            #
+#                                                     large subset of supported models (the complement of the small subset in       #
+#                                                     the above test.) Also run if model initialization test file is modified.      #
+#   * [Language Models Tests (Extra Standard) %N]: Shard slow subset of standard language models tests. Only run when model         #
+#                                                  source is modified, or when specified test files are modified.                   #
+#   * [Language Models Tests (Hybrid) %N]: Install fast path packages for testing against transformers (mamba, conv1d) and to       #
+#                                          run plamo2 model in vLLM.                                                                #
+#   * [Language Models Test (Extended Generation)]: Install fast path packages for testing against transformers (mamba, conv1d)     #
+#                                                   and to run plamo2 model in vLLM.                                                #
+#   * [Multi-Modal Models (Standard)]:                                                                                              #
+#     - Do NOT remove `VLLM_WORKER_MULTIPROC_METHOD=spawn` setting as ROCm requires this for certain models to function.            #
+#   * [Transformers Nightly Models Test]: Whisper needs `VLLM_WORKER_MULTIPROC_METHOD=spawn` to avoid deadlock.                     #
+#   * [Plugin Tests (2 GPUs)]:                                                                                                      #
+#     - {`pytest -v -s entrypoints/openai/test_oot_registration.py`}: It needs a clean process                                      #
+#     - {`pytest -v -s models/test_oot_registration.py`}: It needs a clean process                                                  #
+#     - {`pytest -v -s plugins/lora_resolvers`}: Unit tests for in-tree lora resolver plugins                                       #
+#   * [LoRA TP (Distributed)]:                                                                                                      #
+#     - There is some Tensor Parallelism related processing logic in LoRA that requires multi-GPU testing for validation.           #
+#     - {`pytest -v -s -x lora/test_gptoss_tp.py`}: Disabled for now because MXFP4 backend on non-cuda platform doesn't support     #
+#                                                   LoRA yet.                                                                       #
+#   * [Distributed Tests (GPU_TAG)]: Don't test llama model here, it seems hf implementation is buggy. See:                         #
+#                                    https://github.com/vllm-project/vllm/pull/5689                                                 #
+#   * [Distributed Tests (GPU_TAG)]: Some old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 in          #
+#                                    favor of new tests in fusions_e2e. We avoid replicating the new jobs in                        #
+#                                    this file as it's deprecated.                                                                  #
+#                                                                                                                                   #
+#####################################################################################################################################
+
+
+
+
 steps:
 
 
@@ -41,18 +114,25 @@ steps:
 #                                                                                                                                   #
 #####################################################################################################################################
 
-- label: Pytorch Nightly Dependency Override Check # 2min
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+- label: Pytorch Nightly Dependency Override Check # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
+  optional: true
+  soft_fail: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - requirements/nightly_torch_test.txt
+  - vllm/platforms/rocm.py
   commands:
   - bash standalone_tests/pytorch_nightly_dependency.sh
 
-- label: Async Engine, Inputs, Utils, Worker Test # 10min
-  timeout_in_minutes: 15
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Async Engine, Inputs, Utils, Worker # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/detokenizer
@@ -63,15 +143,20 @@ steps:
   - pytest -v -s -m 'not cpu_test' multimodal
   - pytest -v -s utils_
 
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Async Engine, Inputs, Utils, Worker, Config (CPU) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
+  optional: true
+  no_gpu: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/test_inputs.py
   - tests/test_outputs.py
   - tests/test_pooling_params.py
+  - tests/test_ray_env.py
   - tests/multimodal
   - tests/renderers
   - tests/standalone_tests/lazy_imports.py
@@ -79,12 +164,12 @@ steps:
   - tests/tool_parsers
   - tests/transformers_utils
   - tests/config
-  no_gpu: true
   commands:
   - python3 standalone_tests/lazy_imports.py
   - pytest -v -s test_inputs.py
   - pytest -v -s test_outputs.py
   - pytest -v -s test_pooling_params.py
+  - pytest -v -s test_ray_env.py
   - pytest -v -s -m 'cpu_test' multimodal
   - pytest -v -s renderers
   - pytest -v -s tokenizers_
@@ -92,22 +177,28 @@ steps:
   - pytest -v -s transformers_utils
   - pytest -v -s config
 
-- label: Python-only Installation Test # 10min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Python-only Installation # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
+  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - tests/standalone_tests/python_only_compile.sh
   - setup.py
+  - vllm/platforms/rocm.py
   commands:
   - bash standalone_tests/python_only_compile.sh
 
-- label: Basic Correctness Test # 20min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Basic Correctness # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
   fast_check: true
   torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/basic_correctness/test_basic_correctness
@@ -119,22 +210,25 @@ steps:
   - pytest -v -s basic_correctness/test_basic_correctness.py
   - pytest -v -s basic_correctness/test_cpu_offload.py
 
-- label: Entrypoints Unit Tests # 5min
-  timeout_in_minutes: 10
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Entrypoints Unit Tests # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
-  working_dir: "/vllm-workspace/tests"
   fast_check: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/entrypoints
   - tests/entrypoints/
+  - vllm/platforms/rocm.py
   commands:
   - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
 
-- label: Entrypoints Integration Test (LLM) # 30min
-  timeout_in_minutes: 40
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Entrypoints Integration (LLM) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
   fast_check: true
   torch_nightly: true
@@ -149,30 +243,14 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py
   - pytest -v -s entrypoints/offline_mode
 
-- label: Entrypoints Integration Test (API Server 1) # 100min
-  timeout_in_minutes: 130
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_1
-  working_dir: "/vllm-workspace/tests"
-  fast_check: true
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/
-  - tests/entrypoints/openai
-  - tests/entrypoints/test_chat_utils
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
-  - pytest -v -s entrypoints/test_chat_utils.py
 
-- label: Entrypoints Integration Test (API Server 2)
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+- label: Entrypoints Integration (API Server 2) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
-  optional: true
-  working_dir: "/vllm-workspace/tests"
   fast_check: true
   torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/entrypoints/rpc
@@ -184,29 +262,14 @@ steps:
   - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
   - pytest -v -s tool_use
 
-- label: Entrypoints Integration Test (Pooling)
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_1
-  optional: true
-  fast_check: true
-  torch_nightly: true
-  working_dir: "/vllm-workspace/tests"
-  source_file_dependencies:
-  - vllm/
-  - tests/entrypoints/pooling
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/pooling
 
-- label: Entrypoints Integration Test (Responses API)
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+- label: Entrypoints Integration (Responses API) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
-  optional: true
-  working_dir: "/vllm-workspace/tests"
   fast_check: true
   torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/entrypoints/openai/responses
@@ -214,103 +277,59 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s entrypoints/openai/responses
 
-- label: Distributed Tests (4 GPUs) # 35min
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_4
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace/tests"
-  source_file_dependencies:
-  - vllm/distributed/
-  - tests/distributed/test_utils
-  - tests/distributed/test_pynccl
-  - tests/distributed/test_events
-  - tests/compile/fullgraph/test_basic_correctness.py
-  - examples/offline_inference/rlhf.py
-  - examples/offline_inference/rlhf_colocate.py
-  - examples/offline_inference/new_weight_syncing/
-  - tests/examples/offline_inference/data_parallel.py
-  - tests/v1/distributed
-  - tests/v1/engine/test_engine_core_client.py
-  - tests/distributed/test_symm_mem_allreduce.py
-  commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
-  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
-  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
-  - pytest -v -s distributed/test_utils.py
-  - pytest -v -s compile/fullgraph/test_basic_correctness.py
-  - pytest -v -s distributed/test_pynccl.py
-  - pytest -v -s distributed/test_events.py
-  - pytest -v -s distributed/test_symm_mem_allreduce.py
-  - pushd ../examples/offline_inference
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
-  - popd
-  - pushd ../examples/offline_inference/new_weight_syncing
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
-  - popd
 
-- label: Distributed Tests (8 GPUs) # 4min
-  timeout_in_minutes: 10
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_8
-  optional: true
-  num_gpus: 8
-  working_dir: "/vllm-workspace/tests"
-  source_file_dependencies:
-  - examples/offline_inference/torchrun_dp_example.py
-  - vllm/config/parallel.py
-  - vllm/distributed/
-  - vllm/v1/engine/llm_engine.py
-  - vllm/v1/executor/uniproc_executor.py
-  - vllm/v1/worker/gpu_worker.py
-  commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
-  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
-
-- label: EPLB Algorithm Test # 5min
-  timeout_in_minutes: 15
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative, amdgfx90a]
+- label: EPLB Algorithm # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/distributed/eplb
   - tests/distributed/test_eplb_algo.py
+  - vllm/platforms/rocm.py
   commands:
   - pytest -v -s distributed/test_eplb_algo.py
 
-- label: EPLB Execution Test # 10min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: EPLB Execution # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_4
   num_gpus: 4
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/distributed/eplb
   - tests/distributed/test_eplb_execute.py
+  - tests/distributed/test_eplb_spec_decode.py
+  - vllm/platforms/rocm.py
   commands:
   - pytest -v -s distributed/test_eplb_execute.py
   - pytest -v -s distributed/test_eplb_spec_decode.py
 
-- label: Metrics, Tracing Test # 12min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_2
+
+- label: Elastic EP Scaling Test # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
+  agent_pool: mi250_4
+  num_gpus: 4
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/compilation/
+  - tests/distributed/
+  - vllm/platforms/rocm.py
+  commands:
+  - pytest -v -s distributed/test_elastic_ep.py
+
+
+- label: Metrics, Tracing (2 GPUs) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
+  agent_pool: mi250_1
   num_gpus: 2
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/v1/tracing
@@ -322,9 +341,10 @@ steps:
       'opentelemetry-semantic-conventions-ai>=0.4.1'"
   - pytest -v -s v1/tracing
 
-- label: Regression Test # 7min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Regression # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
@@ -334,10 +354,13 @@ steps:
   - pip install modelscope
   - pytest -v -s test_regression.py
 
-- label: Engine Test # 9min
-  timeout_in_minutes: 15
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Engine # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
+  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/engine
@@ -348,730 +371,824 @@ steps:
   commands:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
 
-- label: V1 Test e2e + engine # 65min
-  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Engine (1 GPU) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
   optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-    - vllm/
-    - tests/v1
+  - vllm/v1/
+  - tests/v1/engine/
+  - vllm/platforms/rocm.py
   commands:
-    - pytest -v -s v1/e2e
-    - pytest -v -s v1/engine
+  - pytest -v -s v1/engine/test_preprocess_error_handling.py
+  - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
 
-- label: V1 Test e2e (2 GPUs) # 65min
-  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_2
+
+- label: e2e Scheduling (1 GPU) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
+  agent_pool: mi250_1
   optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-    - vllm/
-    - tests/v1
+  - vllm/v1/
+  - tests/v1/e2e/general/
+  - vllm/platforms/rocm.py
+  commands:
+  - pytest -v -s v1/e2e/general/test_async_scheduling.py
+
+
+- label: e2e Core (1 GPU) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
+  agent_pool: mi250_1
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/v1/
+  - tests/v1/e2e/general/
+  - vllm/platforms/rocm.py
+  commands:
+  - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
+
+
+- label: Spec Decode Speculators + MTP # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
+  agent_pool: mi250_1
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/v1/spec_decode/
+  - vllm/v1/worker/gpu/spec_decode/
+  - vllm/model_executor/model_loader/
+  - vllm/v1/sample/
+  - vllm/model_executor/layers/
+  - vllm/transformers_utils/configs/speculators/
+  - tests/v1/e2e/spec_decode/
+  - vllm/platforms/rocm.py
+  commands:
+  - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
+
+
+- label: Spec Decode Ngram + Suffix # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
+  agent_pool: mi250_1
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/v1/spec_decode/
+  - vllm/v1/worker/gpu/spec_decode/
+  - vllm/model_executor/model_loader/
+  - vllm/v1/sample/
+  - vllm/model_executor/layers/
+  - tests/v1/e2e/spec_decode/
+  - vllm/platforms/rocm.py
+  commands:
+  - pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"
+
+
+- label: Spec Decode Draft Model # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
+  agent_pool: mi250_1
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/v1/spec_decode/
+  - vllm/v1/worker/gpu/spec_decode/
+  - vllm/model_executor/model_loader/
+  - vllm/v1/sample/
+  - vllm/model_executor/layers/
+  - tests/v1/e2e/spec_decode/
+  - vllm/platforms/rocm.py
+  commands:
+  - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"
+
+
+- label: V1 e2e (2 GPUs) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
+  agent_pool: mi250_1
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/v1/e2e
   commands:
     - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
 
-- label: V1 Test e2e (4 GPUs) # 65min
-  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_4
-  optional: true
-  source_file_dependencies:
-    - vllm/
-    - tests/v1
-  commands:
-    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
 
-- label: V1 Test entrypoints # 35min
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+- label: Entrypoints V1 # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
+  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-    - vllm/
-    - tests/v1
+  - vllm/
+  - tests/v1
   commands:
-    - pytest -v -s v1/entrypoints
+  - pytest -v -s v1/entrypoints
 
-- label: V1 Test others # 42min
+
+- label: V1 Sample + Logits # TBD
   timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
   optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-    - vllm/
-    - tests/v1
+  - vllm/
+  - tests/v1/sample
+  - tests/v1/logits_processors
+  - tests/v1/test_oracle.py
+  - tests/v1/test_request.py
+  - tests/v1/test_outputs.py
   commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - pytest -v -s -m 'not cpu_test' v1/core
-    - pytest -v -s v1/executor
-    - pytest -v -s v1/kv_offload
-    - pytest -v -s v1/sample
-    - pytest -v -s v1/logits_processors
-    - pytest -v -s v1/worker
-    - pytest -v -s v1/spec_decode
-    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
-    - pytest -v -s -m 'not cpu_test' v1/metrics
-    - pytest -v -s v1/test_oracle.py
-    - pytest -v -s v1/test_request.py
-    - pytest -v -s v1/test_outputs.py
-    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
-    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+  - pytest -v -s v1/sample
+  - pytest -v -s v1/logits_processors
+  - pytest -v -s v1/test_oracle.py
+  - pytest -v -s v1/test_request.py
+  - pytest -v -s v1/test_outputs.py
 
-- label: V1 Test attention (H100) # 10min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: V1 Core + KV + Metrics # TBD
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
   optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-    - vllm/config/attention.py
-    - vllm/model_executor/layers/attention
-    - vllm/v1/attention
-    - tests/v1/attention
+  - vllm/
+  - tests/v1/core
+  - tests/v1/executor
+  - tests/v1/kv_offload
+  - tests/v1/worker
+  - tests/v1/kv_connector/unit
+  - tests/v1/metrics
+  - tests/entrypoints/openai/correctness/test_lmeval.py
   commands:
-    - pytest -v -s v1/attention
+  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
+  - pytest -v -s -m 'not cpu_test' v1/core
+  - pytest -v -s v1/executor
+  - pytest -v -s v1/kv_offload
+  - pytest -v -s v1/worker
+  - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+  - pytest -v -s -m 'not cpu_test' v1/metrics
+  - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
+  - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 
-- label: Batch Invariance Tests (H100) # 10min
-  timeout_in_minutes: 25
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: V1 Speculative Decoding (slow) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-    - vllm/v1/attention
-    - vllm/model_executor/layers
-    - tests/v1/determinism/
+  - vllm/v1/spec_decode/
+  - vllm/model_executor/models/
+  - vllm/v1/attention/
+  - vllm/model_executor/layers/
+  - tests/v1/spec_decode/
+  - vllm/platforms/rocm.py
   commands:
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pip install pytest-timeout pytest-forked
-    - pytest -v -s v1/determinism/test_batch_invariance.py
-    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+  - pytest -v -s -m 'slow_test' v1/spec_decode/test_eagle.py
+  - pytest -v -s -m 'slow_test' v1/spec_decode/test_extract_hidden_states.py
+  - pytest -v -s -m 'slow_test' v1/spec_decode/test_max_len.py
+  - pytest -v -s -m 'slow_test' v1/spec_decode/test_mtp.py
+  - pytest -v -s -m 'slow_test' v1/spec_decode/test_ngram.py
+  - pytest -v -s -m 'slow_test' v1/spec_decode/test_speculators_eagle3.py
+  - pytest -v -s -m 'slow_test' v1/spec_decode/test_tree_attention.py
 
-- label: V1 Test others (CPU) # 5 mins
-  timeout_in_minutes: 15
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: V1 attention (H100-MI250) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
+  agent_pool: mi250_1
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/config/attention.py
+  - vllm/model_executor/layers/attention
+  - vllm/v1/attention
+  - tests/v1/attention
+  - vllm/_aiter_ops.py
+  - vllm/envs.py
+  - vllm/platforms/rocm.py
+  commands:
+  - pytest -v -s v1/attention
+
+
+- label: V1 others (CPU) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
   no_gpu: true
+  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-    - vllm/
-    - tests/v1
+  - vllm/
+  - tests/v1
   commands:
-    - pytest -v -s -m 'cpu_test' v1/core
-    - pytest -v -s v1/structured_output
-    - pytest -v -s v1/test_serial_utils.py
-    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
-    - pytest -v -s -m 'cpu_test' v1/metrics
+  - pytest -v -s -m 'cpu_test' v1/core
+  - pytest -v -s v1/structured_output
+  - pytest -v -s v1/test_serial_utils.py
+  - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+  - pytest -v -s -m 'cpu_test' v1/metrics
 
 
-- label: Examples Test # 30min
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+- label: Examples # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
+  optional: true
   working_dir: "/vllm-workspace/examples"
   source_file_dependencies:
   - vllm/entrypoints
   - vllm/multimodal
   - examples/
+  - vllm/platforms/rocm.py
   commands:
     - pip install tensorizer
-    - python3 offline_inference/basic/chat.py
-    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
-    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/basic/classify.py
-    - python3 offline_inference/basic/embed.py
-    - python3 offline_inference/basic/score.py
+    # Basic
+    - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN
+    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
+    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 basic/offline_inference/classify.py
+    - python3 basic/offline_inference/embed.py
+    - python3 basic/offline_inference/score.py
+    # Multi-modal models
     - python3 offline_inference/audio_language.py --seed 0
     - python3 offline_inference/vision_language.py --seed 0
     - python3 offline_inference/vision_language_multi_image.py --seed 0
     - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+    # Pooling models
     - python3 pooling/embed/vision_embedding_offline.py --seed 0
+    # Features demo
     - python3 offline_inference/prefix_caching.py
     - python3 offline_inference/llm_engine_example.py
     - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
     - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 
-- label: Platform Tests (CUDA) # 4min
-  timeout_in_minutes: 15
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Platform Tests (CUDA) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/cuda
   commands:
-    - pytest -v -s cuda/test_cuda_context.py
-    - pytest -v -s cuda/test_platform_no_cuda_init.py
+  - pytest -v -s cuda/test_cuda_context.py
+  - pytest -v -s cuda/test_platform_no_cuda_init.py
 
-- label: Samplers Test # 56min
-  timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Samplers Test # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
   optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/model_executor/layers
   - vllm/sampling_metadata.py
+  - vllm/v1/sample/
+  - vllm/beam_search.py
   - tests/samplers
   - tests/conftest.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
-    - pytest -v -s samplers
+  - pytest -v -s samplers
 
-- label: LoRA Test %N # 20min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: LoRA %N # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
-  optional: true
   parallelism: 4
+  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/lora
   - tests/lora
+  - vllm/platforms/rocm.py
   commands:
-    - pytest -v -s lora \
-      --shard-id=$$BUILDKITE_PARALLEL_JOB \
-      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
-      --ignore=lora/test_chatglm3_tp.py \
-      --ignore=lora/test_llama_tp.py \
-      --ignore=lora/test_llm_with_multi_loras.py \
-      --ignore=lora/test_olmoe_tp.py \
-      --ignore=lora/test_deepseekv2_tp.py \
-      --ignore=lora/test_gptoss_tp.py \
-      --ignore=lora/test_qwen3moe_tp.py
+  - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
 
-- label: PyTorch Compilation Unit Tests # 15min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: PyTorch Compilation Unit Tests # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
   torch_nightly: true
+  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-    - vllm/
-    - tests/compile
-  commands:
-  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
-
-- label: PyTorch Compilation Passes Unit Tests
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_1
-  source_file_dependencies:
-    - vllm/
-    - tests/compile/passes
-  commands:
-  - "find compile/passes -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
-
-- label: PyTorch Fullgraph Smoke Test # 15min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_1
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/
+  - vllm/compilation/
+  - vllm/model_executor/layers/
+  - vllm/v1/worker/
+  - vllm/v1/attention/
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/config/compilation.py
+  - csrc/
   - tests/compile
+  - vllm/platforms/rocm.py
+  commands:
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
+
+
+- label: PyTorch Fullgraph Smoke Test # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
+  agent_pool: mi250_1
+  optional: true
+  torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/compilation/
+  - vllm/model_executor/
+  - vllm/v1/attention/
+  - vllm/config/compilation.py
+  - csrc/
+  - tests/compile
+  - vllm/platforms/rocm.py
   commands:
   - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
 
-- label: PyTorch Fullgraph Test # 27min
-  timeout_in_minutes: 40
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: PyTorch Fullgraph # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
+  optional: true
   torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-  - vllm/
+  - vllm/compilation/
+  - vllm/model_executor/
+  - vllm/v1/attention/
+  - vllm/config/compilation.py
+  - csrc/
   - tests/compile
+  - vllm/platforms/rocm.py
   commands:
   - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
 
-- label: Cudagraph test # 15min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Cudagraph # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
+  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - tests/v1/cudagraph
   - vllm/v1/cudagraph_dispatcher.py
   - vllm/config/compilation.py
   - vllm/compilation
+  - vllm/platforms/rocm.py
   commands:
-    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
-    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
+  - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+  - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
 
-- label: Kernels Core Operation Test # 48min
-  timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Kernels Core Operation Test # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - csrc/
   - tests/kernels/core
   - tests/kernels/test_top_k_per_row.py
+  - tests/kernels/test_concat_mla_q.py
+  - vllm/model_executor/layers/rotary_embedding/
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
-    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+  - pytest -v -s kernels/core kernels/test_top_k_per_row.py
 
-- label: Kernels Attention Test %N # 23min
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_1
-  parallelism: 2
-  source_file_dependencies:
-  - csrc/attention/
-  - vllm/v1/attention
-  - vllm/model_executor/layers/attention
-  - tests/kernels/attention
-  commands:
-    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
 
-- label: Kernels Quantization Test %N # 64min
-  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+- label: Kernels Mamba Test # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
   optional: true
-  parallelism: 2
-  source_file_dependencies:
-  - csrc/quantization/
-  - vllm/model_executor/layers/quantization
-  - tests/kernels/quantization
-  commands:
-    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-
-- label: Kernels MoE Test %N # 40min
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_1
-  parallelism: 2
-  source_file_dependencies:
-  - csrc/quantization/cutlass_w8a8/moe/
-  - csrc/moe/
-  - tests/kernels/moe
-  - vllm/model_executor/layers/fused_moe/
-  - vllm/distributed/device_communicators/
-  - vllm/envs.py
-  - vllm/config
-  commands:
-    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-
-- label: Kernels Mamba Test # 31min
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_1
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - csrc/mamba/
   - tests/kernels/mamba
   - vllm/model_executor/layers/mamba/ops
+  - vllm/platforms/rocm.py
   commands:
-    - pytest -v -s kernels/mamba
+  - pytest -v -s kernels/mamba
 
-- label: Kernels Helion Test # 20min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Kernels Helion Test # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
+  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/utils/import_utils.py
   - tests/kernels/helion/
+  - vllm/platforms/rocm.py
   commands:
-    - pip install helion
-    - pytest -v -s kernels/helion/
+  - pip install helion
+  - pytest -v -s kernels/helion/
 
-- label: Model Executor Test # 23min
-  timeout_in_minutes: 35
-  torch_nightly: true
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Model Executor # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
+  optional: true
+  torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/engine/arg_utils.py
   - vllm/config/model.py
   - vllm/model_executor
   - tests/model_executor
   - tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
-    - apt-get update && apt-get install -y curl libsodium23
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s model_executor
-    - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py
+  - apt-get update && apt-get install -y curl libsodium23
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s model_executor
+  - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py
 
-- label: Benchmarks # 11min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Benchmarks # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
   working_dir: "/vllm-workspace/.buildkite"
   source_file_dependencies:
   - benchmarks/
+  - vllm/platforms/rocm.py
   commands:
   - bash scripts/run-benchmarks.sh
 
-- label: Benchmarks CLI Test # 7min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Benchmarks CLI Test # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/benchmarks/
   commands:
   - pytest -v -s benchmarks/
 
-- label: Quantization Test # 70min
-  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_1
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  - tests/quantization
-  commands:
-  - uv pip install --system torchao==0.14.1
-  - uv pip install --system conch-triton-kernels
-  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
-- label: LM Eval Small Models # 53min
-  timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_1
-  optional: true
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  autorun_on_main: true
-  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
-
-- label: OpenAI API correctness # 10min
-  timeout_in_minutes: 15
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+- label: OpenAI API correctness # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - csrc/
   - vllm/entrypoints/openai/
   - vllm/model_executor/models/whisper.py
-  - tools/
+  - vllm/model_executor/layers/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  - vllm/model_executor/model_loader/
   commands:
   - bash ../tools/install_torchcodec_rocm.sh || exit 1
   - pytest -s entrypoints/openai/correctness/
 
-- label: Basic Models Tests (Initialization) # 15min
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Basic Models Tests (Initialization) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
   torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/test_initialization.py
+  - tests/models/registry.py
   commands:
-    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
+  - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
 
-- label: Basic Models Tests (Extra Initialization) %N # 15min
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Basic Models Tests (Extra Initialization) %N # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
   torch_nightly: true
   parallelism: 2
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/model_executor/models/
-  - vllm/transformers_utils/
+  - vllm/model_executor/layers/
   - tests/models/test_initialization.py
+  - tests/models/registry.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
-    - pytest -v -s models/test_initialization.py \
-             -k 'not test_can_initialize_small_subset' \
-             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
-             --shard-id=$$BUILDKITE_PARALLEL_JOB
+  - pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
 
-- label: Basic Models Tests (Other) # 15min
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Basic Models Tests (Other) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
-  optional: true
   torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/test_terratorch.py
   - tests/models/test_transformers.py
   - tests/models/test_registry.py
   commands:
-    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
+  - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
 
-- label: Basic Models Test (Other CPU) # 5min
-  timeout_in_minutes: 10
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Basic Models Test (Other CPU) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
-  torch_nightly: true
   no_gpu: true
+  optional: true
+  torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/test_utils.py
   - tests/models/test_vision.py
   commands:
-    - pytest -v -s models/test_utils.py models/test_vision.py
+  - pytest -v -s models/test_utils.py models/test_vision.py
 
-- label: Language Models Tests (Standard) # 18min
-  timeout_in_minutes: 25
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_1
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/language
-  commands:
-    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/language -m 'core_model and (not slow_test)'
 
-- label: Language Models Tests (Extra Standard) %N # 27min
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+- label: Language Models Tests (Extra Standard) %N # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
-  optional: true
   torch_nightly: true
   parallelism: 2
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/model_executor/models/
+  - vllm/model_executor/model_loader/
+  - vllm/model_executor/layers/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
   - tests/models/language/pooling/test_embedding.py
   - tests/models/language/generation/test_common.py
   - tests/models/language/pooling/test_classification.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
-    - pip freeze | grep -E 'torch'
-    - export TORCH_NCCL_BLOCKING_WAIT=1
-    - pytest -v -s models/language -m 'core_model and slow_test' \
-             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
-             --shard-id=$$BUILDKITE_PARALLEL_JOB
+  - pip freeze | grep -E 'torch'
+  - export TORCH_NCCL_BLOCKING_WAIT=1
+  - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
 
-- label: Language Models Tests (Hybrid) %N # 50min
-  timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_1
-  optional: true
-  torch_nightly: true
-  parallelism: 2
-  source_file_dependencies:
-  - vllm/
-  - tests/models/language/generation
-  commands:
-    - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
-    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
-    - pytest -v -s models/language/generation \
-                   -m hybrid_model \
-                   --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
-                   --shard-id=$$BUILDKITE_PARALLEL_JOB
 
-- label: Language Models Test (Extended Generation) # 80min
-  timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+- label: Language Models Test (PPL) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/language/generation
-  commands:
-    - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
-    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
-    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
-
-- label: Language Models Test (PPL) # 80min
-  timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_1
-  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/language/generation_ppl_test
   commands:
-    - pytest -v -s models/language/generation_ppl_test
+  - pytest -v -s models/language/generation_ppl_test
 
-- label: Language Models Test (Extended Pooling)  # 36min
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Language Models Test (Extended Pooling)  # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
-  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/language/pooling
   commands:
-    - pytest -v -s models/language/pooling -m 'not core_model'
+  - pytest -v -s models/language/pooling -m 'not core_model'
 
-- label: Language Models Test (MTEB) # 80min
-  timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Language Models Test (MTEB) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
-  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/language/pooling_mteb_test
   commands:
-    - pytest -v -s models/language/pooling_mteb_test
+  - pytest -v -s models/language/pooling_mteb_test
 
-- label: Multi-Modal Processor Test (CPU) # 15min
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Multi-Modal Processor (CPU) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
   no_gpu: true
+  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal
   - tests/models/registry.py
   commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
 
-- label: Multi-Modal Processor Test # 44min
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_1
-  source_file_dependencies:
-  - vllm/
-  - tests/models/multimodal
-  - tests/models/registry.py
-  commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing
 
-- label: Multi-Modal Models Test (Standard) # 60min
-  timeout_in_minutes: 100
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_1
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/multimodal
-  commands:
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py
-    - pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model
-    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
-
-- label: Multi-Modal Accuracy Eval (Small Models) # 5min
-  timeout_in_minutes: 10
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+- label: Multi-Modal Accuracy Eval (Small Models) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
+  optional: true
   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
   source_file_dependencies:
   - vllm/multimodal/
   - vllm/inputs/
   - vllm/v1/core/
+  - vllm/platforms/rocm.py
+  - vllm/model_executor/model_loader/
   commands:
-  - export MIOPEN_DEBUG_CONV_DIRECT=0
-  - export MIOPEN_DEBUG_CONV_GEMM=0
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
 
-- label: Multi-Modal Models Test (Extended Generation 1) # 60min
-  timeout_in_minutes: 120
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: "Multi-Modal Models (Standard) 1: qwen2" # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
-  optional: true
+  torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2"
+  - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model
+
+
+- label: "Multi-Modal Models (Standard) 2: qwen3 + gemma" # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
+  agent_pool: mi250_1
+  torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma"
+  - pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model
+
+
+- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl" # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
+  agent_pool: mi250_1
+  torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma"
+  - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model
+
+
+- label: "Multi-Modal Models (Standard) 4: other + whisper" # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
+  agent_pool: mi250_1
+  torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal/generation
   - tests/models/multimodal/test_mapping.py
   commands:
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py
-    - pytest -v -s models/multimodal/test_mapping.py
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+  - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model
 
-- label: Multi-Modal Models Test (Extended Generation 2) # 60min
-  timeout_in_minutes: 120
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Multi-Modal Models (Extended Generation 1) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
-  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal/generation
+  - tests/models/multimodal/test_mapping.py
+  commands:
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py
+  - pytest -v -s models/multimodal/test_mapping.py
+
+
+- label: Multi-Modal Models (Extended Generation 2) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
+  agent_pool: mi250_1
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal/generation
   commands:
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
 
-- label: Multi-Modal Models Test (Extended Generation 3) # 75min
-  timeout_in_minutes: 150
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Multi-Modal Models (Extended Generation 3) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
-  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal/generation
   commands:
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
 
-- label: Multi-Modal Models Test (Extended Pooling) # 60min
-  timeout_in_minutes: 120
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Multi-Modal Models (Extended Pooling) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
-  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal/pooling
   commands:
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - pytest -v -s models/multimodal/pooling -m 'not core_model'
+  - pytest -v -s models/multimodal/pooling -m 'not core_model'
 
-- label: Quantized Models Test # 45 min
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Distributed Comm Ops # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
-  source_file_dependencies:
-  - vllm/model_executor/layers/quantization
-  - tests/models/quantization
-  commands:
-    - pytest -v -s models/quantization
-
-- label: Transformers Nightly Models Test # 60 min
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_1
-  working_dir: "/vllm-workspace/"
-  optional: true
-  commands:
-    - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
-    - pytest -v -s tests/models/test_transformers.py
-    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
-    - python3 examples/offline_inference/basic/chat.py
-    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
-
-- label: Distributed Comm Ops Test # 7min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_2
   num_gpus: 2
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/distributed
   - tests/distributed
+  - vllm/platforms/rocm.py
   commands:
   - pytest -v -s distributed/test_comm_ops.py
   - pytest -v -s distributed/test_shm_broadcast.py
   - pytest -v -s distributed/test_shm_buffer.py
   - pytest -v -s distributed/test_shm_storage.py
 
-- label: 2 Node Tests (4 GPUs in total) # 16min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction, amdmultinode, amdgfx90a]
-  agent_pool: mi250_4
-  optional: true
+
+- label: Distributed DP Tests (2 GPUs) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
+  agent_pool: mi250_2
   num_gpus: 2
-  num_nodes: 2
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/distributed/
   - vllm/engine/
   - vllm/executor/
-  - vllm/model_executor/models/
-  - tests/distributed/
-  - tests/examples/offline_inference/data_parallel.py
+  - vllm/worker/worker_base.py
+  - vllm/v1/engine/
+  - vllm/v1/worker/
+  - tests/v1/distributed
+  - tests/v1/entrypoints/openai/test_multi_api_servers.py
+  - vllm/platforms/rocm.py
   commands:
-  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)  | grep 'Same node test passed'   | grep 'Node count test passed'
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py 
-    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py 
-    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
-    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
-    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py 
-    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py 
-    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
+  - export TORCH_NCCL_BLOCKING_WAIT=1
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
 
-- label: Distributed Tests (2 GPUs) # 68min
-  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Distributed Compile + RPC Tests (2 GPUs) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_2
-  optional: true
   num_gpus: 2
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/compilation/
@@ -1083,40 +1200,58 @@ steps:
   - vllm/v1/worker/
   - tests/compile/fullgraph/test_basic_correctness.py
   - tests/compile/test_wrapper.py
-  - tests/distributed/
   - tests/entrypoints/llm/test_collective_rpc.py
-  - tests/v1/distributed
-  - tests/v1/entrypoints/openai/test_multi_api_servers.py
-  - tests/v1/shutdown
-  - tests/v1/worker/test_worker_memory_snapshot.py
-  - examples/offline_inference/new_weight_syncing/
+  - vllm/platforms/rocm.py
   commands:
   - export TORCH_NCCL_BLOCKING_WAIT=1
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
   - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
+
+
+- label: Distributed Torchrun + Shutdown Tests (2 GPUs) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
+  agent_pool: mi250_2
+  num_gpus: 2
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/worker/worker_base.py
+  - vllm/v1/engine/
+  - vllm/v1/worker/
+  - tests/distributed/
+  - tests/v1/shutdown
+  - tests/v1/worker/test_worker_memory_snapshot.py
+  - vllm/platforms/rocm.py
+  commands:
+  - export TORCH_NCCL_BLOCKING_WAIT=1
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
   - pytest -v -s v1/worker/test_worker_memory_snapshot.py
 
-- label: Distributed Model Tests (2 GPUs) # 37min
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Distributed Model Tests (2 GPUs) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_2
-  optional: true
   num_gpus: 2
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/model_executor/model_loader/sharded_state_loader.py
   - vllm/model_executor/models/
+  - vllm/model_executor/layers/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
   - tests/basic_correctness/
   - tests/model_executor/model_loader/test_sharded_state_loader.py
   - tests/models/
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
   - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
@@ -1125,46 +1260,52 @@ steps:
   - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
   - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
 
-- label: Plugin Tests (2 GPUs) # 40min
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Plugin Tests (2 GPUs) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_2
   num_gpus: 2
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/plugins/
   - tests/plugins/
+  - vllm/platforms/rocm.py
   commands:
-  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
+  # BEGIN: platform plugin and general plugin tests, all the code in-between runs on dummy platform
   - pip install -e ./plugins/vllm_add_dummy_platform
   - pytest -v -s plugins_tests/test_platform_plugins.py
   - pip uninstall vllm_add_dummy_platform -y
-  # end platform plugin tests
-  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+  # END: platform plugin tests
+  # BEGIN: `io_processor` plugins test, all the code in between uses the `prithvi_io_processor` plugin
   - pip install -e ./plugins/prithvi_io_processor_plugin
   - pytest -v -s plugins_tests/test_io_processor_plugins.py
   - pip uninstall prithvi_io_processor_plugin -y
-  # test bge_m3_sparse io_processor plugin
+  # END: `io_processor` plugins test
+  # BEGIN: `bge_m3_sparse io_processor` test
   - pip install -e ./plugins/bge_m3_sparse_plugin
   - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
   - pip uninstall bge_m3_sparse_plugin -y
-  # end io_processor plugins test
-  # begin stat_logger plugins test
+  # END: `bge_m3_sparse io_processor` test
+  # BEGIN: `stat_logger` plugins test
   - pip install -e ./plugins/vllm_add_dummy_stat_logger
   - pytest -v -s plugins_tests/test_stats_logger_plugins.py
   - pip uninstall dummy_stat_logger -y
-  # end stat_logger plugins test
-  # other tests continue here:
+  # END: `stat_logger` plugins test
+  # BEGIN: other tests
   - pytest -v -s plugins_tests/test_scheduler_plugins.py
   - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s distributed/test_distributed_oot.py
   - pytest -v -s entrypoints/openai/chat_completion/test_oot_registration.py
   - pytest -v -s models/test_oot_registration.py
   - pytest -v -s plugins/lora_resolvers
+  # END: other tests
 
-- label: Pipeline + Context Parallelism Test # 45min
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Pipeline + Context Parallelism (4 GPUs) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_4
   num_gpus: 4
   working_dir: "/vllm-workspace/tests"
@@ -1173,325 +1314,130 @@ steps:
   - vllm/engine/
   - vllm/executor/
   - vllm/model_executor/models/
+  - vllm/model_executor/layers/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
   - tests/distributed/
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
   - pytest -v -s distributed/test_pp_cudagraph.py
   - pytest -v -s distributed/test_pipeline_parallel.py
 
-- label: LoRA TP Test (Distributed) # 17 min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Ray Dependency Compatibility Check # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
+  agent_pool: mi250_1
+  working_dir: "/"
+  source_file_dependencies:
+  - requirements/
+  - setup.py
+  - vllm/platforms/rocm.py
+  commands:
+  - bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh
+
+
+- label: Distributed NixlConnector PD accuracy (4 GPUs)  # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_4
   num_gpus: 4
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-  - vllm/lora
-  - tests/lora
+  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+  - tests/v1/kv_connector/nixl_integration/
+  - vllm/platforms/rocm.py
   commands:
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s -x lora/test_chatglm3_tp.py
-    - pytest -v -s -x lora/test_llama_tp.py
-    - pytest -v -s -x lora/test_llm_with_multi_loras.py
-    - pytest -v -s -x lora/test_olmoe_tp.py
+  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
+  - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 
-- label: Weight Loading Multiple GPU Test  # 33min
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
+  agent_pool: mi250_4
+  num_gpus: 4
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+  - tests/v1/kv_connector/nixl_integration/
+  - vllm/platforms/rocm.py
+  commands:
+  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
+  - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+
+
+- label: NixlConnector PD + Spec Decode acceptance (2 GPUs) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_2
   num_gpus: 2
-  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-  - vllm/
-  - tests/weight_loading
+  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+  - vllm/v1/worker/kv_connector_model_runner_mixin.py
+  - tests/v1/kv_connector/nixl_integration/
+  - vllm/platforms/rocm.py
   commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
+  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
+  - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
 
-- label: Weight Loading Multiple GPU Test - Large Models # optional
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
+  agent_pool: mi250_4
+  num_gpus: 4
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+  - tests/v1/kv_connector/nixl_integration/
+  - vllm/platforms/rocm.py
+  commands:
+  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
+  - CROSS_LAYERS_BLOCKS=True ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+
+
+- label: Distributed Tests (2 GPUs)(H100-MI250) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_2
   num_gpus: 2
-  optional: true
-  working_dir: "/vllm-workspace/tests"
+  working_dir: "/vllm-workspace/"
   source_file_dependencies:
-  - vllm/
-  - tests/weight_loading
-  commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
-
-- label: NixlConnector PD accuracy tests (Distributed) # 30min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_4
-  num_gpus: 4
-  working_dir: "/vllm-workspace/tests"
-  source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-    - tests/v1/kv_connector/nixl_integration/
-  commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
-    - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
-- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
-  timeout_in_minutes: 15
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_4
-  num_gpus: 4
-  working_dir: "/vllm-workspace/tests"
-  source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-    - tests/v1/kv_connector/nixl_integration/
-  commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
-    - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
-- label: Distributed Tests (A100) # 68min
-  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_4
-  optional: true
-  num_gpus: 4
-  source_file_dependencies:
-  - vllm/
+  - vllm/distributed/
+  - vllm/v1/distributed/
+  - vllm/model_executor/layers/fused_moe/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
+  - tests/distributed/test_context_parallel.py
+  - tests/v1/distributed/test_dbo.py
+  - examples/offline_inference/data_parallel.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
   - export TORCH_NCCL_BLOCKING_WAIT=1
-  - pytest -v -s distributed/test_custom_all_reduce.py
-  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
-  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
-  - pytest -v -s -x lora/test_mixtral.py
-
-- label: LM Eval Large Models # 80min
-  timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_4
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
-
-- label: LM Eval Large Models (H100) # 80min
-  timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_4
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-    - export VLLM_USE_DEEP_GEMM=0 
-    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=4
-
-- label: Distributed Tests (H200) # 68min
-  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_2
-  optional: true
-  num_gpus: 2
-  working_dir: "/vllm-workspace/"
-  commands:
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
-    - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
-    - pytest -v -s tests/distributed/test_context_parallel.py
-    - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
-
-- label: LM Eval Small Models (1 Card) # 15min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_1
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
-
-- label: LM Eval Large Models (4 Card) # 80min
-  timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_4
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
-
-- label: ROCm LM Eval Large Models (8 Card) # 80min
-  timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_8
-  num_gpus: 8
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
-
-- label: ROCm GPT-OSS Eval # 80min
-  timeout_in_minutes: 60
-  working_dir: "/vllm-workspace/"
-  agent_pool: mi250_1
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  optional: true
-  source_file_dependencies:
-  - tests/evals/gpt_oss
-  - vllm/model_executor/models/gpt_oss.py
-  - vllm/model_executor/layers/quantization/mxfp4.py
-  - vllm/v1/attention/backends/flashinfer.py
-  commands:
-    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
-
-- label: DeepSeek V2-Lite Accuracy # 70min
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_4
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace"
-  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
-
-- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy # 70min
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
-  agent_pool: mi250_4
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace"
-  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
+  - pytest -v -s tests/distributed/test_context_parallel.py
+  - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
+  - pytest -v -s tests/v1/distributed/test_dbo.py
 
 
-###################################################
-#                                                 #
-#  MI325 test definitions                         #
-#                                                 #
-###################################################
+#####################################################################################################################################
+#                                                                                                                                   #
+#                                                             gfx942                                                                #
+#                                                                                                                                   #
+#####################################################################################################################################
 
 
-##### fast check tests  #####
-
-- label: Pytorch Nightly Dependency Override Check # 2min
-  # if this test fails, it means the nightly torch version is not compatible with some
-  # of the dependencies. Please check the error message and add the package to whitelist
-  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
-  grade: Blocking
-  optional: true
-  soft_fail: true
-  source_file_dependencies:
-  - requirements/nightly_torch_test.txt
-  commands:
-  - bash standalone_tests/pytorch_nightly_dependency.sh
-
-- label: Async Engine, Inputs, Utils, Worker Test # 10min
-  timeout_in_minutes: 15
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
-  grade: Blocking
-  source_file_dependencies:
-  - vllm/
-  - tests/detokenizer
-  - tests/multimodal
-  - tests/utils_
-  commands:
-  - pytest -v -s detokenizer
-  - pytest -v -s -m 'not cpu_test' multimodal
-  - pytest -v -s utils_
-
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+- label: Entrypoints Integration (LLM) # 13.1m
+  timeout_in_minutes: 22
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
   optional: true
-  # grade: Blocking
-  source_file_dependencies:
-  - vllm/
-  - tests/test_inputs.py
-  - tests/test_outputs.py
-  - tests/test_pooling_params.py
-  - tests/multimodal
-  - tests/renderers
-  - tests/standalone_tests/lazy_imports.py
-  - tests/tokenizers_
-  - tests/tool_parsers
-  - tests/transformers_utils
-  - tests/config
-  no_gpu: true
-  commands:
-  - python3 standalone_tests/lazy_imports.py
-  - pytest -v -s test_inputs.py
-  - pytest -v -s test_outputs.py
-  - pytest -v -s test_pooling_params.py
-  - pytest -v -s -m 'cpu_test' multimodal
-  - pytest -v -s renderers
-  - pytest -v -s tokenizers_
-  - pytest -v -s tool_parsers
-  - pytest -v -s transformers_utils
-  - pytest -v -s config
-
-- label: Python-only Installation Test # 10min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_1
-  optional: true
-  # grade: Blocking
-  source_file_dependencies:
-  - tests/standalone_tests/python_only_compile.sh
-  - setup.py
-  commands:
-  - bash standalone_tests/python_only_compile.sh
-
-- label: Basic Correctness Test # 20min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
   fast_check: true
   torch_nightly: true
-  source_file_dependencies:
-  - vllm/
-  - tests/basic_correctness/test_basic_correctness
-  - tests/basic_correctness/test_cpu_offload
-  - tests/basic_correctness/test_cumem.py
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s basic_correctness/test_cumem.py
-  - pytest -v -s basic_correctness/test_basic_correctness.py
-  - pytest -v -s basic_correctness/test_cpu_offload.py
-
-- label: Entrypoints Unit Tests # 5min
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
-  grade: Blocking
-  timeout_in_minutes: 10
   working_dir: "/vllm-workspace/tests"
-  fast_check: true
-  source_file_dependencies:
-  - vllm/entrypoints
-  - tests/entrypoints/
-  commands:
-  - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
-
-- label: Entrypoints Integration Test (LLM) # 30min
-  timeout_in_minutes: 40
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  optional: true
-  # grade: Blocking
-  working_dir: "/vllm-workspace/tests"
-  fast_check: true
-  torch_nightly: true
   source_file_dependencies:
   - vllm/
   - tests/entrypoints/llm
@@ -1499,36 +1445,35 @@ steps:
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
-  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+  - pytest -v -s entrypoints/llm/test_generate.py
+  - pytest -v -s entrypoints/offline_mode
 
-- label: Entrypoints Integration Test (API Server 1) # 100min
-  timeout_in_minutes: 130
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Entrypoints Integration (API Server 1) # 1h 7m
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  optional: true
-  # grade: Blocking
-  working_dir: "/vllm-workspace/tests"
   fast_check: true
   torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/entrypoints/openai
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
   - pytest -v -s entrypoints/test_chat_utils.py
 
-- label: Entrypoints Integration Test (API Server 2)
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Entrypoints Integration (API Server 2) #26.9m
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
   optional: true
-  # grade: Blocking
-  working_dir: "/vllm-workspace/tests"
   fast_check: true
   torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/entrypoints/rpc
@@ -1540,15 +1485,14 @@ steps:
   - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
   - pytest -v -s tool_use
 
-- label: Entrypoints Integration Test (Pooling)
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Entrypoints Integration (Pooling) # 22.8m
+  timeout_in_minutes: 48
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  optional: true
-  # grade: Blocking
-  working_dir: "/vllm-workspace/tests"
   fast_check: true
   torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/entrypoints/pooling
@@ -1556,61 +1500,50 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s entrypoints/pooling
 
-- label: Entrypoints Integration Test (Responses API)
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  optional: true
-  # grade: Blocking
-  working_dir: "/vllm-workspace/tests"
-  fast_check: true
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/
-  - tests/entrypoints/openai/responses
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai/responses
 
-- label: Distributed Tests (4 GPUs) # 35min
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental, amdproduction]
+- label: Distributed Torchrun + Examples (4 GPUs) # TBD
+  timeout_in_minutes: 80
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_4
-  optional: true
-  # grade: Blocking
-  working_dir: "/vllm-workspace/tests"
   num_gpus: 4
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/distributed/
-  - tests/distributed/test_utils
-  - tests/distributed/test_pynccl
-  - tests/distributed/test_events
-  - tests/compile/fullgraph/test_basic_correctness.py
+  - tests/distributed/test_torchrun_example.py
+  - tests/distributed/test_torchrun_example_moe.py
   - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
   - examples/rl/
   - tests/examples/offline_inference/data_parallel.py
+  - vllm/platforms/rocm.py
+  commands:
+  - export TORCH_NCCL_BLOCKING_WAIT=1
+  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+  # rlhf examples
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 ../examples/rl/rlhf_nccl.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 ../examples/rl/rlhf_ipc.py
+
+
+- label: Distributed DP Tests (4 GPUs) # TBD
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_4
+  num_gpus: 4
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/distributed/
   - tests/v1/distributed
   - tests/v1/engine/test_engine_core_client.py
-  - tests/distributed/test_symm_mem_allreduce.py
+  - tests/distributed/test_utils
+  - vllm/platforms/rocm.py
   commands:
-  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
-  # TODO: Remove when the bug is fixed in a future ROCm release
   - export TORCH_NCCL_BLOCKING_WAIT=1
-  # test with torchrun tp=2 and external_dp=2
-  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-  # test with torchrun tp=2 and pp=2
-  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-  # test with torchrun tp=4 and dp=1
-  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-  # test with torchrun tp=2, pp=2 and dp=1
-  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-  # test with torchrun tp=1 and dp=4 with ep
-  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-  # test with torchrun tp=2 and dp=2 with ep
-  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-  # test with internal dp
-  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
@@ -1618,32 +1551,37 @@ steps:
   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
   - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
   - pytest -v -s distributed/test_utils.py
+
+
+- label: Distributed Compile + Comm (4 GPUs) # TBD
+  timeout_in_minutes: 40
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_4
+  num_gpus: 4
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/distributed/
+  - tests/distributed/test_pynccl
+  - tests/distributed/test_events
+  - tests/compile/fullgraph/test_basic_correctness.py
+  - tests/distributed/test_symm_mem_allreduce.py
+  - tests/distributed/test_multiproc_executor.py
+  - vllm/platforms/rocm.py
+  commands:
+  - export TORCH_NCCL_BLOCKING_WAIT=1
   - pytest -v -s compile/fullgraph/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
   - pytest -v -s distributed/test_events.py
   - pytest -v -s distributed/test_symm_mem_allreduce.py
-  # TODO: create a dedicated test section for multi-GPU example tests
-  # when we have multiple distributed example tests
-  # OLD rlhf examples
-  - pushd ../examples/offline_inference
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
-  - popd
-  # NEW rlhf examples
-  - pushd ../examples/rl
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
-  - popd
+  - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
 
-- label: Distributed Tests (8 GPUs) # 4min
+
+- label: Distributed Tests (8 GPUs)(H100-MI325) # 6.4m
   timeout_in_minutes: 10
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_8
-  optional: true
-  # grade: Blocking
-  gpu: h100
   num_gpus: 8
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - examples/offline_inference/torchrun_dp_example.py
@@ -1652,77 +1590,35 @@ steps:
   - vllm/v1/engine/llm_engine.py
   - vllm/v1/executor/uniproc_executor.py
   - vllm/v1/worker/gpu_worker.py
+  - vllm/platforms/rocm.py
   commands:
-  # test with torchrun tp=2 and dp=4 with ep
-  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
-  # TODO: Remove when the bug is fixed in a future ROCm release
   - export TORCH_NCCL_BLOCKING_WAIT=1
   - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
 
-- label: EPLB Algorithm Test # 5min
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
-  grade: Blocking
-  timeout_in_minutes: 15
-  working_dir: "/vllm-workspace/tests"
-  source_file_dependencies:
-  - vllm/distributed/eplb
-  - tests/distributed/test_eplb_algo.py
-  commands:
-  - pytest -v -s distributed/test_eplb_algo.py
 
-- label: EPLB Execution Test # 10min
-  mirror_hardwares: [amdexperimental, amdproduction]
+- label: Elastic EP Scaling Test # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_4
-  # grade: Blocking
-  timeout_in_minutes: 20
-  working_dir: "/vllm-workspace/tests"
   num_gpus: 4
+  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-  - vllm/distributed/eplb
-  - tests/distributed/test_eplb_execute.py
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/compilation/
+  - tests/distributed/
+  - vllm/platforms/rocm.py
   commands:
-  - pytest -v -s distributed/test_eplb_execute.py
-  - pytest -v -s distributed/test_eplb_spec_decode.py
+  - pytest -v -s distributed/test_elastic_ep.py
 
-- label: Metrics, Tracing Test # 12min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_2
-  # grade: Blocking
-  num_gpus: 2
-  source_file_dependencies:
-  - vllm/
-  - tests/v1/tracing
-  commands:
-  - "pip install \
-      'opentelemetry-sdk>=1.26.0' \
-      'opentelemetry-api>=1.26.0' \
-      'opentelemetry-exporter-otlp>=1.26.0' \
-      'opentelemetry-semantic-conventions-ai>=0.4.1'"
-  - pytest -v -s v1/tracing
 
-##### fast check tests  #####
-#####  1 GPU test  #####
-
-- label: Regression Test # 7min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+- label: Engine # 11.3m
+  timeout_in_minutes: 35
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  grade: Blocking
-  source_file_dependencies:
-  - vllm/
-  - tests/test_regression
-  commands:
-  - pip install modelscope
-  - pytest -v -s test_regression.py
-  working_dir: "/vllm-workspace/tests" # optional
-
-- label: Engine Test # 9min
-  timeout_in_minutes: 15
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/engine
@@ -1733,347 +1629,435 @@ steps:
   commands:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
 
-- label: V1 Test e2e + engine # 65min
-  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Engine (1 GPU) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
   optional: true
-  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-    - vllm/
-    - tests/v1
+  - vllm/v1/engine/
+  - tests/v1/engine/
+  - vllm/platforms/rocm.py
   commands:
-    # TODO: accuracy does not match, whether setting
-    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-    - pytest -v -s v1/e2e
-    - pytest -v -s v1/engine
+  - pytest -v -s v1/engine/test_preprocess_error_handling.py
+  - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
 
-- label: V1 Test e2e (2 GPUs) # 65min
-  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: e2e Scheduling (1 GPU) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/v1/
+  - tests/v1/e2e/general/
+  - vllm/platforms/rocm.py
+  commands:
+  - pytest -v -s v1/e2e/general/test_async_scheduling.py
+
+
+- label: e2e Core (1 GPU) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/v1/
+  - tests/v1/e2e/
+  - vllm/platforms/rocm.py
+  commands:
+  - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
+
+
+- label: Spec Decode Eagle # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/v1/spec_decode/
+  - vllm/v1/worker/gpu/spec_decode/
+  - vllm/model_executor/model_loader/
+  - vllm/v1/sample/
+  - vllm/model_executor/layers/
+  - tests/v1/e2e/spec_decode/
+  - vllm/platforms/rocm.py
+  commands:
+  - pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"
+
+
+- label: Spec Decode Speculators + MTP # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/v1/spec_decode/
+  - vllm/v1/worker/gpu/spec_decode/
+  - vllm/model_executor/model_loader/
+  - vllm/v1/sample/
+  - vllm/model_executor/layers/
+  - vllm/transformers_utils/configs/speculators/
+  - tests/v1/e2e/spec_decode/
+  - vllm/platforms/rocm.py
+  commands:
+  - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
+
+
+- label: Spec Decode Ngram + Suffix # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/v1/spec_decode/
+  - vllm/v1/worker/gpu/spec_decode/
+  - vllm/model_executor/model_loader/
+  - vllm/v1/sample/
+  - vllm/model_executor/layers/
+  - tests/v1/e2e/spec_decode/
+  - vllm/platforms/rocm.py
+  commands:
+  - pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"
+
+
+- label: Spec Decode Draft Model # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/v1/spec_decode/
+  - vllm/v1/worker/gpu/spec_decode/
+  - vllm/model_executor/model_loader/
+  - vllm/v1/sample/
+  - vllm/model_executor/layers/
+  - tests/v1/e2e/spec_decode/
+  - vllm/platforms/rocm.py
+  commands:
+  - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"
+
+
+- label: V1 e2e (2 GPUs) # 7.1m
+  timeout_in_minutes: 12
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_2
   optional: true
-  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-    - vllm/
-    - tests/v1
+  - vllm/
+  - tests/v1/e2e
   commands:
-    # Only run tests that need exactly 2 GPUs
     - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
 
-- label: V1 Test e2e (4 GPUs) # 65min
-  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental, amdproduction]
-  # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
-  # See discussion here: https://github.com/vllm-project/vllm/pull/31040
+
+- label: V1 e2e (4 GPUs) # 52.6m
+  timeout_in_minutes: 106
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_4
-  optional: true
-  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-    - vllm/
-    - tests/v1
+  - vllm/
+  - tests/v1/e2e
   commands:
-    # Only run tests that need 4 GPUs
     - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
 
-- label: V1 Test entrypoints # 35min
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
-  grade: Blocking
-  source_file_dependencies:
-    - vllm/
-    - tests/v1
-  commands:
-    - pytest -v -s v1/entrypoints
 
-- label: V1 Test others # 42min
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  optional: true
-  # grade: Blocking
-  source_file_dependencies:
-    - vllm/
-    - tests/v1
-  commands:
-    # split the test to avoid interference
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - pytest -v -s -m 'not cpu_test' v1/core
-    - pytest -v -s v1/executor
-    - pytest -v -s v1/kv_offload
-    - pytest -v -s v1/sample
-    - pytest -v -s v1/logits_processors
-    - pytest -v -s v1/worker
-    - pytest -v -s v1/spec_decode
-    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
-    - pytest -v -s -m 'not cpu_test' v1/metrics
-    - pytest -v -s v1/test_oracle.py
-    - pytest -v -s v1/test_request.py
-    - pytest -v -s v1/test_outputs.py
-    # Integration test for streaming correctness (requires special branch).
-    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
-    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
-
-# TODO: Add the "V1 Test attention (MI300)" test group
-
-- label: V1 Test attention (H100) # 10min
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  optional: true
-  # grade: Blocking
-  timeout_in_minutes: 30
-  gpu: h100
-  source_file_dependencies:
-    - vllm/config/attention.py
-    - vllm/model_executor/layers/attention
-    - vllm/v1/attention
-    - tests/v1/attention
-  commands:
-    - pytest -v -s v1/attention
-
-- label: Batch Invariance Tests (H100) # 10min
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi325_1
-  timeout_in_minutes: 25
-  gpu: h100
-  source_file_dependencies:
-    - vllm/v1/attention
-    - vllm/model_executor/layers
-    - tests/v1/determinism/
-  commands:
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pip install pytest-timeout pytest-forked
-    - pytest -v -s v1/determinism/test_batch_invariance.py
-    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
-
-- label: V1 Test others (CPU) # 5 mins
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
-  grade: Blocking
-  optional: true
-  source_file_dependencies:
-    - vllm/
-    - tests/v1
-  no_gpu: true
-  commands:
-    # split the test to avoid interference
-    - pytest -v -s -m 'cpu_test' v1/core
-    - pytest -v -s v1/structured_output
-    - pytest -v -s v1/test_serial_utils.py
-    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
-    - pytest -v -s -m 'cpu_test' v1/metrics
-
-
-- label: Examples Test # 30min
+- label: Entrypoints V1 # 25.7m
   timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/v1
+  commands:
+  - pytest -v -s v1/entrypoints
+
+
+- label: V1 Spec Decode # TBD
+  timeout_in_minutes: 40
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/v1/spec_decode
+  commands:
+  - pytest -v -s -m 'not slow_test' v1/spec_decode
+
+
+- label: V1 Sample + Logits # TBD
+  timeout_in_minutes: 40
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/v1/sample
+  - tests/v1/logits_processors
+  - tests/v1/test_oracle.py
+  - tests/v1/test_request.py
+  - tests/v1/test_outputs.py
+  commands:
+  - pytest -v -s v1/sample
+  - pytest -v -s v1/logits_processors
+  - pytest -v -s v1/test_oracle.py
+  - pytest -v -s v1/test_request.py
+  - pytest -v -s v1/test_outputs.py
+
+
+- label: V1 Core + KV + Metrics # TBD
+  timeout_in_minutes: 40
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/v1/core
+  - tests/v1/executor
+  - tests/v1/kv_offload
+  - tests/v1/worker
+  - tests/v1/kv_connector/unit
+  - tests/v1/metrics
+  - tests/entrypoints/openai/correctness/test_lmeval.py
+  commands:
+  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
+  - pytest -v -s -m 'not cpu_test' v1/core
+  - pytest -v -s v1/executor
+  - pytest -v -s v1/kv_offload
+  - pytest -v -s v1/worker
+  - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+  - pytest -v -s -m 'not cpu_test' v1/metrics
+  - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
+  # - export HSA_NO_SCRATCH_RECLAIM=1
+  - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+
+
+- label: V1 Speculative Decoding (slow) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/v1/spec_decode/
+  - vllm/model_executor/models/
+  - vllm/v1/attention/
+  - vllm/model_executor/layers/
+  - tests/v1/spec_decode/
+  - vllm/platforms/rocm.py
+  commands:
+  - pytest -v -s -m 'slow_test' v1/spec_decode/test_eagle.py
+  - pytest -v -s -m 'slow_test' v1/spec_decode/test_extract_hidden_states.py
+  - pytest -v -s -m 'slow_test' v1/spec_decode/test_max_len.py
+  - pytest -v -s -m 'slow_test' v1/spec_decode/test_mtp.py
+  - pytest -v -s -m 'slow_test' v1/spec_decode/test_ngram.py
+  - pytest -v -s -m 'slow_test' v1/spec_decode/test_speculators_eagle3.py
+  - pytest -v -s -m 'slow_test' v1/spec_decode/test_tree_attention.py
+
+
+- label: Acceptance Length Test (Large Models) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/v1/spec_decode/
+  - vllm/model_executor/models/mlp_speculator.py
+  - tests/v1/spec_decode/test_acceptance_length.py
+  - vllm/platforms/rocm.py
+  commands:
+  - export VLLM_ALLOW_INSECURE_SERIALIZATION=1
+  - pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
+
+
+- label: V1 attention (H100-MI325) # 14.5m
+  timeout_in_minutes: 40
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/config/attention.py
+  - vllm/model_executor/layers/attention
+  - vllm/v1/attention
+  - tests/v1/attention
+  - vllm/_aiter_ops.py
+  - vllm/envs.py
+  - vllm/platforms/rocm.py
+  commands:
+  - pytest -v -s v1/attention
+
+
+- label: Batch Invariance (H100-MI325) # 5.2m
+  timeout_in_minutes: 12
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/v1/attention
+  - vllm/model_executor/layers
+  - tests/v1/determinism/
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pip install pytest-timeout pytest-forked
+  - pytest -v -s v1/determinism/test_batch_invariance.py
+  - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+
+
+- label: V1 others (CPU) # 10.4m
+  timeout_in_minutes: 28
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
+  no_gpu: true
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/v1
+  commands:
+  - pytest -v -s -m 'cpu_test' v1/core
+  - pytest -v -s v1/structured_output
+  - pytest -v -s v1/test_serial_utils.py
+  - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+  - pytest -v -s -m 'cpu_test' v1/metrics
+
+
+- label: Examples # 24.5m
+  timeout_in_minutes: 55
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
   optional: true
-  # grade: Blocking
   working_dir: "/vllm-workspace/examples"
   source_file_dependencies:
   - vllm/entrypoints
   - vllm/multimodal
   - examples/
+  - vllm/platforms/rocm.py
   commands:
-    - pip install tensorizer # for tensorizer test
-    # for basic
+    - pip install tensorizer
+    # Basic
     - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN
     - python3 basic/offline_inference/generate.py --model facebook/opt-125m
     - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
     - python3 basic/offline_inference/classify.py
     - python3 basic/offline_inference/embed.py
     - python3 basic/offline_inference/score.py
-    # for multi-modal models
+    # Multi-modal models
     - python3 offline_inference/audio_language.py --seed 0
     - python3 offline_inference/vision_language.py --seed 0
     - python3 offline_inference/vision_language_multi_image.py --seed 0
     - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
-    # for pooling models
+    # Pooling models
     - python3 pooling/embed/vision_embedding_offline.py --seed 0
-    # for features demo
+    # Features demo
     - python3 offline_inference/prefix_caching.py
     - python3 offline_inference/llm_engine_example.py
     - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
     - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
-    #- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
 
-- label: Platform Tests (CUDA) # 4min
-  timeout_in_minutes: 15
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Platform Tests (CUDA) # 5.0m
+  timeout_in_minutes: 9
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  # grade: Blocking
+  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/cuda
   commands:
-    - pytest -v -s cuda/test_cuda_context.py
-    - pytest -v -s cuda/test_platform_no_cuda_init.py
+  - pytest -v -s cuda/test_cuda_context.py
+  - pytest -v -s cuda/test_platform_no_cuda_init.py
 
-- label: Samplers Test # 56min
-  timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  optional: true
-  # grade: Blocking
-  source_file_dependencies:
-  - vllm/model_executor/layers
-  - vllm/sampling_metadata.py
-  - tests/samplers
-  - tests/conftest.py
-  commands:
-    - pytest -v -s samplers
 
-- label: LoRA Test %N # 20min each
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
+- label: PyTorch Compilation Passes Unit Tests # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  optional: true
-  # grade: Blocking
-  source_file_dependencies:
-  - vllm/lora
-  - tests/lora
-  commands:
-    - pytest -v -s lora \
-      --shard-id=$$BUILDKITE_PARALLEL_JOB \
-      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
-      --ignore=lora/test_chatglm3_tp.py \
-      --ignore=lora/test_llama_tp.py \
-      --ignore=lora/test_llm_with_multi_loras.py \
-      --ignore=lora/test_olmoe_tp.py \
-      --ignore=lora/test_deepseekv2_tp.py \
-      --ignore=lora/test_gptoss_tp.py \
-      --ignore=lora/test_qwen3moe_tp.py
-  parallelism: 4
-
-##### .buildkite/test_areas/pytorch.yaml #####
-# corresponds to .buildkite/test_areas/pytorch.yaml
-- label: PyTorch Compilation Unit Tests # 15min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  torch_nightly: true
-  source_file_dependencies:
-    - vllm/
-    - tests/compile
-  commands:
-  # Run unit tests defined directly under compile/,
-  # not including subdirectories, which are usually heavier
-  # tests covered elsewhere.
-  # Use `find` to launch multiple instances of pytest so that
-  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
-
-# corresponds to .buildkite/test_areas/pytorch.yaml
-- label: PyTorch Compilation Passes Unit Tests
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  source_file_dependencies:
-    - vllm/
-    - tests/compile/passes
-  commands:
-  # TODO: clean up this comment if not needed. It is used to 
-  # keep track of the tests changes during vLLM IR Ops refactoring.
-  # Use `find` to launch multiple instances of pytest.
-  - "find compile/passes -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
-
-- label: PyTorch Fullgraph Smoke Test # 15min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
-  - tests/compile
+  - tests/compile/passes
   commands:
-  # Run smoke tests under fullgraph directory, except test_full_graph.py
-  # as it is a heavy test that is covered in other steps.
-  # Use `find` to launch multiple instances of pytest so that
-  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
+  - pytest -s -v compile/passes --ignore compile/passes/distributed
 
-- label: PyTorch Fullgraph Test # 27min
-  timeout_in_minutes: 40
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/
-  - tests/compile
-  commands:
-  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
-    # # Limit to no custom ops to reduce running time
-    # # Wrap with quotes to escape yaml and avoid starting -k string with a -
-    # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
-    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
-    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
 
-- label: Cudagraph test
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
+- label: Kernels Core Operation Test # 26.8m
+  timeout_in_minutes: 38
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  source_file_dependencies:
-  - tests/v1/cudagraph
-  - vllm/v1/cudagraph_dispatcher.py
-  - vllm/config/compilation.py
-  - vllm/compilation
-  commands:
-    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
-    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
-
-- label: Kernels Core Operation Test # 48min
-  timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
+  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - csrc/
   - tests/kernels/core
   - tests/kernels/test_top_k_per_row.py
+  - tests/kernels/test_concat_mla_q.py
+  - vllm/model_executor/layers/rotary_embedding/
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
-    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+  - pytest -v -s kernels/core kernels/test_top_k_per_row.py
 
-- label: Kernels Attention Test %N # 23min
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Kernels Attention Test %N # 17.7m
+  timeout_in_minutes: 28
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  # grade: Blocking
+  parallelism: 2
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - csrc/attention/
   - vllm/v1/attention
-    # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
   - vllm/model_executor/layers/attention
   - tests/kernels/attention
+  - vllm/_aiter_ops.py
+  - vllm/envs.py
+  - vllm/platforms/rocm.py
   commands:
-    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 2
+  - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
 
-- label: Kernels Quantization Test %N # 64min
-  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Kernels Quantization Test %N # 15.2m
+  timeout_in_minutes: 24
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  optional: true
-  # grade: Blocking
+  parallelism: 2
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - csrc/quantization/
   - vllm/model_executor/layers/quantization
   - tests/kernels/quantization
+  - tests/kernels/quantization/test_rocm_skinny_gemms.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  - vllm/model_executor/kernels/
   commands:
-    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 2
+  - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
 
-- label: Kernels MoE Test %N # 40min
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Kernels MoE Test %N # TBD
+  timeout_in_minutes: 19
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  # grade: Blocking
+  parallelism: 4
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - csrc/quantization/cutlass_w8a8/moe/
   - csrc/moe/
@@ -2082,533 +2066,301 @@ steps:
   - vllm/distributed/device_communicators/
   - vllm/envs.py
   - vllm/config
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
-    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 2
+  - pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
 
-- label: Kernels FP8 MoE Test
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Kernels FP8 MoE Test # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_2
   optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - csrc/moe/
+  - csrc/quantization/w8a8/cutlass/moe/
+  - vllm/model_executor/layers/fused_moe/
+  - tests/kernels/moe/test_deepep_moe.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  - vllm/envs.py
   commands:
     - pytest -v -s kernels/moe/test_deepep_moe.py
 
-- label: Kernels Mamba Test # 31min
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: ROCm AITER Ops Test # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-  - csrc/mamba/
-  - tests/kernels/mamba
-  - vllm/model_executor/layers/mamba/ops
+  - vllm/_aiter_ops.py
+  - vllm/envs.py
+  - vllm/platforms/rocm.py
+  - tests/rocm/aiter/
+  - vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+  - vllm/v1/attention/selector.py
   commands:
-    - pytest -v -s kernels/mamba
+  - pytest -v -s rocm/aiter/
 
-- label: Kernels DeepGEMM Test (H100) # Nvidia-centric
-# Not replicating for CUTLAS & CuTe
-  timeout_in_minutes: 45
-  gpu: h100
-  num_gpus: 1
-  source_file_dependencies:
-  - tools/install_deepgemm.sh
-  - vllm/utils/deep_gemm.py
-  - vllm/model_executor/layers/fused_moe
-  - vllm/model_executor/layers/quantization
-  - tests/kernels/quantization/test_block_fp8.py
-  - tests/kernels/moe/test_deepgemm.py
-  - tests/kernels/moe/test_batched_deepgemm.py
-  - tests/kernels/attention/test_deepgemm_attention.py
-  commands:
-    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
-    - pytest -v -s kernels/moe/test_deepgemm.py
-    - pytest -v -s kernels/moe/test_batched_deepgemm.py
-    - pytest -v -s kernels/attention/test_deepgemm_attention.py
 
-- label: Kernels Helion Test
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  source_file_dependencies:
-  - vllm/utils/import_utils.py
-  - tests/kernels/helion/
-  commands:
-    - pip install helion
-    - pytest -v -s kernels/helion/
-
-- label: Model Executor Test # 23min
-  timeout_in_minutes: 35
-  torch_nightly: true
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  source_file_dependencies:
-  - vllm/engine/arg_utils.py
-  - vllm/config/model.py
-  - vllm/model_executor
-  - tests/model_executor
-  - tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
-  commands:
-    - apt-get update && apt-get install -y curl libsodium23
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s model_executor
-    - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py
-
-- label: Benchmarks # 11min
+- label: Benchmarks # 8.2m
   timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  # grade: Blocking
+  optional: true
   working_dir: "/vllm-workspace/.buildkite"
   source_file_dependencies:
   - benchmarks/
+  - vllm/platforms/rocm.py
   commands:
   - bash scripts/run-benchmarks.sh
 
-- label: Benchmarks CLI Test # 7min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  source_file_dependencies:
-  - vllm/
-  - tests/benchmarks/
-  commands:
-  - pytest -v -s benchmarks/
 
-- label: Quantization Test # 70min
-  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental, amdproduction]
+- label: Quantization # 36.1m
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  optional: true
-  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - csrc/
   - vllm/model_executor/layers/quantization
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   - tests/quantization
   commands:
-  # temporary install here since we need nightly, will move to requirements/test.in
-  # after torchao 0.12 release, and pin a working version of torchao nightly here
-
-  # since torchao nightly is only compatible with torch nightly currently
-  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
-  # we can only upgrade after this is resolved
-  # TODO(jerryzh168): resolve the above comment
   - uv pip install --system torchao==0.14.1
   - uv pip install --system conch-triton-kernels
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
-- label: LM Eval Small Models # 53min
-  timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Language Models Tests (Standard) # 22.8m
+  timeout_in_minutes: 38
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
   optional: true
-  # grade: Blocking
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  autorun_on_main: true
-  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
-
-- label: OpenAI API correctness # 10min
-  timeout_in_minutes: 15
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  source_file_dependencies:
-  - csrc/
-  - vllm/entrypoints/openai/
-  - vllm/model_executor/models/whisper.py
-  - tools/
-  commands: # LMEval+Transcription WER check
-  - bash ../tools/install_torchcodec_rocm.sh || exit 1
-  - pytest -s entrypoints/openai/correctness/
-
-
-#####  models test  #####
-
-- label: Basic Models Tests (Initialization)
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/test_initialization.py
-  commands:
-    # Run a subset of model initialization tests
-    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
-
-- label: Basic Models Tests (Extra Initialization) %N
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/model_executor/models/
-  - vllm/transformers_utils/
-  - tests/models/test_initialization.py
-  commands:
-    # Only when vLLM model source is modified - test initialization of a large
-    # subset of supported models (the complement of the small subset in the above
-    # test.) Also run if model initialization test file is modified
-    - pytest -v -s models/test_initialization.py \
-             -k 'not test_can_initialize_small_subset' \
-             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
-             --shard-id=$$BUILDKITE_PARALLEL_JOB
-  parallelism: 2
-
-- label: Basic Models Tests (Other)
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  optional: true
-  # grade: Blocking
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/test_terratorch.py
-  - tests/models/test_transformers.py
-  - tests/models/test_registry.py
-  commands:
-    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
-
-- label: Basic Models Test (Other CPU) # 5min
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  timeout_in_minutes: 10
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/test_utils.py
-  - tests/models/test_vision.py
-  no_gpu: true
-  commands:
-    - pytest -v -s models/test_utils.py models/test_vision.py
-
-- label: Language Models Tests (Standard)
-  timeout_in_minutes: 25
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  optional: true
-  # grade: Blocking
   torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/language
   commands:
-    # Test standard language models, excluding a subset of slow tests
-    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/language -m 'core_model and (not slow_test)'
+  - pip freeze | grep -E 'torch'
+  - pytest -v -s models/language -m 'core_model and (not slow_test)'
 
-- label: Language Models Tests (Extra Standard) %N
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Language Models Tests (Hybrid) %N # 34.9m
+  timeout_in_minutes: 55
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  optional: true
-  # grade: Blocking
   torch_nightly: true
-  source_file_dependencies:
-  - vllm/model_executor/models/
-  - tests/models/language/pooling/test_embedding.py
-  - tests/models/language/generation/test_common.py
-  - tests/models/language/pooling/test_classification.py
-  commands:
-    # Shard slow subset of standard language models tests. Only run when model
-    # source is modified, or when specified test files are modified
-    - pip freeze | grep -E 'torch'
-    - export TORCH_NCCL_BLOCKING_WAIT=1
-    - pytest -v -s models/language -m 'core_model and slow_test' \
-             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
-             --shard-id=$$BUILDKITE_PARALLEL_JOB
   parallelism: 2
-
-- label: Language Models Tests (Hybrid) %N
-  timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  optional: true
-  # grade: Blocking
-  torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/language/generation
   commands:
-    # Install fast path packages for testing against transformers
-    # Note: also needed to run plamo2 model in vLLM
-    - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
-    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
-    # Shard hybrid language model tests
-    - pytest -v -s models/language/generation \
-                   -m hybrid_model \
-                   --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
-                   --shard-id=$$BUILDKITE_PARALLEL_JOB
-  parallelism: 2
+  - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
+  - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+  - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
 
-- label: Language Models Test (Extended Generation) # 80min
-  timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Language Models Test (Extended Generation) # 32.2m
+  timeout_in_minutes: 55
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  # grade: Blocking
-  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/language/generation
   commands:
-    # Install fast path packages for testing against transformers
-    # Note: also needed to run plamo2 model in vLLM
-    - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
-    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
-    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+  - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
+  - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+  - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 
-- label: Language Models Test (PPL)
-  timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/language/generation_ppl_test
-  commands:
-    - pytest -v -s models/language/generation_ppl_test
 
-- label: Language Models Test (Extended Pooling)  # 36min
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/language/pooling
-  commands:
-    - pytest -v -s models/language/pooling -m 'not core_model'
-
-- label: Language Models Test (MTEB)
-  timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/language/pooling_mteb_test
-  commands:
-    - pytest -v -s models/language/pooling_mteb_test
-
-- label: Multi-Modal Processor Test (CPU)
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction]
+- label: Multi-Modal Processor # 1h 42m
+  timeout_in_minutes: 138
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
   optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/multimodal
-  - tests/models/registry.py
-  no_gpu: true
-  commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
-
-- label: Multi-Modal Processor Test # 44min
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  optional: true
-  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal
   - tests/models/registry.py
   commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal/processing/test_tensor_schema.py
 
-- label: Multi-Modal Models Test (Standard) # 60min
-  timeout_in_minutes: 100
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: "Multi-Modal Models (Standard) 1: qwen2" # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  optional: true
-  # grade: Blocking
   torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal
   commands:
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py
-    - pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model
-    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2"
+  - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model
 
-- label: Multi-Modal Accuracy Eval (Small Models) # 5min
-  timeout_in_minutes: 10
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: "Multi-Modal Models (Standard) 2: qwen3 + gemma" # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  optional: true
-  # grade: Blocking
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-  - vllm/multimodal/
-  - vllm/inputs/
-  - vllm/v1/core/
+  - vllm/
+  - tests/models/multimodal
   commands:
-  - export MIOPEN_DEBUG_CONV_DIRECT=0
-  - export MIOPEN_DEBUG_CONV_GEMM=0
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma"
+  - pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model
 
-- label: Multi-Modal Models Test (Extended 1) # 60min
-  timeout_in_minutes: 120
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl" # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  # grade: Blocking
-  optional: true
+  torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal/generation
   - tests/models/multimodal/test_mapping.py
   commands:
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py
-    - pytest -v -s models/multimodal/test_mapping.py
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma"
+  - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model
 
-- label: Multi-Modal Models Test (Extended 2) # 60min
-  timeout_in_minutes: 120
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: "Multi-Modal Models (Standard) 4: other + whisper" # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  # grade: Blocking
-  optional: true
+  torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal/generation
   commands:
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+  - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model
 
-- label: Multi-Modal Models Test (Extended 3) # 75min
-  timeout_in_minutes: 150
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Multi-Modal Models (Extended Generation 1) # 1h 2m
+  timeout_in_minutes: 106
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  # grade: Blocking
-  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal/generation
+  - tests/models/multimodal/test_mapping.py
+  commands:
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py
+  - pytest -v -s models/multimodal/test_mapping.py
+
+
+- label: Multi-Modal Models (Extended Generation 2) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal/generation
   commands:
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
 
-- label: Multi-Modal Models Test (Extended Pooling) # 60min
-  timeout_in_minutes: 120
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Multi-Modal Models (Extended Generation 3) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  # grade: Blocking
-  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal/generation
+  commands:
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+
+
+- label: Multi-Modal Models (Extended Pooling) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal/pooling
   commands:
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - pytest -v -s models/multimodal/pooling -m 'not core_model'
+  - pytest -v -s models/multimodal/pooling -m 'not core_model'
 
-- label: Quantized Models Test # 45 min
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Quantized Models Test # 21.4m
+  timeout_in_minutes: 38
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/model_executor/layers/quantization
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   - tests/models/quantization
+  - vllm/model_executor/model_loader/
   commands:
-    - pytest -v -s models/quantization
+  - pytest -v -s models/quantization
 
-- label: Transformers Nightly Models Test
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Transformers Nightly Models # 50.9m
+  timeout_in_minutes: 102
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  # grade: Blocking
-  working_dir: "/vllm-workspace/"
   optional: true
-  commands:
-    - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
-    - pytest -v -s tests/models/test_transformers.py
-    # - pytest -v -s tests/models/multimodal/processing/
-    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
-    - python3 examples/basic/offline_inference/chat.py
-    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
-    # Whisper needs spawn method to avoid deadlock
-    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
-
-- label: Blackwell Fusion and Compile Tests # 30 min
-  timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
-  gpu: b200
   source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/v1/worker/
-  - vllm/v1/cudagraph_dispatcher.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/passes/test_fusion_attn.py
-  - tests/compile/passes/test_silu_mul_quant_fusion.py
-  - tests/compile/passes/distributed/test_fusion_all_reduce.py
-  - tests/compile/fullgraph/test_full_graph.py
+  - vllm/model_executor/models/
+  - vllm/model_executor/model_loader/
+  - vllm/multimodal/
+  - vllm/model_executor/layers/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  - tests/models/
+  - examples/
   commands:
-    - nvidia-smi
-    - pytest -v -s tests/compile/passes/test_fusion_attn.py
-    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
-    # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
+  - pip install --upgrade git+https://github.com/huggingface/transformers
+  - pytest -v -s tests/models/test_initialization.py
+  - pytest -v -s tests/models/test_transformers.py
+  - pytest -v -s tests/models/multimodal/processing/
+  - pytest -v -s tests/models/multimodal/test_mapping.py
+  - python3 examples/basic/offline_inference/chat.py
+  - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+  - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
 
-    # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # # Wrap with quotes to escape yaml
-    # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
-    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
-    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
 
-    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
-
-- label: Blackwell GPT-OSS Eval
-  timeout_in_minutes: 60
+- label: Quantized MoE Test (B200-MI325) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
   working_dir: "/vllm-workspace/"
-  gpu: b200
-  optional: true # run on nightlies
   source_file_dependencies:
-  - tests/evals/gpt_oss
-  - vllm/model_executor/models/gpt_oss.py
-  - vllm/model_executor/layers/quantization/mxfp4.py
-  - vllm/v1/attention/backends/flashinfer.py
-  commands:
-    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
-
-- label: Blackwell Quantized MoE Test
-  timeout_in_minutes: 60
-  working_dir: "/vllm-workspace/"
-  gpu: b200
-  source_file_dependencies:
-  - tests/quantization/test_blackwell_moe.py
+  - tests/quantization/test_gfx3xx_moe.py
   - vllm/model_executor/models/deepseek_v2.py
   - vllm/model_executor/models/gpt_oss.py
   - vllm/model_executor/models/llama4.py
@@ -2616,65 +2368,49 @@ steps:
   - vllm/model_executor/layers/quantization/compressed_tensors
   - vllm/model_executor/layers/quantization/modelopt.py
   - vllm/model_executor/layers/quantization/mxfp4.py
-  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/attention/backends/triton_attn.py
+  - vllm/v1/attention/backends/rocm_attn.py
+  - vllm/v1/attention/backends/rocm_aiter_fa.py
+  - vllm/v1/attention/backends/mla/
+  - vllm/v1/attention/selector.py
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  - vllm/model_executor/model_loader/
   commands:
-    - pytest -s -v tests/quantization/test_blackwell_moe.py
+  - pytest -s -v tests/quantization/test_gfx3xx_moe.py
 
-#####  1 GPU test  #####
-#####  multi gpus test  #####
 
-- label: Distributed Comm Ops Test # 7min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
+- label: Distributed DP Tests (2 GPUs) # 56.1m
+  timeout_in_minutes: 102
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_2
-  # grade: Blocking
-  working_dir: "/vllm-workspace/tests"
   num_gpus: 2
-  source_file_dependencies:
-  - vllm/distributed
-  - tests/distributed
-  commands:
-  - pytest -v -s distributed/test_comm_ops.py
-  - pytest -v -s distributed/test_shm_broadcast.py
-  - pytest -v -s distributed/test_shm_buffer.py
-  - pytest -v -s distributed/test_shm_storage.py
-
-- label: 2 Node Tests (4 GPUs in total) # 16min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction, amdmultinode]
-  agent_pool: mi325_4
-  optional: true
-  # grade: Blocking
   working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  num_nodes: 2
   source_file_dependencies:
   - vllm/distributed/
   - vllm/engine/
   - vllm/executor/
-  - vllm/model_executor/models/
-  - tests/distributed/
-  - tests/examples/offline_inference/data_parallel.py
+  - vllm/worker/worker_base.py
+  - vllm/v1/engine/
+  - vllm/v1/worker/
+  - tests/v1/distributed
+  - tests/v1/entrypoints/openai/test_multi_api_servers.py
+  - vllm/platforms/rocm.py
   commands:
-  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)  | grep 'Same node test passed'   | grep 'Node count test passed'
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py 
-    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py 
-    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
-    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
-    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py 
-    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py 
-    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
+  - export TORCH_NCCL_BLOCKING_WAIT=1
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
 
-- label: Distributed Tests (2 GPUs) # 68min
-  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Distributed Compile + RPC Tests (2 GPUs) # 56.1m
+  timeout_in_minutes: 102
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_2
-  optional: true
-  # grade: Blocking
-  working_dir: "/vllm-workspace/tests"
   num_gpus: 2
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/compilation/
   - vllm/distributed/
@@ -2685,381 +2421,446 @@ steps:
   - vllm/v1/worker/
   - tests/compile/fullgraph/test_basic_correctness.py
   - tests/compile/test_wrapper.py
-  - tests/distributed/
   - tests/entrypoints/llm/test_collective_rpc.py
-  - tests/v1/distributed
-  - tests/v1/entrypoints/openai/test_multi_api_servers.py
-  - tests/v1/shutdown
-  - tests/v1/worker/test_worker_memory_snapshot.py
-  - examples/rl/
+  - vllm/platforms/rocm.py
   commands:
-  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
-  # TODO: Remove when the bug is fixed in a future ROCm release
   - export TORCH_NCCL_BLOCKING_WAIT=1
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
   - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
+
+
+- label: Distributed Torchrun + Shutdown Tests (2 GPUs) # 56.1m
+  timeout_in_minutes: 102
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_2
+  num_gpus: 2
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/worker/worker_base.py
+  - vllm/v1/engine/
+  - vllm/v1/worker/
+  - tests/distributed/
+  - tests/v1/shutdown
+  - tests/v1/worker/test_worker_memory_snapshot.py
+  - vllm/platforms/rocm.py
+  commands:
+  - export TORCH_NCCL_BLOCKING_WAIT=1
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
   - pytest -v -s v1/worker/test_worker_memory_snapshot.py
 
-- label: Distributed Model Tests (2 GPUs) # 37min
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Distributed Model Tests (2 GPUs) # 19.3m
+  timeout_in_minutes: 38
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_2
-  optional: true
-  # grade: Blocking
-  working_dir: "/vllm-workspace/tests"
   num_gpus: 2
+  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/model_executor/model_loader/sharded_state_loader.py
   - vllm/model_executor/models/
+  - vllm/model_executor/layers/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   - tests/basic_correctness/
   - tests/model_executor/model_loader/test_sharded_state_loader.py
   - tests/models/
   commands:
   - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
-  # Avoid importing model tests that cause CUDA reinitialization error
   - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
   - pytest models/language -v -s -m 'distributed(num_gpus=2)'
   - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
   - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
 
-- label: Plugin Tests (2 GPUs) # 40min
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_2
-  optional: true
-  # grade: Blocking
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  source_file_dependencies:
-  - vllm/plugins/
-  - tests/plugins/
-  commands:
-  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
-  - pip install -e ./plugins/vllm_add_dummy_platform
-  - pytest -v -s plugins_tests/test_platform_plugins.py
-  - pip uninstall vllm_add_dummy_platform -y
-  # end platform plugin tests
-  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
-  - pip install -e ./plugins/prithvi_io_processor_plugin
-  - pytest -v -s plugins_tests/test_io_processor_plugins.py
-  - pip uninstall prithvi_io_processor_plugin -y
-  # test bge_m3_sparse io_processor plugin
-  - pip install -e ./plugins/bge_m3_sparse_plugin
-  - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
-  - pip uninstall bge_m3_sparse_plugin -y
-  # end io_processor plugins test
-  # begin stat_logger plugins test
-  - pip install -e ./plugins/vllm_add_dummy_stat_logger
-  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
-  - pip uninstall dummy_stat_logger -y
-  # end stat_logger plugins test
-  # other tests continue here:
-  - pytest -v -s plugins_tests/test_scheduler_plugins.py
-  - pip install -e ./plugins/vllm_add_dummy_model
-  - pytest -v -s distributed/test_distributed_oot.py
-  - pytest -v -s entrypoints/openai/chat_completion/test_oot_registration.py # it needs a clean process
-  - pytest -v -s models/test_oot_registration.py # it needs a clean process
-  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
 
-- label: Pipeline + Context Parallelism Test # 45min
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction]
+- label: LoRA TP (Distributed) # 9.8m
+  timeout_in_minutes: 18
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_4
-  optional: true
-  # grade: Blocking
+  num_gpus: 4
   working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
-  source_file_dependencies:
-  - vllm/distributed/
-  - vllm/engine/
-  - vllm/executor/
-  - vllm/model_executor/models/
-  - tests/distributed/
-  commands:
-  - pytest -v -s distributed/test_pp_cudagraph.py
-  - pytest -v -s distributed/test_pipeline_parallel.py
-
-- label: LoRA TP Test (Distributed) # 17 min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_4
-  optional: true
-  # grade: Blocking
-  num_gpus: 4
   source_file_dependencies:
   - vllm/lora
   - tests/lora
+  - vllm/platforms/rocm.py
   commands:
-    # FIXIT: find out which code initialize cuda before running the test
-    # before the fix, we need to use spawn to test it
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    # There is some Tensor Parallelism related processing logic in LoRA that
-    # requires multi-GPU testing for validation.
-    - pytest -v -s -x lora/test_chatglm3_tp.py
-    - pytest -v -s -x lora/test_llama_tp.py
-    - pytest -v -s -x lora/test_llm_with_multi_loras.py
-    - pytest -v -s -x lora/test_olmoe_tp.py
-
-    # Disabled for now because MXFP4 backend on non-cuda platform
-    # doesn't support LoRA yet
-    #- pytest -v -s -x lora/test_gptoss_tp.py
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+  - pytest -v -s -x lora/test_chatglm3_tp.py
+  - pytest -v -s -x lora/test_llama_tp.py
+  - pytest -v -s -x lora/test_llm_with_multi_loras.py
+  - pytest -v -s -x lora/test_olmoe_tp.py
+  - pytest -v -s -x lora/test_gptoss_tp.py
 
 
-- label: Weight Loading Multiple GPU Test  # 33min
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental, amdproduction]
+- label: Weight Loading Multiple GPU # 7.5m
+  timeout_in_minutes: 14
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_2
-  # grade: Blocking
-  working_dir: "/vllm-workspace/tests"
   num_gpus: 2
-  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/weight_loading
   commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
+  - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
 
-- label: Weight Loading Multiple GPU Test - Large Models # optional
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Weight Loading Multiple GPU - Large Models # 12.6m
+  timeout_in_minutes: 26
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_2
-  # grade: Blocking
-  working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/weight_loading
   commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
+  - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
 
-- label: NixlConnector PD accuracy tests (Distributed) # 30min
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_4
+
+- label: Ray Dependency Compatibility Check # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
   optional: true
-  # grade: Blocking
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
+  working_dir: "/"
   source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-    - tests/v1/kv_connector/nixl_integration/
+  - requirements/
+  - setup.py
+  - vllm/platforms/rocm.py
   commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
-    - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+  - bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh
 
-- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Distributed NixlConnector PD accuracy (4 GPUs)  # 27.4m
+  timeout_in_minutes: 44
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_4
+  num_gpus: 4
   optional: true
-  # grade: Blocking
-  timeout_in_minutes: 15
   working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+  - tests/v1/kv_connector/nixl_integration/
+  - vllm/platforms/rocm.py
+  commands:
+  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
+  - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+
+
+- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_4
   num_gpus: 4
-  source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-    - tests/v1/kv_connector/nixl_integration/
-  commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
-    - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
-- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_4
-  # grade: Blocking
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
-  source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-    - tests/v1/kv_connector/nixl_integration/
-  commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
-    - CROSS_LAYERS_BLOCKS=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
-##### multi gpus test #####
-##### A100 test #####
-
-- label: Distributed Tests (A100) # optional
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_4
-  # grade: Blocking
-  gpu: a100
   optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+  - tests/v1/kv_connector/nixl_integration/
+  - vllm/platforms/rocm.py
+  commands:
+  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
+  - CROSS_LAYERS_BLOCKS=True ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+
+
+- label: Distributed Tests (4 GPUs)(A100-MI325) # 20.9m
+  timeout_in_minutes: 37
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_4
   num_gpus: 4
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   commands:
-  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
-  # TODO: Remove when the bug is fixed in a future ROCm release
   - export TORCH_NCCL_BLOCKING_WAIT=1
-  # NOTE: don't test llama model here, it seems hf implementation is buggy
-  # see https://github.com/vllm-project/vllm/pull/5689 for details
   - pytest -v -s distributed/test_custom_all_reduce.py
   - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
   - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   - pytest -v -s -x lora/test_mixtral.py
 
 
-- label: LM Eval Large Models # optional
-  gpu: a100
-  optional: true
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_4
-  # grade: Blocking
-  num_gpus: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
-
-##### FP8 test #####
-- label: LM Eval Large Models (H100) # optional, still use H100 for consistency
-  gpu: h100
-  optional: true
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_4
-  # grade: Blocking
-  num_gpus: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-    - export VLLM_USE_DEEP_GEMM=0 
-    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=4
-
-
-##### H200 test #####
-- label: Distributed Tests (H200) # optional
-  mirror_hardwares: [amdexperimental, amdproduction]
+- label: Distributed Tests (2 GPUs)(H100-MI325) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_2
-  # grade: Blocking
-  gpu: h200
+  num_gpus: 2
   optional: true
   working_dir: "/vllm-workspace/"
-  num_gpus: 2
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/v1/distributed/
+  - vllm/model_executor/layers/fused_moe/
+  - tests/distributed/test_context_parallel.py
+  - tests/v1/distributed/test_dbo.py
+  - examples/offline_inference/data_parallel.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
-    - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
-    # TODO: this test is not supported on ROCm, there are aiter kernels for this.
-    # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
-    #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
-    # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
-    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
-    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
-    - pytest -v -s tests/distributed/test_context_parallel.py
-    - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
-    # this test is not supported on ROCm
-    # - pytest -v -s tests/v1/distributed/test_dbo.py
+  - export TORCH_NCCL_BLOCKING_WAIT=1
+  - pytest -v -s tests/distributed/test_context_parallel.py
+  - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+  - pytest -v -s tests/v1/distributed/test_dbo.py
 
-##### B200 test #####
-- label: Distributed Tests (B200) # optional
-  gpu: b200
-  optional: true
+
+- label: Distributed Compile Unit Tests (2xH100-2xMI325) # 14.3m
+  timeout_in_minutes: 32
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_2
+  num_gpus: 2
   working_dir: "/vllm-workspace/"
-  num_gpus: 2
+  source_file_dependencies:
+  - vllm/compilation/
+  - vllm/model_executor/layers
+  - tests/compile/passes/distributed/
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
-    - pytest -v -s tests/distributed/test_context_parallel.py
-    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
-    - pytest -v -s tests/v1/distributed/test_dbo.py
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
+  - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
+  # TODO: this test is not supported on ROCm, there are aiter kernels for this.
+  # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
+  # - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+  # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
 
-##### E2E Eval Tests #####
-- label: LM Eval Small Models (1 Card) # 15min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: LM Eval Small Models # 13.3m
+  timeout_in_minutes: 23
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - csrc/
   - vllm/model_executor/layers/quantization
+  - vllm/model_executor/models/
+  - vllm/model_executor/model_loader/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
   - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
 
-- label: LM Eval Large Models (4 Card)
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_4
-  # grade: Blocking
-  gpu: a100
+
+- label: LM Eval Small Models (B200-MI325) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_2
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  - vllm/model_executor/models/
+  - vllm/model_executor/model_loader/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8.txt
+
+
+- label: LM Eval Large Models (H200-MI325) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_8
   optional: true
+  num_gpus: 8
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - vllm/model_executor/model_loader/
+  - vllm/model_executor/layers/quantization/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
+  - vllm/model_executor/layers/layernorm.py
+  - csrc/
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  - tests/evals/
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx.txt
+
+
+- label: LM Eval Large Models (4 GPUs)(FP8) # 24.8m
+  timeout_in_minutes: 42
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_4
   num_gpus: 4
   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
   source_file_dependencies:
   - csrc/
   - vllm/model_executor/layers/quantization
+  - vllm/model_executor/models/
+  - vllm/model_executor/model_loader/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  commands:
+  - export VLLM_USE_DEEP_GEMM=0
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm-fp8.txt --tp-size=4
+
+
+- label: LM Eval Large Models (4 GPUs)(A100-MI325) # 17.3m
+  timeout_in_minutes: 27
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_4
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  - vllm/model_executor/models/
+  - vllm/model_executor/model_loader/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 
-- label: ROCm LM Eval Large Models (8 Card)
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: ROCm LM Eval Large Models (8 Card) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_8
   optional: true
   num_gpus: 8
   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - vllm/model_executor/model_loader/
+  - vllm/model_executor/layers/quantization/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
+  - vllm/model_executor/layers/layernorm.py
+  - csrc/
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
 
-- label: ROCm GPT-OSS Eval
-  timeout_in_minutes: 60
-  working_dir: "/vllm-workspace/tests"
-  agent_pool: mi325_1
-  mirror_hardwares: [amdexperimental, amdproduction]
-  optional: true # run on nightlies
-  source_file_dependencies:
-  - tests/evals/gpt_oss
-  - vllm/model_executor/models/gpt_oss.py
-  - vllm/model_executor/layers/quantization/mxfp4.py
-  - vllm/v1/attention/backends/flashinfer.py
-  commands:
-  - uv pip install --system 'gpt-oss[eval]==0.0.5'
-  - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx942.txt
 
-##### EPLB Accuracy Tests #####
-- label: DeepSeek V2-Lite Accuracy
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_4
-  # grade: Blocking
-  timeout_in_minutes: 60
-  gpu: h100
+- label: GPQA Eval (GPT-OSS) (H100-MI325) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_2
+  num_gpus: 2
   optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  - vllm/model_executor/models/
+  - vllm/model_executor/model_loader/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
+  - vllm/model_executor/layers/fused_moe/
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  - tests/evals/gpt_oss/
+  commands:
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx942.txt
+
+
+- label: DeepSeek V2-Lite Accuracy # 6.7m
+  timeout_in_minutes: 12
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_4
   num_gpus: 4
   working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - vllm/model_executor/model_loader/
+  - vllm/distributed/eplb
+  - vllm/model_executor/layers/fused_moe/
+  - vllm/model_executor/layers/quantization/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/backends/mla/
+  - vllm/v1/attention/selector.py
+  - .buildkite/scripts/scheduled_integration_test/
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
 
-- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_4
-  # grade: Blocking
-  timeout_in_minutes: 60
-  gpu: h100
-  optional: true
-  num_gpus: 4
+
+- label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100-MI325) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
+  num_gpus: 1
   working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - vllm/model_executor/model_loader/
+  - vllm/model_executor/layers/fused_moe/
+  - vllm/model_executor/layers/quantization/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/backends/mla/
+  - vllm/v1/attention/selector.py
+  - .buildkite/scripts/scheduled_integration_test/
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh 0.25 200 8030
+
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy # 6.4m
+  timeout_in_minutes: 11
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_4
+  working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - vllm/model_executor/model_loader/
+  - vllm/model_executor/layers/quantization/
+  - vllm/distributed/eplb
+  - vllm/model_executor/layers/fused_moe/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
+  - .buildkite/scripts/scheduled_integration_test/
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
 
-- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy # 10.9m
+  timeout_in_minutes: 22
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_4
-  # grade: Blocking
-  optional: true
   num_gpus: 4
   working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - vllm/model_executor/model_loader/
+  - vllm/v1/spec_decode/
+  - vllm/distributed/eplb
+  - vllm/model_executor/layers/fused_moe/
+  - vllm/model_executor/layers/quantization/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
+  - .buildkite/scripts/scheduled_integration_test/
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
 
@@ -3072,11 +2873,12 @@ steps:
 
 ## TODO: Enable the test in this group
 # # corresponds to .buildkite/test_areas/compile.yaml
-# - label: Fusion and Compile Unit Tests (2xMI325 GPUs)
-#   timeout_in_minutes: 20
-#   working_dir: "/vllm-workspace/"
-#   mirror_hardwares: [amdexperimental, amdproduction, tj]
+# - label: Fusion and Compile Unit Tests (2xB200-2xMI325) # TBD
+#   timeout_in_minutes: 180
+#   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325, tj]
 #   agent_pool: mi325_1 # changed to 1 GPU until the fusion all reduce is enabled then only revert back to 2 GPUs
+#   num_gpus: 1
+#   working_dir: "/vllm-workspace/"
 #   source_file_dependencies:
 #   - csrc/quantization/fp4/
 #   - vllm/model_executor/layers/quantization/
@@ -3100,205 +2902,91 @@ steps:
 #     # TODO: find out more details
 #     # - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
 
-# corresponds to .buildkite/test_areas/compile.yaml
-- label: Fusion E2E Quick (MI325)
-  timeout_in_minutes: 15
-  working_dir: "/vllm-workspace/"
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  num_devices: 1
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/model_executor/
-    - vllm/v1/attention/
-    - vllm/compilation/
-    - tests/compile/fusions_e2e/
-  commands:
-    - rocm-smi
-    # Run all models and attn backends but only Inductor partition and native custom ops
-    - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
-    # Different from CUDA, Qwen requires +rms_norm and +quant_fp8 as rms+quant fusion is only supported on AITER
-    - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and +rms_norm and +quant_fp8 and qwen3'"
 
-# corresponds to .buildkite/test_areas/compile.yaml
-- label: Fusion E2E Config Sweep (MI325)
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/"
-  mirror_hardwares: [amdexperimental, amdproduction]
+- label: Fusion E2E Quick (H100-MI325) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  num_devices: 1
+  num_gpus: 1
+  working_dir: "/vllm-workspace/"
   source_file_dependencies:
-    - csrc/quantization/
-    - vllm/compilation/
-    # can affect pattern matching
-    - vllm/model_executor/layers/layernorm.py
-    - vllm/model_executor/layers/activation.py
-    - vllm/model_executor/layers/attention/attention.py
-    - vllm/model_executor/layers/quantization/input_quant_fp8.py
-    - tests/compile/fusions_e2e/
+  - csrc/quantization/
+  - vllm/model_executor/
+  - vllm/v1/attention/
+  - vllm/compilation/
+  - tests/compile/fusions_e2e/
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
-    - rocm-smi
-    # Run just llama3 (fp8) for all config combinations
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
+  - rocm-smi
+  # Run all models and attn backends but only Inductor partition and native custom ops
+  - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
+  # Different from CUDA, Qwen requires +rms_norm and +quant_fp8 as rms+quant fusion is only supported on AITER
+  - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and +rms_norm and +quant_fp8 and qwen3'"
+
+
+- label: Fusion E2E Config Sweep (H100-MI325) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
+  agent_pool: mi325_1
+  num_gpus: 1
+  working_dir: "/vllm-workspace/"
+  source_file_dependencies:
+  - csrc/quantization/
+  - vllm/compilation/
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/attention/attention.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/fusions_e2e/
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  commands:
+  - rocm-smi
+  - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
 
 ## There are no ops on ROCm for these tests.
 ## The test still passes but the logs are not useful.
 ## fused ops just call torch.ops.symm_mem which 
 ## exists in ROCm even though they don't work
-# - label: AsyncTP Correctness Tests  (2xMI325 GPUs)
-# - label: Fusion E2E TP2 Quick (MI325)
-# - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325)
-# - label: Fusion E2E TP2 (MI325)
-# - label: Sequence Parallel Correctness Tests (2xMI325 GPUs)
+# - label: AsyncTP Correctness Tests (2xH100-2xMI325)
+# - label: Fusion E2E TP2 Quick (H100-MI325)
+# - label: Fusion E2E TP2 AsyncTP Config Sweep (H100-MI325)
+# - label: Fusion E2E TP2 (B200-MI325)
+# - label: Sequence Parallel Correctness Tests (2xH100-2xMI325)
 
 
 #####################################################################################################################################
 #                                                                                                                                   #
-#  MI355 test definitions ( currently the test set is completely mirrored // TBD which tests are to be routed there ultimately)     #
+#                                                             gfx950                                                                #
 #                                                                                                                                   #
 #####################################################################################################################################
 
-- label: Pytorch Nightly Dependency Override Check # 2min
-  # if this test fails, it means the nightly torch version is not compatible with some
-  # of the dependencies. Please check the error message and add the package to whitelist
-  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+- label: Entrypoints Integration (API Server 1) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
-  optional: true
-  soft_fail: true
-  source_file_dependencies:
-  - requirements/nightly_torch_test.txt
-  commands:
-  - bash standalone_tests/pytorch_nightly_dependency.sh
-
-- label: Async Engine, Inputs, Utils, Worker Test # 10min
-  timeout_in_minutes: 15
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi355_1
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/multimodal
-  - tests/utils_
-  commands:
-  - pytest -v -s -m 'not cpu_test' multimodal
-  - pytest -v -s utils_
-
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi355_1
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/test_inputs.py
-  - tests/test_outputs.py
-  - tests/test_pooling_params.py
-  - tests/multimodal
-  - tests/renderers
-  - tests/standalone_tests/lazy_imports.py
-  - tests/tokenizers_
-  - tests/tool_parsers
-  - tests/transformers_utils
-  - tests/config
-  no_gpu: true
-  commands:
-  - python3 standalone_tests/lazy_imports.py
-  - pytest -v -s test_inputs.py
-  - pytest -v -s test_outputs.py
-  - pytest -v -s test_pooling_params.py
-  - pytest -v -s -m 'cpu_test' multimodal
-  - pytest -v -s renderers
-  - pytest -v -s tokenizers_
-  - pytest -v -s tool_parsers
-  - pytest -v -s transformers_utils
-  - pytest -v -s config
-
-- label: Python-only Installation Test # 10min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_1
-  optional: true
-  source_file_dependencies:
-  - tests/standalone_tests/python_only_compile.sh
-  - setup.py
-  commands:
-  - bash standalone_tests/python_only_compile.sh
-
-- label: Basic Correctness Test # 20min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
-  optional: true
   fast_check: true
   torch_nightly: true
-  source_file_dependencies:
-  - vllm/
-  - tests/basic_correctness/test_basic_correctness
-  - tests/basic_correctness/test_cpu_offload
-  - tests/basic_correctness/test_cumem.py
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s basic_correctness/test_cumem.py
-  - pytest -v -s basic_correctness/test_basic_correctness.py
-  - pytest -v -s basic_correctness/test_cpu_offload.py
-
-- label: Entrypoints Unit Tests # 5min
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi355_1
-  timeout_in_minutes: 10
   working_dir: "/vllm-workspace/tests"
-  fast_check: true
-  source_file_dependencies:
-  - vllm/entrypoints
-  - tests/entrypoints/
-  commands:
-  - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
-
-- label: Entrypoints Integration Test (LLM) # 30min
-  timeout_in_minutes: 40
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
-  optional: true
-  working_dir: "/vllm-workspace/tests"
-  fast_check: true
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/
-  - tests/entrypoints/llm
-  - tests/entrypoints/offline_mode
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
-  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
-
-- label: Entrypoints Integration Test (API Server 1) # 100min
-  timeout_in_minutes: 130
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_1
-  optional: true
-  working_dir: "/vllm-workspace/tests"
-  fast_check: true
-  torch_nightly: true
   source_file_dependencies:
   - vllm/
   - tests/entrypoints/openai
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
   - pytest -v -s entrypoints/test_chat_utils.py
 
-- label: Entrypoints Integration Test (API Server 2)
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+
+- label: Entrypoints Integration (API Server 2) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
   optional: true
-  working_dir: "/vllm-workspace/tests"
   fast_check: true
   torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/entrypoints/rpc
@@ -3310,14 +2998,14 @@ steps:
   - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
   - pytest -v -s tool_use
 
-- label: Entrypoints Integration Test (Pooling)
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+
+- label: Entrypoints Integration (Pooling) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
-  optional: true
-  working_dir: "/vllm-workspace/tests"
   fast_check: true
   torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/entrypoints/pooling
@@ -3325,509 +3013,195 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s entrypoints/pooling
 
-- label: Entrypoints Integration Test (Responses API)
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
+
+- label: Regression # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
   optional: true
   working_dir: "/vllm-workspace/tests"
-  fast_check: true
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/
-  - tests/entrypoints/openai/responses
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai/responses
-
-- label: Distributed Tests (4 GPUs) # 35min
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_4
-  optional: true
-  # grade: Blocking
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
-  source_file_dependencies:
-  - vllm/distributed/
-  - tests/distributed/test_utils
-  - tests/distributed/test_pynccl
-  - tests/distributed/test_events
-  - tests/compile/fullgraph/test_basic_correctness.py
-  - examples/offline_inference/rlhf.py
-  - examples/offline_inference/rlhf_colocate.py
-  - examples/rl/
-  - tests/examples/offline_inference/data_parallel.py
-  - tests/v1/distributed
-  - tests/v1/engine/test_engine_core_client.py
-  - tests/distributed/test_symm_mem_allreduce.py
-  commands:
-  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
-  # TODO: Remove when the bug is fixed in a future ROCm release
-  - export TORCH_NCCL_BLOCKING_WAIT=1
-  # test with torchrun tp=2 and external_dp=2
-  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-  # test with torchrun tp=2 and pp=2
-  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-  # test with torchrun tp=4 and dp=1
-  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-  # test with torchrun tp=2, pp=2 and dp=1
-  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-  # test with torchrun tp=1 and dp=4 with ep
-  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-  # test with torchrun tp=2 and dp=2 with ep
-  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
-  # test with internal dp
-  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
-  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
-  - pytest -v -s distributed/test_utils.py
-  - pytest -v -s compile/fullgraph/test_basic_correctness.py
-  - pytest -v -s distributed/test_pynccl.py
-  - pytest -v -s distributed/test_events.py
-  - pytest -v -s distributed/test_symm_mem_allreduce.py
-  # TODO: create a dedicated test section for multi-GPU example tests
-  # when we have multiple distributed example tests
-  # OLD rlhf examples
-  - pushd ../examples/offline_inference
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
-  - popd
-  # NEW rlhf examples
-  - pushd ../examples/rl
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
-  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py
-  - popd
-
-- label: Distributed Tests (8 GPUs) # 4min
-  timeout_in_minutes: 10
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_8
-  optional: true
-  gpu: h100
-  num_gpus: 8
-  working_dir: "/vllm-workspace/tests"
-  source_file_dependencies:
-  - examples/offline_inference/torchrun_dp_example.py
-  - vllm/config/parallel.py
-  - vllm/distributed/
-  - vllm/v1/engine/llm_engine.py
-  - vllm/v1/executor/uniproc_executor.py
-  - vllm/v1/worker/gpu_worker.py
-  commands:
-  # test with torchrun tp=2 and dp=4 with ep
-  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
-  # TODO: Remove when the bug is fixed in a future ROCm release
-  - export TORCH_NCCL_BLOCKING_WAIT=1
-  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
-
-- label: EPLB Algorithm Test # 5min
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi355_1
-  optional: true
-  timeout_in_minutes: 15
-  working_dir: "/vllm-workspace/tests"
-  source_file_dependencies:
-  - vllm/distributed/eplb
-  - tests/distributed/test_eplb_algo.py
-  commands:
-  - pytest -v -s distributed/test_eplb_algo.py
-
-- label: EPLB Execution Test # 10min
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_4
-  optional: true
-  timeout_in_minutes: 20
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
-  source_file_dependencies:
-  - vllm/distributed/eplb
-  - tests/distributed/test_eplb_execute.py
-  commands:
-  - pytest -v -s distributed/test_eplb_execute.py
-  - pytest -v -s distributed/test_eplb_spec_decode.py
-
-- label: Metrics, Tracing Test # 12min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_2
-  optional: true
-  num_gpus: 2
-  source_file_dependencies:
-  - vllm/
-  - tests/v1/tracing
-  commands:
-  - "pip install \
-      'opentelemetry-sdk>=1.26.0' \
-      'opentelemetry-api>=1.26.0' \
-      'opentelemetry-exporter-otlp>=1.26.0' \
-      'opentelemetry-semantic-conventions-ai>=0.4.1'"
-  - pytest -v -s v1/tracing
-
-##### fast check tests  #####
-#####  1 GPU test  #####
-
-- label: Regression Test # 7min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi355_1
   source_file_dependencies:
   - vllm/
   - tests/test_regression
   commands:
   - pip install modelscope
   - pytest -v -s test_regression.py
-  working_dir: "/vllm-workspace/tests" # optional
 
-- label: Engine Test # 9min
-  timeout_in_minutes: 15
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: V1 Spec Decode # TBD
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
-  - tests/engine
-  - tests/test_sequence
-  - tests/test_config
-  - tests/test_logger
-  - tests/test_vllm_port
+  - tests/v1/spec_decode
   commands:
-  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
+  - pytest -v -s -m 'not slow_test' v1/spec_decode
 
 
-- label: V1 Test e2e + engine # 65min
-  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_1
-  optional: true
-  # grade: Blocking
-  source_file_dependencies:
-    - vllm/
-    - tests/v1
-  commands:
-    # TODO: accuracy does not match, whether setting
-    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-    - pytest -v -s v1/e2e
-    - pytest -v -s v1/engine
-
-- label: V1 Test e2e (2 GPUs) # 65min
-  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_2
-  optional: true
-  # grade: Blocking
-  source_file_dependencies:
-    - vllm/
-    - tests/v1
-  commands:
-    # Only run tests that need exactly 2 GPUs
-    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
-
-- label: V1 Test e2e (4 GPUs) # 65min
-  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental]
-  # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
-  # See discussion here: https://github.com/vllm-project/vllm/pull/31040
-  agent_pool: mi355_4
-  optional: true
-  # grade: Blocking
-  source_file_dependencies:
-    - vllm/
-    - tests/v1
-  commands:
-    # Only run tests that need 4 GPUs
-    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
-
-- label: V1 Test entrypoints # 35min
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi355_1
-  optional: true
-  source_file_dependencies:
-    - vllm/
-    - tests/v1
-  commands:
-    - pytest -v -s v1/entrypoints
-
-- label: V1 Test others # 42min
+- label: V1 Sample + Logits # TBD
   timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
-  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-    - vllm/
-    - tests/v1
+  - vllm/
+  - tests/v1/sample
+  - tests/v1/logits_processors
+  - tests/v1/test_oracle.py
+  - tests/v1/test_request.py
+  - tests/v1/test_outputs.py
   commands:
-    # split the test to avoid interference
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
-    - pytest -v -s -m 'not cpu_test' v1/core
-    - pytest -v -s v1/executor
-    - pytest -v -s v1/kv_offload
-    - pytest -v -s v1/sample
-    - pytest -v -s v1/logits_processors
-    - pytest -v -s v1/worker
-    - pytest -v -s v1/spec_decode
-    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
-    - pytest -v -s -m 'not cpu_test' v1/metrics
-    - pytest -v -s v1/test_oracle.py
-    - pytest -v -s v1/test_request.py
-    - pytest -v -s v1/test_outputs.py
-    # Integration test for streaming correctness (requires special branch).
-    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
-    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+  - pytest -v -s v1/sample
+  - pytest -v -s v1/logits_processors
+  - pytest -v -s v1/test_oracle.py
+  - pytest -v -s v1/test_request.py
+  - pytest -v -s v1/test_outputs.py
 
-- label: V1 Test attention (H100) # 10min
-  mirror_hardwares: [amdexperimental]
+
+- label: V1 Core + KV + Metrics # TBD
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
-  optional: true
-  timeout_in_minutes: 30
-  gpu: h100
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-    - vllm/config/attention.py
-    - vllm/model_executor/layers/attention
-    - vllm/v1/attention
-    - tests/v1/attention
+  - vllm/
+  - tests/v1/core
+  - tests/v1/executor
+  - tests/v1/kv_offload
+  - tests/v1/worker
+  - tests/v1/kv_connector/unit
+  - tests/v1/metrics
+  - tests/entrypoints/openai/correctness/test_lmeval.py
   commands:
-    - pytest -v -s v1/attention
+  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
+  - pytest -v -s -m 'not cpu_test' v1/core
+  - pytest -v -s v1/executor
+  - pytest -v -s v1/kv_offload
+  - pytest -v -s v1/worker
+  - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+  - pytest -v -s -m 'not cpu_test' v1/metrics
+  - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
+  - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 
-- label: Batch Invariance Tests (H100) # 10min
-  mirror_hardwares: [amdexperimental]
+
+- label: V1 Speculative Decoding (slow) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
-  timeout_in_minutes: 25
-  gpu: h100
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-    - vllm/v1/attention
-    - vllm/model_executor/layers
-    - tests/v1/determinism/
+  - vllm/v1/spec_decode/
+  - vllm/model_executor/models/
+  - vllm/v1/attention/
+  - vllm/model_executor/layers/
+  - tests/v1/spec_decode/
+  - vllm/platforms/rocm.py
   commands:
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pip install pytest-timeout pytest-forked
-    - pytest -v -s v1/determinism/test_batch_invariance.py
-    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+  - pytest -v -s -m 'slow_test' v1/spec_decode/test_eagle.py
+  - pytest -v -s -m 'slow_test' v1/spec_decode/test_extract_hidden_states.py
+  - pytest -v -s -m 'slow_test' v1/spec_decode/test_max_len.py
+  - pytest -v -s -m 'slow_test' v1/spec_decode/test_mtp.py
+  - pytest -v -s -m 'slow_test' v1/spec_decode/test_ngram.py
+  - pytest -v -s -m 'slow_test' v1/spec_decode/test_speculators_eagle3.py
+  - pytest -v -s -m 'slow_test' v1/spec_decode/test_tree_attention.py
 
-- label: V1 Test attention (B200) # 10min
-  mirror_hardwares: [amdexperimental, amdmi355]
+
+- label: V1 attention (B200-MI355) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
-  timeout_in_minutes: 30
-  gpu: b200
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-    - vllm/config/attention.py
-    - vllm/model_executor/layers/attention
-    - vllm/v1/attention
-    - tests/v1/attention
+  - vllm/config/attention.py
+  - vllm/model_executor/layers/attention
+  - vllm/v1/attention
+  - tests/v1/attention
+  - vllm/_aiter_ops.py
+  - vllm/envs.py
+  - vllm/platforms/rocm.py
   commands:
-    - pytest -v -s v1/attention
+  - pytest -v -s v1/attention
 
-- label: V1 Test others (CPU) # 5 mins
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+
+- label: Examples # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
-  source_file_dependencies:
-    - vllm/
-    - tests/v1
-  no_gpu: true
-  commands:
-    # split the test to avoid interference
-    - pytest -v -s -m 'cpu_test' v1/core
-    - pytest -v -s v1/structured_output
-    - pytest -v -s v1/test_serial_utils.py
-    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
-    - pytest -v -s -m 'cpu_test' v1/metrics
-
-
-- label: Examples Test # 30min
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
-  optional: true
   working_dir: "/vllm-workspace/examples"
   source_file_dependencies:
   - vllm/entrypoints
   - vllm/multimodal
   - examples/
+  - vllm/platforms/rocm.py
   commands:
-    - pip install tensorizer # for tensorizer test
-    # for basic
-    - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN
-    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
-    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 basic/offline_inference/classify.py
-    - python3 basic/offline_inference/embed.py
-    - python3 basic/offline_inference/score.py
-    # for multi-modal models
-    - python3 offline_inference/audio_language.py --seed 0
-    - python3 offline_inference/vision_language.py --seed 0
-    - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
-    # for pooling models
-    - python3 pooling/embed/vision_embedding_offline.py --seed 0
-    # for features demo
-    - python3 offline_inference/prefix_caching.py
-    - python3 offline_inference/llm_engine_example.py
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
-    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
-    #- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+  - pip install tensorizer
+  # Basic
+  - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN
+  - python3 basic/offline_inference/generate.py --model facebook/opt-125m
+  - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+  - python3 basic/offline_inference/classify.py
+  - python3 basic/offline_inference/embed.py
+  - python3 basic/offline_inference/score.py
+  # Multi-modal models
+  - python3 offline_inference/audio_language.py --seed 0
+  - python3 offline_inference/vision_language.py --seed 0
+  - python3 offline_inference/vision_language_multi_image.py --seed 0
+  - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+  # Pooling models
+  - python3 pooling/embed/vision_embedding_offline.py --seed 0
+  # Features demo
+  - python3 offline_inference/prefix_caching.py
+  - python3 offline_inference/llm_engine_example.py
+  - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+  - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+  - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 
-- label: Platform Tests (CUDA) # 4min
-  timeout_in_minutes: 15
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/cuda
-  commands:
-    - pytest -v -s cuda/test_cuda_context.py
-    - pytest -v -s cuda/test_platform_no_cuda_init.py
-
-- label: Samplers Test # 56min
-  timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_1
-  optional: true
-  source_file_dependencies:
-  - vllm/model_executor/layers
-  - vllm/sampling_metadata.py
-  - tests/samplers
-  - tests/conftest.py
-  commands:
-    - pytest -v -s samplers
-
-- label: LoRA Test %N # 20min each
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_1
-  source_file_dependencies:
-  - vllm/lora
-  - tests/lora
-  commands:
-    - pytest -v -s lora \
-      --shard-id=$$BUILDKITE_PARALLEL_JOB \
-      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
-      --ignore=lora/test_chatglm3_tp.py \
-      --ignore=lora/test_llama_tp.py \
-      --ignore=lora/test_llm_with_multi_loras.py \
-      --ignore=lora/test_olmoe_tp.py \
-      --ignore=lora/test_deepseekv2_tp.py \
-      --ignore=lora/test_gptoss_tp.py \
-      --ignore=lora/test_qwen3moe_tp.py
-  parallelism: 4
-
-- label: PyTorch Compilation Unit Tests # 15min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
-  optional: true
-  torch_nightly: true
-  source_file_dependencies:
-    - vllm/
-    - tests/compile
-  commands:
-  # Run unit tests defined directly under compile/,
-  # not including subdirectories, which are usually heavier
-  # tests covered elsewhere.
-  # Use `find` to launch multiple instances of pytest so that
-  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
-
-- label: PyTorch Fullgraph Smoke Test # 15min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
-  optional: true
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/
-  - tests/compile
-  commands:
-  # Run smoke tests under fullgraph directory, except test_full_graph.py
-  # as it is a heavy test that is covered in other steps.
-  # Use `find` to launch multiple instances of pytest so that
-  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
-
-- label: PyTorch Fullgraph Test # 27min
-  timeout_in_minutes: 40
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
-  optional: true
-  # grade: Blocking
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/
-  - tests/compile
-  commands:
-  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
-    # # Limit to no custom ops to reduce running time
-    # # Wrap with quotes to escape yaml and avoid starting -k string with a -
-    # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
-    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
-    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
-
-- label: Cudagraph test
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
-  optional: true
-  source_file_dependencies:
-  - tests/v1/cudagraph
-  - vllm/v1/cudagraph_dispatcher.py
-  - vllm/config/compilation.py
-  - vllm/compilation
-  commands:
-    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
-    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
-
-- label: Kernels Core Operation Test # 48min
-  timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
-  optional: true
-  source_file_dependencies:
-  - csrc/
-  - tests/kernels/core
-  - tests/kernels/test_top_k_per_row.py
-  commands:
-    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
-
-- label: Kernels Attention Test %N # 23min
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Kernels Attention Test %N # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
+  parallelism: 2
   optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - csrc/attention/
   - vllm/v1/attention
-    # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
   - vllm/model_executor/layers/attention
   - tests/kernels/attention
+  - vllm/_aiter_ops.py
+  - vllm/envs.py
+  - vllm/platforms/rocm.py
   commands:
-    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 2
+  - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
 
-- label: Kernels Quantization Test %N # 64min
-  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental]
+
+- label: Kernels Quantization Test %N # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
-  optional: true
+  parallelism: 2
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - csrc/quantization/
   - vllm/model_executor/layers/quantization
   - tests/kernels/quantization
+  - tests/kernels/quantization/test_rocm_skinny_gemms.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  - vllm/model_executor/kernels/
   commands:
-    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 2
+  - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
 
-- label: Kernels MoE Test %N # 40min
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Kernels MoE Test %N # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
-  optional: true
+  parallelism: 4
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - csrc/quantization/cutlass_w8a8/moe/
   - csrc/moe/
@@ -3836,1018 +3210,473 @@ steps:
   - vllm/distributed/device_communicators/
   - vllm/envs.py
   - vllm/config
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
-    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 2
+  - pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
 
-- label: Kernels FP8 MoE Test
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Kernels FP8 MoE Test # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_2
-  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - csrc/moe/
+  - csrc/quantization/w8a8/cutlass/moe/
+  - vllm/model_executor/layers/fused_moe/
+  - tests/kernels/moe/test_deepep_moe.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  - vllm/envs.py
   commands:
     - pytest -v -s kernels/moe/test_deepep_moe.py
 
-- label: Kernels Mamba Test # 31min
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
-  optional: true
-  source_file_dependencies:
-  - csrc/mamba/
-  - tests/kernels/mamba
-  - vllm/model_executor/layers/mamba/ops
-  commands:
-    - pytest -v -s kernels/mamba
 
-- label: Kernels DeepGEMM Test (H100) # Nvidia-centric
-# Not replicating for CUTLAS & CuTe
-  timeout_in_minutes: 45
-  gpu: h100
-  num_gpus: 1
-  source_file_dependencies:
-  - tools/install_deepgemm.sh
-  - vllm/utils/deep_gemm.py
-  - vllm/model_executor/layers/fused_moe
-  - vllm/model_executor/layers/quantization
-  - tests/kernels/quantization/test_block_fp8.py
-  - tests/kernels/moe/test_deepgemm.py
-  - tests/kernels/moe/test_batched_deepgemm.py
-  - tests/kernels/attention/test_deepgemm_attention.py
-  commands:
-    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
-    - pytest -v -s kernels/moe/test_deepgemm.py
-    - pytest -v -s kernels/moe/test_batched_deepgemm.py
-    - pytest -v -s kernels/attention/test_deepgemm_attention.py
-
-- label: Kernels Helion Test
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
+- label: Quantization # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
-  optional: true
-  source_file_dependencies:
-  - vllm/utils/import_utils.py
-  - tests/kernels/helion/
-  commands:
-    - pip install helion
-    - pytest -v -s kernels/helion/
-
-- label: Model Executor Test # 23min
-  timeout_in_minutes: 35
-  torch_nightly: true
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
-  optional: true
-  source_file_dependencies:
-  - vllm/engine/arg_utils.py
-  - vllm/config/model.py
-  - vllm/model_executor
-  - tests/model_executor
-  - tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
-  commands:
-    - apt-get update && apt-get install -y curl libsodium23
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s model_executor
-    - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py
-
-- label: Benchmarks # 11min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
-  optional: true
-  working_dir: "/vllm-workspace/.buildkite"
-  source_file_dependencies:
-  - benchmarks/
-  commands:
-  - bash scripts/run-benchmarks.sh
-
-- label: Benchmarks CLI Test # 7min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/benchmarks/
-  commands:
-  - pytest -v -s benchmarks/
-
-- label: Quantization Test # 70min
-  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
-  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - csrc/
   - vllm/model_executor/layers/quantization
   - tests/quantization
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
-  # temporary install here since we need nightly, will move to requirements/test.in
-  # after torchao 0.12 release, and pin a working version of torchao nightly here
-
-  # since torchao nightly is only compatible with torch nightly currently
-  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
-  # we can only upgrade after this is resolved
-  # TODO(jerryzh168): resolve the above comment
   - uv pip install --system torchao==0.14.1
   - uv pip install --system conch-triton-kernels
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
-- label: LM Eval Small Models # 53min
-  timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental]
+
+- label: Language Models Tests (Standard) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
-  optional: true
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  autorun_on_main: true
-  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
-
-- label: OpenAI API correctness # 10min
-  timeout_in_minutes: 15
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
-  optional: true
-  source_file_dependencies:
-  - csrc/
-  - vllm/entrypoints/openai/
-  - vllm/model_executor/models/whisper.py
-  - tools/
-  commands: # LMEval+Transcription WER check
-  - bash ../tools/install_torchcodec_rocm.sh || exit 1
-  - pytest -s entrypoints/openai/correctness/
-
-
-#####  models test  #####
-
-- label: Basic Models Tests (Initialization)
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
-  optional: true
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/test_initialization.py
-  commands:
-    # Run a subset of model initialization tests
-    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
-
-- label: Basic Models Tests (Extra Initialization) %N
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
-  optional: true
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/model_executor/models/
-  - vllm/transformers_utils/
-  - tests/models/test_initialization.py
-  commands:
-    # Only when vLLM model source is modified - test initialization of a large
-    # subset of supported models (the complement of the small subset in the above
-    # test.) Also run if model initialization test file is modified
-    - pytest -v -s models/test_initialization.py \
-             -k 'not test_can_initialize_small_subset' \
-             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
-             --shard-id=$$BUILDKITE_PARALLEL_JOB
-  parallelism: 2
-
-- label: Basic Models Tests (Other)
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_1
-  optional: true
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/test_terratorch.py
-  - tests/models/test_transformers.py
-  - tests/models/test_registry.py
-  commands:
-    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
-
-- label: Basic Models Test (Other CPU) # 5min
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
-  optional: true
-  timeout_in_minutes: 10
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/test_utils.py
-  - tests/models/test_vision.py
-  no_gpu: true
-  commands:
-    - pytest -v -s models/test_utils.py models/test_vision.py
-
-- label: Language Models Tests (Standard)
-  timeout_in_minutes: 25
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
-  optional: true
   torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/language
   commands:
-    # Test standard language models, excluding a subset of slow tests
-    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/language -m 'core_model and (not slow_test)'
+  - pip freeze | grep -E 'torch'
+  - pytest -v -s models/language -m 'core_model and (not slow_test)'
 
-- label: Language Models Tests (Extra Standard) %N
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_1
-  optional: true
-  torch_nightly: true
-  source_file_dependencies:
-  - vllm/model_executor/models/
-  - tests/models/language/pooling/test_embedding.py
-  - tests/models/language/generation/test_common.py
-  - tests/models/language/pooling/test_classification.py
-  commands:
-    # Shard slow subset of standard language models tests. Only run when model
-    # source is modified, or when specified test files are modified
-    - pip freeze | grep -E 'torch'
-    - export TORCH_NCCL_BLOCKING_WAIT=1
-    - pytest -v -s models/language -m 'core_model and slow_test' \
-             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
-             --shard-id=$$BUILDKITE_PARALLEL_JOB
-  parallelism: 2
 
-- label: Language Models Tests (Hybrid) %N
-  timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental]
+- label: Language Models Test (Extended Generation) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
-  optional: true
-  torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/language/generation
   commands:
-    # Install fast path packages for testing against transformers
-    # Note: also needed to run plamo2 model in vLLM
-    - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
-    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
-    # Shard hybrid language model tests
-    - pytest -v -s models/language/generation \
-                   -m hybrid_model \
-                   --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
-                   --shard-id=$$BUILDKITE_PARALLEL_JOB
-  parallelism: 2
+  - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
+  - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+  - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 
-- label: Language Models Test (Extended Generation) # 80min
-  timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_1
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/language/generation
-  commands:
-    # Install fast path packages for testing against transformers
-    # Note: also needed to run plamo2 model in vLLM
-    - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
-    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
-    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 
-- label: Language Models Test (PPL)
-  timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental]
+- label: Language Models Test (Extended Pooling) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/language/generation_ppl_test
-  commands:
-    - pytest -v -s models/language/generation_ppl_test
-
-- label: Language Models Test (Extended Pooling)  # 36min
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_1
-  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/language/pooling
   commands:
-    - pytest -v -s models/language/pooling -m 'not core_model'
+  - pytest -v -s models/language/pooling -m 'not core_model'
 
-- label: Language Models Test (MTEB)
-  timeout_in_minutes: 110
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_1
-  # grade: Blocking
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/language/pooling_mteb_test
-  commands:
-    - pytest -v -s models/language/pooling_mteb_test
 
-- label: Multi-Modal Processor Test (CPU)
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
+- label: "Multi-Modal Models (Standard) 1: qwen2" # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/multimodal
-  no_gpu: true
-  commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
-
-- label: Multi-Modal Processor Test # 44min
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_1
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/models/multimodal
-  commands:
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing
-
-- label: Multi-Modal Models Test (Standard) # 60min
-  timeout_in_minutes: 100
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_1
-  optional: true
   torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal
   commands:
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py
-    - pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model
-    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2"
+  - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model
 
-- label: Multi-Modal Accuracy Eval (Small Models) # 5min
-  timeout_in_minutes: 10
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: "Multi-Modal Models (Standard) 2: qwen3 + gemma" # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
-  optional: true
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-  - vllm/multimodal/
-  - vllm/inputs/
-  - vllm/v1/core/
+  - vllm/
+  - tests/models/multimodal
   commands:
-  - export MIOPEN_DEBUG_CONV_DIRECT=0
-  - export MIOPEN_DEBUG_CONV_GEMM=0
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma"
+  - pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model
 
-- label: Multi-Modal Models Test (Extended 1) # 60min
-  timeout_in_minutes: 120
-  mirror_hardwares: [amdexperimental]
+
+- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl" # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
-  optional: true
+  torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal/generation
   - tests/models/multimodal/test_mapping.py
   commands:
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py
-    - pytest -v -s models/multimodal/test_mapping.py
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma"
+  - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model
 
-- label: Multi-Modal Models Test (Extended 2) # 60min
-  timeout_in_minutes: 120
-  mirror_hardwares: [amdexperimental]
+
+- label: "Multi-Modal Models (Standard) 4: other + whisper" # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
-  optional: true
+  torch_nightly: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal/generation
   commands:
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+  - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model
 
-- label: Multi-Modal Models Test (Extended 3) # 75min
-  timeout_in_minutes: 150
-  mirror_hardwares: [amdexperimental]
+
+- label: Multi-Modal Models (Extended Generation 1) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
-  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal/generation
+  - tests/models/multimodal/test_mapping.py
+  commands:
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py
+  - pytest -v -s models/multimodal/test_mapping.py
+
+
+- label: Multi-Modal Models (Extended Generation 2) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
+  agent_pool: mi355_1
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal/generation
   commands:
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
 
-- label: Multi-Modal Models Test (Extended Pooling) # 60min
-  timeout_in_minutes: 120
-  mirror_hardwares: [amdexperimental]
+
+- label: Multi-Modal Models (Extended Generation 3) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
-  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal/generation
+  commands:
+  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+  - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+
+
+- label: Multi-Modal Models (Extended Pooling) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
+  agent_pool: mi355_1
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal/pooling
   commands:
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - pytest -v -s models/multimodal/pooling -m 'not core_model'
+  - pytest -v -s models/multimodal/pooling -m 'not core_model'
 
-- label: Quantized Models Test # 45 min
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction]
+
+- label: Quantized Models Test # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
-  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/model_executor/layers/quantization
   - tests/models/quantization
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  - vllm/model_executor/model_loader/
   commands:
-    - pytest -v -s models/quantization
+  - pytest -v -s models/quantization
 
-- label: Transformers Nightly Models Test
-  mirror_hardwares: [amdexperimental]
+
+- label: Kernels (B200-MI355) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
   working_dir: "/vllm-workspace/"
-  optional: true
-  commands:
-    - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
-    - pytest -v -s tests/models/test_transformers.py
-    # - pytest -v -s tests/models/multimodal/processing/
-    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
-    - python3 examples/basic/offline_inference/chat.py
-    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
-    # Whisper needs spawn method to avoid deadlock
-    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
-
-- label: Blackwell Test (MI355) # 21 min
-  mirror_hardwares: [amdexperimental, amdmi355]
-  agent_pool: mi355_1
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/"
-  gpu: b200
-  # optional: true
   source_file_dependencies:
   - csrc/quantization/fp4/
   - csrc/attention/mla/
   - csrc/quantization/cutlass_w8a8/moe/
   - vllm/model_executor/layers/fused_moe/cutlass_moe.py
-  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-  - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/v1/attention/backends/mla/cutlass_mla.py
-  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/v1/attention/backends/triton_attn.py
+  - vllm/v1/attention/backends/rocm_attn.py
+  - vllm/v1/attention/backends/rocm_aiter_fa.py
+  - vllm/v1/attention/backends/rocm_aiter_unified_attn.py
+  - vllm/v1/attention/backends/mla/aiter_triton_mla.py
+  - vllm/v1/attention/backends/mla/rocm_aiter_mla.py
   - vllm/v1/attention/selector.py
-  - vllm/platforms/cuda.py
+  - vllm/platforms/rocm.py
+  - vllm/_aiter_ops.py
   commands:
-    - rocm-smi
-    - python3 examples/basic/offline_inference/chat.py
-    # Attention
-    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
-    - pytest -v -s tests/kernels/attention/test_attention_selector.py 
-    #- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
-    #- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
-    #- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
-    #- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
-    ## Quantization
-    #- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
-    #- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
-    #- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
-    #- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
-    #- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
-    #- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
-    #- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
-    #- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
-    #- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
-    #- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
-    #- pytest -v -s tests/kernels/moe/test_flashinfer.py
-    #- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
+  - rocm-smi
+  - python3 examples/basic/offline_inference/chat.py
+  - pytest -v -s tests/kernels/attention/test_attention_selector.py
 
-- label: Blackwell Fusion and Compile Tests # 30 min
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  gpu: b200
+
+- label: Weight Loading Multiple GPU # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
+  agent_pool: mi355_2
+  num_gpus: 2
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+  - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
+
+
+- label: Weight Loading Multiple GPU - Large Models # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
+  agent_pool: mi355_2
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+  - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
+
+
+- label: Ray Dependency Compatibility Check # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
+  agent_pool: mi355_1
+  optional: true
+  working_dir: "/"
+  source_file_dependencies:
+  - requirements/
+  - setup.py
+  - vllm/platforms/rocm.py
+  commands:
+  - bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh
+
+
+- label: Distributed NixlConnector PD accuracy (4 GPUs) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
+  agent_pool: mi355_4
+  num_gpus: 4
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+  - tests/v1/kv_connector/nixl_integration/
+  - vllm/platforms/rocm.py
+  commands:
+  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
+  - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+
+
+- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
+  agent_pool: mi355_4
+  num_gpus: 4
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+  - tests/v1/kv_connector/nixl_integration/
+  - vllm/platforms/rocm.py
+  commands:
+  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
+  - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+
+
+- label: NixlConnector PD + Spec Decode acceptance (2 GPUs) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
+  agent_pool: mi355_2
+  num_gpus: 2
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+  - vllm/v1/worker/kv_connector_model_runner_mixin.py
+  - tests/v1/kv_connector/nixl_integration/
+  - vllm/platforms/rocm.py
+  commands:
+  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
+  - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh
+
+
+- label: Distributed Tests (2 GPUs)(H100-MI355) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
+  agent_pool: mi355_2
+  num_gpus: 2
+  optional: true
+  working_dir: "/vllm-workspace/"
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/v1/distributed/
+  - vllm/model_executor/layers/fused_moe/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
+  - tests/distributed/test_context_parallel.py
+  - tests/v1/distributed/test_dbo.py
+  - examples/offline_inference/data_parallel.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  commands:
+  - export TORCH_NCCL_BLOCKING_WAIT=1
+  - pytest -v -s tests/distributed/test_context_parallel.py
+  - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+  - pytest -v -s tests/v1/distributed/test_dbo.py
+
+
+- label: Distributed Compile Unit Tests (2xH100-2xMI355) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
+  agent_pool: mi355_2
+  num_gpus: 2
+  optional: true
+  working_dir: "/vllm-workspace/"
   source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/v1/worker/
-  - vllm/v1/cudagraph_dispatcher.py
   - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/passes/test_fusion_attn.py
-  - tests/compile/passes/test_silu_mul_quant_fusion.py
-  - tests/compile/passes/distributed/test_fusion_all_reduce.py
-  - tests/compile/fullgraph/test_full_graph.py
+  - vllm/model_executor/layers
+  - tests/compile/passes/distributed/
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
-    - nvidia-smi
-    - pytest -v -s tests/compile/passes/test_fusion_attn.py
-    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
-    # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
+  - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
+  # TODO: this test is not supported on ROCm, there are aiter kernels for this.
+  # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
+  # - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+  # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
 
-    # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # # Wrap with quotes to escape yaml
-    # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
-    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
-    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
 
-    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
-
-- label: Blackwell GPT-OSS Eval
-  timeout_in_minutes: 60
-  working_dir: "/vllm-workspace/"
-  gpu: b200
-  optional: true # run on nightlies
+- label: LM Eval Small Models (B200-MI355) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
+  agent_pool: mi355_2
+  optional: true
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
-  - tests/evals/gpt_oss
-  - vllm/model_executor/models/gpt_oss.py
-  - vllm/model_executor/layers/quantization/mxfp4.py
-  - vllm/v1/attention/backends/flashinfer.py
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  - vllm/model_executor/models/
+  - vllm/model_executor/model_loader/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8.txt
+
+
+- label: LM Eval Large Models (4 GPUs)(FP8) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
+  agent_pool: mi355_4
+  num_gpus: 4
+  optional: true
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  - vllm/model_executor/models/
+  - vllm/model_executor/model_loader/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
+  commands:
+  - export VLLM_USE_DEEP_GEMM=0
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm-fp8.txt --tp-size=4
+
+
+- label: GPQA Eval (GPT-OSS) (B200-MI355) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx955nightly, amdmi355]
+  agent_pool: mi355_2
+  num_gpus: 2
+  optional: true
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  - vllm/model_executor/models/
+  - vllm/model_executor/model_loader/
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
+  - vllm/model_executor/layers/fused_moe/
+  - tests/evals/gpt_oss/
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
     - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+    - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx950.txt
 
-- label: Blackwell Quantized MoE Test
-  timeout_in_minutes: 60
-  working_dir: "/vllm-workspace/"
-  gpu: b200
-  source_file_dependencies:
-  - tests/quantization/test_blackwell_moe.py
-  - vllm/model_executor/models/deepseek_v2.py
-  - vllm/model_executor/models/gpt_oss.py
-  - vllm/model_executor/models/llama4.py
-  - vllm/model_executor/layers/fused_moe
-  - vllm/model_executor/layers/quantization/compressed_tensors
-  - vllm/model_executor/layers/quantization/modelopt.py
-  - vllm/model_executor/layers/quantization/mxfp4.py
-  - vllm/v1/attention/backends/flashinfer.py
-  commands:
-    - pytest -s -v tests/quantization/test_blackwell_moe.py
 
-- label: Blackwell LM Eval Small Models
-  timeout_in_minutes: 120
-  mirror_hardwares: [amdexperimental, amdproduction, amdmi355]
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200-MI355) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_2
-  gpu: b200
-  optional: true # run on nightlies
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi355.txt
-
-#####  1 GPU test  #####
-#####  multi gpus test  #####
-
-- label: Distributed Comm Ops Test # 7min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_2
-  optional: true
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  source_file_dependencies:
-  - vllm/distributed
-  - tests/distributed
-  commands:
-  - pytest -v -s distributed/test_comm_ops.py
-  - pytest -v -s distributed/test_shm_broadcast.py
-  - pytest -v -s distributed/test_shm_buffer.py
-  - pytest -v -s distributed/test_shm_storage.py
-
-- label: 2 Node Tests (4 GPUs in total) # 16min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdmultinode]
-  agent_pool: mi355_4
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  num_nodes: 2
-  source_file_dependencies:
-  - vllm/distributed/
-  - vllm/engine/
-  - vllm/executor/
-  - vllm/model_executor/models/
-  - tests/distributed/
-  - tests/examples/offline_inference/data_parallel.py
-  commands:
-  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)  | grep 'Same node test passed'   | grep 'Node count test passed'
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py 
-    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py 
-    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
-    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
-    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py 
-    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py 
-    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
-
-- label: Distributed Tests (2 GPUs) # 68min
-  timeout_in_minutes: 90
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_2
-  optional: true
-  # grade: Blocking
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  source_file_dependencies:
-  - vllm/compilation/
-  - vllm/distributed/
-  - vllm/engine/
-  - vllm/executor/
-  - vllm/worker/worker_base.py
-  - vllm/v1/engine/
-  - vllm/v1/worker/
-  - tests/compile/fullgraph/test_basic_correctness.py
-  - tests/compile/test_wrapper.py
-  - tests/distributed/
-  - tests/entrypoints/llm/test_collective_rpc.py
-  - tests/v1/distributed
-  - tests/v1/entrypoints/openai/test_multi_api_servers.py
-  - tests/v1/shutdown
-  - tests/v1/worker/test_worker_memory_snapshot.py
-  commands:
-  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
-  # TODO: Remove when the bug is fixed in a future ROCm release
-  - export TORCH_NCCL_BLOCKING_WAIT=1
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
-  - pytest -v -s entrypoints/llm/test_collective_rpc.py
-  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
-  - pytest -v -s ./compile/test_wrapper.py
-  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
-  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
-  - pytest -v -s compile/correctness_e2e/test_sequence_parallel.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
-  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
-
-- label: Distributed Model Tests (2 GPUs) # 37min
-  timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_2
-  optional: true
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  source_file_dependencies:
-  - vllm/model_executor/model_loader/sharded_state_loader.py
-  - vllm/model_executor/models/
-  - tests/basic_correctness/
-  - tests/model_executor/model_loader/test_sharded_state_loader.py
-  - tests/models/
-  commands:
-  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
-  # Avoid importing model tests that cause CUDA reinitialization error
-  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
-  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
-
-- label: Plugin Tests (2 GPUs) # 40min
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_2
-  optional: true
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  source_file_dependencies:
-  - vllm/plugins/
-  - tests/plugins/
-  commands:
-  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
-  - pip install -e ./plugins/vllm_add_dummy_platform
-  - pytest -v -s plugins_tests/test_platform_plugins.py
-  - pip uninstall vllm_add_dummy_platform -y
-  # end platform plugin tests
-  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
-  - pip install -e ./plugins/prithvi_io_processor_plugin
-  - pytest -v -s plugins_tests/test_io_processor_plugins.py
-  - pip uninstall prithvi_io_processor_plugin -y
-  # test bge_m3_sparse io_processor plugin
-  - pip install -e ./plugins/bge_m3_sparse_plugin
-  - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
-  - pip uninstall bge_m3_sparse_plugin -y
-  # end io_processor plugins test
-  # begin stat_logger plugins test
-  - pip install -e ./plugins/vllm_add_dummy_stat_logger
-  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
-  - pip uninstall dummy_stat_logger -y
-  # end stat_logger plugins test
-  # other tests continue here:
-  - pytest -v -s plugins_tests/test_scheduler_plugins.py
-  - pip install -e ./plugins/vllm_add_dummy_model
-  - pytest -v -s distributed/test_distributed_oot.py
-  - pytest -v -s entrypoints/openai/chat_completion/test_oot_registration.py # it needs a clean process
-  - pytest -v -s models/test_oot_registration.py # it needs a clean process
-  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
-
-- label: Pipeline + Context Parallelism Test # 45min
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_4
-  optional: true
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
-  source_file_dependencies:
-  - vllm/distributed/
-  - vllm/engine/
-  - vllm/executor/
-  - vllm/model_executor/models/
-  - tests/distributed/
-  commands:
-  - pytest -v -s distributed/test_pp_cudagraph.py
-  - pytest -v -s distributed/test_pipeline_parallel.py
-
-- label: LoRA TP Test (Distributed) # 17 min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_4
-  optional: true
-  num_gpus: 4
-  source_file_dependencies:
-  - vllm/lora
-  - tests/lora
-  commands:
-    # FIXIT: find out which code initialize cuda before running the test
-    # before the fix, we need to use spawn to test it
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    # There is some Tensor Parallelism related processing logic in LoRA that
-    # requires multi-GPU testing for validation.
-    - pytest -v -s -x lora/test_chatglm3_tp.py
-    - pytest -v -s -x lora/test_llama_tp.py
-    - pytest -v -s -x lora/test_llm_with_multi_loras.py
-    - pytest -v -s -x lora/test_olmoe_tp.py
-
-    # Disabled for now because MXFP4 backend on non-cuda platform
-    # doesn't support LoRA yet
-    #- pytest -v -s -x lora/test_gptoss_tp.py
-
-
-- label: Weight Loading Multiple GPU Test  # 33min
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_2
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/weight_loading
-  commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
-
-- label: Weight Loading Multiple GPU Test - Large Models # optional
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_2
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/weight_loading
-  commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
-
-- label: NixlConnector PD accuracy tests (Distributed) # 30min
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_4
-  optional: true
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
-  source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-    - tests/v1/kv_connector/nixl_integration/
-  commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
-    - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
-- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_4
-  optional: true
-  timeout_in_minutes: 15
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
-  source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-    - tests/v1/kv_connector/nixl_integration/
-  commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
-    - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
-- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_4
-  # grade: Blocking
-  timeout_in_minutes: 30
-  working_dir: "/vllm-workspace/tests"
-  num_devices: 4
-  source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
-    - tests/v1/kv_connector/nixl_integration/
-  commands:
-    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
-    - CROSS_LAYERS_BLOCKS=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
-##### multi gpus test #####
-##### A100 test #####
-
-- label: Distributed Tests (A100) # optional
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_4
-  gpu: a100
-  optional: true
-  num_gpus: 4
-  source_file_dependencies:
-  - vllm/
-  commands:
-  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
-  # TODO: Remove when the bug is fixed in a future ROCm release
-  - export TORCH_NCCL_BLOCKING_WAIT=1
-  # NOTE: don't test llama model here, it seems hf implementation is buggy
-  # see https://github.com/vllm-project/vllm/pull/5689 for details
-  - pytest -v -s distributed/test_custom_all_reduce.py
-  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
-  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
-  - pytest -v -s -x lora/test_mixtral.py
-
-
-- label: LM Eval Large Models # optional
-  gpu: a100
-  optional: true
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_4
-  num_gpus: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
-
-##### H100 test #####
-- label: LM Eval Large Models (H100) # optional
-  gpu: h100
-  optional: true
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_4
-  num_gpus: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
-    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
-
-
-##### H200 test #####
-- label: Distributed Tests (H200) # optional
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_2
-  gpu: h200
-  optional: true
-  working_dir: "/vllm-workspace/"
-  num_gpus: 2
-  commands:
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
-    - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
-    #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
-    # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
-    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
-    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
-
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
-    - pytest -v -s tests/distributed/test_context_parallel.py
-    - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
-    - pytest -v -s tests/v1/distributed/test_dbo.py
-
-##### B200 test #####
-- label: Distributed Tests (B200) # optional
-  gpu: b200
-  optional: true
-  working_dir: "/vllm-workspace/"
-  num_gpus: 2
-  commands:
-    - pytest -v -s tests/distributed/test_context_parallel.py
-    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
-    - pytest -v -s tests/v1/distributed/test_dbo.py
-
-##### E2E Eval Tests #####
-- label: LM Eval Small Models (1 Card) # 15min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
-
-- label: LM Eval Large Models (4 Card)
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_4
-  gpu: a100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
-
-- label: ROCm LM Eval Large Models (8 Card)
-  mirror_hardwares: [amdproduction]
-  agent_pool: mi355_8
-  optional: true
-  num_gpus: 8
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
-
-- label: ROCm GPT-OSS Eval
-  timeout_in_minutes: 60
-  working_dir: "/vllm-workspace/tests"
-  agent_pool: mi355_1
-  mirror_hardwares: [amdexperimental, amdproduction]
-  optional: true # run on nightlies
-  source_file_dependencies:
-  - tests/evals/gpt_oss
-  - vllm/model_executor/models/gpt_oss.py
-  - vllm/model_executor/layers/quantization/mxfp4.py
-  - vllm/v1/attention/backends/flashinfer.py
-  commands:
-  - uv pip install --system 'gpt-oss[eval]==0.0.5'
-  - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx950.txt
-
-##### EPLB Accuracy Tests #####
-- label: DeepSeek V2-Lite Accuracy
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_4
-  timeout_in_minutes: 60
-  gpu: h100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace"
-  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
-
-- label: Qwen3-30B-A3B-FP8-block Accuracy (B200-MI355)
-  mirror_hardwares: [amdexperimental, amdproduction, amdmi355]
-  agent_pool: mi355_2
-  timeout_in_minutes: 60
-  gpu: b200
-  optional: true
   num_gpus: 2
   working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - vllm/model_executor/model_loader/
+  - vllm/model_executor/layers/quantization/
+  - vllm/model_executor/layers/fused_moe/
+  - vllm/distributed/eplb
+  - vllm/v1/attention/backends/
+  - vllm/v1/attention/selector.py
+  - .buildkite/scripts/scheduled_integration_test/
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
 
 
-- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
-  timeout_in_minutes: 60
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_4
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace"
-  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
-
-- label: Attention Benchmarks Smoke Test (B200-MI355)
-  device: b200
-  mirror_hardwares: [amdexperimental, amdmi355]
+- label: Attention Benchmarks Smoke Test (B200-MI355) # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_2
   num_gpus: 2
-  optional: true
   working_dir: "/vllm-workspace/"
-  timeout_in_minutes: 10
   source_file_dependencies:
   - benchmarks/attention_benchmarks/
   - vllm/v1/attention/
+  - vllm/_aiter_ops.py
+  - vllm/platforms/rocm.py
   commands:
   - python3 benchmarks/attention_benchmarks/benchmark.py --backends ROCM_ATTN ROCM_AITER_FA ROCM_AITER_UNIFIED_ATTN --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1
-
diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml
index 5da7b64ac..c21b66552 100644
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -59,7 +59,7 @@ steps:
   - export VLLM_TEST_CLEAN_GPU_MEMORY=1
   - pytest -s -v tests/compile/passes/distributed
 
-- label: Fusion and Compile Unit Tests (B200)
+- label: Fusion and Compile Unit Tests (2xB200)
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/"
   device: b200
diff --git a/tests/evals/gpt_oss/configs/models-gfx942.txt b/tests/evals/gpt_oss/configs/models-gfx942.txt
index 48cef0122..60eff507d 100644
--- a/tests/evals/gpt_oss/configs/models-gfx942.txt
+++ b/tests/evals/gpt_oss/configs/models-gfx942.txt
@@ -1,3 +1,3 @@
 # GFX942 model configurations for GPQA evaluation
 # Tests different environment variable combinations
-gpt-oss-20b-rocm-baseline.yaml
\ No newline at end of file
+gpt-oss-20b-rocm-baseline.yaml
diff --git a/tests/evals/gsm8k/configs/DeepSeek-R1-DP_MI325.yaml b/tests/evals/gsm8k/configs/DeepSeek-R1-DP_MI325.yaml
new file mode 100644
index 000000000..0171cb4b1
--- /dev/null
+++ b/tests/evals/gsm8k/configs/DeepSeek-R1-DP_MI325.yaml
@@ -0,0 +1,12 @@
+model_name: "deepseek-ai/DeepSeek-R1"
+accuracy_threshold: 0.95
+num_questions: 1319
+num_fewshot: 5
+startup_max_wait_seconds: 1200
+server_args: >-
+  --enforce-eager
+  --max-model-len 4096
+  --data-parallel-size 8
+  --enable-expert-parallel
+  --attention-backend=TRITON_ATTN
+  --speculative-config '{"method":"mtp","num_speculative_tokens":3}'
diff --git a/tests/evals/gsm8k/configs/DeepSeek-R1-TP_MI325.yaml b/tests/evals/gsm8k/configs/DeepSeek-R1-TP_MI325.yaml
new file mode 100644
index 000000000..ef92f574c
--- /dev/null
+++ b/tests/evals/gsm8k/configs/DeepSeek-R1-TP_MI325.yaml
@@ -0,0 +1,12 @@
+model_name: "deepseek-ai/DeepSeek-R1"
+accuracy_threshold: 0.95
+num_questions: 1319
+num_fewshot: 5
+startup_max_wait_seconds: 1200
+server_args: >-
+  --enforce-eager
+  --max-model-len 4096
+  --tensor-parallel-size 8
+  --enable-expert-parallel
+  --attention-backend=TRITON_ATTN
+  --speculative-config '{"method":"mtp","num_speculative_tokens":3}'
diff --git a/tests/evals/gsm8k/configs/DeepSeek-V3.2-DP_MI325.yaml b/tests/evals/gsm8k/configs/DeepSeek-V3.2-DP_MI325.yaml
new file mode 100644
index 000000000..8d207878d
--- /dev/null
+++ b/tests/evals/gsm8k/configs/DeepSeek-V3.2-DP_MI325.yaml
@@ -0,0 +1,12 @@
+model_name: "deepseek-ai/DeepSeek-V3.2"
+accuracy_threshold: 0.95
+num_questions: 1319
+num_fewshot: 5
+startup_max_wait_seconds: 1200
+server_args: >-
+  --enforce-eager
+  --max-model-len 4096
+  --data-parallel-size 8
+  --enable-expert-parallel
+  --attention-backend=TRITON_ATTN
+  --speculative-config '{"method":"mtp","num_speculative_tokens":3}'
diff --git a/tests/evals/gsm8k/configs/DeepSeek-V3.2-TP_MI325.yaml b/tests/evals/gsm8k/configs/DeepSeek-V3.2-TP_MI325.yaml
new file mode 100644
index 000000000..46853d3f5
--- /dev/null
+++ b/tests/evals/gsm8k/configs/DeepSeek-V3.2-TP_MI325.yaml
@@ -0,0 +1,12 @@
+model_name: "deepseek-ai/DeepSeek-V3.2"
+accuracy_threshold: 0.95
+num_questions: 1319
+num_fewshot: 5
+startup_max_wait_seconds: 1200
+server_args: >-
+  --enforce-eager
+  --max-model-len 4096
+  --tensor-parallel-size 8
+  --enable-expert-parallel
+  --attention-backend=TRITON_ATTN
+  --speculative-config '{"method":"mtp","num_speculative_tokens":3}'
diff --git a/tests/evals/gsm8k/configs/models-mi355.txt b/tests/evals/gsm8k/configs/models-mi3xx-quantized.txt
similarity index 100%
rename from tests/evals/gsm8k/configs/models-mi355.txt
rename to tests/evals/gsm8k/configs/models-mi3xx-quantized.txt
diff --git a/tests/evals/gsm8k/configs/models-mi3xx.txt b/tests/evals/gsm8k/configs/models-mi3xx.txt
new file mode 100644
index 000000000..6cf833b64
--- /dev/null
+++ b/tests/evals/gsm8k/configs/models-mi3xx.txt
@@ -0,0 +1,4 @@
+DeepSeek-R1-TP_MI325.yaml
+DeepSeek-R1-DP_MI325.yaml
+DeepSeek-V3.2-TP_MI325.yaml
+DeepSeek-V3.2-DP_MI325.yaml
diff --git a/tests/evals/gsm8k/test_gsm8k_correctness.py b/tests/evals/gsm8k/test_gsm8k_correctness.py
index c8028c0b8..7e36ea1bd 100644
--- a/tests/evals/gsm8k/test_gsm8k_correctness.py
+++ b/tests/evals/gsm8k/test_gsm8k_correctness.py
@@ -64,6 +64,16 @@ def test_gsm8k_correctness(config_filename):
             "Marlin kernels are not supported."
         )
 
+    # TODO(akaratza): Enable DeepSeek-V3.2 and DeepSeek-R1 on ROCm platforms
+    if current_platform.is_rocm() and (
+        "deepseek-ai/DeepSeek-V3.2" in eval_config["model_name"]
+        or "deepseek-ai/DeepSeek-R1" in eval_config["model_name"]
+    ):
+        pytest.skip(
+            "Skipping DeepSeek-V3.2 and DeepSeek-R1 on ROCm platforms "
+            "due to agent pool disk space issues and pod evictions."
+        )
+
     # Parse server arguments from config (use shlex to handle quoted strings)
     server_args_str = eval_config.get("server_args", "")
     server_args = shlex.split(server_args_str) if server_args_str else []
diff --git a/tests/quantization/test_mi3xx_moe.py b/tests/quantization/test_mi3xx_moe.py
new file mode 100644
index 000000000..2f8dfde68
--- /dev/null
+++ b/tests/quantization/test_mi3xx_moe.py
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+def test_mi3xx_moe():
+    print("TODO: add tests for Mi3xx MoE quantization")
diff --git a/tests/rocm/aiter/test_mla_fp8_support_check.py b/tests/rocm/aiter/test_mla_fp8_support_check.py
index e3dc0f8ea..28da59a1a 100644
--- a/tests/rocm/aiter/test_mla_fp8_support_check.py
+++ b/tests/rocm/aiter/test_mla_fp8_support_check.py
@@ -31,7 +31,7 @@ class TestAiterMlaFp8SupportCheck:
 
         # Should return False without raising
         with patch(
-            "vllm._aiter_ops.inspect.signature",
+            "inspect.signature",
             side_effect=ImportError("No module"),
         ):
             result = _check_aiter_mla_fp8_support()
@@ -46,7 +46,7 @@ class TestAiterMlaFp8SupportCheck:
         aiter_ops._AITER_MLA_SUPPORTS_FP8 = None
 
         with patch(
-            "vllm._aiter_ops.inspect.signature",
+            "inspect.signature",
             side_effect=ModuleNotFoundError("Module not found"),
         ):
             # Should return False without raising
@@ -63,7 +63,7 @@ class TestAiterMlaFp8SupportCheck:
         aiter_ops._AITER_MLA_SUPPORTS_FP8 = None
 
         with patch(
-            "vllm._aiter_ops.inspect.signature",
+            "inspect.signature",
             side_effect=AttributeError("No attribute"),
         ):
             assert _check_aiter_mla_fp8_support() is False
@@ -78,7 +78,7 @@ class TestAiterMlaFp8SupportCheck:
         aiter_ops._AITER_MLA_SUPPORTS_FP8 = None
 
         with patch(
-            "vllm._aiter_ops.inspect.signature",
+            "inspect.signature",
             side_effect=ValueError("No signature"),
         ):
             assert _check_aiter_mla_fp8_support() is False
@@ -93,7 +93,7 @@ class TestAiterMlaFp8SupportCheck:
         aiter_ops._AITER_MLA_SUPPORTS_FP8 = None
 
         with patch(
-            "vllm._aiter_ops.inspect.signature",
+            "inspect.signature",
             side_effect=TypeError("Not a callable"),
         ):
             assert _check_aiter_mla_fp8_support() is False